1 /* mmgetaa.c - functions for mmap()ed access to libraries */
2 
3 /* $Id: mmgetaa.c 1153 2013-05-20 13:29:29Z wrp $ */
4 
5 /* copyright (c) 1999, 2000, 2014 by William  R. Pearson and The Rector &
6    Visitors of the University of Virginia */
7 
8 /* Licensed under the Apache License, Version 2.0 (the "License");
9    you may not use this file except in compliance with the License.
10    You may obtain a copy of the License at
11 
12    http://www.apache.org/licenses/LICENSE-2.0
13 
14    Unless required by applicable law or agreed to in writing,
15    software distributed under this License is distributed on an "AS
16    IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
17    express or implied.  See the License for the specific language
18    governing permissions and limitations under the License.
19 */
20 
21 /*
22   This is one of two alternative files that can be used to
23   read a database.  The two files are nmgetaa.c, and mmgetaa.c
24   (nxgetaa.c has been retired).
25 
26   nmgetlib.c and mmgetaa.c are used together. nmgetlib.c provides
27   the same functions as nxgetaa.c if memory mapping is not used,
28   mmgetaa.c provides the database reading functions if memory
29   mapping is used. The decision to use memory mapping is made on
30   a file-by-file basis.
31 */
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <unistd.h>
36 #include <limits.h>
37 #include <string.h>
38 #include <errno.h>
39 
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/mman.h>
43 #include <fcntl.h>
44 
45 #define MAXLINE 512
46 #define EOSEQ 0
47 
48 #define XTERNAL
49 #include "uascii.h"
50 /* #include "upam.h" */
51 #undef XTERNAL
52 
53 #define GCGBIN 6
54 
55 #ifndef MAP_FILE
56 #define MAP_FILE 0
57 #endif
58 
59 #include "defs.h"
60 #include "structs.h"
61 #include "mm_file.h"
62 
63 extern int64_t bl2_long8_cvt(int64_t);
64 extern int bl2_uint4_cvt(int);
65 extern void newname(char *, char *, char *, int);
66 
67 long crck(char *, int);
68 extern void src_int4_read(FILE *fd,  int *val);
69 extern void src_long4_read(FILE *fd,  long  *valp);
70 extern void src_long8_read(FILE *fd,  int64_t *val);
71 
72 int
73 agetlib_mb(unsigned char *seq,int maxs,char *libstr,int n_libstr,fseek_t *libpos,
74 	   int *lcont, struct lmf_str *m_fd, long *l_off);
75 
76 extern void
77 aranlib(char *str, int cnt, fseek_t seek, char *libstr, struct lmf_str *lm_fd);
78 void
79 aranlib_mb(char *str, int cnt, fseek_t seek, char *libstr, struct lmf_str *lm_fd);
80 
81 /* mmap()ed functions */
82 #ifdef USE_MMAP
83 int agetlibm(); void aranlibm();
84 int lgetlibm(); void lranlibm();
85 void vranlibm();
86 int gcg_getlibm();
87 
88 int (*getlibam[])()={
89   agetlibm,lgetlibm, NULL, NULL,NULL,agetlibm,gcg_getlibm
90 };
91 
92 void (*ranlibam[])()={
93   aranlibm,lranlibm,NULL,NULL,NULL,vranlibm,vranlibm
94 };
95 #endif
96 
97 int
98 bmap_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
99 			struct lmf_str *m_fd, struct db_str *db);
100 
101 /* load_mmap() loads the d_pos[] and s_pos[] arrays for rapid access */
102 /* 24-July-2011 -- checks and maps bsq file */
103 
can_mmap(int lib_type)104 int can_mmap(int lib_type) {
105   return (getlibam[lib_type] != NULL);
106 }
107 
108 
109 struct lmf_str *
load_mmap(FILE * libi,char * sname,int lib_type,int ldnaseq,struct lmf_str * m_fd)110 load_mmap(FILE *libi,	/* fd for already open ".xin" file */
111 	  char *sname,	/* name of sequence database file */
112 	  int lib_type,	/* 0-Fasta, 5-vms_pir, 6-gcg_binary */
113 	  int ldnaseq,	/* 1 for DNA, 0 for protein */
114 	  struct lmf_str *m_fd)
115 {
116   char format[4];
117   char bname[MAX_FN], xbname[MAX_FN];
118   int i, lib_aa;
119   fseek_t f_size;
120   long lf_size;
121   struct stat statbuf;
122   int max_cnt;
123   fseek_t *d_pos_arr, *s_pos_arr, *b_pos_arr;
124   int mm_flag, mm64_flag, mmb_flag;
125   FILE *libi_b;		/* FILE * for .xin_b file */
126   int *tmp_pos_arr;
127 
128   /* first check that the necessary indices are up-to-date */
129   /* read the offsets in ".xin" file */
130   if (fread(format,1,4,libi)==0) {
131     fprintf(stderr," cannot read .xin format\n");
132     return NULL;
133   }
134 
135   mm64_flag = (format[2]>=1);	/* 4 bytes or 8 bytes for long? */
136 
137 #ifndef BIG_LIB64
138   if (mm64_flag) {return NULL;}
139 #endif
140 
141   if (format[3]!=lib_type) {
142     fprintf(stderr," cannot read format %d != lib_type %d\n",
143 	    format[3],lib_type);
144     return NULL;
145   }
146 
147   src_int4_read(libi,&lib_aa);
148   if (lib_aa == ldnaseq) { /* database residue mismatch */
149     fprintf(stderr," residue type mismatch %s != %s (.xin) in %s\n",
150 	    (lib_aa ? "DNA" : "prot."),(ldnaseq ? "prot." : "DNA"),
151 	    sname);
152     return NULL;
153   }
154 
155   /* everything looks good, allocate an lmf_str */
156   m_fd->lib_aa = lib_aa;
157 
158   /* get ascii file size from index */
159   if (mm64_flag) src_long8_read(libi,&f_size);
160   else {
161     src_long4_read(libi,&lf_size);
162     f_size = lf_size;
163   }
164 
165   if (sizeof(char *) < sizeof(fseek_t) && f_size > UINT_MAX) {
166     fprintf(stderr,"\n *** Warning *** database too large (%lld) for 32-bit mmap()\n",f_size);
167     return NULL;
168   }
169 
170   /* check for .bsq binary mapping */
171   newname(bname,sname,"bsq",sizeof(bname));
172   mm_flag = (m_fd->mmap_fd=open(bname,O_RDONLY) >= 0);
173   mmb_flag = 0;
174   if (mm_flag) {
175     mmb_flag = 1;
176 
177     /* fstat the binary sequence file */
178     if(stat(bname, &statbuf) < 0) {
179       fprintf(stderr," cannot stat %s for mmap()", sname);
180       perror("...");
181       mmb_flag = 0;
182       goto next_mmap;
183     }
184 
185     /* now open the .xin_b file and read the offsets */
186     newname(xbname, sname, "xin_b",sizeof(xbname));
187     if ((libi_b = fopen(xbname,"r"))==NULL) {
188       fprintf(stderr,"Cannot open %s binary index file\n",xbname);
189       mmb_flag = 0;
190       goto next_mmap;
191     }
192 
193     /* now read the .xin_b file */
194     if (fread(format,1,4,libi_b)==0) {
195       fprintf(stderr," cannot read .xin_b format\n");
196       mmb_flag = 0;
197       goto next_mmap;
198 
199     }
200     src_int4_read(libi_b,&lib_aa);
201     /* get .bsq file size from .xin_b */
202     src_long8_read(libi_b,&f_size);
203     if (f_size != statbuf.st_size) {
204       fprintf(stderr," %s file size (%lld) and expected size (%lld) don't match\n",
205 	      bname,statbuf.st_size,f_size);
206       mmb_flag = 0;
207       goto next_mmap;
208     }
209 
210     /* now, start to open mmap()ed file */
211     mmb_flag=((m_fd->mmap_fd=open(bname,O_RDONLY))>=0);
212 
213     if (!mmb_flag) {
214       fprintf(stderr," cannot open %s for mmap()", bname);
215       perror("...");
216       goto next_mmap;
217     }
218 
219     /* the index file and library file are open and the sizes match */
220     /* allocate the m_file struct and  map the file */
221 
222     m_fd->st_size = statbuf.st_size;
223     if((m_fd->mmap_base =
224 	mmap(NULL, m_fd->st_size, PROT_READ,
225 	     MAP_FILE | MAP_SHARED, m_fd->mmap_fd, 0)) == (char *) -1) {
226       mm_flag = 0;
227 #ifdef DEBUG
228       fprintf(stderr," cannot mmap %s", bname);
229       perror("...");
230 #endif
231     }
232 
233     /* now finish reading the index file */
234     src_int4_read(libi_b,&max_cnt);
235 
236     src_long8_read(libi_b,&m_fd->tot_len);
237     src_long4_read(libi_b,&lf_size);
238     m_fd->max_len = lf_size;
239     /* get seqbuf_max */
240     src_long4_read(libi_b,&lf_size);
241     /* get seqbuf_dup */
242     src_long4_read(libi_b,&lf_size);
243 
244 #ifdef DEBUG
245     fprintf(stderr,
246 	    "\n%s\tformat: %c%c%d %d; max_cnt: %d; tot_len: %lld max_len: %ld\n",
247 	    sname,format[0],format[1],format[2],format[3],
248 	    max_cnt,m_fd->tot_len,m_fd->max_len);
249 #endif
250 
251     /* allocate array of description pointers */
252 
253     if ((b_pos_arr=(fseek_t *)calloc(max_cnt+1, sizeof(fseek_t)))==NULL) {
254       fprintf(stderr," cannot allocate %d for binary seq array\n",max_cnt+1);
255       exit(1);
256     }
257 
258     /* now read the binary offsets (b_pos_arr) */
259     if (fread(b_pos_arr,sizeof(fseek_t),max_cnt+1,libi_b)!=
260 	max_cnt+1) {
261       fprintf(stderr," error reading bseq offsets: %s\n",xbname);
262       return NULL;
263     }
264 
265 #ifndef IS_BIG_ENDIAN
266     for (i=0; i<=max_cnt; i++) {
267       b_pos_arr[i] = bl2_long8_cvt(b_pos_arr[i]);
268     }
269 #endif
270     /* now have the b_pos_arr[] allocated and read, close libi_b */
271     fclose(libi_b);
272   }
273 
274  next_mmap:
275   /* here if no mmb_flag or mmb read failed */
276   if (!mmb_flag) {
277     /* now, start to open mmap()ed file */
278     mm_flag=((m_fd->mmap_fd=open(sname,O_RDONLY))>=0);
279     if (!mm_flag) {
280       fprintf(stderr," cannot open %s for mmap()", sname);
281       perror("...");
282       return NULL;	/* file did not open */
283     }
284 
285     /* fstat the library file and get size */
286     if(fstat(m_fd->mmap_fd, &statbuf) < 0) {
287       fprintf(stderr," cannot stat %s for mmap()", sname);
288       perror("...");
289       m_fd->mm_flg = 0;
290       goto finish;
291     }
292 
293     /* check for identical sizes - if different, do not mmap */
294     if (f_size != statbuf.st_size) {
295       fprintf(stderr," %s file size (%lld) and expected size (%lld) don't match\n",
296 	      sname,statbuf.st_size,f_size);
297       mm_flag = 0;
298       goto finish;
299     }
300 
301     /* the index file and library file are open and the sizes match */
302     /* allocate the m_file struct and  map the file */
303 
304     m_fd->st_size = statbuf.st_size;
305     if((m_fd->mmap_base =
306 	mmap(NULL, m_fd->st_size, PROT_READ,
307 	     MAP_FILE | MAP_SHARED, m_fd->mmap_fd, 0)) == (char *) -1) {
308       mm_flag = 0;
309 #ifdef DEBUG
310       fprintf(stderr," cannot mmap %s", sname);
311       perror("...");
312 #endif
313     }
314   }
315 
316   /* here, we have a memory mapped file, and if mmb_flag, we have the
317      b_pos_arr[] read, but not s_pos_arr[] or d_pos_arr[] */
318 
319  finish:
320   close(m_fd->mmap_fd);
321   if (!mm_flag) { return NULL; }
322 
323   /* now finish reading the index file */
324   src_int4_read(libi,&max_cnt);
325 
326   if (mm64_flag) {
327     src_long8_read(libi,&m_fd->tot_len);
328   }
329   else {
330     src_long4_read(libi,&lf_size);
331     m_fd->tot_len = lf_size;
332   }
333   src_long4_read(libi,&lf_size);
334   m_fd->max_len = lf_size;
335 
336 #ifdef DEBUG
337   fprintf(stderr,
338 	  "\n%s\tformat: %c%c%d %d; max_cnt: %d; tot_len: %lld max_len: %ld\n",
339 	  sname,format[0],format[1],format[2],format[3],
340 	  max_cnt,m_fd->tot_len,m_fd->max_len);
341 #endif
342 
343   /* allocate array of description pointers */
344   if (!mm64_flag) {
345     if ((tmp_pos_arr=(int *)calloc(max_cnt+1,sizeof(int)))==NULL) {
346       fprintf(stderr," cannot allocate %d for tmp_pos array\n",
347 	      max_cnt+1);
348 		return NULL;
349     }
350   }
351 
352   if ((d_pos_arr=(fseek_t *)calloc(max_cnt+1, sizeof(fseek_t)))==NULL) {
353     fprintf(stderr," cannot allocate %d for desc. array\n",max_cnt+1);
354     exit(1);
355   }
356 
357   /* read m_fd->d_pos[max_cnt+1] */
358   if (mm64_flag) {
359     if (fread(d_pos_arr,sizeof(fseek_t),max_cnt+1,libi)!=
360 	max_cnt+1) {
361       fprintf(stderr," error reading desc. offsets: %s\n",sname);
362       return NULL;
363     }
364   }
365   else {
366     if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!=
367 	max_cnt+1) {
368       fprintf(stderr," error reading desc. offsets: %s\n",sname);
369       return NULL;
370     }
371 #ifdef DEBUG
372     fprintf(stderr,"d_pos_crc: %ld\n",
373 	    crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));
374 #endif
375   }
376 
377 #ifndef IS_BIG_ENDIAN
378   if (mm64_flag)
379     for (i=0; i<=max_cnt; i++) {
380       d_pos_arr[i] = bl2_long8_cvt(d_pos_arr[i]);
381     }
382   else
383     for (i=0; i<=max_cnt; i++) {
384       d_pos_arr[i] = bl2_uint4_cvt(tmp_pos_arr[i]);
385     }
386 #else
387   if (!mm64_flag) {
388     for (i=0; i<=max_cnt; i++) {
389       d_pos_arr[i] = tmp_pos_arr[i];
390     }
391   }
392 #endif
393 
394 #ifdef DEBUG
395   for (i=0; i<max_cnt-1; i++) {
396     if (d_pos_arr[i+1] <= d_pos_arr[i] )
397       fprintf(stderr," ** dpos_error [%d]\t%lld\t%lld\n",
398 	      i,d_pos_arr[i],d_pos_arr[i+1]);
399   }
400 #endif
401 
402   /* allocate array of sequence pointers */
403   if ((s_pos_arr=(fseek_t *)calloc(max_cnt+1,sizeof(fseek_t)))==NULL) {
404     fprintf(stderr," cannot allocate %d for seq. array\n",max_cnt+1);
405     exit(1);
406   }
407 
408   /* read m_fd->s_pos[max_cnt+1] */
409   if (mm64_flag) {
410     if (fread(s_pos_arr,sizeof(fseek_t),max_cnt+1,libi)!=
411 	max_cnt+1) {
412       fprintf(stderr," error reading seq offsets: %s\n",sname);
413       return NULL;
414     }
415   }
416   else {
417     if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!=
418 	max_cnt+1) {
419       fprintf(stderr," error reading seq offsets: %s\n",sname);
420       return NULL;
421     }
422 #ifdef DEBUG
423     fprintf(stderr,"s_pos_crc: %ld\n",
424 	    crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));
425 #endif
426   }
427 
428 #ifndef IS_BIG_ENDIAN
429   if (mm64_flag)
430     for (i=0; i<=max_cnt; i++)
431       s_pos_arr[i] = bl2_long8_cvt(s_pos_arr[i]);
432   else
433     for (i=0; i<=max_cnt; i++)
434       s_pos_arr[i] = (long)bl2_uint4_cvt(tmp_pos_arr[i]);
435 #else
436   if (!mm64_flag)
437     for (i=0; i<=max_cnt; i++)
438       s_pos_arr[i] = (long)tmp_pos_arr[i];
439 #endif
440 
441 #ifdef DEBUG
442   for (i=1; i<max_cnt-1; i++) {
443     if (s_pos_arr[i+1]<s_pos_arr[i])
444       fprintf(stderr," ** spos_error [%d]\t%lld\t%lld\n",
445 	      i,s_pos_arr[i],s_pos_arr[i]);
446   }
447 #endif
448 
449   if (!mm64_flag) free(tmp_pos_arr);
450 
451   m_fd->max_cnt = max_cnt;
452   m_fd->d_pos_arr = d_pos_arr;
453   m_fd->s_pos_arr = s_pos_arr;
454   if (mmb_flag)   m_fd->b_pos_arr = b_pos_arr;
455   m_fd->lpos = 0;
456   m_fd->lb_type = lib_type;
457   m_fd->getlib = getlibam[lib_type];
458   m_fd->ranlib = ranlibam[lib_type];
459   m_fd->get_mmap_chain = NULL;
460   m_fd->mm_flg = 1;
461   if (mmb_flag) {
462     m_fd->getlib = agetlib_mb;
463     m_fd->ranlib = aranlib_mb;
464     m_fd->get_mmap_chain = bmap_get_mmap_chain;
465   }
466 
467   /*   check_mmap(m_fd,-2); */
468 
469   return m_fd;
470 }
471 
mgets(char * s,int n,struct lmf_str * m_fd)472 char *mgets (char *s, int n, struct lmf_str *m_fd)
473 {
474   char *cs, *mfp;
475 
476   mfp = m_fd->mmap_addr;
477   cs = s;
478 
479   while (--n > 0 && (*mfp != (char)EOF))
480     if ((*cs++ = *mfp++) == '\n') break;
481   *cs = '\0';
482 
483   m_fd->mmap_addr = mfp;
484   return (*mfp == (char)EOF && cs == s) ? NULL : s;
485 }
486 
487 int
agetlibm(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)488 agetlibm(unsigned char *seq,
489 	 int maxs,
490 	 char *libstr,
491 	 int n_libstr,
492 	 fseek_t *libpos,
493 	 int *lcont,
494 	 struct lmf_str *m_fd,
495 	 long *l_off)
496 {
497   register unsigned char *cp, *seqp;
498   register int *ap;
499   int sel_status;
500   char *desc;
501   int lpos;		/* entry number in library */
502   long l;
503   unsigned char *seqm, *seqm1;
504   char *bp;
505   static long seq_len, desc_len;
506   static unsigned char *cp_max;
507 
508   *l_off = 1;
509 
510   lpos = m_fd->lpos;
511 
512   seqp = seq;
513   seqm = &seq[maxs-9];
514   seqm1 = seqm-1;
515 
516   ap = m_fd->sascii;
517 
518   if (*lcont==0) {
519   start_seq:
520     if (lpos >= m_fd->max_cnt) return (-1);
521     seq_len = m_fd->d_pos_arr[lpos+1] - m_fd->s_pos_arr[lpos];
522     desc_len = m_fd->s_pos_arr[lpos] - m_fd->d_pos_arr[lpos]-m_fd->acc_off;
523     if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {
524       fprintf(stderr," ** sequence over-run: %ld at %d\n",seq_len,lpos);
525       return(-1);
526     }
527     *libpos = (fseek_t)lpos;
528 
529     desc = m_fd->mmap_base+m_fd->d_pos_arr[lpos]+m_fd->acc_off;
530     strncpy(libstr,desc,n_libstr-1);
531     libstr[n_libstr-1]='\0';
532 
533     if ((m_fd->sel_acc_p != NULL) &&
534 	(sel_status = (m_fd->sel_acc_p)(libstr, 0, m_fd->sel_local)) <= 0) {
535       if (sel_status < 0) return (-1);
536       lpos++;
537       goto start_seq;
538     }
539 
540     if ((bp=strchr(libstr,'\r'))!=NULL) *bp='\0';
541     if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
542 
543     if (n_libstr > MAX_UID) {
544       bp = libstr;
545       while (*bp++) if ( *bp=='\001' || *bp=='\t') *bp=' ';
546     }
547 
548     /* find @C:offset */
549     if ((bp = memchr(desc+desc_len-15,'@', 14)) && !strncmp(bp+1,"C:",2)) {
550       *l_off = atol(bp+3);	/* this addresses an apparent bug in sscanf for non-null terminated strings */
551     }
552 
553     m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
554     cp_max = (unsigned char *)(m_fd->mmap_addr+seq_len);
555   }
556 
557   for (cp=(unsigned char *)m_fd->mmap_addr; seqp<seqm1; ) {
558     if ((*seqp++=ap[*cp++])<NA &&
559 	(*seqp++=ap[*cp++])<NA &&
560 	(*seqp++=ap[*cp++])<NA &&
561 	(*seqp++=ap[*cp++])<NA &&
562 	(*seqp++=ap[*cp++])<NA &&
563 	(*seqp++=ap[*cp++])<NA &&
564 	(*seqp++=ap[*cp++])<NA &&
565 	(*seqp++=ap[*cp++])<NA &&
566 	(*seqp++=ap[*cp++])<NA &&
567 	(*seqp++=ap[*cp++])<NA) continue;
568     --seqp;
569     if (cp >= cp_max) break;
570   }
571   m_fd->mmap_addr = (char *)cp;
572 
573   if (seqp>=seqm1) (*lcont)++;
574   else {
575     *lcont=0;
576     lpos++;
577     m_fd->lpos = lpos;
578   }
579   *seqp = EOSEQ;
580   /*   if ((int)(seqp-seq)==0) return 1; */
581   return (int)(seqp-seq);
582 }
583 
584 int
agetlib_mb(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)585 agetlib_mb(unsigned char *seq,
586 	   int maxs,
587 	   char *libstr,
588 	   int n_libstr,
589 	   fseek_t *libpos,
590 	   int *lcont,
591 	   struct lmf_str *m_fd,
592 	   long *l_off)
593 {
594   int lpos, seq_len;		/* entry number in library */
595 
596   *l_off = 1;
597   lpos = m_fd->lpos++;
598 
599   if (lpos >= m_fd->max_cnt) return (-1);
600   seq_len = m_fd->b_pos_arr[lpos+1] - m_fd->b_pos_arr[lpos]-1;
601   if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {
602     fprintf(stderr," ** sequence over-run: %d at %d\n",seq_len,lpos);
603     return(-1);
604   }
605   *libpos = (fseek_t)lpos;
606 
607   strncpy(libstr,"",n_libstr-1);
608 
609   memcpy(seq, m_fd->mmap_base+m_fd->b_pos_arr[lpos], min(seq_len+1,maxs));
610 
611   *lcont=0;
612   /*   if ((int)(seqp-seq)==0) return 1; */
613   return seq_len;
614 }
615 
616 void
aranlibm(char * str,int cnt,fseek_t libpos,char * libstr,struct lmf_str * m_fd)617 aranlibm(char *str,
618 	 int cnt,
619 	 fseek_t libpos,
620 	 char *libstr,
621 	 struct lmf_str *m_fd)
622 {
623   char *bp;
624   int llen;
625   int lpos;
626 
627   lpos = (int) libpos;
628 
629   llen = m_fd->s_pos_arr[lpos]-m_fd->d_pos_arr[lpos];
630 
631   if (llen >= cnt) llen = cnt-1;
632 
633   strncpy(str,m_fd->mmap_base+m_fd->d_pos_arr[lpos]+1,llen);
634   str[llen]='\0';
635   if ((bp = strchr(str,'\r'))!=NULL) *bp='\0';
636   if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
637   bp = str;
638   while (*bp++) if ( *bp=='\001' || *bp=='\t') *bp=' ';
639   m_fd->lpos = lpos;
640 }
641 
642 /* aranlib_mb is a hybrid of aranlib/aranlibm that uses
643    s_pos_arr[lpos] to get the fseek offset for the description
644 */
645 
646 void
aranlib_mb(char * str,int cnt,fseek_t libpos,char * libstr,struct lmf_str * m_fd)647 aranlib_mb(char *str,
648 	   int cnt,
649 	   fseek_t libpos,
650 	   char *libstr,
651 	   struct lmf_str *m_fd)
652 {
653   char *bp;
654   int llen;
655   int lpos, fs_pos;
656 
657   aranlib(str, cnt, (fseek_t)m_fd->d_pos_arr[(int)libpos],
658 	  libstr, m_fd);
659   m_fd->lpos = libpos;
660 }
661 
662 /* bmap_get_mmap_chain fills cur_seqr_chain with sequence pointers
663    from the memory mapped file at *m_fd
664 
665    because the database is opened read-only, this code only works with
666    an amino acid mapping identical to that used by blastdbcmd, aa_b2toa[]
667 
668    bmap_get_mmap_chain must return EOF AND a set of sequences for the
669    comp_lib9.c/next_seqr_chain() logic to work properly.
670 */
671 
bmap_get_mmap_chain(struct seqr_chain * cur_seqr_chain,struct lmf_str * m_fd,struct db_str * db)672 int bmap_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
673 			struct lmf_str *m_fd, struct db_str *db) {
674   int i, lib_cnt;
675   struct seq_record *seq_a, *seq_p;
676   struct mseq_record *mseq_a, *mseq_p;
677 
678   lib_cnt = m_fd->lpos;
679   if (lib_cnt >= m_fd->max_cnt) return EOF;
680   seq_a = cur_seqr_chain->seqr_base;
681   mseq_a = cur_seqr_chain->mseqr_base;
682 
683   for (i=0; i < cur_seqr_chain->max_chain_seqs; i++) {
684     if (lib_cnt >= m_fd->max_cnt) break;
685     seq_p = &seq_a[i];
686     mseq_p = &mseq_a[i];
687     seq_p->n1 = m_fd->b_pos_arr[lib_cnt+1] - m_fd->b_pos_arr[lib_cnt]-1; /* value is +1 off to get the NULL */
688 
689     db->entries++;
690     db->length += seq_p->n1;
691     if (db->length > LONG_MAX) {
692       db->length -= LONG_MAX; db->carry++;
693     }
694 
695     mseq_p->m_file_p = m_fd;
696     mseq_p->n1tot_p=NULL;
697     mseq_p->cont = 0;
698     seq_p->index = mseq_p->index = mseq_p->lseek = lib_cnt;
699 #ifndef DEBUG
700     mseq_p->libstr[0] = '\0';
701 #else
702 #endif
703     seq_p->aa1b = (unsigned char *)(m_fd->mmap_base + m_fd->b_pos_arr[lib_cnt++]);
704     seq_p->l_offset = 0;
705     seq_p->l_off = 1;
706 #if DEBUG
707     seq_p->adler32_crc = mseq_p->adler32_crc = adler32(1L,seq_p->aa1b,seq_p->n1);
708 #endif
709   }
710   cur_seqr_chain->cur_seq_cnt = i;
711   m_fd->lpos = lib_cnt;
712   if (lib_cnt >= m_fd->max_cnt) return EOF;
713   else return i;
714 }
715 
716 /* there is no vgetlibm() because vgetlibm() and agetlibm() are
717    identical - the difference in the two file formats relates to the
718    location of the sequence, which is already available in spos_arr[].
719 
720    however vranlibm must accomodate both type 5 and 6 files;
721    type 6 has extra stuff after the seq_id.
722 */
723 
724 void
vranlibm(char * str,int cnt,fseek_t libpos,char * libstr,struct lmf_str * m_fd)725 vranlibm(char *str,
726 	 int cnt,
727 	 fseek_t libpos,
728 	 char *libstr,
729 	 struct lmf_str *m_fd)
730 {
731   char *bp, *mp;
732   int llen;
733   int lpos;
734 
735   lpos = (int)libpos;
736 
737   llen = m_fd->s_pos_arr[lpos]-m_fd->d_pos_arr[lpos];
738 
739   mp = m_fd->mmap_base+m_fd->d_pos_arr[lpos];
740 
741   strncpy(str,mp+4,20);
742   str[20]='\0';
743   if ((bp=strchr(str,' '))!=NULL) *(bp+1) = '\0';
744   else if ((bp=strchr(str,'\n'))!=NULL) *bp = ' ';
745   bp = strchr(mp,'\n');
746 
747   llen -= (bp-mp)-5;
748   if (llen >  cnt-strlen(str)) llen = cnt-strlen(str)-1;
749 
750   strncat(str,bp+1,llen);
751   if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
752   str[cnt-1]='\0';
753   m_fd->lpos = lpos;
754 }
755 
756 void
close_mmap(struct lmf_str * m_fd)757 close_mmap(struct lmf_str *m_fd) {
758   free(m_fd->s_pos_arr);
759   free(m_fd->d_pos_arr);
760   if (m_fd->mm_flg) {
761     munmap(m_fd->mmap_base,m_fd->st_size);
762     free(m_fd);
763   }
764   m_fd->mm_flg=0;
765 }
766 
767 #ifndef min
768 #define min(x,y) ((x) > (y) ? (y) : (x))
769 #endif
770 
771 static int gcg_bton[4]={2,4,1,3};
772 
773 int
gcg_getlibm(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)774 gcg_getlibm(unsigned char *seq,
775 	    int maxs,
776 	    char *libstr,
777 	    int n_libstr,
778 	    fseek_t *libpos,
779 	    int *lcont,
780 	    struct lmf_str *m_fd,
781 	    long *l_off)
782 {
783   char dummy[20];
784   char gcg_date[6];
785   char gcg_type[10];
786   register unsigned char *cp, *seqp, stmp;
787   register int *ap, lpos;
788   unsigned char *seqm, *seqm1;
789   long r_block, b_block, r_fact, r16_block;
790 
791   *l_off = 1;
792 
793   seqp = seq;
794   seqm = &seq[maxs-9];
795 
796   ap = m_fd->sascii;
797   lpos = m_fd->lpos;
798 
799   if (*lcont==0) {
800     if (lpos >= m_fd->max_cnt) return (-1);
801     sscanf(m_fd->mmap_base+m_fd->d_pos_arr[lpos]+4,"%s %s %s %s %ld\n",
802 	   libstr,gcg_date,gcg_type,dummy,&(m_fd->gcg_len));
803 
804     m_fd->gcg_binary = (gcg_type[0]=='2');
805 
806     libstr[12]='\0';
807     *libpos = lpos;
808     m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
809   }
810 
811   r_block = b_block = min((size_t)(seqm-seqp),m_fd->gcg_len);
812   if (m_fd->gcg_binary) {
813     r_block = (r_block+3)/4;
814   }
815 
816   cp=(unsigned char *)m_fd->mmap_addr;
817   if (!m_fd->gcg_binary) {
818     r_fact = 1;
819     r16_block = r_block/16;
820     while (r16_block-- > 0) {
821       *seqp++ = ap[*cp++];
822       *seqp++ = ap[*cp++];
823       *seqp++ = ap[*cp++];
824       *seqp++ = ap[*cp++];
825       *seqp++ = ap[*cp++];
826       *seqp++ = ap[*cp++];
827       *seqp++ = ap[*cp++];
828       *seqp++ = ap[*cp++];
829       *seqp++ = ap[*cp++];
830       *seqp++ = ap[*cp++];
831       *seqp++ = ap[*cp++];
832       *seqp++ = ap[*cp++];
833       *seqp++ = ap[*cp++];
834       *seqp++ = ap[*cp++];
835       *seqp++ = ap[*cp++];
836       *seqp++ = ap[*cp++];
837     }
838     while (seqp<seq+r_block) *seqp++ = ap[*cp++];
839   }
840   else if (m_fd->gcg_binary) {
841     r_fact = 4;
842     r16_block = r_block/8;
843     while(r16_block-- > 0) {
844       stmp = *cp++;
845       *seqp++ = gcg_bton[(stmp>>6) &3];
846       *seqp++ = gcg_bton[(stmp>>4) &3];
847       *seqp++ = gcg_bton[(stmp>>2) &3];
848       *seqp++ = gcg_bton[(stmp) &3];
849       stmp = *cp++;
850       *seqp++ = gcg_bton[(stmp>>6) &3];
851       *seqp++ = gcg_bton[(stmp>>4) &3];
852       *seqp++ = gcg_bton[(stmp>>2) &3];
853       *seqp++ = gcg_bton[(stmp) &3];
854       stmp = *cp++;
855       *seqp++ = gcg_bton[(stmp>>6) &3];
856       *seqp++ = gcg_bton[(stmp>>4) &3];
857       *seqp++ = gcg_bton[(stmp>>2) &3];
858       *seqp++ = gcg_bton[(stmp) &3];
859       stmp = *cp++;
860       *seqp++ = gcg_bton[(stmp>>6) &3];
861       *seqp++ = gcg_bton[(stmp>>4) &3];
862       *seqp++ = gcg_bton[(stmp>>2) &3];
863       *seqp++ = gcg_bton[(stmp) &3];
864       stmp = *cp++;
865       *seqp++ = gcg_bton[(stmp>>6) &3];
866       *seqp++ = gcg_bton[(stmp>>4) &3];
867       *seqp++ = gcg_bton[(stmp>>2) &3];
868       *seqp++ = gcg_bton[(stmp) &3];
869       stmp = *cp++;
870       *seqp++ = gcg_bton[(stmp>>6) &3];
871       *seqp++ = gcg_bton[(stmp>>4) &3];
872       *seqp++ = gcg_bton[(stmp>>2) &3];
873       *seqp++ = gcg_bton[(stmp) &3];
874       stmp = *cp++;
875       *seqp++ = gcg_bton[(stmp>>6) &3];
876       *seqp++ = gcg_bton[(stmp>>4) &3];
877       *seqp++ = gcg_bton[(stmp>>2) &3];
878       *seqp++ = gcg_bton[(stmp) &3];
879       stmp = *cp++;
880       *seqp++ = gcg_bton[(stmp>>6) &3];
881       *seqp++ = gcg_bton[(stmp>>4) &3];
882       *seqp++ = gcg_bton[(stmp>>2) &3];
883       *seqp++ = gcg_bton[(stmp) &3];
884     }
885 
886     while (seqp < seq+4*r_block) {
887       stmp = *cp++;
888       *seqp++ = gcg_bton[(stmp>>6) &3];
889       *seqp++ = gcg_bton[(stmp>>4) &3];
890       *seqp++ = gcg_bton[(stmp>>2) &3];
891       *seqp++ = gcg_bton[(stmp) &3];
892     }
893   }
894   if (r_fact * r_block >= m_fd->gcg_len) {
895     *lcont = 0;
896     m_fd->lpos++;
897   }
898   else {
899     if (m_fd->gcg_binary) b_block = 4*r_block;
900     m_fd->gcg_len -= b_block;
901     (*lcont)++;
902   }
903 
904   seq[b_block] = EOSEQ;
905   /*   if (b_block==0) return 1; else */
906   return b_block;
907 }
908 
909 void lget_ann_m(struct lmf_str *lm_fd, char *libstr, int n_libstr);
910 
911 int
lgetlibm(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)912 lgetlibm(unsigned char *seq,
913 	 int maxs,
914 	 char *libstr,
915 	 int n_libstr,
916 	 fseek_t *libpos,
917 	 int *lcont,
918 	 struct lmf_str *m_fd,
919 	 long *l_off)
920 {
921   register unsigned char *cp, *seqp;
922   register int *ap, lpos;
923   unsigned char *seqm, *seqm1;
924 
925   *l_off = 1;
926 
927   seqp = seq;
928   seqm = &seq[maxs-11];
929   seqm1 = seqm-1;
930 
931   lpos = m_fd->lpos;
932   ap = m_fd->sascii;
933 
934   if (*lcont==0) {
935     if (lpos >= m_fd->max_cnt) return (-1);
936 
937     if (n_libstr <= 21) {
938       strncpy(libstr,m_fd->mmap_base+m_fd->d_pos_arr[lpos]+12,12);
939       libstr[12]='\0';
940     }
941     else {
942       lget_ann_m(m_fd,libstr,n_libstr);
943     }
944     *libpos = lpos;
945 
946     m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
947     cp = (unsigned char *)m_fd->mmap_addr;
948   }
949   else cp = (unsigned char *)m_fd->mmap_addr;
950 
951   while (seqp<seqm1) {
952     if (*cp=='/' && *(cp-1)=='\n') break;
953     if ((*seqp++=ap[*cp++])<NA &&
954 	(*seqp++=ap[*cp++])<NA &&
955 	(*seqp++=ap[*cp++])<NA &&
956 	(*seqp++=ap[*cp++])<NA &&
957 	(*seqp++=ap[*cp++])<NA &&
958 	(*seqp++=ap[*cp++])<NA &&
959 	(*seqp++=ap[*cp++])<NA &&
960 	(*seqp++=ap[*cp++])<NA &&
961 	(*seqp++=ap[*cp++])<NA &&
962 	(*seqp++=ap[*cp++])<NA &&
963 	(*seqp++=ap[*cp++])<NA) continue;
964     --seqp;
965     if (*cp=='\n' && *(cp+1)==' ') cp += 11;
966   }
967 
968   if (seqp>=seqm1) {
969     (*lcont)++;
970     m_fd->mmap_addr = (char *)cp;
971   }
972   else {
973     *lcont=0;
974     m_fd->lpos++;
975   }
976 
977   *seqp = EOSEQ;
978   return (int)(seqp-seq);
979 }
980 
981 void
lget_ann_m(struct lmf_str * lm_fd,char * libstr,int n_libstr)982 lget_ann_m(struct lmf_str *lm_fd, char *libstr, int n_libstr) {
983   char *bp, *bp_gid, locus[120], desc[120], acc[120], ver[120];
984 
985   /* copy in locus from lm_fd->lline */
986   strncpy(locus,&lm_fd->mmap_addr[12],sizeof(locus));
987   if ((bp=strchr(locus,' '))!=NULL) *(bp+1) = '\0';
988 
989   /* get description */
990   mgets(desc,sizeof(desc),lm_fd);
991   while (desc[0]!='D' || desc[1]!='E' || strncmp(desc,"DEFINITION",10))
992     mgets(desc,sizeof(desc),lm_fd);
993   if ((bp = strchr(&desc[12],'\n'))!=NULL) *bp='\0';
994 
995   /* get accession */
996   mgets(acc,sizeof(acc),lm_fd);
997   while (acc[0]!='A' || acc[1]!='C' || strncmp(acc,"ACCESSION",9)) {
998     mgets(acc,sizeof(acc),lm_fd);
999     if (acc[0]=='O' && acc[1]=='R' && strncmp(acc,"ORIGIN",6)==0)
1000       break;
1001   }
1002   if ((bp = strchr(&acc[12],'\n'))!=NULL) *bp='\0';
1003   if ((bp = strchr(&acc[12],' '))!=NULL) *bp='\0';
1004 
1005   /* get version */
1006   mgets(ver,sizeof(ver),lm_fd);
1007   while (ver[0]!='V' || ver[1]!='E' || strncmp(ver,"VERSION",7)) {
1008     mgets(ver,sizeof(ver),lm_fd);
1009     if (ver[0]=='O' && ver[1]=='R' && strncmp(ver,"ORIGIN",6)==0)
1010       break;
1011   }
1012   if ((bp = strchr(&ver[12],'\n'))!=NULL) *bp='\0';
1013 
1014       /* extract gi:123456 from version line */
1015   bp_gid = strchr(&ver[12],':');
1016   if (bp_gid != NULL) {
1017     if ((bp=strchr(bp_gid+1,' '))!=NULL) *bp='\0';
1018     bp_gid++;
1019   }
1020   if ((bp = strchr(&ver[12],' '))!=NULL) *bp='\0';
1021 
1022       /* build up FASTA header line */
1023   if (bp_gid != NULL) {
1024     strncpy(libstr,"gi|",n_libstr-1);
1025     strncat(libstr,bp_gid,n_libstr-4);
1026     strncat(libstr,"|gb|",n_libstr-20);
1027   }
1028   else {libstr[0]='\0';}
1029 
1030   /* if we have a version number, use it, otherwise accession,
1031 	 otherwise locus/description */
1032 
1033   if (ver[0]=='V') {
1034     strncat(libstr,&ver[12],n_libstr-1-strlen(libstr));
1035     strncat(libstr,"|",n_libstr-1-strlen(libstr));
1036   }
1037   else if (acc[0]=='A') {
1038     strncat(libstr,&acc[12],n_libstr-1-strlen(libstr));
1039     strncat(libstr," ",n_libstr-1-strlen(libstr));
1040   }
1041 
1042   strncat(libstr,locus,n_libstr-1-strlen(libstr));
1043   strncat(libstr,&desc[11],n_libstr-1-strlen(libstr));
1044   libstr[n_libstr-1]='\0';
1045 }
1046 
1047 void
lranlibm(char * str,int cnt,fseek_t seek,char * libstr,struct lmf_str * m_fd)1048 lranlibm(char *str,
1049 	 int cnt,
1050 	 fseek_t seek,
1051 	 char *libstr,
1052 	 struct lmf_str *m_fd)
1053 {
1054   lget_ann_m(m_fd,str,cnt);
1055 
1056   str[cnt-1]='\0';
1057 
1058   m_fd->lpos = seek;
1059 }
1060 
1061 static int check_status=0;
1062 
1063 void
check_mmap(struct lmf_str * m_fd,long ntt)1064 check_mmap(struct lmf_str *m_fd,long ntt) {
1065 
1066   int i, seq_len, ok_stat;
1067 
1068   ok_stat = 1;
1069   if ( ++check_status > 5) return;
1070 
1071   fprintf(stderr," ** checking %s %ld**\n", m_fd->lb_name,ntt);
1072   for (i=0; i<m_fd->max_cnt; i++) {
1073     seq_len = m_fd->d_pos_arr[i+1] - m_fd->s_pos_arr[i];
1074     if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {
1075       fprintf(stderr,"%d:\t%lld\t%lld\t%lld\n",
1076 	      i,m_fd->d_pos_arr[i],m_fd->s_pos_arr[i],
1077 	      m_fd->d_pos_arr[i+1]-m_fd->s_pos_arr[i]);
1078       ok_stat=0;
1079     }
1080   }
1081   if (ok_stat) {
1082     if (check_status) fprintf(stderr," ** check_mmap OK %s %ld**\n",
1083 			      m_fd->lb_name,ntt);
1084   }
1085 }
1086 
1087 #ifdef DEBUG
1088 /*  C H K 3  --  Compute a type-3 Kermit block check.  */
1089 /*
1090  Calculate the 16-bit CRC of a null-terminated string using a byte-oriented
1091  tableless algorithm invented by Andy Lowry (Columbia University).  The
1092  magic number 010201 is derived from the CRC-CCITT polynomial x^16+x^12+x^5+1.
1093  Note - this function could be adapted for strings containing imbedded 0's
1094  by including a length argument.
1095 */
1096 long
crck(s,n)1097 crck(s,n)
1098     char *s; int n;
1099 {
1100     unsigned int c, q;
1101     long crc = 0;
1102 
1103     while (n-->0) {
1104 	c = *s++;
1105 	/* if (parity)*/
1106 	c &= 0177;
1107 	q = (crc ^ c) & 017;		/* Low-order nibble */
1108 	crc = (crc >> 4) ^ (q * 010201);
1109 	q = (crc ^ (c >> 4)) & 017;	/* High order nibble */
1110 	crc = (crc >> 4) ^ (q * 010201);
1111     }
1112     return(crc);
1113 }
1114 #endif
1115