1 /* mmgetaa.c - functions for mmap()ed access to libraries */
2
3 /* $Id: mmgetaa.c 1153 2013-05-20 13:29:29Z wrp $ */
4
5 /* copyright (c) 1999, 2000, 2014 by William R. Pearson and The Rector &
6 Visitors of the University of Virginia */
7
8 /* Licensed under the Apache License, Version 2.0 (the "License");
9 you may not use this file except in compliance with the License.
10 You may obtain a copy of the License at
11
12 http://www.apache.org/licenses/LICENSE-2.0
13
14 Unless required by applicable law or agreed to in writing,
15 software distributed under this License is distributed on an "AS
16 IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
17 express or implied. See the License for the specific language
18 governing permissions and limitations under the License.
19 */
20
21 /*
22 This is one of two alternative files that can be used to
23 read a database. The two files are nmgetaa.c, and mmgetaa.c
24 (nxgetaa.c has been retired).
25
26 nmgetlib.c and mmgetaa.c are used together. nmgetlib.c provides
27 the same functions as nxgetaa.c if memory mapping is not used,
28 mmgetaa.c provides the database reading functions if memory
29 mapping is used. The decision to use memory mapping is made on
30 a file-by-file basis.
31 */
32
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <unistd.h>
36 #include <limits.h>
37 #include <string.h>
38 #include <errno.h>
39
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/mman.h>
43 #include <fcntl.h>
44
45 #define MAXLINE 512
46 #define EOSEQ 0
47
48 #define XTERNAL
49 #include "uascii.h"
50 /* #include "upam.h" */
51 #undef XTERNAL
52
53 #define GCGBIN 6
54
55 #ifndef MAP_FILE
56 #define MAP_FILE 0
57 #endif
58
59 #include "defs.h"
60 #include "structs.h"
61 #include "mm_file.h"
62
63 extern int64_t bl2_long8_cvt(int64_t);
64 extern int bl2_uint4_cvt(int);
65 extern void newname(char *, char *, char *, int);
66
67 long crck(char *, int);
68 extern void src_int4_read(FILE *fd, int *val);
69 extern void src_long4_read(FILE *fd, long *valp);
70 extern void src_long8_read(FILE *fd, int64_t *val);
71
72 int
73 agetlib_mb(unsigned char *seq,int maxs,char *libstr,int n_libstr,fseek_t *libpos,
74 int *lcont, struct lmf_str *m_fd, long *l_off);
75
76 extern void
77 aranlib(char *str, int cnt, fseek_t seek, char *libstr, struct lmf_str *lm_fd);
78 void
79 aranlib_mb(char *str, int cnt, fseek_t seek, char *libstr, struct lmf_str *lm_fd);
80
81 /* mmap()ed functions */
82 #ifdef USE_MMAP
83 int agetlibm(); void aranlibm();
84 int lgetlibm(); void lranlibm();
85 void vranlibm();
86 int gcg_getlibm();
87
88 int (*getlibam[])()={
89 agetlibm,lgetlibm, NULL, NULL,NULL,agetlibm,gcg_getlibm
90 };
91
92 void (*ranlibam[])()={
93 aranlibm,lranlibm,NULL,NULL,NULL,vranlibm,vranlibm
94 };
95 #endif
96
97 int
98 bmap_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
99 struct lmf_str *m_fd, struct db_str *db);
100
101 /* load_mmap() loads the d_pos[] and s_pos[] arrays for rapid access */
102 /* 24-July-2011 -- checks and maps bsq file */
103
can_mmap(int lib_type)104 int can_mmap(int lib_type) {
105 return (getlibam[lib_type] != NULL);
106 }
107
108
109 struct lmf_str *
load_mmap(FILE * libi,char * sname,int lib_type,int ldnaseq,struct lmf_str * m_fd)110 load_mmap(FILE *libi, /* fd for already open ".xin" file */
111 char *sname, /* name of sequence database file */
112 int lib_type, /* 0-Fasta, 5-vms_pir, 6-gcg_binary */
113 int ldnaseq, /* 1 for DNA, 0 for protein */
114 struct lmf_str *m_fd)
115 {
116 char format[4];
117 char bname[MAX_FN], xbname[MAX_FN];
118 int i, lib_aa;
119 fseek_t f_size;
120 long lf_size;
121 struct stat statbuf;
122 int max_cnt;
123 fseek_t *d_pos_arr, *s_pos_arr, *b_pos_arr;
124 int mm_flag, mm64_flag, mmb_flag;
125 FILE *libi_b; /* FILE * for .xin_b file */
126 int *tmp_pos_arr;
127
128 /* first check that the necessary indices are up-to-date */
129 /* read the offsets in ".xin" file */
130 if (fread(format,1,4,libi)==0) {
131 fprintf(stderr," cannot read .xin format\n");
132 return NULL;
133 }
134
135 mm64_flag = (format[2]>=1); /* 4 bytes or 8 bytes for long? */
136
137 #ifndef BIG_LIB64
138 if (mm64_flag) {return NULL;}
139 #endif
140
141 if (format[3]!=lib_type) {
142 fprintf(stderr," cannot read format %d != lib_type %d\n",
143 format[3],lib_type);
144 return NULL;
145 }
146
147 src_int4_read(libi,&lib_aa);
148 if (lib_aa == ldnaseq) { /* database residue mismatch */
149 fprintf(stderr," residue type mismatch %s != %s (.xin) in %s\n",
150 (lib_aa ? "DNA" : "prot."),(ldnaseq ? "prot." : "DNA"),
151 sname);
152 return NULL;
153 }
154
155 /* everything looks good, allocate an lmf_str */
156 m_fd->lib_aa = lib_aa;
157
158 /* get ascii file size from index */
159 if (mm64_flag) src_long8_read(libi,&f_size);
160 else {
161 src_long4_read(libi,&lf_size);
162 f_size = lf_size;
163 }
164
165 if (sizeof(char *) < sizeof(fseek_t) && f_size > UINT_MAX) {
166 fprintf(stderr,"\n *** Warning *** database too large (%lld) for 32-bit mmap()\n",f_size);
167 return NULL;
168 }
169
170 /* check for .bsq binary mapping */
171 newname(bname,sname,"bsq",sizeof(bname));
172 mm_flag = (m_fd->mmap_fd=open(bname,O_RDONLY) >= 0);
173 mmb_flag = 0;
174 if (mm_flag) {
175 mmb_flag = 1;
176
177 /* fstat the binary sequence file */
178 if(stat(bname, &statbuf) < 0) {
179 fprintf(stderr," cannot stat %s for mmap()", sname);
180 perror("...");
181 mmb_flag = 0;
182 goto next_mmap;
183 }
184
185 /* now open the .xin_b file and read the offsets */
186 newname(xbname, sname, "xin_b",sizeof(xbname));
187 if ((libi_b = fopen(xbname,"r"))==NULL) {
188 fprintf(stderr,"Cannot open %s binary index file\n",xbname);
189 mmb_flag = 0;
190 goto next_mmap;
191 }
192
193 /* now read the .xin_b file */
194 if (fread(format,1,4,libi_b)==0) {
195 fprintf(stderr," cannot read .xin_b format\n");
196 mmb_flag = 0;
197 goto next_mmap;
198
199 }
200 src_int4_read(libi_b,&lib_aa);
201 /* get .bsq file size from .xin_b */
202 src_long8_read(libi_b,&f_size);
203 if (f_size != statbuf.st_size) {
204 fprintf(stderr," %s file size (%lld) and expected size (%lld) don't match\n",
205 bname,statbuf.st_size,f_size);
206 mmb_flag = 0;
207 goto next_mmap;
208 }
209
210 /* now, start to open mmap()ed file */
211 mmb_flag=((m_fd->mmap_fd=open(bname,O_RDONLY))>=0);
212
213 if (!mmb_flag) {
214 fprintf(stderr," cannot open %s for mmap()", bname);
215 perror("...");
216 goto next_mmap;
217 }
218
219 /* the index file and library file are open and the sizes match */
220 /* allocate the m_file struct and map the file */
221
222 m_fd->st_size = statbuf.st_size;
223 if((m_fd->mmap_base =
224 mmap(NULL, m_fd->st_size, PROT_READ,
225 MAP_FILE | MAP_SHARED, m_fd->mmap_fd, 0)) == (char *) -1) {
226 mm_flag = 0;
227 #ifdef DEBUG
228 fprintf(stderr," cannot mmap %s", bname);
229 perror("...");
230 #endif
231 }
232
233 /* now finish reading the index file */
234 src_int4_read(libi_b,&max_cnt);
235
236 src_long8_read(libi_b,&m_fd->tot_len);
237 src_long4_read(libi_b,&lf_size);
238 m_fd->max_len = lf_size;
239 /* get seqbuf_max */
240 src_long4_read(libi_b,&lf_size);
241 /* get seqbuf_dup */
242 src_long4_read(libi_b,&lf_size);
243
244 #ifdef DEBUG
245 fprintf(stderr,
246 "\n%s\tformat: %c%c%d %d; max_cnt: %d; tot_len: %lld max_len: %ld\n",
247 sname,format[0],format[1],format[2],format[3],
248 max_cnt,m_fd->tot_len,m_fd->max_len);
249 #endif
250
251 /* allocate array of description pointers */
252
253 if ((b_pos_arr=(fseek_t *)calloc(max_cnt+1, sizeof(fseek_t)))==NULL) {
254 fprintf(stderr," cannot allocate %d for binary seq array\n",max_cnt+1);
255 exit(1);
256 }
257
258 /* now read the binary offsets (b_pos_arr) */
259 if (fread(b_pos_arr,sizeof(fseek_t),max_cnt+1,libi_b)!=
260 max_cnt+1) {
261 fprintf(stderr," error reading bseq offsets: %s\n",xbname);
262 return NULL;
263 }
264
265 #ifndef IS_BIG_ENDIAN
266 for (i=0; i<=max_cnt; i++) {
267 b_pos_arr[i] = bl2_long8_cvt(b_pos_arr[i]);
268 }
269 #endif
270 /* now have the b_pos_arr[] allocated and read, close libi_b */
271 fclose(libi_b);
272 }
273
274 next_mmap:
275 /* here if no mmb_flag or mmb read failed */
276 if (!mmb_flag) {
277 /* now, start to open mmap()ed file */
278 mm_flag=((m_fd->mmap_fd=open(sname,O_RDONLY))>=0);
279 if (!mm_flag) {
280 fprintf(stderr," cannot open %s for mmap()", sname);
281 perror("...");
282 return NULL; /* file did not open */
283 }
284
285 /* fstat the library file and get size */
286 if(fstat(m_fd->mmap_fd, &statbuf) < 0) {
287 fprintf(stderr," cannot stat %s for mmap()", sname);
288 perror("...");
289 m_fd->mm_flg = 0;
290 goto finish;
291 }
292
293 /* check for identical sizes - if different, do not mmap */
294 if (f_size != statbuf.st_size) {
295 fprintf(stderr," %s file size (%lld) and expected size (%lld) don't match\n",
296 sname,statbuf.st_size,f_size);
297 mm_flag = 0;
298 goto finish;
299 }
300
301 /* the index file and library file are open and the sizes match */
302 /* allocate the m_file struct and map the file */
303
304 m_fd->st_size = statbuf.st_size;
305 if((m_fd->mmap_base =
306 mmap(NULL, m_fd->st_size, PROT_READ,
307 MAP_FILE | MAP_SHARED, m_fd->mmap_fd, 0)) == (char *) -1) {
308 mm_flag = 0;
309 #ifdef DEBUG
310 fprintf(stderr," cannot mmap %s", sname);
311 perror("...");
312 #endif
313 }
314 }
315
316 /* here, we have a memory mapped file, and if mmb_flag, we have the
317 b_pos_arr[] read, but not s_pos_arr[] or d_pos_arr[] */
318
319 finish:
320 close(m_fd->mmap_fd);
321 if (!mm_flag) { return NULL; }
322
323 /* now finish reading the index file */
324 src_int4_read(libi,&max_cnt);
325
326 if (mm64_flag) {
327 src_long8_read(libi,&m_fd->tot_len);
328 }
329 else {
330 src_long4_read(libi,&lf_size);
331 m_fd->tot_len = lf_size;
332 }
333 src_long4_read(libi,&lf_size);
334 m_fd->max_len = lf_size;
335
336 #ifdef DEBUG
337 fprintf(stderr,
338 "\n%s\tformat: %c%c%d %d; max_cnt: %d; tot_len: %lld max_len: %ld\n",
339 sname,format[0],format[1],format[2],format[3],
340 max_cnt,m_fd->tot_len,m_fd->max_len);
341 #endif
342
343 /* allocate array of description pointers */
344 if (!mm64_flag) {
345 if ((tmp_pos_arr=(int *)calloc(max_cnt+1,sizeof(int)))==NULL) {
346 fprintf(stderr," cannot allocate %d for tmp_pos array\n",
347 max_cnt+1);
348 return NULL;
349 }
350 }
351
352 if ((d_pos_arr=(fseek_t *)calloc(max_cnt+1, sizeof(fseek_t)))==NULL) {
353 fprintf(stderr," cannot allocate %d for desc. array\n",max_cnt+1);
354 exit(1);
355 }
356
357 /* read m_fd->d_pos[max_cnt+1] */
358 if (mm64_flag) {
359 if (fread(d_pos_arr,sizeof(fseek_t),max_cnt+1,libi)!=
360 max_cnt+1) {
361 fprintf(stderr," error reading desc. offsets: %s\n",sname);
362 return NULL;
363 }
364 }
365 else {
366 if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!=
367 max_cnt+1) {
368 fprintf(stderr," error reading desc. offsets: %s\n",sname);
369 return NULL;
370 }
371 #ifdef DEBUG
372 fprintf(stderr,"d_pos_crc: %ld\n",
373 crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));
374 #endif
375 }
376
377 #ifndef IS_BIG_ENDIAN
378 if (mm64_flag)
379 for (i=0; i<=max_cnt; i++) {
380 d_pos_arr[i] = bl2_long8_cvt(d_pos_arr[i]);
381 }
382 else
383 for (i=0; i<=max_cnt; i++) {
384 d_pos_arr[i] = bl2_uint4_cvt(tmp_pos_arr[i]);
385 }
386 #else
387 if (!mm64_flag) {
388 for (i=0; i<=max_cnt; i++) {
389 d_pos_arr[i] = tmp_pos_arr[i];
390 }
391 }
392 #endif
393
394 #ifdef DEBUG
395 for (i=0; i<max_cnt-1; i++) {
396 if (d_pos_arr[i+1] <= d_pos_arr[i] )
397 fprintf(stderr," ** dpos_error [%d]\t%lld\t%lld\n",
398 i,d_pos_arr[i],d_pos_arr[i+1]);
399 }
400 #endif
401
402 /* allocate array of sequence pointers */
403 if ((s_pos_arr=(fseek_t *)calloc(max_cnt+1,sizeof(fseek_t)))==NULL) {
404 fprintf(stderr," cannot allocate %d for seq. array\n",max_cnt+1);
405 exit(1);
406 }
407
408 /* read m_fd->s_pos[max_cnt+1] */
409 if (mm64_flag) {
410 if (fread(s_pos_arr,sizeof(fseek_t),max_cnt+1,libi)!=
411 max_cnt+1) {
412 fprintf(stderr," error reading seq offsets: %s\n",sname);
413 return NULL;
414 }
415 }
416 else {
417 if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!=
418 max_cnt+1) {
419 fprintf(stderr," error reading seq offsets: %s\n",sname);
420 return NULL;
421 }
422 #ifdef DEBUG
423 fprintf(stderr,"s_pos_crc: %ld\n",
424 crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));
425 #endif
426 }
427
428 #ifndef IS_BIG_ENDIAN
429 if (mm64_flag)
430 for (i=0; i<=max_cnt; i++)
431 s_pos_arr[i] = bl2_long8_cvt(s_pos_arr[i]);
432 else
433 for (i=0; i<=max_cnt; i++)
434 s_pos_arr[i] = (long)bl2_uint4_cvt(tmp_pos_arr[i]);
435 #else
436 if (!mm64_flag)
437 for (i=0; i<=max_cnt; i++)
438 s_pos_arr[i] = (long)tmp_pos_arr[i];
439 #endif
440
441 #ifdef DEBUG
442 for (i=1; i<max_cnt-1; i++) {
443 if (s_pos_arr[i+1]<s_pos_arr[i])
444 fprintf(stderr," ** spos_error [%d]\t%lld\t%lld\n",
445 i,s_pos_arr[i],s_pos_arr[i]);
446 }
447 #endif
448
449 if (!mm64_flag) free(tmp_pos_arr);
450
451 m_fd->max_cnt = max_cnt;
452 m_fd->d_pos_arr = d_pos_arr;
453 m_fd->s_pos_arr = s_pos_arr;
454 if (mmb_flag) m_fd->b_pos_arr = b_pos_arr;
455 m_fd->lpos = 0;
456 m_fd->lb_type = lib_type;
457 m_fd->getlib = getlibam[lib_type];
458 m_fd->ranlib = ranlibam[lib_type];
459 m_fd->get_mmap_chain = NULL;
460 m_fd->mm_flg = 1;
461 if (mmb_flag) {
462 m_fd->getlib = agetlib_mb;
463 m_fd->ranlib = aranlib_mb;
464 m_fd->get_mmap_chain = bmap_get_mmap_chain;
465 }
466
467 /* check_mmap(m_fd,-2); */
468
469 return m_fd;
470 }
471
mgets(char * s,int n,struct lmf_str * m_fd)472 char *mgets (char *s, int n, struct lmf_str *m_fd)
473 {
474 char *cs, *mfp;
475
476 mfp = m_fd->mmap_addr;
477 cs = s;
478
479 while (--n > 0 && (*mfp != (char)EOF))
480 if ((*cs++ = *mfp++) == '\n') break;
481 *cs = '\0';
482
483 m_fd->mmap_addr = mfp;
484 return (*mfp == (char)EOF && cs == s) ? NULL : s;
485 }
486
487 int
agetlibm(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)488 agetlibm(unsigned char *seq,
489 int maxs,
490 char *libstr,
491 int n_libstr,
492 fseek_t *libpos,
493 int *lcont,
494 struct lmf_str *m_fd,
495 long *l_off)
496 {
497 register unsigned char *cp, *seqp;
498 register int *ap;
499 int sel_status;
500 char *desc;
501 int lpos; /* entry number in library */
502 long l;
503 unsigned char *seqm, *seqm1;
504 char *bp;
505 static long seq_len, desc_len;
506 static unsigned char *cp_max;
507
508 *l_off = 1;
509
510 lpos = m_fd->lpos;
511
512 seqp = seq;
513 seqm = &seq[maxs-9];
514 seqm1 = seqm-1;
515
516 ap = m_fd->sascii;
517
518 if (*lcont==0) {
519 start_seq:
520 if (lpos >= m_fd->max_cnt) return (-1);
521 seq_len = m_fd->d_pos_arr[lpos+1] - m_fd->s_pos_arr[lpos];
522 desc_len = m_fd->s_pos_arr[lpos] - m_fd->d_pos_arr[lpos]-m_fd->acc_off;
523 if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {
524 fprintf(stderr," ** sequence over-run: %ld at %d\n",seq_len,lpos);
525 return(-1);
526 }
527 *libpos = (fseek_t)lpos;
528
529 desc = m_fd->mmap_base+m_fd->d_pos_arr[lpos]+m_fd->acc_off;
530 strncpy(libstr,desc,n_libstr-1);
531 libstr[n_libstr-1]='\0';
532
533 if ((m_fd->sel_acc_p != NULL) &&
534 (sel_status = (m_fd->sel_acc_p)(libstr, 0, m_fd->sel_local)) <= 0) {
535 if (sel_status < 0) return (-1);
536 lpos++;
537 goto start_seq;
538 }
539
540 if ((bp=strchr(libstr,'\r'))!=NULL) *bp='\0';
541 if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
542
543 if (n_libstr > MAX_UID) {
544 bp = libstr;
545 while (*bp++) if ( *bp=='\001' || *bp=='\t') *bp=' ';
546 }
547
548 /* find @C:offset */
549 if ((bp = memchr(desc+desc_len-15,'@', 14)) && !strncmp(bp+1,"C:",2)) {
550 *l_off = atol(bp+3); /* this addresses an apparent bug in sscanf for non-null terminated strings */
551 }
552
553 m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
554 cp_max = (unsigned char *)(m_fd->mmap_addr+seq_len);
555 }
556
557 for (cp=(unsigned char *)m_fd->mmap_addr; seqp<seqm1; ) {
558 if ((*seqp++=ap[*cp++])<NA &&
559 (*seqp++=ap[*cp++])<NA &&
560 (*seqp++=ap[*cp++])<NA &&
561 (*seqp++=ap[*cp++])<NA &&
562 (*seqp++=ap[*cp++])<NA &&
563 (*seqp++=ap[*cp++])<NA &&
564 (*seqp++=ap[*cp++])<NA &&
565 (*seqp++=ap[*cp++])<NA &&
566 (*seqp++=ap[*cp++])<NA &&
567 (*seqp++=ap[*cp++])<NA) continue;
568 --seqp;
569 if (cp >= cp_max) break;
570 }
571 m_fd->mmap_addr = (char *)cp;
572
573 if (seqp>=seqm1) (*lcont)++;
574 else {
575 *lcont=0;
576 lpos++;
577 m_fd->lpos = lpos;
578 }
579 *seqp = EOSEQ;
580 /* if ((int)(seqp-seq)==0) return 1; */
581 return (int)(seqp-seq);
582 }
583
584 int
agetlib_mb(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)585 agetlib_mb(unsigned char *seq,
586 int maxs,
587 char *libstr,
588 int n_libstr,
589 fseek_t *libpos,
590 int *lcont,
591 struct lmf_str *m_fd,
592 long *l_off)
593 {
594 int lpos, seq_len; /* entry number in library */
595
596 *l_off = 1;
597 lpos = m_fd->lpos++;
598
599 if (lpos >= m_fd->max_cnt) return (-1);
600 seq_len = m_fd->b_pos_arr[lpos+1] - m_fd->b_pos_arr[lpos]-1;
601 if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {
602 fprintf(stderr," ** sequence over-run: %d at %d\n",seq_len,lpos);
603 return(-1);
604 }
605 *libpos = (fseek_t)lpos;
606
607 strncpy(libstr,"",n_libstr-1);
608
609 memcpy(seq, m_fd->mmap_base+m_fd->b_pos_arr[lpos], min(seq_len+1,maxs));
610
611 *lcont=0;
612 /* if ((int)(seqp-seq)==0) return 1; */
613 return seq_len;
614 }
615
616 void
aranlibm(char * str,int cnt,fseek_t libpos,char * libstr,struct lmf_str * m_fd)617 aranlibm(char *str,
618 int cnt,
619 fseek_t libpos,
620 char *libstr,
621 struct lmf_str *m_fd)
622 {
623 char *bp;
624 int llen;
625 int lpos;
626
627 lpos = (int) libpos;
628
629 llen = m_fd->s_pos_arr[lpos]-m_fd->d_pos_arr[lpos];
630
631 if (llen >= cnt) llen = cnt-1;
632
633 strncpy(str,m_fd->mmap_base+m_fd->d_pos_arr[lpos]+1,llen);
634 str[llen]='\0';
635 if ((bp = strchr(str,'\r'))!=NULL) *bp='\0';
636 if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
637 bp = str;
638 while (*bp++) if ( *bp=='\001' || *bp=='\t') *bp=' ';
639 m_fd->lpos = lpos;
640 }
641
642 /* aranlib_mb is a hybrid of aranlib/aranlibm that uses
643 s_pos_arr[lpos] to get the fseek offset for the description
644 */
645
646 void
aranlib_mb(char * str,int cnt,fseek_t libpos,char * libstr,struct lmf_str * m_fd)647 aranlib_mb(char *str,
648 int cnt,
649 fseek_t libpos,
650 char *libstr,
651 struct lmf_str *m_fd)
652 {
653 char *bp;
654 int llen;
655 int lpos, fs_pos;
656
657 aranlib(str, cnt, (fseek_t)m_fd->d_pos_arr[(int)libpos],
658 libstr, m_fd);
659 m_fd->lpos = libpos;
660 }
661
662 /* bmap_get_mmap_chain fills cur_seqr_chain with sequence pointers
663 from the memory mapped file at *m_fd
664
665 because the database is opened read-only, this code only works with
666 an amino acid mapping identical to that used by blastdbcmd, aa_b2toa[]
667
668 bmap_get_mmap_chain must return EOF AND a set of sequences for the
669 comp_lib9.c/next_seqr_chain() logic to work properly.
670 */
671
bmap_get_mmap_chain(struct seqr_chain * cur_seqr_chain,struct lmf_str * m_fd,struct db_str * db)672 int bmap_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
673 struct lmf_str *m_fd, struct db_str *db) {
674 int i, lib_cnt;
675 struct seq_record *seq_a, *seq_p;
676 struct mseq_record *mseq_a, *mseq_p;
677
678 lib_cnt = m_fd->lpos;
679 if (lib_cnt >= m_fd->max_cnt) return EOF;
680 seq_a = cur_seqr_chain->seqr_base;
681 mseq_a = cur_seqr_chain->mseqr_base;
682
683 for (i=0; i < cur_seqr_chain->max_chain_seqs; i++) {
684 if (lib_cnt >= m_fd->max_cnt) break;
685 seq_p = &seq_a[i];
686 mseq_p = &mseq_a[i];
687 seq_p->n1 = m_fd->b_pos_arr[lib_cnt+1] - m_fd->b_pos_arr[lib_cnt]-1; /* value is +1 off to get the NULL */
688
689 db->entries++;
690 db->length += seq_p->n1;
691 if (db->length > LONG_MAX) {
692 db->length -= LONG_MAX; db->carry++;
693 }
694
695 mseq_p->m_file_p = m_fd;
696 mseq_p->n1tot_p=NULL;
697 mseq_p->cont = 0;
698 seq_p->index = mseq_p->index = mseq_p->lseek = lib_cnt;
699 #ifndef DEBUG
700 mseq_p->libstr[0] = '\0';
701 #else
702 #endif
703 seq_p->aa1b = (unsigned char *)(m_fd->mmap_base + m_fd->b_pos_arr[lib_cnt++]);
704 seq_p->l_offset = 0;
705 seq_p->l_off = 1;
706 #if DEBUG
707 seq_p->adler32_crc = mseq_p->adler32_crc = adler32(1L,seq_p->aa1b,seq_p->n1);
708 #endif
709 }
710 cur_seqr_chain->cur_seq_cnt = i;
711 m_fd->lpos = lib_cnt;
712 if (lib_cnt >= m_fd->max_cnt) return EOF;
713 else return i;
714 }
715
716 /* there is no vgetlibm() because vgetlibm() and agetlibm() are
717 identical - the difference in the two file formats relates to the
718 location of the sequence, which is already available in spos_arr[].
719
720 however vranlibm must accomodate both type 5 and 6 files;
721 type 6 has extra stuff after the seq_id.
722 */
723
724 void
vranlibm(char * str,int cnt,fseek_t libpos,char * libstr,struct lmf_str * m_fd)725 vranlibm(char *str,
726 int cnt,
727 fseek_t libpos,
728 char *libstr,
729 struct lmf_str *m_fd)
730 {
731 char *bp, *mp;
732 int llen;
733 int lpos;
734
735 lpos = (int)libpos;
736
737 llen = m_fd->s_pos_arr[lpos]-m_fd->d_pos_arr[lpos];
738
739 mp = m_fd->mmap_base+m_fd->d_pos_arr[lpos];
740
741 strncpy(str,mp+4,20);
742 str[20]='\0';
743 if ((bp=strchr(str,' '))!=NULL) *(bp+1) = '\0';
744 else if ((bp=strchr(str,'\n'))!=NULL) *bp = ' ';
745 bp = strchr(mp,'\n');
746
747 llen -= (bp-mp)-5;
748 if (llen > cnt-strlen(str)) llen = cnt-strlen(str)-1;
749
750 strncat(str,bp+1,llen);
751 if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
752 str[cnt-1]='\0';
753 m_fd->lpos = lpos;
754 }
755
756 void
close_mmap(struct lmf_str * m_fd)757 close_mmap(struct lmf_str *m_fd) {
758 free(m_fd->s_pos_arr);
759 free(m_fd->d_pos_arr);
760 if (m_fd->mm_flg) {
761 munmap(m_fd->mmap_base,m_fd->st_size);
762 free(m_fd);
763 }
764 m_fd->mm_flg=0;
765 }
766
767 #ifndef min
768 #define min(x,y) ((x) > (y) ? (y) : (x))
769 #endif
770
771 static int gcg_bton[4]={2,4,1,3};
772
773 int
gcg_getlibm(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)774 gcg_getlibm(unsigned char *seq,
775 int maxs,
776 char *libstr,
777 int n_libstr,
778 fseek_t *libpos,
779 int *lcont,
780 struct lmf_str *m_fd,
781 long *l_off)
782 {
783 char dummy[20];
784 char gcg_date[6];
785 char gcg_type[10];
786 register unsigned char *cp, *seqp, stmp;
787 register int *ap, lpos;
788 unsigned char *seqm, *seqm1;
789 long r_block, b_block, r_fact, r16_block;
790
791 *l_off = 1;
792
793 seqp = seq;
794 seqm = &seq[maxs-9];
795
796 ap = m_fd->sascii;
797 lpos = m_fd->lpos;
798
799 if (*lcont==0) {
800 if (lpos >= m_fd->max_cnt) return (-1);
801 sscanf(m_fd->mmap_base+m_fd->d_pos_arr[lpos]+4,"%s %s %s %s %ld\n",
802 libstr,gcg_date,gcg_type,dummy,&(m_fd->gcg_len));
803
804 m_fd->gcg_binary = (gcg_type[0]=='2');
805
806 libstr[12]='\0';
807 *libpos = lpos;
808 m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
809 }
810
811 r_block = b_block = min((size_t)(seqm-seqp),m_fd->gcg_len);
812 if (m_fd->gcg_binary) {
813 r_block = (r_block+3)/4;
814 }
815
816 cp=(unsigned char *)m_fd->mmap_addr;
817 if (!m_fd->gcg_binary) {
818 r_fact = 1;
819 r16_block = r_block/16;
820 while (r16_block-- > 0) {
821 *seqp++ = ap[*cp++];
822 *seqp++ = ap[*cp++];
823 *seqp++ = ap[*cp++];
824 *seqp++ = ap[*cp++];
825 *seqp++ = ap[*cp++];
826 *seqp++ = ap[*cp++];
827 *seqp++ = ap[*cp++];
828 *seqp++ = ap[*cp++];
829 *seqp++ = ap[*cp++];
830 *seqp++ = ap[*cp++];
831 *seqp++ = ap[*cp++];
832 *seqp++ = ap[*cp++];
833 *seqp++ = ap[*cp++];
834 *seqp++ = ap[*cp++];
835 *seqp++ = ap[*cp++];
836 *seqp++ = ap[*cp++];
837 }
838 while (seqp<seq+r_block) *seqp++ = ap[*cp++];
839 }
840 else if (m_fd->gcg_binary) {
841 r_fact = 4;
842 r16_block = r_block/8;
843 while(r16_block-- > 0) {
844 stmp = *cp++;
845 *seqp++ = gcg_bton[(stmp>>6) &3];
846 *seqp++ = gcg_bton[(stmp>>4) &3];
847 *seqp++ = gcg_bton[(stmp>>2) &3];
848 *seqp++ = gcg_bton[(stmp) &3];
849 stmp = *cp++;
850 *seqp++ = gcg_bton[(stmp>>6) &3];
851 *seqp++ = gcg_bton[(stmp>>4) &3];
852 *seqp++ = gcg_bton[(stmp>>2) &3];
853 *seqp++ = gcg_bton[(stmp) &3];
854 stmp = *cp++;
855 *seqp++ = gcg_bton[(stmp>>6) &3];
856 *seqp++ = gcg_bton[(stmp>>4) &3];
857 *seqp++ = gcg_bton[(stmp>>2) &3];
858 *seqp++ = gcg_bton[(stmp) &3];
859 stmp = *cp++;
860 *seqp++ = gcg_bton[(stmp>>6) &3];
861 *seqp++ = gcg_bton[(stmp>>4) &3];
862 *seqp++ = gcg_bton[(stmp>>2) &3];
863 *seqp++ = gcg_bton[(stmp) &3];
864 stmp = *cp++;
865 *seqp++ = gcg_bton[(stmp>>6) &3];
866 *seqp++ = gcg_bton[(stmp>>4) &3];
867 *seqp++ = gcg_bton[(stmp>>2) &3];
868 *seqp++ = gcg_bton[(stmp) &3];
869 stmp = *cp++;
870 *seqp++ = gcg_bton[(stmp>>6) &3];
871 *seqp++ = gcg_bton[(stmp>>4) &3];
872 *seqp++ = gcg_bton[(stmp>>2) &3];
873 *seqp++ = gcg_bton[(stmp) &3];
874 stmp = *cp++;
875 *seqp++ = gcg_bton[(stmp>>6) &3];
876 *seqp++ = gcg_bton[(stmp>>4) &3];
877 *seqp++ = gcg_bton[(stmp>>2) &3];
878 *seqp++ = gcg_bton[(stmp) &3];
879 stmp = *cp++;
880 *seqp++ = gcg_bton[(stmp>>6) &3];
881 *seqp++ = gcg_bton[(stmp>>4) &3];
882 *seqp++ = gcg_bton[(stmp>>2) &3];
883 *seqp++ = gcg_bton[(stmp) &3];
884 }
885
886 while (seqp < seq+4*r_block) {
887 stmp = *cp++;
888 *seqp++ = gcg_bton[(stmp>>6) &3];
889 *seqp++ = gcg_bton[(stmp>>4) &3];
890 *seqp++ = gcg_bton[(stmp>>2) &3];
891 *seqp++ = gcg_bton[(stmp) &3];
892 }
893 }
894 if (r_fact * r_block >= m_fd->gcg_len) {
895 *lcont = 0;
896 m_fd->lpos++;
897 }
898 else {
899 if (m_fd->gcg_binary) b_block = 4*r_block;
900 m_fd->gcg_len -= b_block;
901 (*lcont)++;
902 }
903
904 seq[b_block] = EOSEQ;
905 /* if (b_block==0) return 1; else */
906 return b_block;
907 }
908
909 void lget_ann_m(struct lmf_str *lm_fd, char *libstr, int n_libstr);
910
911 int
lgetlibm(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)912 lgetlibm(unsigned char *seq,
913 int maxs,
914 char *libstr,
915 int n_libstr,
916 fseek_t *libpos,
917 int *lcont,
918 struct lmf_str *m_fd,
919 long *l_off)
920 {
921 register unsigned char *cp, *seqp;
922 register int *ap, lpos;
923 unsigned char *seqm, *seqm1;
924
925 *l_off = 1;
926
927 seqp = seq;
928 seqm = &seq[maxs-11];
929 seqm1 = seqm-1;
930
931 lpos = m_fd->lpos;
932 ap = m_fd->sascii;
933
934 if (*lcont==0) {
935 if (lpos >= m_fd->max_cnt) return (-1);
936
937 if (n_libstr <= 21) {
938 strncpy(libstr,m_fd->mmap_base+m_fd->d_pos_arr[lpos]+12,12);
939 libstr[12]='\0';
940 }
941 else {
942 lget_ann_m(m_fd,libstr,n_libstr);
943 }
944 *libpos = lpos;
945
946 m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
947 cp = (unsigned char *)m_fd->mmap_addr;
948 }
949 else cp = (unsigned char *)m_fd->mmap_addr;
950
951 while (seqp<seqm1) {
952 if (*cp=='/' && *(cp-1)=='\n') break;
953 if ((*seqp++=ap[*cp++])<NA &&
954 (*seqp++=ap[*cp++])<NA &&
955 (*seqp++=ap[*cp++])<NA &&
956 (*seqp++=ap[*cp++])<NA &&
957 (*seqp++=ap[*cp++])<NA &&
958 (*seqp++=ap[*cp++])<NA &&
959 (*seqp++=ap[*cp++])<NA &&
960 (*seqp++=ap[*cp++])<NA &&
961 (*seqp++=ap[*cp++])<NA &&
962 (*seqp++=ap[*cp++])<NA &&
963 (*seqp++=ap[*cp++])<NA) continue;
964 --seqp;
965 if (*cp=='\n' && *(cp+1)==' ') cp += 11;
966 }
967
968 if (seqp>=seqm1) {
969 (*lcont)++;
970 m_fd->mmap_addr = (char *)cp;
971 }
972 else {
973 *lcont=0;
974 m_fd->lpos++;
975 }
976
977 *seqp = EOSEQ;
978 return (int)(seqp-seq);
979 }
980
981 void
lget_ann_m(struct lmf_str * lm_fd,char * libstr,int n_libstr)982 lget_ann_m(struct lmf_str *lm_fd, char *libstr, int n_libstr) {
983 char *bp, *bp_gid, locus[120], desc[120], acc[120], ver[120];
984
985 /* copy in locus from lm_fd->lline */
986 strncpy(locus,&lm_fd->mmap_addr[12],sizeof(locus));
987 if ((bp=strchr(locus,' '))!=NULL) *(bp+1) = '\0';
988
989 /* get description */
990 mgets(desc,sizeof(desc),lm_fd);
991 while (desc[0]!='D' || desc[1]!='E' || strncmp(desc,"DEFINITION",10))
992 mgets(desc,sizeof(desc),lm_fd);
993 if ((bp = strchr(&desc[12],'\n'))!=NULL) *bp='\0';
994
995 /* get accession */
996 mgets(acc,sizeof(acc),lm_fd);
997 while (acc[0]!='A' || acc[1]!='C' || strncmp(acc,"ACCESSION",9)) {
998 mgets(acc,sizeof(acc),lm_fd);
999 if (acc[0]=='O' && acc[1]=='R' && strncmp(acc,"ORIGIN",6)==0)
1000 break;
1001 }
1002 if ((bp = strchr(&acc[12],'\n'))!=NULL) *bp='\0';
1003 if ((bp = strchr(&acc[12],' '))!=NULL) *bp='\0';
1004
1005 /* get version */
1006 mgets(ver,sizeof(ver),lm_fd);
1007 while (ver[0]!='V' || ver[1]!='E' || strncmp(ver,"VERSION",7)) {
1008 mgets(ver,sizeof(ver),lm_fd);
1009 if (ver[0]=='O' && ver[1]=='R' && strncmp(ver,"ORIGIN",6)==0)
1010 break;
1011 }
1012 if ((bp = strchr(&ver[12],'\n'))!=NULL) *bp='\0';
1013
1014 /* extract gi:123456 from version line */
1015 bp_gid = strchr(&ver[12],':');
1016 if (bp_gid != NULL) {
1017 if ((bp=strchr(bp_gid+1,' '))!=NULL) *bp='\0';
1018 bp_gid++;
1019 }
1020 if ((bp = strchr(&ver[12],' '))!=NULL) *bp='\0';
1021
1022 /* build up FASTA header line */
1023 if (bp_gid != NULL) {
1024 strncpy(libstr,"gi|",n_libstr-1);
1025 strncat(libstr,bp_gid,n_libstr-4);
1026 strncat(libstr,"|gb|",n_libstr-20);
1027 }
1028 else {libstr[0]='\0';}
1029
1030 /* if we have a version number, use it, otherwise accession,
1031 otherwise locus/description */
1032
1033 if (ver[0]=='V') {
1034 strncat(libstr,&ver[12],n_libstr-1-strlen(libstr));
1035 strncat(libstr,"|",n_libstr-1-strlen(libstr));
1036 }
1037 else if (acc[0]=='A') {
1038 strncat(libstr,&acc[12],n_libstr-1-strlen(libstr));
1039 strncat(libstr," ",n_libstr-1-strlen(libstr));
1040 }
1041
1042 strncat(libstr,locus,n_libstr-1-strlen(libstr));
1043 strncat(libstr,&desc[11],n_libstr-1-strlen(libstr));
1044 libstr[n_libstr-1]='\0';
1045 }
1046
1047 void
lranlibm(char * str,int cnt,fseek_t seek,char * libstr,struct lmf_str * m_fd)1048 lranlibm(char *str,
1049 int cnt,
1050 fseek_t seek,
1051 char *libstr,
1052 struct lmf_str *m_fd)
1053 {
1054 lget_ann_m(m_fd,str,cnt);
1055
1056 str[cnt-1]='\0';
1057
1058 m_fd->lpos = seek;
1059 }
1060
1061 static int check_status=0;
1062
1063 void
check_mmap(struct lmf_str * m_fd,long ntt)1064 check_mmap(struct lmf_str *m_fd,long ntt) {
1065
1066 int i, seq_len, ok_stat;
1067
1068 ok_stat = 1;
1069 if ( ++check_status > 5) return;
1070
1071 fprintf(stderr," ** checking %s %ld**\n", m_fd->lb_name,ntt);
1072 for (i=0; i<m_fd->max_cnt; i++) {
1073 seq_len = m_fd->d_pos_arr[i+1] - m_fd->s_pos_arr[i];
1074 if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {
1075 fprintf(stderr,"%d:\t%lld\t%lld\t%lld\n",
1076 i,m_fd->d_pos_arr[i],m_fd->s_pos_arr[i],
1077 m_fd->d_pos_arr[i+1]-m_fd->s_pos_arr[i]);
1078 ok_stat=0;
1079 }
1080 }
1081 if (ok_stat) {
1082 if (check_status) fprintf(stderr," ** check_mmap OK %s %ld**\n",
1083 m_fd->lb_name,ntt);
1084 }
1085 }
1086
1087 #ifdef DEBUG
1088 /* C H K 3 -- Compute a type-3 Kermit block check. */
1089 /*
1090 Calculate the 16-bit CRC of a null-terminated string using a byte-oriented
1091 tableless algorithm invented by Andy Lowry (Columbia University). The
1092 magic number 010201 is derived from the CRC-CCITT polynomial x^16+x^12+x^5+1.
1093 Note - this function could be adapted for strings containing imbedded 0's
1094 by including a length argument.
1095 */
1096 long
crck(s,n)1097 crck(s,n)
1098 char *s; int n;
1099 {
1100 unsigned int c, q;
1101 long crc = 0;
1102
1103 while (n-->0) {
1104 c = *s++;
1105 /* if (parity)*/
1106 c &= 0177;
1107 q = (crc ^ c) & 017; /* Low-order nibble */
1108 crc = (crc >> 4) ^ (q * 010201);
1109 q = (crc ^ (c >> 4)) & 017; /* High order nibble */
1110 crc = (crc >> 4) ^ (q * 010201);
1111 }
1112 return(crc);
1113 }
1114 #endif
1115