1 static char rcsid[] = "$Id: indexdb.c 222798 2020-06-03 21:55:53Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11
12 #include "indexdb.h"
13 #include "indexdbdef.h"
14
15
16 #ifdef WORDS_BIGENDIAN
17 #include "bigendian.h"
18 #else
19 #include "littleendian.h"
20 #endif
21
22 #include <stdio.h>
23 #include <stddef.h>
24 #include <stdlib.h>
25 #include <string.h> /* For memset */
26 #include <ctype.h> /* For toupper */
27 #include <sys/mman.h> /* For munmap */
28
29 #ifdef HAVE_UNISTD_H
30 #include <unistd.h> /* For lseek and close */
31 #endif
32 #ifdef HAVE_SYS_TYPES_H
33 #include <sys/types.h> /* For off_t */
34 #endif
35 #if HAVE_DIRENT_H
36 # include <dirent.h>
37 # define NAMLEN(dirent) strlen((dirent)->d_name)
38 #else
39 # define dirent direct
40 # define NAMLEN(dirent) (dirent)->d_namlen
41 # if HAVE_SYS_NDIR_H
42 # include <sys/ndir.h>
43 # endif
44 # if HAVE_SYS_DIR_H
45 # include <sys/dir.h>
46 # endif
47 # if HAVE_NDIR_H
48 # include <ndir.h>
49 # endif
50 #endif
51
52 #include "mem.h"
53 #include "assert.h"
54 #include "fopen.h"
55 #include "types.h" /* For Oligospace_T */
56 #include "filesuffix.h"
57
58 #include "compress.h"
59 #include "interval.h"
60 #include "complement.h"
61 #include "bitpack64-read.h" /* For Bitpack64_block_offsets */
62 #include "bitpack64-readtwo.h"
63
64 #ifdef HAVE_PTHREAD
65 #include <pthread.h> /* sys/types.h already included above */
66 #endif
67
68 #if defined(HAVE_SSE2)
69 #include <emmintrin.h>
70 #endif
71 #ifdef HAVE_SSE4_1
72 #include <smmintrin.h>
73 #endif
74 #ifdef HAVE_AVX2
75 #include <immintrin.h>
76 #endif
77 #ifdef HAVE_AVX512
78 #include <immintrin.h>
79 #endif
80
81
82
83
84 #define MAXENTRIES 20
85
86 /* Note: NONMODULAR is the old behavior. Now we store only when
87 startposition % index1interval == 0 */
88
89
90 /* Low-level codon hacking */
91 #ifdef DEBUG
92 #define debug(x) x
93 #else
94 #define debug(x)
95 #endif
96
97 /* Calls to Indexdb_read */
98 #ifdef DEBUG0
99 #define debug0(x) x
100 #else
101 #define debug0(x)
102 #endif
103
104 /* Shifting of high word to low word for PMAP */
105 #ifdef DEBUG2
106 #define debug2(x) x
107 #else
108 #define debug2(x)
109 #endif
110
111
112 #ifdef PMAP
113 #if (defined(DEBUG) || defined(DEBUG0))
114 static Width_T index1part_aa;
115 #endif
116
117 void
Indexdb_setup(Width_T index1part_aa_in)118 Indexdb_setup (Width_T index1part_aa_in) {
119 #if (defined(DEBUG) || defined(DEBUG0))
120 index1part_aa = index1part_aa_in;
121 #endif
122 return;
123 }
124
125 #else
126
127 #define poly_A 0U
128 static Oligospace_T poly_T; /* Was LOW12MER 0x00FFFFFF */
129
130 #if (defined(DEBUG) || defined(DEBUG0))
131 static Width_T index1part;
132 #endif
133
134 void
Indexdb_setup(Width_T index1part_in)135 Indexdb_setup (Width_T index1part_in) {
136 #if (defined(DEBUG) || defined(DEBUG0))
137 index1part = index1part_in;
138 #endif
139
140 #ifdef HAVE_64_BIT
141 poly_T = ~(~0ULL << 2*index1part_in);
142 #else
143 poly_T = ~(~0U << 2*index1part_in);
144 #endif
145
146 return;
147 }
148 #endif
149
150
151 #define T Indexdb_T
152
153
154 #ifdef LARGE_GENOMES
155 void
Indexdb_free(T * old)156 Indexdb_free (T *old) {
157 if (*old) {
158
159 if ((*old)->positions_high_access == NOT_USED) {
160 /* Skip */
161
162 } else if ((*old)->positions_high_access == ALLOCATED_PRIVATE) {
163 FREE((*old)->positions_high);
164
165 } else if ((*old)->positions_high_access == ALLOCATED_SHARED) {
166 Access_deallocate((*old)->positions_high,(*old)->positions_high_shmid,(*old)->positions_high_key);
167
168 #ifdef HAVE_MMAP
169 } else if ((*old)->positions_high_access == MMAPPED) {
170 munmap((void *) (*old)->positions_high,(*old)->positions_high_len);
171 close((*old)->positions_high_fd);
172 #endif
173 } else {
174 close((*old)->positions_high_fd);
175 }
176
177 if ((*old)->positions_access == ALLOCATED_PRIVATE) {
178 FREE((*old)->positions);
179
180 } else if ((*old)->positions_access == ALLOCATED_SHARED) {
181 Access_deallocate((*old)->positions,(*old)->positions_shmid,(*old)->positions_key);
182
183 #ifdef HAVE_MMAP
184 } else if ((*old)->positions_access == MMAPPED) {
185 munmap((void *) (*old)->positions,(*old)->positions_len);
186 close((*old)->positions_fd);
187 #endif
188 } else {
189 close((*old)->positions_fd);
190 }
191
192 #ifdef HAVE_PTHREAD
193 if ((*old)->positions_high_access == FILEIO || (*old)->positions_access == FILEIO) {
194 pthread_mutex_destroy(&(*old)->positions_read_mutex);
195 }
196 #endif
197
198
199 if ((*old)->offsetsstrm_access == ALLOCATED_PRIVATE) {
200 FREE((*old)->offsetsstrm);
201
202 } else if ((*old)->offsetsstrm_access == ALLOCATED_SHARED) {
203 Access_deallocate((*old)->offsetsstrm,(*old)->offsetsstrm_shmid,(*old)->offsetsstrm_key);
204
205 #ifdef HAVE_MMAP
206 } else if ((*old)->offsetsstrm_access == MMAPPED) {
207 munmap((void *) (*old)->offsetsstrm,(*old)->offsetsstrm_len);
208 close((*old)->offsetsstrm_fd);
209 #endif
210 }
211
212 if ((*old)->offsetsmeta_access == ALLOCATED_PRIVATE) {
213 FREE((*old)->offsetsmeta);
214 } else if ((*old)->offsetsmeta_access == ALLOCATED_SHARED) {
215 Access_deallocate((*old)->offsetsmeta,(*old)->offsetsmeta_shmid,(*old)->offsetsmeta_key);
216 #ifdef HAVE_MMAP
217 } else if ((*old)->offsetsmeta_access == MMAPPED) {
218 munmap((void *) (*old)->offsetsmeta,(*old)->offsetsmeta_len);
219 close((*old)->offsetsmeta_fd);
220 #endif
221 }
222
223 if ((*old)->offsetspages_access == ALLOCATED_PRIVATE) {
224 FREE((*old)->offsetspages);
225 } else if ((*old)->offsetspages_access == ALLOCATED_SHARED) {
226 Access_deallocate((*old)->offsetspages,(*old)->offsetspages_shmid,(*old)->offsetspages_key);
227 #ifdef HAVE_MMAP
228 } else if ((*old)->offsetspages_access == MMAPPED) {
229 munmap((void *) (*old)->offsetspages,(*old)->offsetspages_len);
230 close((*old)->offsetspages_fd);
231 #endif
232 }
233
234 FREE(*old);
235 }
236 return;
237 }
238
239 #else
240
241 void
Indexdb_free(T * old)242 Indexdb_free (T *old) {
243 if (*old) {
244
245 if ((*old)->positions_access == ALLOCATED_PRIVATE) {
246 FREE((*old)->positions);
247
248 } else if ((*old)->positions_access == ALLOCATED_SHARED) {
249 Access_deallocate((*old)->positions,(*old)->positions_shmid,(*old)->positions_key);
250
251 #ifdef HAVE_MMAP
252 } else if ((*old)->positions_access == MMAPPED) {
253 munmap((void *) (*old)->positions,(*old)->positions_len);
254 close((*old)->positions_fd);
255 #endif
256 } else if ((*old)->positions_access == FILEIO) {
257 #ifdef HAVE_PTHREAD
258 pthread_mutex_destroy(&(*old)->positions_read_mutex);
259 #endif
260 close((*old)->positions_fd);
261 }
262
263 if ((*old)->offsetsstrm_access == ALLOCATED_PRIVATE) {
264 FREE((*old)->offsetsstrm);
265
266 } else if ((*old)->offsetsstrm_access == ALLOCATED_SHARED) {
267 Access_deallocate((*old)->offsetsstrm,(*old)->offsetsstrm_shmid,(*old)->offsetsstrm_key);
268
269 #ifdef HAVE_MMAP
270 } else if ((*old)->offsetsstrm_access == MMAPPED) {
271 munmap((void *) (*old)->offsetsstrm,(*old)->offsetsstrm_len);
272 close((*old)->offsetsstrm_fd);
273 #endif
274 }
275
276 if ((*old)->offsetsmeta_access == ALLOCATED_PRIVATE) {
277 FREE((*old)->offsetsmeta);
278 } else if ((*old)->offsetsmeta_access == ALLOCATED_SHARED) {
279 Access_deallocate((*old)->offsetsmeta,(*old)->offsetsmeta_shmid,(*old)->offsetsmeta_key);
280 #ifdef HAVE_MMAP
281 } else if ((*old)->offsetsmeta_access == MMAPPED) {
282 munmap((void *) (*old)->offsetsmeta,(*old)->offsetsmeta_len);
283 close((*old)->offsetsmeta_fd);
284 #endif
285 }
286
287 FREE(*old);
288 }
289 return;
290 }
291
292 #endif
293
294
295 Width_T
Indexdb_interval(T this)296 Indexdb_interval (T this) {
297 return this->index1interval;
298 }
299
300
301 #ifdef LARGE_GENOMES
302 bool
Indexdb_positions_fileio_p(T this)303 Indexdb_positions_fileio_p (T this) {
304 if (this->positions_high_access == FILEIO || this->positions_access == FILEIO) {
305 return true;
306 } else {
307 return false;
308 }
309 }
310 #else
311 bool
Indexdb_positions_fileio_p(T this)312 Indexdb_positions_fileio_p (T this) {
313 if (this->positions_access == FILEIO) {
314 return true;
315 } else {
316 return false;
317 }
318 }
319 #endif
320
321
322
323 static Oligospace_T
power(int base,Width_T exponent)324 power (int base, Width_T exponent) {
325 #ifdef OLIGOSPACE_NOT_LONG
326 Oligospace_T result = 1U;
327 #else
328 Oligospace_T result = 1UL;
329 #endif
330 int i;
331
332 for (i = 0; i < exponent; i++) {
333 result *= base;
334 }
335 return result;
336 }
337
338 #if 0
339 double
340 Indexdb_mean_size_old (T this, Mode_T mode, Width_T index1part) {
341 Oligospace_T oligospace, n;
342
343 #ifdef PMAP
344 /* index1part should be in aa */
345 n = oligospace = power(this->alphabet_size,index1part);
346 #else
347 n = oligospace = power(4,index1part);
348 if (mode != STANDARD) {
349 n = power(3,index1part);
350 }
351 #endif
352
353 #ifdef WORDS_BIGENDIAN
354 /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
355 return (double) Bigendian_convert_uint(this->offsetsstrm[Bigendian_convert_uint(this->offsetsmeta[oligospace/this->blocksize])])/(double) n;
356 #else
357 printf("index1part %d, blocksize %d, oligospace %u\n",index1part,this->blocksize,oligospace);
358 printf("%u\n",this->offsetsmeta[oligospace/this->blocksize]);
359 printf("%u\n",this->offsetsstrm[this->offsetsmeta[oligospace/this->blocksize]]);
360
361 return (double) this->offsetsstrm[this->offsetsmeta[oligospace/this->blocksize]]/(double) n;
362 #endif
363 }
364 #endif
365
366
367 double
Indexdb_mean_size(T this,Mode_T mode,Width_T index1part)368 Indexdb_mean_size (T this, Mode_T mode, Width_T index1part) {
369 Oligospace_T oligospace, n;
370
371 #ifdef PMAP
372 /* index1part should be in aa */
373 n = oligospace = power(this->alphabet_size,index1part);
374 #else
375 n = oligospace = power(4,index1part);
376 if (mode != STANDARD) {
377 n = power(3,index1part);
378 }
379 #endif
380
381 return (double) this->total_npositions/(double) n;
382 }
383
384
385
386
387 static Indexdb_filenames_T
Indexdb_filenames_new(char * pages_filename,char * pointers_filename,char * offsets_filename,char * positions_high_filename,char * positions_filename,char * pointers_basename_ptr,char * offsets_basename_ptr,char * positions_high_basename_ptr,char * positions_basename_ptr,char * pointers_index1info_ptr,char * offsets_index1info_ptr,char * positions_high_index1info_ptr,char * positions_index1info_ptr)388 Indexdb_filenames_new (char *pages_filename, char *pointers_filename, char *offsets_filename,
389 char *positions_high_filename, char *positions_filename,
390 char *pointers_basename_ptr, char *offsets_basename_ptr,
391 char *positions_high_basename_ptr, char *positions_basename_ptr,
392 char *pointers_index1info_ptr, char *offsets_index1info_ptr,
393 char *positions_high_index1info_ptr, char *positions_index1info_ptr) {
394 Indexdb_filenames_T new = (Indexdb_filenames_T) MALLOC(sizeof(*new));
395
396 new->pages_filename = pages_filename;
397 new->pointers_filename = pointers_filename;
398 new->offsets_filename = offsets_filename;
399 new->positions_high_filename = positions_high_filename;
400 new->positions_filename = positions_filename;
401
402 new->pointers_basename_ptr = pointers_basename_ptr;
403 new->offsets_basename_ptr = offsets_basename_ptr;
404 new->positions_high_basename_ptr = positions_high_basename_ptr;
405 new->positions_basename_ptr = positions_basename_ptr;
406
407 new->pointers_index1info_ptr = pointers_index1info_ptr;
408 new->offsets_index1info_ptr = offsets_index1info_ptr;
409 new->positions_high_index1info_ptr = positions_high_index1info_ptr;
410 new->positions_index1info_ptr = positions_index1info_ptr;
411
412 return new;
413 }
414
415 void
Indexdb_filenames_free(Indexdb_filenames_T * old)416 Indexdb_filenames_free (Indexdb_filenames_T *old) {
417
418 FREE((*old)->pages_filename);
419 FREE((*old)->pointers_filename);
420 FREE((*old)->offsets_filename);
421 FREE((*old)->positions_high_filename);
422 FREE((*old)->positions_filename);
423
424 FREE(*old);
425
426 return;
427 }
428
429
430 Indexdb_filenames_T
Indexdb_get_filenames_no_compression(Width_T * index1part,Width_T * index1interval,char * genomesubdir,char * fileroot,char * idx_filesuffix,char * snps_root,Width_T required_interval,bool offsets_only_p)431 Indexdb_get_filenames_no_compression (Width_T *index1part, Width_T *index1interval,
432 char *genomesubdir, char *fileroot, char *idx_filesuffix, char *snps_root,
433 Width_T required_interval, bool offsets_only_p) {
434 char *offsets_filename, *positions_high_filename, *positions_filename,
435 *offsets_basename_ptr, *positions_high_basename_ptr, *positions_basename_ptr,
436 *offsets_index1info_ptr, *positions_high_index1info_ptr, *positions_index1info_ptr;
437
438 char *base_filename, *filename;
439 char *pattern, interval_char, digit_string[2], *p, *q;
440 #ifndef PMAP
441 char tens;
442 #endif
443 char ones;
444 Width_T found_index1part, found_interval;
445 int rootlength, patternlength;
446
447 char *locoffsets_suffix, *offsets_suffix, *positions_high_suffix, *positions_suffix;
448 struct dirent *entry;
449 DIR *dp;
450
451
452 if (snps_root == NULL) {
453 locoffsets_suffix = "locoffsets";
454 offsets_suffix = "offsets";
455 positions_high_suffix = POSITIONS_HIGH_FILESUFFIX;
456 positions_suffix = POSITIONS_FILESUFFIX;
457 } else {
458 locoffsets_suffix = (char *) CALLOC(strlen("locoffsets")+strlen(".")+strlen(snps_root)+1,sizeof(char));
459 sprintf(locoffsets_suffix,"%s.%s","locoffsets",snps_root);
460 offsets_suffix = (char *) CALLOC(strlen("offsets")+strlen(".")+strlen(snps_root)+1,sizeof(char));
461 sprintf(offsets_suffix,"%s.%s","offsets",snps_root);
462 positions_high_suffix = (char *) CALLOC(strlen(POSITIONS_HIGH_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
463 sprintf(positions_high_suffix,"%s.%s",POSITIONS_HIGH_FILESUFFIX,snps_root);
464 positions_suffix = (char *) CALLOC(strlen(POSITIONS_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
465 sprintf(positions_suffix,"%s.%s",POSITIONS_FILESUFFIX,snps_root);
466 }
467
468 *index1part = 0;
469 *index1interval = 1000;
470 base_filename = (char *) NULL;
471
472 if ((dp = opendir(genomesubdir)) == NULL) {
473 fprintf(stderr,"Unable to open directory %s\n",genomesubdir);
474 return (Indexdb_filenames_T) NULL;
475 }
476
477 pattern = (char *) CALLOC(strlen(fileroot)+strlen(".")+strlen(idx_filesuffix)+1,sizeof(char));
478 sprintf(pattern,"%s.%s",fileroot,idx_filesuffix);
479 patternlength = strlen(pattern);
480
481 digit_string[1] = '\0'; /* Needed for atoi */
482 while ((entry = readdir(dp)) != NULL) {
483 filename = entry->d_name;
484 if (!strncmp(filename,pattern,patternlength)) {
485 p = &(filename[strlen(pattern)]); /* Points after idx_filesuffix, e.g., "ref" */
486 if ((q = strstr(p,locoffsets_suffix)) != NULL && !strcmp(q,locoffsets_suffix)) {
487 /* Skip refXXXlocoffsets */
488 } else if ((q = strstr(p,offsets_suffix)) != NULL && !strcmp(q,offsets_suffix)) {
489 if (q - p == 3) {
490 #ifdef PMAP
491 /* e.g., pf67offsets */
492 if (sscanf(p,"%c%c",&ones,&interval_char) == 2) {
493 digit_string[0] = ones;
494 found_index1part = atoi(digit_string);
495
496 digit_string[0] = interval_char;
497 found_interval = atoi(digit_string);
498 } else {
499 abort();
500 }
501 #else
502 /* e.g., ref123offsets */
503 if (sscanf(p,"%c%c%c",&tens,&ones,&interval_char) == 3) {
504 digit_string[0] = tens;
505 found_index1part = 10*atoi(digit_string);
506 digit_string[0] = ones;
507 found_index1part += atoi(digit_string);
508
509 digit_string[0] = interval_char;
510 found_interval = atoi(digit_string);
511 } else {
512 abort();
513 }
514 #endif
515
516 } else if (q - p == 1) {
517 /* Old style, e.g, idx or ref3 */
518 if (sscanf(p,"%c",&interval_char) == 1) {
519 if (interval_char == 'x') {
520 found_interval = 6;
521 } else {
522 digit_string[0] = interval_char;
523 found_interval = atoi(digit_string);
524 }
525 } else {
526 abort();
527 }
528 #ifdef PMAP
529 found_index1part = found_interval;
530 #else
531 found_index1part = 12;
532 #endif
533
534 } else {
535 fprintf(stderr,"Cannot parse part between %s and offsets in filename %s\n",idx_filesuffix,filename);
536 abort();
537 if (snps_root != NULL) {
538 FREE(locoffsets_suffix);
539 FREE(offsets_suffix);
540 FREE(positions_high_suffix);
541 FREE(positions_suffix);
542 }
543 return (Indexdb_filenames_T) NULL;
544 }
545
546 if (required_interval != 0) {
547 if (found_interval == required_interval) {
548 *index1part = found_index1part;
549 *index1interval = found_interval;
550 FREE(base_filename);
551 base_filename = (char *) CALLOC(strlen(filename)+1,sizeof(char));
552 strcpy(base_filename,filename);
553 }
554 } else {
555 if (found_interval < *index1interval) {
556 *index1part = found_index1part;
557 *index1interval = found_interval;
558 FREE(base_filename);
559 base_filename = (char *) CALLOC(strlen(filename)+1,sizeof(char));
560 strcpy(base_filename,filename);
561 }
562 }
563 }
564 }
565 }
566
567 FREE(pattern);
568
569 if (closedir(dp) < 0) {
570 fprintf(stderr,"Unable to close directory %s\n",genomesubdir);
571 }
572
573 /* Construct full filenames */
574 if (base_filename == NULL) {
575 #if 0
576 fprintf(stderr,"Cannot find offsets file containing %s and %s",idx_filesuffix,offsets_suffix);
577 if (required_interval > 0) {
578 fprintf(stderr," and having sampling interval of %d",required_interval);
579 }
580 fprintf(stderr,"\n");
581 #endif
582
583 /* offsets_filename = (char *) NULL; */
584 /* positions_high_filename = (char *) NULL; */
585 /* positions_filename = (char *) NULL; */
586 if (snps_root != NULL) {
587 FREE(locoffsets_suffix);
588 FREE(offsets_suffix);
589 FREE(positions_high_suffix);
590 FREE(positions_suffix);
591 }
592 return (Indexdb_filenames_T) NULL;
593
594 } else {
595 offsets_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+strlen(base_filename)+1,sizeof(char));
596 offsets_basename_ptr = &(offsets_filename[strlen(genomesubdir)+strlen("/")]);
597 offsets_index1info_ptr = &(offsets_basename_ptr[patternlength]);
598
599 sprintf(offsets_filename,"%s/%s",genomesubdir,base_filename);
600 if (Access_file_exists_p(offsets_filename) == false) {
601 fprintf(stderr,"Offsets filename %s does not exist\n",offsets_filename);
602 FREE(offsets_filename);
603 /* offsets_filename = (char *) NULL; */
604 /* positions_high_filename = (char *) NULL; */
605 /* positions_filename = (char *) NULL; */
606 FREE(base_filename);
607 if (snps_root != NULL) {
608 FREE(locoffsets_suffix);
609 FREE(offsets_suffix);
610 FREE(positions_high_suffix);
611 FREE(positions_suffix);
612 }
613 return (Indexdb_filenames_T) NULL;
614 }
615
616
617 if ((q = strstr(base_filename,offsets_suffix)) == NULL) {
618 abort();
619 } else {
620 rootlength = q - base_filename;
621 }
622
623 positions_high_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(positions_high_suffix)+1,sizeof(char));
624 positions_high_basename_ptr = &(positions_high_filename[strlen(genomesubdir)+strlen("/")]);
625 positions_high_index1info_ptr = &(positions_high_basename_ptr[patternlength]);
626
627 positions_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(positions_suffix)+1,sizeof(char));
628 positions_basename_ptr = &(positions_filename[strlen(genomesubdir)+strlen("/")]);
629 positions_index1info_ptr = &(positions_basename_ptr[patternlength]);
630
631 sprintf(positions_high_filename,"%s/",genomesubdir);
632 strncpy(positions_high_basename_ptr,base_filename,rootlength);
633 strcpy(&(positions_high_basename_ptr[rootlength]),positions_high_suffix);
634
635 sprintf(positions_filename,"%s/",genomesubdir);
636 strncpy(positions_basename_ptr,base_filename,rootlength);
637 strcpy(&(positions_basename_ptr[rootlength]),positions_suffix);
638
639 if (offsets_only_p == true) {
640 /* Do not look for a positions file */
641 } else if (Access_file_exists_p(positions_filename) == false) {
642 fprintf(stderr,"Positions filename %s does not exist\n",positions_filename);
643 FREE(offsets_filename);
644 FREE(positions_high_filename);
645 FREE(positions_filename);
646 /* offsets_filename = (char *) NULL; */
647 /* positions_high_filename = (char *) NULL; */
648 /* positions_filename = (char *) NULL; */
649 FREE(base_filename);
650 if (snps_root != NULL) {
651 FREE(offsets_suffix);
652 FREE(positions_high_suffix);
653 FREE(positions_suffix);
654 }
655 return (Indexdb_filenames_T) NULL;
656 }
657
658 if (Access_file_exists_p(positions_high_filename) == false) {
659 /* Not a large genome */
660 FREE(positions_high_filename);
661 positions_high_filename = (char *) NULL;
662 }
663
664 if (snps_root != NULL) {
665 FREE(offsets_suffix);
666 FREE(positions_high_suffix);
667 FREE(positions_suffix);
668 }
669
670 FREE(base_filename);
671 fprintf(stderr,"Looking for index files in directory %s (offsets not compressed)\n",genomesubdir);
672 fprintf(stderr," Offsets file is %s\n",offsets_basename_ptr);
673 if (positions_high_filename == NULL) {
674 fprintf(stderr," Positions file is %s\n",positions_basename_ptr);
675 } else {
676 fprintf(stderr," Positions files are %s and %s\n",positions_high_basename_ptr,positions_basename_ptr);
677 }
678 return Indexdb_filenames_new(/*pages_filename*/NULL,/*pointers_filename*/NULL,offsets_filename,
679 positions_high_filename,positions_filename,
680 /*pointers_basename_ptr*/NULL,offsets_basename_ptr,
681 positions_high_basename_ptr,positions_basename_ptr,
682 /*pointers_index1info_ptr*/NULL,offsets_index1info_ptr,
683 positions_high_index1info_ptr,positions_index1info_ptr);
684 }
685 }
686
687
688
689 #ifdef PMAP
690 #define BASE_KMER_SAMPLING 3 /* e.g., 677 */
691 #define KMER_SAMPLING 2 /* e.g., 77 */
692 #else
693 #define BASE_KMER_SAMPLING 3 /* e.g., 153 */
694 #define KMER_SAMPLING 1 /* e.g., 3 */
695 #endif
696
697
698 Indexdb_filenames_T
Indexdb_get_filenames_bitpack(Alphabet_T * alphabet,Alphabet_T required_alphabet,Width_T * index1part,Width_T * index1interval,char * genomesubdir,char * fileroot,char * idx_filesuffix,char * snps_root,Width_T required_index1part,Width_T required_interval,Blocksize_T blocksize,bool offsets_only_p)699 Indexdb_get_filenames_bitpack (
700 #ifdef PMAP
701 Alphabet_T *alphabet, Alphabet_T required_alphabet,
702 #endif
703 Width_T *index1part, Width_T *index1interval,
704 char *genomesubdir, char *fileroot, char *idx_filesuffix, char *snps_root,
705 Width_T required_index1part, Width_T required_interval,
706 Blocksize_T blocksize, bool offsets_only_p) {
707 char *offsetspages_filename, *offsetsmeta_filename, *offsetsstrm_filename,
708 *positions_high_filename, *positions_filename,
709 *offsetspages_basename_ptr, *offsetsmeta_basename_ptr, *offsetsstrm_basename_ptr,
710 *positions_high_basename_ptr, *positions_basename_ptr,
711 *offsetsmeta_index1info_ptr, *offsetsstrm_index1info_ptr,
712 *positions_high_index1info_ptr, *positions_index1info_ptr;
713 /* char *offsetspages_index1info_ptr; */
714
715 char *base_filename, *filename;
716 #ifdef PMAP
717 char *pattern1, *pattern2, *a;
718 int patternlength1, patternlength2, alphabet_strlen;
719 Alphabet_T found_alphabet;
720 #else
721 char *pattern;
722 char tens;
723 #endif
724 char interval_char, digit_string[2], *p, *q;
725 Width_T found_index1part = 0, found_interval = 0;
726 int rootlength, patternlength;
727
728 char ones;
729 char *locoffsetsstrm_suffix, *offsetspages_suffix, *offsetsmeta_suffix, *offsetsstrm_suffix,
730 *positions_high_suffix, *positions_suffix;
731 struct dirent *entry;
732 DIR *dp;
733
734
735 if (snps_root == NULL) {
736 if (blocksize == 32) {
737 offsetspages_suffix = "offsets32pages";
738 offsetsmeta_suffix = "offsets32meta";
739 locoffsetsstrm_suffix = "locoffsets32strm";
740 offsetsstrm_suffix = "offsets32strm";
741 positions_high_suffix = POSITIONS_HIGH_FILESUFFIX;
742 positions_suffix = POSITIONS_FILESUFFIX;
743 } else if (blocksize == 64) {
744 offsetspages_suffix = "offsets64pages";
745 offsetsmeta_suffix = "offsets64meta";
746 locoffsetsstrm_suffix = "locoffsets64strm";
747 offsetsstrm_suffix = "offsets64strm";
748 positions_high_suffix = POSITIONS_HIGH_FILESUFFIX;
749 positions_suffix = POSITIONS_FILESUFFIX;
750 } else {
751 fprintf(stderr,"Unexpected blocksize %d\n",blocksize);
752 abort();
753 }
754 } else {
755 if (blocksize == 32) {
756 offsetspages_suffix = (char *) CALLOC(strlen("offsets32pages.")+strlen(snps_root)+1,sizeof(char));
757 offsetsmeta_suffix = (char *) CALLOC(strlen("offsets32meta.")+strlen(snps_root)+1,sizeof(char));
758 locoffsetsstrm_suffix = (char *) CALLOC(strlen("locoffsets32strm.")+strlen(snps_root)+1,sizeof(char));
759 offsetsstrm_suffix = (char *) CALLOC(strlen("offsets32strm.")+strlen(snps_root)+1,sizeof(char));
760 positions_high_suffix = (char *) CALLOC(strlen(POSITIONS_HIGH_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
761 positions_suffix = (char *) CALLOC(strlen(POSITIONS_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
762
763 sprintf(offsetspages_suffix,"offsets32pages.%s",snps_root);
764 sprintf(offsetsmeta_suffix,"offsets32meta.%s",snps_root);
765 sprintf(locoffsetsstrm_suffix,"locoffsets32strm.%s",snps_root);
766 sprintf(offsetsstrm_suffix,"offsets32strm.%s",snps_root);
767 sprintf(positions_high_suffix,"%s.%s",POSITIONS_HIGH_FILESUFFIX,snps_root);
768 sprintf(positions_suffix,"%s.%s",POSITIONS_FILESUFFIX,snps_root);
769
770 } else if (blocksize == 64) {
771 offsetspages_suffix = (char *) CALLOC(strlen("offsets64pages.")+strlen(snps_root)+1,sizeof(char));
772 offsetsmeta_suffix = (char *) CALLOC(strlen("offsets64meta.")+strlen(snps_root)+1,sizeof(char));
773 locoffsetsstrm_suffix = (char *) CALLOC(strlen("locoffsets64strm.")+strlen(snps_root)+1,sizeof(char));
774 offsetsstrm_suffix = (char *) CALLOC(strlen("offsets64strm.")+strlen(snps_root)+1,sizeof(char));
775 positions_high_suffix = (char *) CALLOC(strlen(POSITIONS_HIGH_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
776 positions_suffix = (char *) CALLOC(strlen(POSITIONS_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
777
778 sprintf(offsetspages_suffix,"offsets64pages.%s",snps_root);
779 sprintf(offsetsmeta_suffix,"offsets64meta.%s",snps_root);
780 sprintf(locoffsetsstrm_suffix,"locoffsets64strm.%s",snps_root);
781 sprintf(offsetsstrm_suffix,"offsets64strm.%s",snps_root);
782 sprintf(positions_high_suffix,"%s.%s",POSITIONS_HIGH_FILESUFFIX,snps_root);
783 sprintf(positions_suffix,"%s.%s",POSITIONS_FILESUFFIX,snps_root);
784
785 } else {
786 fprintf(stderr,"Unexpected blocksize %d\n",blocksize);
787 abort();
788 }
789 }
790
791
792 #ifdef PMAP
793 *alphabet = NALPHABETS + 1;
794 #endif
795 *index1part = 0;
796 *index1interval = 1000;
797 base_filename = (char *) NULL;
798
799 if ((dp = opendir(genomesubdir)) == NULL) {
800 fprintf(stderr,"Unable to open directory %s\n",genomesubdir);
801 return (Indexdb_filenames_T) NULL;
802 } else {
803 fprintf(stderr,"Looking for genome %s in directory %s\n",fileroot,genomesubdir);
804 }
805
806 #ifdef PMAP
807 pattern1 = (char *) CALLOC(strlen(fileroot)+strlen(".")+1,sizeof(char)); /* e.g., "hg19." */
808 sprintf(pattern1,"%s.",fileroot);
809 patternlength1 = strlen(pattern1);
810
811 pattern2 = (char *) CALLOC(strlen(".")+strlen(idx_filesuffix)+1,sizeof(char)); /* e.g., ".pr" */
812 sprintf(pattern2,".%s",idx_filesuffix);
813 patternlength2 = strlen(pattern2);
814
815 digit_string[1] = '\0'; /* Needed for atoi */
816 while ((entry = readdir(dp)) != NULL) {
817 filename = entry->d_name;
818 if (!strncmp(filename,pattern1,patternlength1)) {
819 a = &(filename[strlen(pattern1)]); /* Points after fileroot, e.g., "hg19." */
820 if ((p = strstr(a,pattern2)) != NULL && (q = strstr(p,locoffsetsstrm_suffix)) != NULL && !strcmp(q,locoffsetsstrm_suffix)) {
821 /* Skip */
822 } else if ((p = strstr(a,pattern2)) != NULL && (q = strstr(p,offsetsstrm_suffix)) != NULL && !strcmp(q,offsetsstrm_suffix)) {
823 if ((found_alphabet = Alphabet_find(a)) != AA0) {
824 alphabet_strlen = p - a;
825 p += patternlength2;
826
827 if (q - p == KMER_SAMPLING) {
828 /* Latest style, e.g., pf77 */
829 if (sscanf(p,"%c%c",&ones,&interval_char) == 2) {
830 digit_string[0] = ones;
831 found_index1part = atoi(digit_string);
832
833 digit_string[0] = interval_char;
834 found_interval = atoi(digit_string);
835 } else {
836 abort();
837 }
838
839 #if 0
840 } else if (q - p == BASE_KMER_SAMPLING) {
841 /* Previous style, e.g., pf677. No longer supported. */
842 if (sscanf(p,"%c%c%c",&ones0,&ones,&interval_char) == 3) {
843 digit_string[0] = ones0;
844 found_basesize = atoi(digit_string);
845
846 digit_string[0] = ones;
847 found_index1part = atoi(digit_string);
848
849 digit_string[0] = interval_char;
850 found_interval = atoi(digit_string);
851 } else {
852 abort();
853 }
854 #endif
855
856 } else {
857 /* fprintf(stderr,"Cannot parse part between %s and offsets in filename %s\n",idx_filesuffix,filename); */
858 if (snps_root != NULL) {
859 FREE(locoffsetsstrm_suffix);
860 FREE(offsetsstrm_suffix);
861 FREE(offsetsmeta_suffix);
862 FREE(offsetspages_suffix);
863 FREE(positions_high_suffix);
864 FREE(positions_suffix);
865 }
866 return (Indexdb_filenames_T) NULL;
867 }
868
869 if ((required_alphabet == AA0 || found_alphabet == required_alphabet) &&
870 (required_index1part == 0 || found_index1part == required_index1part) &&
871 (required_interval == 0 || found_interval == required_interval)) {
872 if (required_alphabet == AA0 && found_alphabet > *alphabet) {
873 /* Skip, since we have already found an earlier alphabet */
874 } else if (required_index1part == 0 && found_index1part < *index1part) {
875 /* Skip, since we have already found a larger index1part */
876 } else if (required_interval == 0 && found_interval > *index1interval) {
877 /* Skip, since we have already found a smaller interval */
878 } else {
879 patternlength = patternlength1 + alphabet_strlen + patternlength2;
880 /* *basesize = found_basesize; */
881 *index1part = found_index1part;
882 *index1interval = found_interval;
883 *alphabet = found_alphabet;
884 FREE(base_filename);
885 base_filename = (char *) CALLOC(strlen(filename)+1,sizeof(char));
886 strcpy(base_filename,filename);
887 }
888 }
889 }
890 }
891 }
892 }
893
894 FREE(pattern2);
895 FREE(pattern1);
896
897 #else
898
899 pattern = (char *) CALLOC(strlen(fileroot)+strlen(".")+strlen(idx_filesuffix)+1,sizeof(char));
900 sprintf(pattern,"%s.%s",fileroot,idx_filesuffix);
901 patternlength = strlen(pattern);
902
903 digit_string[1] = '\0'; /* Needed for atoi */
904 while ((entry = readdir(dp)) != NULL) {
905 filename = entry->d_name;
906 if (!strncmp(filename,pattern,patternlength)) {
907 p = &(filename[strlen(pattern)]); /* Points after idx_filesuffix, e.g., "ref" */
908 if ((q = strstr(p,locoffsetsstrm_suffix)) != NULL && !strcmp(q,locoffsetsstrm_suffix)) {
909 /* Skip */
910
911 } else if ((q = strstr(p,offsetsstrm_suffix)) != NULL && !strcmp(q,offsetsstrm_suffix)) {
912
913 if (q - p == BASE_KMER_SAMPLING) {
914 /* New style, e.g., ref153 */
915 if (sscanf(p,"%c%c%c",&tens,&ones,&interval_char) == 3) {
916 digit_string[0] = tens;
917 found_index1part = 10*atoi(digit_string);
918 digit_string[0] = ones;
919 found_index1part += atoi(digit_string);
920
921 digit_string[0] = interval_char;
922 found_interval = atoi(digit_string);
923 } else {
924 abort();
925 }
926
927 } else {
928 fprintf(stderr,"Cannot parse part between %s and offsets in filename %s: found %ld characters, expecting %d\n",
929 idx_filesuffix,filename,q-p,BASE_KMER_SAMPLING);
930 abort();
931 if (snps_root != NULL) {
932 FREE(locoffsetsstrm_suffix);
933 FREE(offsetsstrm_suffix);
934 FREE(offsetsmeta_suffix);
935 FREE(offsetspages_suffix);
936 FREE(positions_high_suffix);
937 FREE(positions_suffix);
938 }
939 return (Indexdb_filenames_T) NULL;
940 }
941
942 if ((required_index1part == 0 || found_index1part == required_index1part) &&
943 (required_interval == 0 || found_interval == required_interval)) {
944 if (required_index1part == 0 && found_index1part < *index1part) {
945 /* Skip, since we have already found a larger index1part */
946 } else if (required_interval == 0 && found_interval > *index1interval) {
947 /* Skip, since we have already found a smaller interval */
948 } else {
949 *index1part = found_index1part;
950 *index1interval = found_interval;
951 FREE(base_filename);
952 base_filename = (char *) CALLOC(strlen(filename)+1,sizeof(char));
953 strcpy(base_filename,filename);
954 }
955 }
956 }
957 }
958 }
959
960 FREE(pattern);
961 #endif
962
963
964 if (closedir(dp) < 0) {
965 fprintf(stderr,"Unable to close directory %s\n",genomesubdir);
966 }
967
968 /* Construct full filenames */
969 if (base_filename == NULL) {
970 /* offsetspages_filename = (char *) NULL; */
971 /* offsetsmeta_filename = (char *) NULL; */
972 /* offsetsstrm_filename = (char *) NULL; */
973 /* positions_high_filename = (char *) NULL; */
974 /* positions_filename = (char *) NULL; */
975 if (snps_root != NULL) {
976 FREE(locoffsetsstrm_suffix);
977 FREE(offsetsstrm_suffix);
978 FREE(offsetsmeta_suffix);
979 FREE(offsetspages_suffix);
980 FREE(positions_high_suffix);
981 FREE(positions_suffix);
982 }
983 return (Indexdb_filenames_T) NULL;
984
985 } else {
986 offsetsstrm_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+strlen(base_filename)+1,sizeof(char));
987 offsetsstrm_basename_ptr = &(offsetsstrm_filename[strlen(genomesubdir)+strlen("/")]);
988 offsetsstrm_index1info_ptr = &(offsetsstrm_basename_ptr[patternlength]);
989
990 sprintf(offsetsstrm_filename,"%s/%s",genomesubdir,base_filename);
991 if (Access_file_exists_p(offsetsstrm_filename) == false) {
992 fprintf(stderr,"Offsets filename %s does not exist\n",offsetsstrm_filename);
993 FREE(offsetsstrm_filename);
994 /* offsetsstrm_filename = (char *) NULL; */
995 /* positions_high_filename = (char *) NULL; */
996 /* positions_filename = (char *) NULL; */
997 FREE(base_filename);
998 if (snps_root != NULL) {
999 FREE(locoffsetsstrm_suffix);
1000 FREE(offsetsstrm_suffix);
1001 FREE(offsetsmeta_suffix);
1002 FREE(offsetspages_suffix);
1003 FREE(positions_high_suffix);
1004 FREE(positions_suffix);
1005 }
1006 return (Indexdb_filenames_T) NULL;
1007 }
1008
1009
1010 if ((q = strstr(base_filename,offsetsstrm_suffix)) == NULL) {
1011 abort();
1012 } else {
1013 rootlength = q - base_filename;
1014 }
1015
1016 offsetsmeta_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(offsetsmeta_suffix)+1,sizeof(char));
1017 offsetsmeta_basename_ptr = &(offsetsmeta_filename[strlen(genomesubdir)+strlen("/")]);
1018 offsetsmeta_index1info_ptr = &(offsetsmeta_basename_ptr[patternlength]);
1019
1020 sprintf(offsetsmeta_filename,"%s/",genomesubdir);
1021 strncpy(offsetsmeta_basename_ptr,base_filename,rootlength);
1022 strcpy(&(offsetsmeta_basename_ptr[rootlength]),offsetsmeta_suffix);
1023
1024 if (Access_file_exists_p(offsetsmeta_filename) == false) {
1025 fprintf(stderr,"Offsetsmeta filename %s does not exist\n",offsetsmeta_filename);
1026 FREE(offsetsstrm_filename);
1027 /* offsetsstrm_filename = (char *) NULL; */
1028 FREE(base_filename);
1029 if (snps_root != NULL) {
1030 FREE(locoffsetsstrm_suffix);
1031 FREE(offsetsstrm_suffix);
1032 FREE(offsetsmeta_suffix);
1033 FREE(positions_high_suffix);
1034 FREE(positions_suffix);
1035 }
1036 return (Indexdb_filenames_T) NULL;
1037 }
1038
1039
1040 offsetspages_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(offsetspages_suffix)+1,sizeof(char));
1041 offsetspages_basename_ptr = &(offsetspages_filename[strlen(genomesubdir)+strlen("/")]);
1042 /* offsetspages_index1info_ptr = &(offsetspages_basename_ptr[patternlength]); */
1043
1044 sprintf(offsetspages_filename,"%s/",genomesubdir);
1045 strncpy(offsetspages_basename_ptr,base_filename,rootlength);
1046 strcpy(&(offsetspages_basename_ptr[rootlength]),offsetspages_suffix);
1047 if (Access_file_exists_p(offsetspages_filename) == false) {
1048 /* Not a huge genome */
1049 FREE(offsetspages_filename);
1050 offsetspages_filename = (char *) NULL;
1051 }
1052
1053 positions_high_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(positions_high_suffix)+1,sizeof(char));
1054 positions_high_basename_ptr = &(positions_high_filename[strlen(genomesubdir)+strlen("/")]);
1055 positions_high_index1info_ptr = &(positions_high_basename_ptr[patternlength]);
1056
1057 positions_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(positions_suffix)+1,sizeof(char));
1058 positions_basename_ptr = &(positions_filename[strlen(genomesubdir)+strlen("/")]);
1059 positions_index1info_ptr = &(positions_basename_ptr[patternlength]);
1060
1061 sprintf(positions_high_filename,"%s/",genomesubdir);
1062 strncpy(positions_high_basename_ptr,base_filename,rootlength);
1063 strcpy(&(positions_high_basename_ptr[rootlength]),positions_high_suffix);
1064
1065 sprintf(positions_filename,"%s/",genomesubdir);
1066 strncpy(positions_basename_ptr,base_filename,rootlength);
1067 strcpy(&(positions_basename_ptr[rootlength]),positions_suffix);
1068
1069 if (offsets_only_p == true) {
1070 /* Do not look for a positions file */
1071 } else if (Access_file_exists_p(positions_filename) == false) {
1072 /* Try newer naming scheme: ref153positions instead of ref12153positions */
1073 sprintf(positions_high_filename,"%s/",genomesubdir);
1074 strncpy(positions_high_basename_ptr,base_filename,rootlength-BASE_KMER_SAMPLING); /* e.g., skip "153" */
1075 strncpy(&(positions_high_basename_ptr[rootlength-BASE_KMER_SAMPLING]),&(base_filename[rootlength-BASE_KMER_SAMPLING]),BASE_KMER_SAMPLING);
1076 strcpy(&(positions_high_basename_ptr[rootlength]),positions_high_suffix);
1077
1078 sprintf(positions_filename,"%s/",genomesubdir);
1079 strncpy(positions_basename_ptr,base_filename,rootlength-BASE_KMER_SAMPLING); /* e.g., skip "153" */
1080 strncpy(&(positions_basename_ptr[rootlength-BASE_KMER_SAMPLING]),&(base_filename[rootlength-BASE_KMER_SAMPLING]),BASE_KMER_SAMPLING);
1081 strcpy(&(positions_basename_ptr[rootlength]),positions_suffix);
1082
1083 if (Access_file_exists_p(positions_filename) == false) {
1084 fprintf(stderr,"Positions filename %s does not exist\n",positions_filename);
1085 FREE(offsetspages_filename);
1086 FREE(offsetsmeta_filename);
1087 FREE(offsetsstrm_filename);
1088 FREE(positions_high_filename);
1089 FREE(positions_filename);
1090 /* offsetsmeta_filename = (char *) NULL; */
1091 /* offsetsstrm_filename = (char *) NULL; */
1092 /* positions_high_filename = (char *) NULL; */
1093 /* positions_filename = (char *) NULL; */
1094 FREE(base_filename);
1095 if (snps_root != NULL) {
1096 FREE(locoffsetsstrm_suffix);
1097 FREE(offsetsstrm_suffix);
1098 FREE(offsetsmeta_suffix);
1099 FREE(offsetspages_suffix);
1100 FREE(positions_high_suffix);
1101 FREE(positions_suffix);
1102 }
1103 return (Indexdb_filenames_T) NULL;
1104 }
1105 }
1106
1107 if (Access_file_exists_p(positions_high_filename) == false) {
1108 /* Not a large genome */
1109 FREE(positions_high_filename);
1110 positions_high_filename = (char *) NULL;
1111 }
1112
1113 if (snps_root != NULL) {
1114 FREE(locoffsetsstrm_suffix);
1115 FREE(offsetsstrm_suffix);
1116 FREE(offsetsmeta_suffix);
1117 FREE(offsetspages_suffix);
1118 FREE(positions_high_suffix);
1119 FREE(positions_suffix);
1120 }
1121
1122 FREE(base_filename);
1123
1124 fprintf(stderr,"Looking for index files in directory %s\n",genomesubdir);
1125 if (offsetspages_filename != NULL) {
1126 fprintf(stderr," Pages file is %s\n",offsetspages_basename_ptr);
1127 }
1128 fprintf(stderr," Pointers file is %s\n",offsetsmeta_basename_ptr);
1129 fprintf(stderr," Offsets file is %s\n",offsetsstrm_basename_ptr);
1130 if (positions_high_filename == NULL) {
1131 fprintf(stderr," Positions file is %s\n",positions_basename_ptr);
1132 } else {
1133 fprintf(stderr," Positions files are %s and %s\n",positions_high_basename_ptr,positions_basename_ptr);
1134 }
1135 return Indexdb_filenames_new(offsetspages_filename,offsetsmeta_filename,offsetsstrm_filename,
1136 positions_high_filename,positions_filename,
1137 offsetsmeta_basename_ptr,offsetsstrm_basename_ptr,
1138 positions_high_basename_ptr,positions_basename_ptr,
1139 offsetsmeta_index1info_ptr,offsetsstrm_index1info_ptr,
1140 positions_high_index1info_ptr,positions_index1info_ptr);
1141
1142 }
1143 }
1144
1145
1146 Indexdb_filenames_T
Indexdb_get_filenames(int * compression_type,Alphabet_T * alphabet,Alphabet_T required_alphabet,Width_T * index1part,Width_T * index1interval,char * genomesubdir,char * fileroot,char * idx_filesuffix,char * snps_root,Width_T required_index1part,Width_T required_interval,bool offsets_only_p)1147 Indexdb_get_filenames (int *compression_type,
1148 #ifdef PMAP
1149 Alphabet_T *alphabet, Alphabet_T required_alphabet,
1150 #endif
1151 Width_T *index1part, Width_T *index1interval, char *genomesubdir,
1152 char *fileroot, char *idx_filesuffix, char *snps_root,
1153 Width_T required_index1part, Width_T required_interval,
1154 bool offsets_only_p) {
1155 Indexdb_filenames_T filenames;
1156
1157 if ((filenames = Indexdb_get_filenames_no_compression(&(*index1part),&(*index1interval),
1158 genomesubdir,fileroot,idx_filesuffix,snps_root,
1159 required_interval,offsets_only_p)) != NULL) {
1160 *compression_type = NO_COMPRESSION;
1161 return filenames;
1162
1163
1164 } else if ((filenames = Indexdb_get_filenames_bitpack(
1165 #ifdef PMAP
1166 &(*alphabet),required_alphabet,
1167 #endif
1168 &(*index1part),&(*index1interval),
1169 genomesubdir,fileroot,idx_filesuffix,snps_root,
1170 required_index1part,required_interval,
1171 /*blocksize*/64,offsets_only_p)) != NULL) {
1172 *compression_type = BITPACK64_COMPRESSION;
1173 return filenames;
1174
1175 } else {
1176 return (Indexdb_filenames_T) NULL;
1177 }
1178 }
1179
1180
1181 #if 0
1182 void
1183 Indexdb_shmem_remove (char *genomesubdir, char *fileroot, char *idx_filesuffix, char *snps_root,
1184 #ifdef PMAP
1185 Alphabet_T *alphabet, int *alphabet_size, Alphabet_T required_alphabet,
1186 #endif
1187 Width_T required_index1part, Width_T required_interval, bool expand_offsets_p) {
1188 Indexdb_filenames_T filenames;
1189 int index1part, index1interval;
1190
1191 if ((filenames = Indexdb_get_filenames_no_compression(&index1part,&index1interval,
1192 genomesubdir,fileroot,idx_filesuffix,snps_root,
1193 required_interval,/*offsets_only_p*/false)) != NULL) {
1194 /* Try non-compressed files */
1195 Access_shmem_remove(filenames->offsets_filename);
1196
1197 } else if ((filenames = Indexdb_get_filenames_bitpack(
1198 #ifdef PMAP
1199 &(*alphabet),required_alphabet,
1200 #endif
1201 &index1part,&index1interval,
1202 genomesubdir,fileroot,idx_filesuffix,snps_root,
1203 required_index1part,required_interval,
1204 /*blocksize*/64,/*offsets_only_p*/false)) != NULL) {
1205 if (expand_offsets_p == true) {
1206 /* ALLOCATED_PRIVATE */
1207
1208 } else {
1209 Access_shmem_remove(filenames->pointers_filename);
1210 Access_shmem_remove(filenames->offsets_filename);
1211 #ifdef LARGE_GENOMES
1212 if (filenames->pages_filename != NULL) {
1213 Access_shmem_remove(filenames->pages_filename);
1214 }
1215 #endif
1216 }
1217 }
1218
1219 #ifdef LARGE_GENOMES
1220 Access_shmem_remove(filenames->positions_high_filename);
1221 Access_shmem_remove(filenames->positions_filename);
1222 #else
1223 Access_shmem_remove(filenames->positions_filename);
1224 #endif
1225
1226 Indexdb_filenames_free(&filenames);
1227
1228 return;
1229 }
1230 #endif
1231
1232
1233 static void
load_offsets(T new,Indexdb_filenames_T filenames,char * idx_filesuffix,char * snps_root,Access_mode_T offsetsstrm_access,bool sharedp,bool multiple_sequences_p,bool preload_shared_memory_p,bool unload_shared_memory_p)1234 load_offsets (T new, Indexdb_filenames_T filenames, char *idx_filesuffix, char *snps_root,
1235 Access_mode_T offsetsstrm_access, bool sharedp,
1236 bool multiple_sequences_p, bool preload_shared_memory_p,
1237 bool unload_shared_memory_p) {
1238 char *comma;
1239 double seconds;
1240 #ifdef HAVE_MMAP
1241 int npages;
1242 #endif
1243
1244 fprintf(stderr,"Offsets compression type: bitpack64\n");
1245 new->compression_type = BITPACK64_COMPRESSION;
1246 new->blocksize = 64; /* Used to be determined by 4^(kmer - basesize), but now fixed at 64 */
1247
1248 /* (1) Offsetsmeta: always ALLOCATED */
1249 if (snps_root) {
1250 fprintf(stderr,"Allocating memory for %s (%s) offset pointers, kmer %d, interval %d...",
1251 idx_filesuffix,snps_root,new->index1part,new->index1interval);
1252 } else {
1253 fprintf(stderr,"Allocating memory for %s offset pointers, kmer %d, interval %d...",
1254 idx_filesuffix,new->index1part,new->index1interval);
1255 }
1256
1257 if (
1258 #ifndef HAVE_MMAP
1259 0 &&
1260 #endif
1261 multiple_sequences_p == false && preload_shared_memory_p == false && unload_shared_memory_p == false) {
1262 #ifdef HAVE_MMAP
1263 new->offsetsmeta = (UINT4 *) Access_mmap(&new->offsetsmeta_fd,&new->offsetsmeta_len,&seconds,
1264 filenames->pointers_filename,/*randomp*/false);
1265 new->offsetsmeta_access = MMAPPED;
1266
1267 #endif
1268 } else if (sharedp == true) {
1269 new->offsetsmeta = (UINT4 *) Access_allocate_shared(&new->offsetsmeta_access,&new->offsetsmeta_shmid,&new->offsetsmeta_key,
1270 &new->offsetsmeta_fd,&new->offsetsmeta_len,&seconds,
1271 filenames->pointers_filename,sizeof(UINT4));
1272 } else {
1273 new->offsetsmeta = (UINT4 *) Access_allocate_private(&new->offsetsmeta_access,&new->offsetsmeta_len,&seconds,
1274 filenames->pointers_filename,sizeof(UINT4));
1275 }
1276
1277 if (new->offsetsmeta == NULL) {
1278 fprintf(stderr,"insufficient memory for allocating %s\n",filenames->pointers_filename);
1279 exit(9);
1280 } else {
1281 comma = Genomicpos_commafmt(new->offsetsmeta_len);
1282 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1283 FREE(comma);
1284 /* new->offsetsmeta_access set by Access_allocate */
1285 }
1286
1287
1288 /* (2) Offsetsstrm: could be ALLOCATED or MMAPPED +/- PRELOAD */
1289 if (offsetsstrm_access == USE_ALLOCATE) {
1290 if (snps_root) {
1291 fprintf(stderr,"Allocating memory for %s (%s) offsets, kmer %d, interval %d...",
1292 idx_filesuffix,snps_root,new->index1part,new->index1interval);
1293 } else {
1294 fprintf(stderr,"Allocating memory for %s offsets, kmer %d, interval %d...",
1295 idx_filesuffix,new->index1part,new->index1interval);
1296 }
1297
1298 if (sharedp == true) {
1299 new->offsetsstrm = (UINT4 *) Access_allocate_shared(&new->offsetsstrm_access,&new->offsetsstrm_shmid,&new->offsetsstrm_key,
1300 &new->offsetsstrm_fd,&new->offsetsstrm_len,&seconds,
1301 filenames->offsets_filename,sizeof(UINT4));
1302 } else {
1303 new->offsetsstrm = (UINT4 *) Access_allocate_private(&new->offsetsstrm_access,&new->offsetsstrm_len,&seconds,
1304 filenames->offsets_filename,sizeof(UINT4));
1305 }
1306
1307 if (new->offsetsstrm == NULL) {
1308 fprintf(stderr,"insufficient memory for allocating %s\n",filenames->offsets_filename);
1309 exit(9);
1310 } else {
1311 comma = Genomicpos_commafmt(new->offsetsstrm_len);
1312 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1313 FREE(comma);
1314 }
1315
1316 #ifdef HAVE_MMAP
1317 } else if (offsetsstrm_access == USE_MMAP_PRELOAD) {
1318 if (snps_root) {
1319 fprintf(stderr,"Pre-loading %s (%s) offsets, kmer %d, interval %d...",
1320 idx_filesuffix,snps_root,new->index1part,new->index1interval);
1321 } else {
1322 fprintf(stderr,"Pre-loading %s offsets, kmer %d, interval %d...",
1323 idx_filesuffix,new->index1part,new->index1interval);
1324 }
1325
1326 new->offsetsstrm = (UINT4 *) Access_mmap_and_preload(&new->offsetsstrm_fd,&new->offsetsstrm_len,&npages,&seconds,
1327 filenames->offsets_filename,sizeof(UINT4));
1328 if (new->offsetsstrm == NULL) {
1329 fprintf(stderr,"insufficient memory\n");
1330 exit(9);
1331 } else {
1332 comma = Genomicpos_commafmt(new->offsetsstrm_len);
1333 fprintf(stderr,"done (%s bytes, %d pages, %.2f sec)\n",comma,npages,seconds);
1334 FREE(comma);
1335 new->offsetsstrm_access = MMAPPED;
1336 }
1337 #endif
1338
1339 #ifdef HAVE_MMAP
1340 } else if (offsetsstrm_access == USE_MMAP_ONLY) {
1341 if (snps_root) {
1342 fprintf(stderr,"Memory mapping %s (%s) offsets, kmer %d, interval %d...",
1343 idx_filesuffix,snps_root,new->index1part,new->index1interval);
1344 } else {
1345 fprintf(stderr,"Memory mapping %s offsets, kmer %d, interval %d...",
1346 idx_filesuffix,new->index1part,new->index1interval);
1347 }
1348
1349 new->offsetsstrm = (UINT4 *) Access_mmap(&new->offsetsstrm_fd,&new->offsetsstrm_len,&seconds,
1350 filenames->offsets_filename,/*randomp*/false);
1351 if (new->offsetsstrm == NULL) {
1352 fprintf(stderr,"Insufficient memory for memory mapping %s\n",filenames->offsets_filename);
1353 exit(9);
1354 } else {
1355 comma = Genomicpos_commafmt(new->offsetsstrm_len);
1356 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1357 FREE(comma);
1358 new->offsetsstrm_access = MMAPPED;
1359 }
1360 #endif
1361
1362 } else if (offsetsstrm_access == USE_FILEIO) {
1363 fprintf(stderr,"Offsetsstrm file I/O access of %s not allowed\n",filenames->offsets_filename);
1364 exit(9);
1365
1366 } else {
1367 fprintf(stderr,"Don't recognize offsetsstrm_access type %d\n",offsetsstrm_access);
1368 abort();
1369 }
1370
1371
1372 #if defined(LARGE_GENOMES) || defined(UTILITYP)
1373 /* (3) Pages: always ALLOCATED */
1374 if (filenames->pages_filename == NULL) {
1375 new->hugep = false;
1376 new->offsetspages_access = NOT_USED;
1377 new->offsetspages_len = 0;
1378 new->offsetspages = (UINT4 *) MALLOC(1*sizeof(UINT4));
1379 new->offsetspages[0] = (UINT4) -1;
1380
1381 } else {
1382 new->hugep = true;
1383 if (
1384 #ifndef HAVE_MMAP
1385 0 &&
1386 #endif
1387 multiple_sequences_p == false && preload_shared_memory_p == false && unload_shared_memory_p == false) {
1388 #ifdef HAVE_MMAP
1389 new->offsetspages = (UINT4 *) Access_mmap(&new->offsetspages_fd,&new->offsetspages_len,&seconds,
1390 filenames->pages_filename,/*randomp*/false);
1391 new->offsetspages_access = MMAPPED;
1392 #endif
1393 } else if (sharedp == true) {
1394 new->offsetspages = (UINT4 *) Access_allocate_shared(&new->offsetspages_access,&new->offsetspages_shmid,&new->offsetspages_key,
1395 &new->offsetspages_fd,&new->offsetspages_len,&seconds,
1396 filenames->pages_filename,sizeof(UINT4));
1397 } else {
1398 new->offsetspages = (UINT4 *) Access_allocate_private(&new->offsetspages_access,&new->offsetspages_len,&seconds,
1399 filenames->pages_filename,sizeof(UINT4));
1400 }
1401 }
1402 #endif
1403
1404 return;
1405 }
1406
1407
1408 #if defined(LARGE_GENOMES) || defined(UTILITYP)
1409 static void
load_positions_high(T new,Indexdb_filenames_T filenames,char * idx_filesuffix,char * snps_root,Access_mode_T positions_access,bool sharedp,bool multiple_sequences_p,bool preload_shared_memory_p,bool unload_shared_memory_p)1410 load_positions_high (T new, Indexdb_filenames_T filenames, char *idx_filesuffix,
1411 char *snps_root, Access_mode_T positions_access, bool sharedp,
1412 bool multiple_sequences_p, bool preload_shared_memory_p,
1413 bool unload_shared_memory_p) {
1414 char *comma;
1415 double seconds;
1416 #ifdef HAVE_MMAP
1417 int npages;
1418 #endif
1419
1420 /* (4) Positions high */
1421 if (positions_access == USE_ALLOCATE) {
1422 if (snps_root) {
1423 fprintf(stderr,"Allocating memory for %s (%s) positions high, kmer %d, interval %d...",
1424 idx_filesuffix,snps_root,new->index1part,new->index1interval);
1425 } else {
1426 fprintf(stderr,"Allocating memory for %s positions high, kmer %d, interval %d...",
1427 idx_filesuffix,new->index1part,new->index1interval);
1428 }
1429
1430 if (filenames->positions_high_filename == NULL) {
1431 new->positions_high = (unsigned char *) NULL;
1432
1433 } else if (
1434 #ifndef HAVE_MMAP
1435 0 &&
1436 #endif
1437 multiple_sequences_p == false && preload_shared_memory_p == false && unload_shared_memory_p == false) {
1438 #ifdef HAVE_MMAP
1439 new->positions_high = (unsigned char *) Access_mmap(&new->positions_high_fd,&new->positions_high_len,&seconds,
1440 filenames->positions_high_filename,/*randomp*/false);
1441 new->positions_high_access = MMAPPED;
1442 #endif
1443
1444 } else if (sharedp == true) {
1445 new->positions_high = (unsigned char *) Access_allocate_shared(&new->positions_high_access,&new->positions_high_shmid,&new->positions_high_key,
1446 &new->positions_high_fd,&new->positions_high_len,&seconds,
1447 filenames->positions_high_filename,sizeof(unsigned char));
1448
1449 } else {
1450 new->positions_high = (unsigned char *) Access_allocate_private(&new->positions_high_access,&new->positions_high_len,&seconds,
1451 filenames->positions_high_filename,sizeof(unsigned char));
1452 }
1453
1454 if (filenames->positions_high_filename == NULL) {
1455 new->positions_high_access = NOT_USED;
1456 } else if (new->positions_high == NULL) {
1457 fprintf(stderr,"insufficient memory (need to use a lower batch mode (-B)\n");
1458 exit(9);
1459 } else {
1460 comma = Genomicpos_commafmt(new->positions_high_len);
1461 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1462 FREE(comma);
1463 /* new->positions_high_access set by Access_allocate */
1464 }
1465
1466 #ifdef HAVE_MMAP
1467 } else if (positions_access == USE_MMAP_PRELOAD) {
1468 if (snps_root) {
1469 fprintf(stderr,"Pre-loading %s (%s) positions high, kmer %d, interval %d...",
1470 idx_filesuffix,snps_root,new->index1part,new->index1interval);
1471 } else {
1472 fprintf(stderr,"Pre-loading %s positions high, kmer %d, interval %d...",
1473 idx_filesuffix,new->index1part,new->index1interval);
1474 }
1475
1476 if (filenames->positions_high_filename == NULL) {
1477 new->positions_high = (unsigned char *) NULL;
1478 } else {
1479 new->positions_high = (unsigned char *) Access_mmap_and_preload(&new->positions_high_fd,&new->positions_high_len,&npages,&seconds,
1480 filenames->positions_high_filename,sizeof(unsigned char));
1481 }
1482
1483 if (filenames->positions_high_filename == NULL) {
1484 new->positions_high_access = NOT_USED;
1485 } else if (new->positions_high == NULL) {
1486 fprintf(stderr,"insufficient memory\n");
1487 exit(9);
1488 } else {
1489 comma = Genomicpos_commafmt(new->positions_high_len);
1490 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1491 FREE(comma);
1492 new->positions_high_access = MMAPPED;
1493 }
1494 #endif
1495
1496 #ifdef HAVE_MMAP
1497 } else if (positions_access == USE_MMAP_ONLY) {
1498 if (snps_root) {
1499 fprintf(stderr,"Memory mapping %s (%s) positions high, kmer %d, interval %d...",
1500 idx_filesuffix,snps_root,new->index1part,new->index1interval);
1501 } else {
1502 fprintf(stderr,"Memory mapping %s positions high, kmer %d, interval %d...",
1503 idx_filesuffix,new->index1part,new->index1interval);
1504 }
1505
1506 if (filenames->positions_high_filename == NULL) {
1507 new->positions_high = (unsigned char *) NULL;
1508 } else {
1509 new->positions_high = (unsigned char *) Access_mmap(&new->positions_high_fd,&new->positions_high_len,&seconds,
1510 filenames->positions_high_filename,/*randomp*/true);
1511 }
1512
1513 if (filenames->positions_high_filename == NULL) {
1514 new->positions_high_access = NOT_USED;
1515 } else if (new->positions_high == NULL) {
1516 fprintf(stderr,"Insufficient memory for memory mapping %s\n",
1517 filenames->positions_high_filename);
1518 exit(9);
1519 } else {
1520 comma = Genomicpos_commafmt(new->positions_high_len);
1521 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1522 FREE(comma);
1523 new->positions_high_access = MMAPPED;
1524 }
1525 #endif
1526
1527 } else if (positions_access == USE_FILEIO) {
1528 fprintf(stderr,"Positions high file I/O access of %s not allowed\n",
1529 filenames->positions_high_filename);
1530 exit(9);
1531
1532 } else {
1533 fprintf(stderr,"Don't recognize positions_access type %d\n",positions_access);
1534 abort();
1535 }
1536
1537 return;
1538 }
1539 #endif
1540
1541
1542 static void
load_positions(T new,Indexdb_filenames_T filenames,char * idx_filesuffix,char * snps_root,Access_mode_T positions_access,bool sharedp,bool multiple_sequences_p,bool preload_shared_memory_p,bool unload_shared_memory_p)1543 load_positions (T new, Indexdb_filenames_T filenames, char *idx_filesuffix,
1544 char *snps_root, Access_mode_T positions_access, bool sharedp,
1545 bool multiple_sequences_p, bool preload_shared_memory_p,
1546 bool unload_shared_memory_p) {
1547 char *comma;
1548 double seconds;
1549 #ifdef HAVE_MMAP
1550 int npages;
1551 #endif
1552
1553 /* (5) Positions */
1554 if (positions_access == USE_ALLOCATE) {
1555 if (snps_root) {
1556 fprintf(stderr,"Allocating memory for %s (%s) positions, kmer %d, interval %d...",
1557 idx_filesuffix,snps_root,new->index1part,new->index1interval);
1558 } else {
1559 fprintf(stderr,"Allocating memory for %s positions, kmer %d, interval %d...",
1560 idx_filesuffix,new->index1part,new->index1interval);
1561 }
1562
1563 if (
1564 #ifndef HAVE_MMAP
1565 0 &&
1566 #endif
1567 multiple_sequences_p == false && preload_shared_memory_p == false && unload_shared_memory_p == false) {
1568 #ifdef HAVE_MMAP
1569 new->positions = (UINT4 *) Access_mmap(&new->positions_fd,&new->positions_len,&seconds,
1570 filenames->positions_filename,/*randomp*/false);
1571 new->positions_access = MMAPPED;
1572 #endif
1573
1574 } else if (sharedp == true) {
1575 new->positions = (UINT4 *) Access_allocate_shared(&new->positions_access,&new->positions_shmid,&new->positions_key,
1576 &new->positions_fd,&new->positions_len,&seconds,
1577 filenames->positions_filename,sizeof(UINT4));
1578
1579 } else {
1580 new->positions = (UINT4 *) Access_allocate_private(&new->positions_access,&new->positions_len,&seconds,
1581 filenames->positions_filename,sizeof(UINT4));
1582 }
1583
1584 if (new->positions == NULL) {
1585 fprintf(stderr,"insufficient memory (need to use a lower batch mode (-B)\n");
1586 exit(9);
1587 } else {
1588 comma = Genomicpos_commafmt(new->positions_len);
1589 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1590 FREE(comma);
1591 /* new->positions_access set by Access_allocate */
1592 }
1593
1594 #ifdef HAVE_MMAP
1595 } else if (positions_access == USE_MMAP_PRELOAD) {
1596 if (snps_root) {
1597 fprintf(stderr,"Pre-loading %s (%s) positions, kmer %d, interval %d...",
1598 idx_filesuffix,snps_root,new->index1part,new->index1interval);
1599 } else {
1600 fprintf(stderr,"Pre-loading %s positions, kmer %d, interval %d...",
1601 idx_filesuffix,new->index1part,new->index1interval);
1602 }
1603
1604 new->positions = (UINT4 *) Access_mmap_and_preload(&new->positions_fd,&new->positions_len,&npages,&seconds,
1605 filenames->positions_filename,sizeof(UINT4));
1606
1607 if (new->positions == NULL) {
1608 fprintf(stderr,"insufficient memory\n");
1609 exit(9);
1610 } else {
1611 comma = Genomicpos_commafmt(new->positions_len);
1612 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1613 FREE(comma);
1614 new->positions_access = MMAPPED;
1615 }
1616 #endif
1617
1618 #ifdef HAVE_MMAP
1619 } else if (positions_access == USE_MMAP_ONLY) {
1620 if (snps_root) {
1621 fprintf(stderr,"Memory mapping %s (%s) positions, kmer %d, interval %d...",
1622 idx_filesuffix,snps_root,new->index1part,new->index1interval);
1623 } else {
1624 fprintf(stderr,"Memory mapping %s positions, kmer %d, interval %d...",
1625 idx_filesuffix,new->index1part,new->index1interval);
1626 }
1627
1628 new->positions = (UINT4 *) Access_mmap(&new->positions_fd,&new->positions_len,&seconds,
1629 filenames->positions_filename,/*randomp*/true);
1630
1631 if (new->positions == NULL) {
1632 fprintf(stderr,"Insufficient memory for memory mapping %s\n",
1633 filenames->positions_filename);
1634 exit(9);
1635 } else {
1636 comma = Genomicpos_commafmt(new->positions_len);
1637 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1638 FREE(comma);
1639 new->positions_access = MMAPPED;
1640 }
1641 #endif
1642
1643 } else if (positions_access == USE_FILEIO) {
1644 fprintf(stderr,"Positions file I/O access of %s not allowed\n",
1645 filenames->positions_filename);
1646 exit(9);
1647
1648 } else {
1649 fprintf(stderr,"Don't recognize positions_access %d\n",positions_access);
1650 abort();
1651 }
1652
1653 #ifdef HAVE_PTHREAD
1654 #if defined(LARGE_GENOMES) || defined(UTILITYP)
1655 if (new->positions_high_access == FILEIO || new->positions_access == FILEIO) {
1656 pthread_mutex_init(&new->positions_read_mutex,NULL);
1657 }
1658 #else
1659 if (new->positions_access == FILEIO) {
1660 pthread_mutex_init(&new->positions_read_mutex,NULL);
1661 }
1662 #endif
1663 #endif
1664
1665 return;
1666 }
1667
1668
1669 T
Indexdb_new_genome(Width_T * index1part,Width_T * index1interval,char * genomesubdir,char * fileroot,char * idx_filesuffix,char * snps_root,Width_T required_index1part,Width_T required_interval,Access_mode_T offsetsstrm_access,Access_mode_T positions_access,bool sharedp,bool multiple_sequences_p,bool preload_shared_memory_p,bool unload_shared_memory_p)1670 Indexdb_new_genome (Width_T *index1part, Width_T *index1interval,
1671 char *genomesubdir, char *fileroot, char *idx_filesuffix, char *snps_root,
1672 Width_T required_index1part, Width_T required_interval,
1673 Access_mode_T offsetsstrm_access, Access_mode_T positions_access, bool sharedp,
1674 bool multiple_sequences_p, bool preload_shared_memory_p, bool unload_shared_memory_p) {
1675 T new = (T) MALLOC(sizeof(*new));
1676 Indexdb_filenames_T filenames;
1677
1678 /* Oligospace_T poly_T; */
1679 /* Positionsptr_T ptr0; -- UINT8 or UINT4 */
1680 /* Positionsptr_T end0; -- UINT8 or UINT4 */
1681
1682 if ((filenames = Indexdb_get_filenames_bitpack(&new->index1part,&new->index1interval,
1683 genomesubdir,fileroot,idx_filesuffix,snps_root,
1684 required_index1part,required_interval,
1685 /*blocksize*/64,/*offsets_only_p*/false)) == NULL) {
1686 fprintf(stderr,"Cannot find genomic index files in either current or old format. Looking for files containing %s\n",
1687 idx_filesuffix);
1688 if (required_index1part != 0) {
1689 fprintf(stderr,"Required kmer is %d\n",required_index1part);
1690 }
1691 if (required_interval != 0) {
1692 fprintf(stderr,"Required interval is %d\n",required_interval);
1693 }
1694 exit(9);
1695 }
1696
1697
1698 *index1part = new->index1part;
1699 *index1interval = new->index1interval;
1700 load_offsets(new,filenames,idx_filesuffix,snps_root,offsetsstrm_access,sharedp,
1701 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
1702
1703 new->total_npositions = Access_filesize(filenames->positions_filename)/sizeof(UINT4);
1704
1705 #if defined(LARGE_GENOMES) || defined(UTILITYP)
1706 load_positions_high(new,filenames,idx_filesuffix,snps_root,positions_access,sharedp,
1707 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
1708 #endif
1709 load_positions(new,filenames,idx_filesuffix,snps_root,positions_access,sharedp,
1710 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
1711
1712 Indexdb_filenames_free(&filenames);
1713
1714 return new;
1715 }
1716
1717
1718 /* Assume that transcriptome is less than 4 billion bp, even for GSNAPL */
1719 T
Indexdb_new_transcriptome(Width_T * index1part,Width_T * index1interval,char * genomesubdir,char * fileroot,char * idx_filesuffix,char * snps_root,Alphabet_T * alphabet,int * alphabet_size,Alphabet_T required_alphabet,Width_T required_index1part,Width_T required_interval,Access_mode_T offsetsstrm_access,Access_mode_T positions_access,bool sharedp,bool multiple_sequences_p,bool preload_shared_memory_p,bool unload_shared_memory_p)1720 Indexdb_new_transcriptome (Width_T *index1part, Width_T *index1interval,
1721 char *genomesubdir, char *fileroot, char *idx_filesuffix, char *snps_root,
1722 #ifdef PMAP
1723 Alphabet_T *alphabet, int *alphabet_size, Alphabet_T required_alphabet,
1724 #endif
1725 Width_T required_index1part, Width_T required_interval,
1726 Access_mode_T offsetsstrm_access, Access_mode_T positions_access, bool sharedp,
1727 bool multiple_sequences_p, bool preload_shared_memory_p, bool unload_shared_memory_p) {
1728 T new = (T) MALLOC(sizeof(*new));
1729 Indexdb_filenames_T filenames;
1730
1731 /* Oligospace_T poly_T; */
1732 /* Positionsptr_T ptr0; -- UINT8 or UINT4 */
1733 /* Positionsptr_T end0; -- UINT8 or UINT4 */
1734
1735 if ((filenames = Indexdb_get_filenames_bitpack(
1736 #ifdef PMAP
1737 &(*alphabet),required_alphabet,
1738 #endif
1739 &new->index1part,&new->index1interval,
1740 genomesubdir,fileroot,idx_filesuffix,snps_root,
1741 required_index1part,required_interval,
1742 /*blocksize*/64,/*offsets_only_p*/false)) == NULL) {
1743 fprintf(stderr,"Cannot find genomic index files in a recent format. Looking for files containing %s\n",idx_filesuffix);
1744 exit(9);
1745 } else {
1746 /* Bitpack compression */
1747 *index1part = new->index1part;
1748 *index1interval = new->index1interval;
1749 }
1750
1751 load_offsets(new,filenames,idx_filesuffix,snps_root,offsetsstrm_access,sharedp,
1752 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
1753 load_positions(new,filenames,idx_filesuffix,snps_root,positions_access,sharedp,
1754 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
1755
1756 Indexdb_filenames_free(&filenames);
1757
1758 return new;
1759 }
1760
1761
1762 /************************************************************************
1763 * Debugging procedures
1764 ************************************************************************/
1765
1766 #ifndef PMAP
1767
1768 /* 87654321 */
1769 #define LOW_TWO_BITS 0x00000003
1770
1771 #if (defined(DEBUG0) || defined(DEBUG1) || defined(DEBUG2))
1772 static char *
shortoligo_nt(Oligospace_T oligo,Width_T oligosize)1773 shortoligo_nt (Oligospace_T oligo, Width_T oligosize) {
1774 char *nt;
1775 Width_T i, j;
1776 Oligospace_T lowbits;
1777
1778 nt = (char *) CALLOC(oligosize+1,sizeof(char));
1779 j = oligosize-1;
1780 for (i = 0; i < oligosize; i++) {
1781 lowbits = oligo & LOW_TWO_BITS;
1782 switch (lowbits) {
1783 case RIGHT_A: nt[j] = 'A'; break;
1784 case RIGHT_C: nt[j] = 'C'; break;
1785 case RIGHT_G: nt[j] = 'G'; break;
1786 case RIGHT_T: nt[j] = 'T'; break;
1787 }
1788 oligo >>= 2;
1789 j--;
1790 }
1791
1792 return nt;
1793 }
1794 #endif
1795
1796 #endif
1797
1798
1799 /************************************************************************
1800 * Read procedures
1801 ************************************************************************/
1802
1803 #ifdef LARGE_GENOMES
1804 static void
positions_move_absolute_1(int positions_fd,size_t ptr)1805 positions_move_absolute_1 (int positions_fd, size_t ptr) {
1806 off_t offset = ptr*sizeof(unsigned char);
1807
1808 if (lseek(positions_fd,offset,SEEK_SET) < 0) {
1809 fprintf(stderr,"Attempted to do lseek on offset %zu*%zu=%zu\n",
1810 ptr,sizeof(unsigned char),(size_t) offset);
1811 perror("Error in indexdb.c, positions_move_absolute");
1812 exit(9);
1813 }
1814 return;
1815 }
1816 #endif
1817
1818
1819 static void
positions_move_absolute_4(int positions_fd,size_t ptr)1820 positions_move_absolute_4 (int positions_fd, size_t ptr) {
1821 off_t offset = ptr*sizeof(UINT4);
1822
1823 if (lseek(positions_fd,offset,SEEK_SET) < 0) {
1824 fprintf(stderr,"Attempted to do lseek on offset %zu*%zu=%zu\n",
1825 ptr,sizeof(UINT4),(size_t) offset);
1826 perror("Error in indexdb.c, positions_move_absolute");
1827 exit(9);
1828 }
1829 return;
1830 }
1831
1832 #if 0
1833 static Univcoord_T
1834 positions_read_forward (int positions_fd) {
1835 Univcoord_T value;
1836 char buffer[4];
1837
1838 read(positions_fd,buffer,4);
1839
1840 value = (buffer[3] & 0xff);
1841 value <<= 8;
1842 value |= (buffer[2] & 0xff);
1843 value <<= 8;
1844 value |= (buffer[1] & 0xff);
1845 value <<= 8;
1846 value |= (buffer[0] & 0xff);
1847
1848 return value;
1849 }
1850 #endif
1851
1852 #ifdef LARGE_GENOMES
1853 static void
positions_read_multiple_large(int positions_high_fd,int positions_fd,Univcoord_T * values,int n)1854 positions_read_multiple_large (int positions_high_fd, int positions_fd, Univcoord_T *values, int n) {
1855 int i;
1856 Univcoord_T value;
1857 unsigned char buffer[4];
1858
1859 #ifdef WORDS_BIGENDIAN
1860 /* Need to keep in bigendian format */
1861 for (i = 0; i < n; i++) {
1862 read(positions_high_fd,buffer,1);
1863 value = (buffer[0] & 0xff);
1864 value <<= 8;
1865
1866 read(positions_fd,buffer,4);
1867 value |= (buffer[0] & 0xff);
1868 value <<= 8;
1869 value |= (buffer[1] & 0xff);
1870 value <<= 8;
1871 value |= (buffer[2] & 0xff);
1872 value <<= 8;
1873 value |= (buffer[3] & 0xff);
1874
1875 values[i] = value;
1876 }
1877 #else
1878 for (i = 0; i < n; i++) {
1879 read(positions_high_fd,buffer,1);
1880 value = (buffer[0] & 0xff);
1881 value <<= 8;
1882
1883 read(positions_fd,buffer,4);
1884 value |= (buffer[3] & 0xff);
1885 value <<= 8;
1886 value |= (buffer[2] & 0xff);
1887 value <<= 8;
1888 value |= (buffer[1] & 0xff);
1889 value <<= 8;
1890 value |= (buffer[0] & 0xff);
1891
1892 values[i] = value;
1893 }
1894 #endif
1895
1896 return;
1897 }
1898
1899 static void
positions_copy_multiple_large(Univcoord_T * positions,unsigned char * positions_high,UINT4 * positions_low,int n)1900 positions_copy_multiple_large (Univcoord_T *positions, unsigned char *positions_high, UINT4 *positions_low, int n) {
1901 int i;
1902
1903 for (i = 0; i < n; i++) {
1904 positions[i] = ((Univcoord_T) positions_high[i] << 32) + positions_low[i];
1905 }
1906
1907 return;
1908 }
1909
1910 #else
1911
1912 static void
positions_read_multiple(int positions_fd,Univcoord_T * values,int n)1913 positions_read_multiple (int positions_fd, Univcoord_T *values, int n) {
1914 int i;
1915 Univcoord_T value;
1916 unsigned char buffer[4];
1917
1918 #ifdef WORDS_BIGENDIAN
1919 /* Need to keep in bigendian format */
1920 for (i = 0; i < n; i++) {
1921 read(positions_fd,buffer,4);
1922
1923 value = (buffer[0] & 0xff);
1924 value <<= 8;
1925 value |= (buffer[1] & 0xff);
1926 value <<= 8;
1927 value |= (buffer[2] & 0xff);
1928 value <<= 8;
1929 value |= (buffer[3] & 0xff);
1930
1931 values[i] = value;
1932 }
1933 #else
1934 for (i = 0; i < n; i++) {
1935 read(positions_fd,buffer,4);
1936
1937 value = (buffer[3] & 0xff);
1938 value <<= 8;
1939 value |= (buffer[2] & 0xff);
1940 value <<= 8;
1941 value |= (buffer[1] & 0xff);
1942 value <<= 8;
1943 value |= (buffer[0] & 0xff);
1944
1945 values[i] = value;
1946 }
1947 #endif
1948
1949 return;
1950 }
1951 #endif
1952
1953
1954
1955
1956 #if 0
1957 static Univcoord_T
1958 positions_read_backward (int positions_fd) {
1959 Univcoord_T value;
1960 char buffer[4];
1961 off_t reloffset = -2*((off_t) sizeof(Univcoord_T)); /* 1 to undo the effect of read */
1962
1963 read(positions_fd,buffer,4);
1964
1965 value = (buffer[3] & 0xff);
1966 value <<= 8;
1967 value |= (buffer[2] & 0xff);
1968 value <<= 8;
1969 value |= (buffer[1] & 0xff);
1970 value <<= 8;
1971 value |= (buffer[0] & 0xff);
1972
1973 if (lseek(positions_fd,reloffset,SEEK_CUR) < 0) {
1974 fprintf(stderr,"Attempted to do lseek on relative offset %ld\n",(long int) reloffset);
1975 perror("Error in indexdb.c, positions_read_backward");
1976 exit(9);
1977 }
1978
1979 return value;
1980 }
1981 #endif
1982
1983
1984 /* Used by non-utility programs */
1985 Positionsptr_T *
Indexdb_offsets_from_bitpack(char * offsetsmetafile,char * offsetsstrmfile,int alphabet_size,Width_T index1part_aa)1986 Indexdb_offsets_from_bitpack (char *offsetsmetafile, char *offsetsstrmfile,
1987 #ifdef PMAP
1988 int alphabet_size, Width_T index1part_aa
1989 #else
1990 Width_T index1part
1991 #endif
1992 ) {
1993 UINT4 *offsetsmeta;
1994 UINT4 *offsetsstrm;
1995 size_t offsetsmeta_len, offsetsstrm_len;
1996 Positionsptr_T *offsets = NULL;
1997 Oligospace_T oligospace;
1998 #ifdef PMAP
1999 Oligospace_T oligoi;
2000 #elif defined(LARGE_GENOMES)
2001 #else
2002 Oligospace_T oligoi;
2003 Blocksize_T blocksize;
2004 #endif
2005 #ifdef HAVE_MMAP
2006 int offsetsmeta_fd, offsetsstrm_fd;
2007 #else
2008 Access_T offsetsmeta_access, offsetsstrm_access;
2009 #endif
2010 double seconds;
2011
2012
2013 #ifdef PMAP
2014 oligospace = power(alphabet_size,index1part_aa);
2015 #else
2016 oligospace = power(4,index1part);
2017 #endif
2018 #if !defined(PMAP) && !defined(LARGE_GENOMES)
2019 blocksize = 64; /* Used to be determined by 4^(kmer - basesize), but now fixed at 64 */
2020 #endif
2021
2022
2023 #ifdef HAVE_MMAP
2024 offsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,offsetsmetafile,/*randomp*/false);
2025 offsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,offsetsstrmfile,/*randomp*/false);
2026 #else
2027 offsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,offsetsmetafile,sizeof(UINT4));
2028 offsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,offsetsstrmfile,sizeof(UINT4));
2029 #endif
2030
2031 #ifdef OLIGOSPACE_NOT_LONG
2032 fprintf(stderr,"Allocating memory (%u 4-byte words) for offsets, kmer %d...",oligospace+1U,
2033 #ifdef PMAP
2034 index1part_aa
2035 #else
2036 index1part
2037 #endif
2038 );
2039 #else
2040 fprintf(stderr,"Allocating memory (%llu 4-byte words) for offsets, kmer %d...",(unsigned long long) oligospace+1UL,
2041 #ifdef PMAP
2042 index1part_aa
2043 #else
2044 index1part
2045 #endif
2046 );
2047 #endif
2048
2049 /* Bitpack procedures start from offsets[1], so we need to print offsets[0] as a special case */
2050 offsets = (Positionsptr_T *) MALLOC((oligospace+1) * sizeof(Positionsptr_T));
2051
2052 if (offsets == NULL) {
2053 fprintf(stderr,"cannot allocated requested memory. Cannot run expand offsets on this machine.\n");
2054 exit(9);
2055 } else {
2056 fprintf(stderr,"done\n");
2057 }
2058
2059 fprintf(stderr,"Expanding offsetsstrm into offsets...");
2060 #ifdef PMAP
2061 for (oligoi = 0UL; oligoi <= oligospace; oligoi += 1) {
2062 offsets[oligoi] = Bitpack64_read_one(oligoi,offsetsmeta,offsetsstrm);
2063 }
2064 #elif defined(LARGE_GENOMES)
2065 fprintf(stderr,"Cannot do expand offsets on large genomes\n");
2066 exit(9);
2067 #else
2068 for (oligoi = 0UL; oligoi < oligospace; oligoi += blocksize) {
2069 Bitpack64_block_offsets(&(offsets[oligoi]),oligoi,offsetsmeta,offsetsstrm);
2070 }
2071 #endif
2072
2073 fprintf(stderr,"done\n");
2074
2075 #ifdef HAVE_MMAP
2076 munmap((void *) offsetsstrm,offsetsstrm_len);
2077 munmap((void *) offsetsmeta,offsetsmeta_len);
2078 #else
2079 FREE(offsetsstrm);
2080 FREE(offsetsmeta);
2081 #endif
2082
2083 return offsets;
2084 }
2085
2086
2087 #if defined(HAVE_64_BIT) && defined(UTILITYP)
2088 /* Used by utility programs for writing indexdb */
2089 Hugepositionsptr_T *
Indexdb_offsets_from_bitpack_huge(char * offsetspagesfile,char * offsetsmetafile,char * offsetsstrmfile,int alphabet_size,Width_T index1part_aa)2090 Indexdb_offsets_from_bitpack_huge (char *offsetspagesfile, char *offsetsmetafile, char *offsetsstrmfile,
2091 #ifdef PMAP
2092 int alphabet_size, Width_T index1part_aa
2093 #else
2094 Width_T index1part
2095 #endif
2096 ) {
2097 UINT4 *offsetspages;
2098 UINT4 *offsetsmeta;
2099 UINT4 *offsetsstrm;
2100
2101 size_t offsetspages_len, offsetsmeta_len, offsetsstrm_len;
2102 Hugepositionsptr_T *offsets = NULL;
2103 Oligospace_T oligospace, oligoi;
2104 Blocksize_T blocksize;
2105 #ifdef HAVE_MMAP
2106 int offsetsmeta_fd, offsetsstrm_fd;
2107 #else
2108 Access_T offsetsmeta_access;
2109 #endif
2110 Access_T offsetspages_access, offsetsstrm_access;
2111 double seconds;
2112
2113
2114 #ifdef PMAP
2115 oligospace = power(alphabet_size,index1part_aa);
2116 #else
2117 oligospace = power(4,index1part);
2118 #endif
2119 blocksize = 64; /* Used to be determined by 4^(kmer - basesize), but now fixed at 64 */
2120
2121 if (blocksize == 1) {
2122 return (Hugepositionsptr_T *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,
2123 offsetsstrmfile,sizeof(Hugepositionsptr_T));
2124
2125 } else {
2126 if (offsetspagesfile == NULL) {
2127 offsetspages_access = NOT_USED;
2128 offsetspages_len = 0;
2129 offsetspages = (UINT4 *) MALLOC(1*sizeof(UINT4));
2130 offsetspages[0] = (UINT4) -1;
2131 } else {
2132 offsetspages = (UINT4 *) Access_allocate_private(&offsetspages_access,&offsetspages_len,&seconds,offsetspagesfile,sizeof(UINT4));
2133 }
2134 #ifdef HAVE_MMAP
2135 offsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,offsetsmetafile,/*randomp*/false);
2136 offsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,offsetsstrmfile,/*randomp*/false);
2137 #else
2138 offsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,offsetsmetafile,sizeof(UINT4));
2139 offsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,offsetsstrmfile,sizeof(UINT4));
2140 #endif
2141
2142 #ifdef OLIGOSPACE_NOT_LONG
2143 fprintf(stderr,"Allocating memory (%u 8-byte words) for offsets, kmer %d...",oligospace+1U,
2144 #ifdef PMAP
2145 index1part_aa
2146 #else
2147 index1part
2148 #endif
2149 );
2150 #else
2151 fprintf(stderr,"Allocating memory (%llu 8-byte words) for offsets, kmer %d...",(unsigned long long) oligospace+1UL,
2152 #ifdef PMAP
2153 index1part_aa
2154 #else
2155 index1part
2156 #endif
2157 );
2158 #endif
2159
2160 /* Bitpack procedures start from offsets[1], so we need to print offsets[0] as a special case */
2161 offsets = (Hugepositionsptr_T *) MALLOC((oligospace+1) * sizeof(Hugepositionsptr_T));
2162
2163 if (offsets == NULL) {
2164 fprintf(stderr,"cannot allocated requested memory. Cannot run expand offsets on this machine.\n");
2165 exit(9);
2166 } else {
2167 fprintf(stderr,"done\n");
2168 }
2169
2170 fprintf(stderr,"Expanding offsetsstrm into offsets...");
2171 #ifdef PMAP
2172 for (oligoi = 0UL; oligoi <= oligospace; oligoi += 1) {
2173 offsets[oligoi] = Bitpack64_read_one_huge(oligoi,offsetspages,offsetsmeta,offsetsstrm);
2174 }
2175 #else
2176 for (oligoi = 0UL; oligoi < oligospace; oligoi += blocksize) {
2177 Bitpack64_block_offsets_huge(&(offsets[oligoi]),oligoi,offsetspages,offsetsmeta,offsetsstrm);
2178 }
2179 #endif
2180
2181 fprintf(stderr,"done\n");
2182
2183 #ifdef HAVE_MMAP
2184 munmap((void *) offsetsstrm,offsetsstrm_len);
2185 munmap((void *) offsetsmeta,offsetsmeta_len);
2186 #else
2187 FREE(offsetsstrm);
2188 FREE(offsetsmeta);
2189 #endif
2190 FREE(offsetspages);
2191
2192 return offsets;
2193 }
2194 }
2195 #endif
2196
2197
2198
2199 #ifdef PMAP
2200
2201 /* PMAP version. Doesn't mask bottom 12 nt. */
2202 Univcoord_T *
Indexdb_read(int * nentries,T this,Oligospace_T aaindex)2203 Indexdb_read (int *nentries, T this, Oligospace_T aaindex) {
2204 Univcoord_T *positions, bigendian, littleendian;
2205 Positionsptr_T ptr, ptr0, end0;
2206 int i;
2207 char byte1, byte2, byte3;
2208
2209 debug0(printf("%u (%s)\n",aaindex,Alphabet_aaindex_aa(aaindex,this->alphabet)));
2210
2211 if (this->compression_type == NO_COMPRESSION) {
2212 #ifdef WORDS_BIGENDIAN
2213 /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
2214 ptr0 = Bigendian_convert_uint(this->offsetsstrm[aaindex]);
2215 end0 = Bigendian_convert_uint(this->offsetsstrm[aaindex+1]);
2216 #else
2217 ptr0 = this->offsetsstrm[aaindex];
2218 end0 = this->offsetsstrm[aaindex+1];
2219 #endif
2220
2221 } else if (this->compression_type == BITPACK64_COMPRESSION) {
2222 ptr0 = Bitpack64_read_two(&end0,aaindex,this->offsetsmeta,this->offsetsstrm);
2223
2224 } else {
2225 abort();
2226 }
2227
2228
2229 debug0(printf("offset pointers are %u and %u\n",ptr0,end0));
2230
2231 #ifdef ALLOW_DUPLICATES
2232 /* Not used */
2233 /* Skip backward over bad values, due to duplicates */
2234 if (this->positions_access == FILEIO) {
2235 #ifdef HAVE_PTHREAD
2236 pthread_mutex_lock(&this->positions_read_mutex);
2237 #endif
2238 positions_move_absolute(this->positions_fd,end0-1);
2239 while (end0 > ptr0 && positions_read_backward(this->positions_fd) == BADVAL) {
2240 end0--;
2241 }
2242 #ifdef HAVE_PTHREAD
2243 pthread_mutex_unlock(&this->positions_read_mutex);
2244 #endif
2245 } else {
2246 while (end0 > ptr0 && this->positions[end0-1] == BADVAL) {
2247 end0--;
2248 }
2249 }
2250 #endif /* ALLOW_DUPLICATES */
2251
2252 if ((*nentries = end0 - ptr0) == 0) {
2253 return NULL;
2254 } else {
2255 debug0(printf("Indexdb_read: offset pointers are %u and %u\n",ptr0,end0));
2256
2257 positions = (Univcoord_T *) MALLOC(*nentries*sizeof(Univcoord_T));
2258 if (this->positions_access == FILEIO) {
2259 #ifdef HAVE_PTHREAD
2260 pthread_mutex_lock(&this->positions_read_mutex);
2261 #endif
2262 #ifdef LARGE_GENOMES
2263 positions_move_absolute_1(this->positions_high_fd,ptr0);
2264 positions_move_absolute_4(this->positions_fd,ptr0);
2265 positions_read_multiple_large(this->positions_high_fd,this->positions_fd,positions,*nentries);
2266 #else
2267 positions_move_absolute_4(this->positions_fd,ptr0);
2268 positions_read_multiple(this->positions_fd,positions,*nentries);
2269 #endif
2270 #ifdef HAVE_PTHREAD
2271 pthread_mutex_unlock(&this->positions_read_mutex);
2272 #endif
2273
2274 } else if (this->positions_access == ALLOCATED_PRIVATE || this->positions_access == ALLOCATED_SHARED) {
2275 #ifdef LARGE_GENOMES
2276 positions_copy_multiple_large(positions,&(this->positions_high[ptr0]),&(this->positions[ptr0]),*nentries);
2277 #else
2278 memcpy(positions,&(this->positions[ptr0]),(*nentries)*sizeof(Univcoord_T));
2279 #endif
2280
2281 } else {
2282 #ifdef WORDS_BIGENDIAN
2283 #ifdef LARGE_GENOMES
2284 for (ptr = ptr0, i = 0; ptr < end0; ptr++, i++) {
2285 bigendian = (Univcoord_T) this->positions_high[ptr];
2286 bigendian <<= 8;
2287
2288 littleendian = this->positions[ptr];
2289 bigendian |= littleendian & 0xff; /* 0 */
2290 bigendian <<= 8;
2291 bigendian |= ((littleendian >>= 8) & 0xff); /* 1 */
2292 bigendian <<= 8;
2293 bigendian |= ((littleendian >>= 8) & 0xff); /* 2 */
2294 bigendian <<= 8;
2295 bigendian |= ((littleendian >>= 8) & 0xff); /* 3 */
2296 positions[i] = bigendian;
2297 }
2298 #else
2299 for (ptr = ptr0, i = 0; ptr < end0; ptr++, i++) {
2300 littleendian = this->positions[ptr];
2301 bigendian = littleendian & 0xff; /* 0 */
2302 bigendian <<= 8;
2303 bigendian |= ((littleendian >>= 8) & 0xff); /* 1 */
2304 bigendian <<= 8;
2305 bigendian |= ((littleendian >>= 8) & 0xff); /* 2 */
2306 bigendian <<= 8;
2307 bigendian |= ((littleendian >>= 8) & 0xff); /* 3 */
2308 positions[i] = bigendian;
2309 }
2310 #endif
2311
2312 #else /* littleendian */
2313
2314 #ifdef LARGE_GENOMES
2315 positions_copy_multiple_large(positions,&(this->positions_high[ptr0]),&(this->positions[ptr0]),*nentries);
2316 #else
2317 memcpy(positions,&(this->positions[ptr0]),(*nentries)*sizeof(Univcoord_T));
2318 #endif
2319 #endif
2320 }
2321 debug0(
2322 printf("%d entries:",*nentries);
2323 for (i = 0; i < *nentries; i++) {
2324 printf(" %u",positions[i]);
2325 }
2326 printf("\n");
2327 );
2328
2329 return positions;
2330 }
2331 }
2332
2333 #else
2334
2335 /* GMAP version: Allocates memory */
2336 Univcoord_T *
Indexdb_read(int * nentries,T this,Oligospace_T oligo)2337 Indexdb_read (int *nentries, T this, Oligospace_T oligo) {
2338 Univcoord_T *positions;
2339 Positionsptr_T ptr0, end0;
2340 Oligospace_T part0;
2341 #ifdef WORDS_BIGENDIAN
2342 int i;
2343 Positionsptr_T ptr;
2344 Univcoord_T bigendian, littleendian;
2345 char byte1, byte2, byte3;
2346 #endif
2347 #ifdef DEBUG0
2348 int j;
2349 #endif
2350
2351
2352 #if 0
2353 debug0(printf("%06X (%s)\n",oligo,shortoligo_nt(oligo,index1part)));
2354 #endif
2355 part0 = oligo & poly_T;
2356
2357 /* Ignore poly A and poly T on stage 1 */
2358 /* Was commented out */
2359 if (part0 == poly_A || part0 == poly_T) {
2360 *nentries = 0;
2361 return NULL;
2362 }
2363
2364 if (this->compression_type == NO_COMPRESSION) {
2365 #ifdef WORDS_BIGENDIAN
2366 /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
2367 ptr0 = Bigendian_convert_uint(this->offsetsstrm[part0]);
2368 end0 = Bigendian_convert_uint(this->offsetsstrm[part0+1]);
2369 #else
2370 ptr0 = this->offsetsstrm[part0];
2371 end0 = this->offsetsstrm[part0+1];
2372 #endif
2373
2374 } else if (this->compression_type == BITPACK64_COMPRESSION) {
2375 #ifdef LARGE_GENOMES
2376 ptr0 = Bitpack64_read_two_huge(&end0,part0,this->offsetspages,this->offsetsmeta,this->offsetsstrm);
2377 #else
2378 ptr0 = Bitpack64_read_two(&end0,part0,this->offsetsmeta,this->offsetsstrm);
2379 #endif
2380
2381 } else {
2382 abort();
2383 }
2384
2385
2386 #ifdef ALLOW_DUPLICATES
2387 /* Not used */
2388 /* Skip backward over bad values, due to duplicates */
2389 if (this->positions_access == FILEIO) {
2390 #ifdef HAVE_PTHREAD
2391 pthread_mutex_lock(&this->positions_read_mutex);
2392 #endif
2393 positions_move_absolute(this->positions_fd,end0-1);
2394 while (end0 > ptr0 && positions_read_backward(this->positions_fd) == BADVAL) {
2395 end0--;
2396 }
2397 #ifdef HAVE_PTHREAD
2398 pthread_mutex_unlock(&this->positions_read_mutex);
2399 #endif
2400 } else {
2401 while (end0 > ptr0 && this->positions[end0-1] == BADVAL) {
2402 end0--;
2403 }
2404 }
2405 #endif
2406
2407 if ((*nentries = end0 - ptr0) == 0) {
2408 return NULL;
2409 } else {
2410 debug0(printf("Indexdb_read: offset pointers are %llu and %llu\n",ptr0,end0));
2411
2412 positions = (Univcoord_T *) MALLOC(*nentries*sizeof(Univcoord_T));
2413 #ifdef LARGE_GENOMES
2414 if (this->positions_high_access == FILEIO || this->positions_access == FILEIO) {
2415 #ifdef HAVE_PTHREAD
2416 pthread_mutex_lock(&this->positions_read_mutex);
2417 #endif
2418 positions_move_absolute_1(this->positions_high_fd,ptr0);
2419 positions_move_absolute_4(this->positions_fd,ptr0);
2420 positions_read_multiple_large(this->positions_high_fd,this->positions_fd,positions,*nentries);
2421 #ifdef HAVE_PTHREAD
2422 pthread_mutex_unlock(&this->positions_read_mutex);
2423 #endif
2424
2425 } else if (this->positions_high_access == ALLOCATED_PRIVATE || this->positions_high_access == ALLOCATED_SHARED ||
2426 this->positions_access == ALLOCATED_PRIVATE || this->positions_access == ALLOCATED_SHARED) {
2427 positions_copy_multiple_large(positions,&(this->positions_high[ptr0]),&(this->positions[ptr0]),*nentries);
2428
2429 #else
2430
2431 if (this->positions_access == FILEIO) {
2432 #ifdef HAVE_PTHREAD
2433 pthread_mutex_lock(&this->positions_read_mutex);
2434 #endif
2435 positions_move_absolute_4(this->positions_fd,ptr0);
2436 positions_read_multiple(this->positions_fd,positions,*nentries);
2437 #ifdef HAVE_PTHREAD
2438 pthread_mutex_unlock(&this->positions_read_mutex);
2439 #endif
2440
2441 } else if (this->positions_access == ALLOCATED_PRIVATE || this->positions_access == ALLOCATED_SHARED) {
2442 memcpy(positions,&(this->positions[ptr0]),(*nentries)*sizeof(Univcoord_T));
2443
2444 #endif
2445
2446 } else {
2447 #ifdef WORDS_BIGENDIAN
2448 #ifdef LARGE_GENOMES
2449 for (ptr = ptr0, i = 0; ptr < end0; ptr++, i++) {
2450 bigendian = (Univcoord_T) this->positions_high[ptr];
2451 bigendian <<= 8;
2452
2453 littleendian = this->positions[ptr];
2454 bigendian |= littleendian & 0xff; /* 0 */
2455 bigendian <<= 8;
2456 bigendian |= ((littleendian >>= 8) & 0xff); /* 1 */
2457 bigendian <<= 8;
2458 bigendian |= ((littleendian >>= 8) & 0xff); /* 2 */
2459 bigendian <<= 8;
2460 bigendian |= ((littleendian >>= 8) & 0xff); /* 3 */
2461 positions[i] = bigendian;
2462 }
2463 #else
2464 for (ptr = ptr0, i = 0; ptr < end0; ptr++, i++) {
2465 littleendian = this->positions[ptr];
2466 bigendian = littleendian & 0xff; /* 0 */
2467 bigendian <<= 8;
2468 bigendian |= ((littleendian >>= 8) & 0xff); /* 1 */
2469 bigendian <<= 8;
2470 bigendian |= ((littleendian >>= 8) & 0xff); /* 2 */
2471 bigendian <<= 8;
2472 bigendian |= ((littleendian >>= 8) & 0xff); /* 3 */
2473 positions[i] = bigendian;
2474 }
2475 #endif
2476
2477 #else /* littleendian */
2478
2479 #ifdef LARGE_GENOMES
2480 positions_copy_multiple_large(positions,&(this->positions_high[ptr0]),&(this->positions[ptr0]),*nentries);
2481 #else
2482 memcpy(positions,&(this->positions[ptr0]),(*nentries)*sizeof(Univcoord_T));
2483 #endif
2484 #endif
2485 }
2486
2487 debug0(
2488 printf("%d entries:",*nentries);
2489 for (j = 0; j < *nentries; j++) {
2490 printf(" %llu",positions[j]);
2491 }
2492 printf("\n");
2493 );
2494
2495 return positions;
2496 }
2497 }
2498
2499 #endif /* ifdef PMAP */
2500
2501
2502
2503 /* Does not allocate memory, but points to positions in Indexdb_T.
2504 diagterm is needed to skip any positions that extend past genomicpos 0 */
2505 #ifdef UTILITYP
2506
2507 /* Utility programs do not distinguish between Univcoord_T */
2508
2509 #elif !defined(GSNAP)
2510 /* GMAP reads instead of using pointers */
2511
2512 #elif defined(LARGE_GENOMES)
2513 int
2514 Indexdb_largeptr (unsigned char **positions_high, UINT4 **positions,
2515 T this, Oligospace_T oligo) {
2516 int nentries;
2517 Positionsptr_T ptr0, end0;
2518 #ifdef DEBUG0
2519 unsigned char *p, *r;
2520 UINT4 *q;
2521 #endif
2522
2523 ptr0 = Bitpack64_read_two_huge(&end0,oligo,this->offsetspages,this->offsetsmeta,this->offsetsstrm);
2524
2525 debug0(printf("Indexdb_largeptr: oligo = %06llX, offset pointers are %llu and %llu\n",
2526 oligo,ptr0,end0));
2527
2528 if ((nentries = end0 - ptr0) == 0) {
2529 *positions_high = (unsigned char *) NULL;
2530 *positions = (UINT4 *) NULL;
2531 return 0;
2532
2533 } else {
2534 *positions_high = &(this->positions_high[ptr0]);
2535 *positions = &(this->positions[ptr0]);
2536
2537 debug0(
2538 printf("%d entries:",nentries);
2539 r = &(this->positions_high[end0]);
2540 p = *positions_high;
2541 q = *positions;
2542 while (p < r) {
2543 printf(" %u %u",*p,*q);
2544 p++; q++;
2545 }
2546 printf("\n");
2547 );
2548
2549 return nentries;
2550 }
2551 }
2552 #endif
2553
2554
2555 #ifdef GSNAP
2556 int
2557 Indexdb_ptr (UINT4 **positions, T this, Oligospace_T oligo) {
2558 int nentries;
2559 UINT4 ptr0, end0;
2560 #ifdef DEBUG0
2561 UINT4 *p, *r;
2562 #endif
2563
2564 ptr0 = Bitpack64_read_two(&end0,oligo,this->offsetsmeta,this->offsetsstrm);
2565
2566 debug0(printf("Indexdb_ptr: oligo = %06llX, offset pointers are %u and %u\n",
2567 oligo,ptr0,end0));
2568
2569 if ((nentries = end0 - ptr0) == 0) {
2570 *positions = (UINT4 *) NULL;
2571 return 0;
2572
2573 } else {
2574 *positions = &(this->positions[ptr0]);
2575
2576 debug0(
2577 printf("%d entries",nentries);
2578 r = &(this->positions[end0]);
2579 p = *positions;
2580 while (p < r) {
2581 printf(" %u",*p);
2582 p++;
2583 }
2584 printf("\n");
2585 );
2586
2587 return nentries;
2588 }
2589 }
2590 #endif
2591
2592
2593
2594 #if defined(UTILITYP)
2595
2596 #elif defined(GSNAP)
2597 /* GSNAP uses pointers instead of reading */
2598
2599 #else
2600 /* Analogous to Indexdb_read, except this includes diagterm. Always allocates memory. */
2601 Univcoord_T *
2602 Indexdb_read_with_diagterm (int *nentries, T this, Oligospace_T oligo, int diagterm) {
2603 Univcoord_T *positions;
2604 Positionsptr_T ptr0, end0;
2605
2606 #ifdef LARGE_GENOMES
2607 Positionsptr_T ptr;
2608 int i;
2609 #ifdef HAVE_AVX2
2610 __m256i _high, _low, _diagterm;
2611 Univcoord_T *out;
2612 unsigned char *in_high;
2613 UINT4 *in_low;
2614 #endif
2615
2616 #elif defined(HAVE_AVX2)
2617 __m256i _diagterm;
2618 Univcoord_T *out, *in;
2619 #elif defined(HAVE_SSE2)
2620 __m128i _diagterm;
2621 Univcoord_T *out, *in;
2622 #else
2623 Positionsptr_T ptr;
2624 int i;
2625 #endif
2626
2627 #ifdef DEBUG0
2628 int k;
2629 #endif
2630
2631 assert(diagterm >= 0); /* For GMAP */
2632
2633 #ifdef LARGE_GENOMES
2634 ptr0 = Bitpack64_read_two_huge(&end0,oligo,this->offsetspages,this->offsetsmeta,this->offsetsstrm);
2635 #else
2636 ptr0 = Bitpack64_read_two(&end0,oligo,this->offsetsmeta,this->offsetsstrm);
2637 #endif
2638
2639 debug0(printf("Indexdb_read_with_diagterm: oligo = %06llX, offset pointers are %llu and %llu, diagterm %d\n",
2640 oligo,ptr0,end0,diagterm));
2641
2642 if ((*nentries = end0 - ptr0) == 0) {
2643 return (Univcoord_T *) NULL;
2644
2645 #if 0
2646 } else {
2647 /* Should not be necessary if diagterm >= 0 */
2648 /* Skip any positions that extend past genomicpos 0 */
2649 #ifdef LARGE_GENOMES
2650 while (ptr0 < end0 && ((Univcoord_T) this->positions_high[ptr0] << 32) + this->positions[ptr0] < (Univcoord_T) -diagterm) {
2651 ptr0++;
2652 }
2653 #else
2654 while (ptr0 < end0 && this->positions[ptr0] < (Univcoord_T) -diagterm) {
2655 ptr0++;
2656 }
2657 #endif
2658 if ((*nentries = end0 - ptr0) == 0) {
2659 return (Univcoord_T *) NULL;
2660 }
2661 #endif
2662 }
2663
2664 positions = (Univcoord_T *) MALLOC((*nentries)*sizeof(Univcoord_T));
2665
2666 #ifdef LARGE_GENOMES
2667 /* Previously checked for (this->positions_high_access == ALLOCATED_PRIVATE || this->positions_high_access == ALLOCATED_SHARED ||
2668 this->positions_access == ALLOCATED_PRIVATE || this->positions_access == ALLOCATED_SHARED) */
2669 #ifdef HAVE_AVX2
2670 _diagterm = _mm256_set1_epi64x(diagterm);
2671 in_high = &(this->positions_high[ptr0]);
2672 in_low = &(this->positions[ptr0]);
2673 out = &(positions[0]);
2674 while (in_low + 4 < &(this->positions[end0])) {
2675 _high = _mm256_slli_epi64(_mm256_cvtepu8_epi64(_mm_loadu_si128((__m128i *) in_high)), 32);
2676 _low = _mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i *) in_low));
2677 _mm256_storeu_si256((__m256i *) out, _mm256_add_epi64(_mm256_add_epi64(_high,_low), _diagterm));
2678 in_high += 4;
2679 in_low += 4;
2680 out += 4;
2681 }
2682 while (in_low < &(this->positions[end0])) {
2683 *out++ = ((Univcoord_T) *in_high++ << 32) + (*in_low++) + diagterm;
2684 }
2685
2686 #else
2687 for (ptr = ptr0, i = 0; ptr < end0; ptr++) {
2688 positions[i++] = ((Univcoord_T) this->positions_high[ptr] << 32) + this->positions[ptr] + diagterm;
2689 }
2690 #endif
2691
2692 #elif defined(HAVE_AVX2)
2693 _diagterm = _mm256_set1_epi32(diagterm);
2694 in = &(this->positions[ptr0]);
2695 out = &(positions[0]);
2696 while (in + 8 < &(this->positions[end0])) {
2697 _mm256_storeu_si256((__m256i *) out, _mm256_add_epi32(_mm256_loadu_si256((__m256i *) in), _diagterm));
2698 in += 8;
2699 out += 8;
2700 }
2701 while (in < &(this->positions[end0])) {
2702 *out++ = *in++ + diagterm;
2703 }
2704
2705 #elif defined(HAVE_SSE2)
2706 _diagterm = _mm_set1_epi32(diagterm);
2707 in = &(this->positions[ptr0]);
2708 out = &(positions[0]);
2709 while (in + 4 < &(this->positions[end0])) {
2710 _mm_storeu_si128((__m128i *) out, _mm_add_epi32(_mm_loadu_si128((__m128i *) in), _diagterm));
2711 in += 4;
2712 out += 4;
2713 }
2714 while (in < &(this->positions[end0])) {
2715 *out++ = *in++ + diagterm;
2716 }
2717
2718 #else
2719 for (ptr = ptr0, i = 0; ptr < end0; ptr++) {
2720 positions[i++] = this->positions[ptr] + diagterm;
2721 }
2722 #endif
2723
2724 debug0(
2725 printf("%d entries (vs diagterm %d):",*nentries,diagterm);
2726 for (k = 0; k < *nentries; k++) {
2727 printf(" %u",positions[k]);
2728 }
2729 printf("\n");
2730 );
2731
2732 return positions;
2733 }
2734 #endif
2735
2736
2737
2738 /************************************************************************
2739 * Create procedure -- for user-provided genomic segment
2740 ************************************************************************/
2741
2742 #if defined(UTILITYP) || defined(LARGE_GENOMES)
2743 #else
2744 T
2745 Indexdb_new_segment (char *genomicseg,
2746 #ifdef PMAP
2747 int alphabet_size, Width_T index1part_aa, bool watsonp,
2748 #else
2749 Width_T index1part,
2750 #endif
2751 Width_T index1interval) {
2752 T new = (T) MALLOC(sizeof(*new));
2753 char *uppercaseCode;
2754 Positionsptr_T *work_offsets; /* Working set for use in calculating positions */
2755 int totalcounts = 0;
2756 int c;
2757 Oligospace_T oligospace, oligoi;
2758 char *p;
2759 Univcoord_T position = 0UL;
2760
2761 #ifdef HAVE_64_BIT
2762 Oligospace_T oligo = 0U, masked, mask;
2763 #else
2764 Shortoligomer_T high = 0U, low = 0U, carry;
2765 #endif
2766
2767 #ifdef PMAP
2768 int frame = -1, between_counter[3], in_counter[3];
2769 Oligospace_T aaindex;
2770 int index1part_nt = 3*index1part_aa;
2771 #else
2772 int between_counter, in_counter;
2773 #endif
2774
2775 uppercaseCode = UPPERCASE_U2T;
2776
2777 #ifdef PMAP
2778 oligospace = power(alphabet_size,index1part_aa);
2779 between_counter[0] = between_counter[1] = between_counter[2] = 0;
2780 in_counter[0] = in_counter[1] = in_counter[2] = 0;
2781 new->index1part = index1part_aa;
2782
2783 #else
2784 #ifdef HAVE_64_BIT
2785 mask = ~(~0ULL << 2*index1part);
2786 #else
2787 mask = ~(~0U << 2*index1part);
2788 #endif
2789 oligospace = power(4,index1part);
2790 between_counter = in_counter = 0;
2791 new->index1part = index1part;
2792 #endif
2793
2794 new->index1interval = 1;
2795 new->compression_type = NO_COMPRESSION;
2796
2797 new->offsetsmeta = (UINT4 *) CALLOC(oligospace+1,sizeof(UINT4));
2798 for (oligoi = 0; oligoi <= oligospace; oligoi++) {
2799 new->offsetsmeta[oligoi] = oligoi;
2800 }
2801 new->offsetsmeta_access = ALLOCATED_PRIVATE;
2802
2803 new->offsetsstrm = (UINT4 *) CALLOC(oligospace+1,sizeof(UINT4));
2804 new->offsetsstrm_access = ALLOCATED_PRIVATE;
2805
2806 p = genomicseg;
2807 while ((c = *(p++)) != '\0') {
2808 #ifdef PMAP
2809 if (++frame == 3) {
2810 frame = 0;
2811 }
2812 between_counter[frame] += 1;
2813 in_counter[frame] += 1;
2814 #else
2815 between_counter++;
2816 in_counter++;
2817 #endif
2818
2819 #ifdef HAVE_64_BIT
2820 switch (uppercaseCode[c]) {
2821 case 'A': oligo = (oligo << 2); break;
2822 case 'C': oligo = (oligo << 2) | 1U; break;
2823 case 'G': oligo = (oligo << 2) | 2U; break;
2824 case 'T': oligo = (oligo << 2) | 3U; break;
2825 default:
2826 oligo = 0U;
2827 #ifdef PMAP
2828 in_counter[0] = in_counter[1] = in_counter[2] = 0;
2829 #else
2830 in_counter = 0;
2831 #endif
2832 break;
2833 }
2834
2835 #else
2836 carry = (low >> 30);
2837 switch (uppercaseCode[c]) {
2838 case 'A': low = (low << 2); break;
2839 case 'C': low = (low << 2) | 1U; break;
2840 case 'G': low = (low << 2) | 2U; break;
2841 case 'T': low = (low << 2) | 3U; break;
2842 default:
2843 high = low = carry = 0U;
2844 #ifdef PMAP
2845 in_counter[0] = in_counter[1] = in_counter[2] = 0;
2846 #else
2847 in_counter = 0;
2848 #endif
2849 break;
2850 }
2851 high = (high << 2) | carry;
2852 #endif
2853
2854 #ifdef PMAP
2855 if (in_counter[frame] > 0) {
2856 if (watsonp == true) {
2857 #ifdef HAVE_64_BIT
2858 if (Alphabet_get_codon_fwd(oligo) == AA_STOP) {
2859 in_counter[frame] = 0;
2860 }
2861 #else
2862 if (Alphabet_get_codon_fwd(low) == AA_STOP) {
2863 in_counter[frame] = 0;
2864 }
2865 #endif
2866 } else {
2867 #ifdef HAVE_64_BIT
2868 if (Alphabet_get_codon_rev(oligo) == AA_STOP) {
2869 in_counter[frame] = 0;
2870 }
2871 #else
2872 if (Alphabet_get_codon_rev(low) == AA_STOP) {
2873 in_counter[frame] = 0;
2874 }
2875 #endif
2876 }
2877 }
2878 if (in_counter[frame] == index1part_aa + 1) {
2879 if (between_counter[frame] >= index1interval) {
2880 #ifdef HAVE_64_BIT
2881 aaindex = Alphabet_get_aa_index(oligo,watsonp,index1part_nt);
2882 #else
2883 aaindex = Alphabet_get_aa_index(high,low,watsonp,index1part_nt);
2884 #endif
2885 #ifdef OLIGOSPACE_NOT_LONG
2886 oligoi = (Oligospace_T) aaindex + 1U;
2887 #else
2888 oligoi = (Oligospace_T) aaindex + 1UL;
2889 #endif
2890 new->offsetsstrm[oligoi] += 1;
2891 between_counter[frame] = 0;
2892 }
2893 in_counter[frame] -= 1;
2894 }
2895 #else
2896 if (in_counter == index1part) {
2897 if (
2898 #ifdef NONMODULAR
2899 between_counter >= index1interval
2900 #else
2901 /* Actually, modular condition not needed for user-supplied genomic segment */
2902 (position-index1part+1U) % index1interval == 0
2903 #endif
2904 ) {
2905 masked = oligo & mask;
2906 #ifdef OLIGOSPACE_NOT_LONG
2907 oligoi = (Oligospace_T) masked + 1U;
2908 #else
2909 oligoi = (Oligospace_T) masked + 1UL;
2910 #endif
2911 new->offsetsstrm[oligoi] += 1;
2912 debug(printf("Found oligo %06llX. Incremented offsets for %llu to be %u\n",
2913 masked,(unsigned long long) oligoi,new->offsetsstrm[oligoi]));
2914 between_counter = 0;
2915 }
2916 in_counter--;
2917 }
2918 #endif
2919
2920 position++;
2921 }
2922
2923 #ifdef ADDSENTINEL
2924 for (oligoi = 1; oligoi <= oligospace; oligoi++) {
2925 new->offsetsstrm[oligoi] = new->offsetsstrm[oligoi] + new->offsetsstrm[oligoi-1] + 1;
2926 debug(printf("Offset for %06llX: %u\n",oligoi,new->offsetsstrm[oligoi]));
2927 }
2928 #else
2929 for (oligoi = 1; oligoi <= oligospace; oligoi++) {
2930 new->offsetsstrm[oligoi] = new->offsetsstrm[oligoi] + new->offsetsstrm[oligoi-1];
2931 debug(printf("Offset for %06llX: %u\n",oligoi,new->offsetsstrm[oligoi]));
2932 }
2933 #endif
2934
2935
2936 /* Create positions */
2937 position = 0U;
2938 #ifdef HAVE_64_BIT
2939 oligo = 0ULL;
2940 #else
2941 high = low = 0U;
2942 #endif
2943
2944 #ifdef PMAP
2945 frame = -1;
2946 between_counter[0] = between_counter[1] = between_counter[2] = 0;
2947 in_counter[0] = in_counter[1] = in_counter[2] = 0;
2948 #else
2949 between_counter = in_counter = 0;
2950 #endif
2951
2952 work_offsets = (Positionsptr_T *) CALLOC(oligospace+1,sizeof(Positionsptr_T));
2953 for (oligoi = 0; oligoi <= oligospace; oligoi++) {
2954 work_offsets[oligoi] = new->offsetsstrm[oligoi];
2955 }
2956
2957 totalcounts = new->offsetsstrm[oligospace];
2958 if (totalcounts == 0) {
2959 #ifdef PMAP
2960 fprintf(stderr,"Error: user-provided genomic segment has no valid oligomers of size %d\n",index1part_nt);
2961 #else
2962 fprintf(stderr,"Error: user-provided genomic segment has no valid oligomers of size %d\n",index1part);
2963 #endif
2964 exit(9);
2965 }
2966 new->positions = (Univcoord_T *) CALLOC(totalcounts,sizeof(Univcoord_T));
2967 new->positions_access = ALLOCATED_PRIVATE;
2968
2969 p = genomicseg;
2970 while ((c = *(p++)) != '\0') {
2971 #ifdef PMAP
2972 if (++frame == 3) {
2973 frame = 0;
2974 }
2975 between_counter[frame] += 1;
2976 in_counter[frame] += 1;
2977 #else
2978 between_counter++;
2979 in_counter++;
2980 #endif
2981
2982 #ifdef HAVE_64_BIT
2983 switch (uppercaseCode[c]) {
2984 case 'A': oligo = (oligo << 2); break;
2985 case 'C': oligo = (oligo << 2) | 1U; break;
2986 case 'G': oligo = (oligo << 2) | 2U; break;
2987 case 'T': oligo = (oligo << 2) | 3U; break;
2988 default:
2989 oligo = 0U;
2990 #ifdef PMAP
2991 in_counter[0] = in_counter[1] = in_counter[2] = 0;
2992 #else
2993 in_counter = 0;
2994 #endif
2995 break;
2996 }
2997
2998 #else
2999 carry = (low >> 30);
3000 switch (uppercaseCode[c]) {
3001 case 'A': low = (low << 2); break;
3002 case 'C': low = (low << 2) | 1U; break;
3003 case 'G': low = (low << 2) | 2U; break;
3004 case 'T': low = (low << 2) | 3U; break;
3005 default:
3006 high = low = carry = 0U;
3007 #ifdef PMAP
3008 in_counter[0] = in_counter[1] = in_counter[2] = 0;
3009 #else
3010 in_counter = 0;
3011 #endif
3012 break;
3013 }
3014 high = (high << 2) | carry;
3015 #endif
3016
3017 #ifdef PMAP
3018 if (in_counter[frame] > 0) {
3019 if (watsonp == true) {
3020 #ifdef HAVE_64_BIT
3021 if (Alphabet_get_codon_fwd(oligo) == AA_STOP) {
3022 in_counter[frame] = 0;
3023 }
3024 #else
3025 if (Alphabet_get_codon_fwd(low) == AA_STOP) {
3026 in_counter[frame] = 0;
3027 }
3028 #endif
3029 } else {
3030 #ifdef HAVE_64_BIT
3031 if (Alphabet_get_codon_rev(oligo) == AA_STOP) {
3032 in_counter[frame] = 0;
3033 }
3034 #else
3035 if (Alphabet_get_codon_rev(low) == AA_STOP) {
3036 in_counter[frame] = 0;
3037 }
3038 #endif
3039 }
3040 }
3041
3042 if (in_counter[frame] == index1part_aa + 1) {
3043 if (between_counter[frame] >= index1interval) {
3044 #ifdef HAVE_64_BIT
3045 aaindex = Alphabet_get_aa_index(oligo,watsonp,index1part_nt);
3046 #else
3047 aaindex = Alphabet_get_aa_index(high,low,watsonp,index1part_nt);
3048 #endif
3049 if (watsonp == true) {
3050 new->positions[work_offsets[aaindex]++] = position-index1part_nt+1;
3051 } else {
3052 new->positions[work_offsets[aaindex]++] = position;
3053 }
3054 between_counter[frame] = 0;
3055 }
3056 in_counter[frame] -= 1;
3057 }
3058 #else
3059 if (in_counter == index1part) {
3060 if (
3061 #ifdef NONMODULAR
3062 between_counter >= index1interval
3063 #else
3064 /* Actually, modular condition not needed for user-supplied genomic segment */
3065 (position-index1part+1) % index1interval == 0
3066 #endif
3067 ) {
3068 masked = oligo & mask;
3069 new->positions[work_offsets[masked]++] = position-index1part+1;
3070 between_counter = 0;
3071 }
3072 in_counter--;
3073 }
3074 #endif
3075
3076 position++;
3077 }
3078
3079 #ifdef ADDSENTINEL
3080 for (oligoi = 0; oligoi < oligospace; oligoi++) {
3081 new->positions[work_offsets[oligoi]] = (Univcoord_T) -1;
3082 }
3083 #endif
3084
3085 FREE(work_offsets);
3086
3087 return new;
3088 }
3089 #endif
3090
3091
3092