1 static char rcsid[] = "$Id: indexdb.c 222798 2020-06-03 21:55:53Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11 
12 #include "indexdb.h"
13 #include "indexdbdef.h"
14 
15 
16 #ifdef WORDS_BIGENDIAN
17 #include "bigendian.h"
18 #else
19 #include "littleendian.h"
20 #endif
21 
22 #include <stdio.h>
23 #include <stddef.h>
24 #include <stdlib.h>
25 #include <string.h>		/* For memset */
26 #include <ctype.h>		/* For toupper */
27 #include <sys/mman.h>		/* For munmap */
28 
29 #ifdef HAVE_UNISTD_H
30 #include <unistd.h>		/* For lseek and close */
31 #endif
32 #ifdef HAVE_SYS_TYPES_H
33 #include <sys/types.h>		/* For off_t */
34 #endif
35 #if HAVE_DIRENT_H
36 # include <dirent.h>
37 # define NAMLEN(dirent) strlen((dirent)->d_name)
38 #else
39 # define dirent direct
40 # define NAMLEN(dirent) (dirent)->d_namlen
41 # if HAVE_SYS_NDIR_H
42 #  include <sys/ndir.h>
43 # endif
44 # if HAVE_SYS_DIR_H
45 #  include <sys/dir.h>
46 # endif
47 # if HAVE_NDIR_H
48 #  include <ndir.h>
49 # endif
50 #endif
51 
52 #include "mem.h"
53 #include "assert.h"
54 #include "fopen.h"
55 #include "types.h"		/* For Oligospace_T */
56 #include "filesuffix.h"
57 
58 #include "compress.h"
59 #include "interval.h"
60 #include "complement.h"
61 #include "bitpack64-read.h"	/* For Bitpack64_block_offsets */
62 #include "bitpack64-readtwo.h"
63 
64 #ifdef HAVE_PTHREAD
65 #include <pthread.h>		/* sys/types.h already included above */
66 #endif
67 
68 #if defined(HAVE_SSE2)
69 #include <emmintrin.h>
70 #endif
71 #ifdef HAVE_SSE4_1
72 #include <smmintrin.h>
73 #endif
74 #ifdef HAVE_AVX2
75 #include <immintrin.h>
76 #endif
77 #ifdef HAVE_AVX512
78 #include <immintrin.h>
79 #endif
80 
81 
82 
83 
84 #define MAXENTRIES 20
85 
86 /* Note: NONMODULAR is the old behavior.  Now we store only when
87    startposition % index1interval == 0 */
88 
89 
90 /* Low-level codon hacking */
91 #ifdef DEBUG
92 #define debug(x) x
93 #else
94 #define debug(x)
95 #endif
96 
97 /* Calls to Indexdb_read */
98 #ifdef DEBUG0
99 #define debug0(x) x
100 #else
101 #define debug0(x)
102 #endif
103 
104 /* Shifting of high word to low word for PMAP */
105 #ifdef DEBUG2
106 #define debug2(x) x
107 #else
108 #define debug2(x)
109 #endif
110 
111 
112 #ifdef PMAP
113 #if (defined(DEBUG) || defined(DEBUG0))
114 static Width_T index1part_aa;
115 #endif
116 
117 void
Indexdb_setup(Width_T index1part_aa_in)118 Indexdb_setup (Width_T index1part_aa_in) {
119 #if (defined(DEBUG) || defined(DEBUG0))
120   index1part_aa = index1part_aa_in;
121 #endif
122   return;
123 }
124 
125 #else
126 
127 #define poly_A 0U
128 static Oligospace_T poly_T;  /* Was LOW12MER 0x00FFFFFF */
129 
130 #if (defined(DEBUG) || defined(DEBUG0))
131 static Width_T index1part;
132 #endif
133 
134 void
Indexdb_setup(Width_T index1part_in)135 Indexdb_setup (Width_T index1part_in) {
136 #if (defined(DEBUG) || defined(DEBUG0))
137   index1part = index1part_in;
138 #endif
139 
140 #ifdef HAVE_64_BIT
141   poly_T = ~(~0ULL << 2*index1part_in);
142 #else
143   poly_T = ~(~0U << 2*index1part_in);
144 #endif
145 
146   return;
147 }
148 #endif
149 
150 
151 #define T Indexdb_T
152 
153 
154 #ifdef LARGE_GENOMES
155 void
Indexdb_free(T * old)156 Indexdb_free (T *old) {
157   if (*old) {
158 
159     if ((*old)->positions_high_access == NOT_USED) {
160       /* Skip */
161 
162     } else if ((*old)->positions_high_access == ALLOCATED_PRIVATE) {
163       FREE((*old)->positions_high);
164 
165     } else if ((*old)->positions_high_access == ALLOCATED_SHARED) {
166       Access_deallocate((*old)->positions_high,(*old)->positions_high_shmid,(*old)->positions_high_key);
167 
168 #ifdef HAVE_MMAP
169     } else if ((*old)->positions_high_access == MMAPPED) {
170       munmap((void *) (*old)->positions_high,(*old)->positions_high_len);
171       close((*old)->positions_high_fd);
172 #endif
173     } else {
174       close((*old)->positions_high_fd);
175     }
176 
177     if ((*old)->positions_access == ALLOCATED_PRIVATE) {
178       FREE((*old)->positions);
179 
180     } else if ((*old)->positions_access == ALLOCATED_SHARED) {
181       Access_deallocate((*old)->positions,(*old)->positions_shmid,(*old)->positions_key);
182 
183 #ifdef HAVE_MMAP
184     } else if ((*old)->positions_access == MMAPPED) {
185       munmap((void *) (*old)->positions,(*old)->positions_len);
186       close((*old)->positions_fd);
187 #endif
188     } else {
189       close((*old)->positions_fd);
190     }
191 
192 #ifdef HAVE_PTHREAD
193     if ((*old)->positions_high_access == FILEIO || (*old)->positions_access == FILEIO) {
194       pthread_mutex_destroy(&(*old)->positions_read_mutex);
195     }
196 #endif
197 
198 
199     if ((*old)->offsetsstrm_access == ALLOCATED_PRIVATE) {
200       FREE((*old)->offsetsstrm);
201 
202     } else if ((*old)->offsetsstrm_access == ALLOCATED_SHARED) {
203       Access_deallocate((*old)->offsetsstrm,(*old)->offsetsstrm_shmid,(*old)->offsetsstrm_key);
204 
205 #ifdef HAVE_MMAP
206     } else if ((*old)->offsetsstrm_access == MMAPPED) {
207       munmap((void *) (*old)->offsetsstrm,(*old)->offsetsstrm_len);
208       close((*old)->offsetsstrm_fd);
209 #endif
210     }
211 
212     if ((*old)->offsetsmeta_access == ALLOCATED_PRIVATE) {
213       FREE((*old)->offsetsmeta);
214     } else if ((*old)->offsetsmeta_access == ALLOCATED_SHARED) {
215       Access_deallocate((*old)->offsetsmeta,(*old)->offsetsmeta_shmid,(*old)->offsetsmeta_key);
216 #ifdef HAVE_MMAP
217     } else if ((*old)->offsetsmeta_access == MMAPPED) {
218       munmap((void *) (*old)->offsetsmeta,(*old)->offsetsmeta_len);
219       close((*old)->offsetsmeta_fd);
220 #endif
221     }
222 
223     if ((*old)->offsetspages_access == ALLOCATED_PRIVATE) {
224       FREE((*old)->offsetspages);
225     } else if ((*old)->offsetspages_access == ALLOCATED_SHARED) {
226       Access_deallocate((*old)->offsetspages,(*old)->offsetspages_shmid,(*old)->offsetspages_key);
227 #ifdef HAVE_MMAP
228     } else if ((*old)->offsetspages_access == MMAPPED) {
229       munmap((void *) (*old)->offsetspages,(*old)->offsetspages_len);
230       close((*old)->offsetspages_fd);
231 #endif
232     }
233 
234     FREE(*old);
235   }
236   return;
237 }
238 
239 #else
240 
241 void
Indexdb_free(T * old)242 Indexdb_free (T *old) {
243   if (*old) {
244 
245     if ((*old)->positions_access == ALLOCATED_PRIVATE) {
246       FREE((*old)->positions);
247 
248     } else if ((*old)->positions_access == ALLOCATED_SHARED) {
249       Access_deallocate((*old)->positions,(*old)->positions_shmid,(*old)->positions_key);
250 
251 #ifdef HAVE_MMAP
252     } else if ((*old)->positions_access == MMAPPED) {
253       munmap((void *) (*old)->positions,(*old)->positions_len);
254       close((*old)->positions_fd);
255 #endif
256     } else if ((*old)->positions_access == FILEIO) {
257 #ifdef HAVE_PTHREAD
258       pthread_mutex_destroy(&(*old)->positions_read_mutex);
259 #endif
260       close((*old)->positions_fd);
261     }
262 
263     if ((*old)->offsetsstrm_access == ALLOCATED_PRIVATE) {
264       FREE((*old)->offsetsstrm);
265 
266     } else if ((*old)->offsetsstrm_access == ALLOCATED_SHARED) {
267       Access_deallocate((*old)->offsetsstrm,(*old)->offsetsstrm_shmid,(*old)->offsetsstrm_key);
268 
269 #ifdef HAVE_MMAP
270     } else if ((*old)->offsetsstrm_access == MMAPPED) {
271       munmap((void *) (*old)->offsetsstrm,(*old)->offsetsstrm_len);
272       close((*old)->offsetsstrm_fd);
273 #endif
274     }
275 
276     if ((*old)->offsetsmeta_access == ALLOCATED_PRIVATE) {
277       FREE((*old)->offsetsmeta);
278     } else if ((*old)->offsetsmeta_access == ALLOCATED_SHARED) {
279       Access_deallocate((*old)->offsetsmeta,(*old)->offsetsmeta_shmid,(*old)->offsetsmeta_key);
280 #ifdef HAVE_MMAP
281     } else if ((*old)->offsetsmeta_access == MMAPPED) {
282       munmap((void *) (*old)->offsetsmeta,(*old)->offsetsmeta_len);
283       close((*old)->offsetsmeta_fd);
284 #endif
285     }
286 
287     FREE(*old);
288   }
289   return;
290 }
291 
292 #endif
293 
294 
295 Width_T
Indexdb_interval(T this)296 Indexdb_interval (T this) {
297   return this->index1interval;
298 }
299 
300 
301 #ifdef LARGE_GENOMES
302 bool
Indexdb_positions_fileio_p(T this)303 Indexdb_positions_fileio_p (T this) {
304   if (this->positions_high_access == FILEIO || this->positions_access == FILEIO) {
305     return true;
306   } else {
307     return false;
308   }
309 }
310 #else
311 bool
Indexdb_positions_fileio_p(T this)312 Indexdb_positions_fileio_p (T this) {
313   if (this->positions_access == FILEIO) {
314     return true;
315   } else {
316     return false;
317   }
318 }
319 #endif
320 
321 
322 
323 static Oligospace_T
power(int base,Width_T exponent)324 power (int base, Width_T exponent) {
325 #ifdef OLIGOSPACE_NOT_LONG
326   Oligospace_T result = 1U;
327 #else
328   Oligospace_T result = 1UL;
329 #endif
330   int i;
331 
332   for (i = 0; i < exponent; i++) {
333     result *= base;
334   }
335   return result;
336 }
337 
338 #if 0
339 double
340 Indexdb_mean_size_old (T this, Mode_T mode, Width_T index1part) {
341   Oligospace_T oligospace, n;
342 
343 #ifdef PMAP
344   /* index1part should be in aa */
345   n = oligospace = power(this->alphabet_size,index1part);
346 #else
347   n = oligospace = power(4,index1part);
348   if (mode != STANDARD) {
349     n = power(3,index1part);
350   }
351 #endif
352 
353 #ifdef WORDS_BIGENDIAN
354   /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
355   return (double) Bigendian_convert_uint(this->offsetsstrm[Bigendian_convert_uint(this->offsetsmeta[oligospace/this->blocksize])])/(double) n;
356 #else
357   printf("index1part %d, blocksize %d, oligospace %u\n",index1part,this->blocksize,oligospace);
358   printf("%u\n",this->offsetsmeta[oligospace/this->blocksize]);
359   printf("%u\n",this->offsetsstrm[this->offsetsmeta[oligospace/this->blocksize]]);
360 
361   return (double) this->offsetsstrm[this->offsetsmeta[oligospace/this->blocksize]]/(double) n;
362 #endif
363 }
364 #endif
365 
366 
367 double
Indexdb_mean_size(T this,Mode_T mode,Width_T index1part)368 Indexdb_mean_size (T this, Mode_T mode, Width_T index1part) {
369   Oligospace_T oligospace, n;
370 
371 #ifdef PMAP
372   /* index1part should be in aa */
373   n = oligospace = power(this->alphabet_size,index1part);
374 #else
375   n = oligospace = power(4,index1part);
376   if (mode != STANDARD) {
377     n = power(3,index1part);
378   }
379 #endif
380 
381   return (double) this->total_npositions/(double) n;
382 }
383 
384 
385 
386 
387 static Indexdb_filenames_T
Indexdb_filenames_new(char * pages_filename,char * pointers_filename,char * offsets_filename,char * positions_high_filename,char * positions_filename,char * pointers_basename_ptr,char * offsets_basename_ptr,char * positions_high_basename_ptr,char * positions_basename_ptr,char * pointers_index1info_ptr,char * offsets_index1info_ptr,char * positions_high_index1info_ptr,char * positions_index1info_ptr)388 Indexdb_filenames_new (char *pages_filename, char *pointers_filename, char *offsets_filename,
389 		       char *positions_high_filename, char *positions_filename,
390 		       char *pointers_basename_ptr, char *offsets_basename_ptr,
391 		       char *positions_high_basename_ptr, char *positions_basename_ptr,
392 		       char *pointers_index1info_ptr, char *offsets_index1info_ptr,
393 		       char *positions_high_index1info_ptr, char *positions_index1info_ptr) {
394   Indexdb_filenames_T new = (Indexdb_filenames_T) MALLOC(sizeof(*new));
395 
396   new->pages_filename = pages_filename;
397   new->pointers_filename = pointers_filename;
398   new->offsets_filename = offsets_filename;
399   new->positions_high_filename = positions_high_filename;
400   new->positions_filename = positions_filename;
401 
402   new->pointers_basename_ptr = pointers_basename_ptr;
403   new->offsets_basename_ptr = offsets_basename_ptr;
404   new->positions_high_basename_ptr = positions_high_basename_ptr;
405   new->positions_basename_ptr = positions_basename_ptr;
406 
407   new->pointers_index1info_ptr = pointers_index1info_ptr;
408   new->offsets_index1info_ptr = offsets_index1info_ptr;
409   new->positions_high_index1info_ptr = positions_high_index1info_ptr;
410   new->positions_index1info_ptr = positions_index1info_ptr;
411 
412   return new;
413 }
414 
415 void
Indexdb_filenames_free(Indexdb_filenames_T * old)416 Indexdb_filenames_free (Indexdb_filenames_T *old) {
417 
418   FREE((*old)->pages_filename);
419   FREE((*old)->pointers_filename);
420   FREE((*old)->offsets_filename);
421   FREE((*old)->positions_high_filename);
422   FREE((*old)->positions_filename);
423 
424   FREE(*old);
425 
426   return;
427 }
428 
429 
430 Indexdb_filenames_T
Indexdb_get_filenames_no_compression(Width_T * index1part,Width_T * index1interval,char * genomesubdir,char * fileroot,char * idx_filesuffix,char * snps_root,Width_T required_interval,bool offsets_only_p)431 Indexdb_get_filenames_no_compression (Width_T *index1part, Width_T *index1interval,
432 				      char *genomesubdir, char *fileroot, char *idx_filesuffix, char *snps_root,
433 				      Width_T required_interval, bool offsets_only_p) {
434   char *offsets_filename, *positions_high_filename, *positions_filename,
435     *offsets_basename_ptr, *positions_high_basename_ptr, *positions_basename_ptr,
436     *offsets_index1info_ptr, *positions_high_index1info_ptr, *positions_index1info_ptr;
437 
438   char *base_filename, *filename;
439   char *pattern, interval_char, digit_string[2], *p, *q;
440 #ifndef PMAP
441   char tens;
442 #endif
443   char ones;
444   Width_T found_index1part, found_interval;
445   int rootlength, patternlength;
446 
447   char *locoffsets_suffix, *offsets_suffix, *positions_high_suffix, *positions_suffix;
448   struct dirent *entry;
449   DIR *dp;
450 
451 
452   if (snps_root == NULL) {
453     locoffsets_suffix = "locoffsets";
454     offsets_suffix = "offsets";
455     positions_high_suffix = POSITIONS_HIGH_FILESUFFIX;
456     positions_suffix = POSITIONS_FILESUFFIX;
457   } else {
458     locoffsets_suffix = (char *) CALLOC(strlen("locoffsets")+strlen(".")+strlen(snps_root)+1,sizeof(char));
459     sprintf(locoffsets_suffix,"%s.%s","locoffsets",snps_root);
460     offsets_suffix = (char *) CALLOC(strlen("offsets")+strlen(".")+strlen(snps_root)+1,sizeof(char));
461     sprintf(offsets_suffix,"%s.%s","offsets",snps_root);
462     positions_high_suffix = (char *) CALLOC(strlen(POSITIONS_HIGH_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
463     sprintf(positions_high_suffix,"%s.%s",POSITIONS_HIGH_FILESUFFIX,snps_root);
464     positions_suffix = (char *) CALLOC(strlen(POSITIONS_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
465     sprintf(positions_suffix,"%s.%s",POSITIONS_FILESUFFIX,snps_root);
466   }
467 
468   *index1part = 0;
469   *index1interval = 1000;
470   base_filename = (char *) NULL;
471 
472   if ((dp = opendir(genomesubdir)) == NULL) {
473     fprintf(stderr,"Unable to open directory %s\n",genomesubdir);
474     return (Indexdb_filenames_T) NULL;
475   }
476 
477   pattern = (char *) CALLOC(strlen(fileroot)+strlen(".")+strlen(idx_filesuffix)+1,sizeof(char));
478   sprintf(pattern,"%s.%s",fileroot,idx_filesuffix);
479   patternlength = strlen(pattern);
480 
481   digit_string[1] = '\0';	/* Needed for atoi */
482   while ((entry = readdir(dp)) != NULL) {
483     filename = entry->d_name;
484     if (!strncmp(filename,pattern,patternlength)) {
485       p = &(filename[strlen(pattern)]); /* Points after idx_filesuffix, e.g., "ref" */
486       if ((q = strstr(p,locoffsets_suffix)) != NULL && !strcmp(q,locoffsets_suffix)) {
487 	/* Skip refXXXlocoffsets */
488       } else if ((q = strstr(p,offsets_suffix)) != NULL && !strcmp(q,offsets_suffix)) {
489 	if (q - p == 3) {
490 #ifdef PMAP
491 	  /* e.g., pf67offsets */
492 	  if (sscanf(p,"%c%c",&ones,&interval_char) == 2) {
493 	    digit_string[0] = ones;
494 	    found_index1part = atoi(digit_string);
495 
496 	    digit_string[0] = interval_char;
497 	    found_interval = atoi(digit_string);
498 	  } else {
499 	    abort();
500 	  }
501 #else
502 	  /* e.g., ref123offsets */
503 	  if (sscanf(p,"%c%c%c",&tens,&ones,&interval_char) == 3) {
504 	    digit_string[0] = tens;
505 	    found_index1part = 10*atoi(digit_string);
506 	    digit_string[0] = ones;
507 	    found_index1part += atoi(digit_string);
508 
509 	    digit_string[0] = interval_char;
510 	    found_interval = atoi(digit_string);
511 	  } else {
512 	    abort();
513 	  }
514 #endif
515 
516 	} else if (q - p == 1) {
517 	  /* Old style, e.g, idx or ref3 */
518 	  if (sscanf(p,"%c",&interval_char) == 1) {
519 	    if (interval_char == 'x') {
520 	      found_interval = 6;
521 	    } else {
522 	      digit_string[0] = interval_char;
523 	      found_interval = atoi(digit_string);
524 	    }
525 	  } else {
526 	    abort();
527 	  }
528 #ifdef PMAP
529 	  found_index1part = found_interval;
530 #else
531 	  found_index1part = 12;
532 #endif
533 
534 	} else {
535 	  fprintf(stderr,"Cannot parse part between %s and offsets in filename %s\n",idx_filesuffix,filename);
536 	  abort();
537 	  if (snps_root != NULL) {
538 	    FREE(locoffsets_suffix);
539 	    FREE(offsets_suffix);
540 	    FREE(positions_high_suffix);
541 	    FREE(positions_suffix);
542 	  }
543 	  return (Indexdb_filenames_T) NULL;
544 	}
545 
546 	if (required_interval != 0) {
547 	  if (found_interval == required_interval) {
548 	    *index1part = found_index1part;
549 	    *index1interval = found_interval;
550 	    FREE(base_filename);
551 	    base_filename = (char *) CALLOC(strlen(filename)+1,sizeof(char));
552 	    strcpy(base_filename,filename);
553 	  }
554 	} else {
555 	  if (found_interval < *index1interval) {
556 	    *index1part = found_index1part;
557 	    *index1interval = found_interval;
558 	    FREE(base_filename);
559 	    base_filename = (char *) CALLOC(strlen(filename)+1,sizeof(char));
560 	    strcpy(base_filename,filename);
561 	  }
562 	}
563       }
564     }
565   }
566 
567   FREE(pattern);
568 
569   if (closedir(dp) < 0) {
570     fprintf(stderr,"Unable to close directory %s\n",genomesubdir);
571   }
572 
573   /* Construct full filenames */
574   if (base_filename == NULL) {
575 #if 0
576     fprintf(stderr,"Cannot find offsets file containing %s and %s",idx_filesuffix,offsets_suffix);
577     if (required_interval > 0) {
578       fprintf(stderr," and having sampling interval of %d",required_interval);
579     }
580     fprintf(stderr,"\n");
581 #endif
582 
583     /* offsets_filename = (char *) NULL; */
584     /* positions_high_filename = (char *) NULL; */
585     /* positions_filename = (char *) NULL; */
586     if (snps_root != NULL) {
587       FREE(locoffsets_suffix);
588       FREE(offsets_suffix);
589       FREE(positions_high_suffix);
590       FREE(positions_suffix);
591     }
592     return (Indexdb_filenames_T) NULL;
593 
594   } else {
595     offsets_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+strlen(base_filename)+1,sizeof(char));
596     offsets_basename_ptr = &(offsets_filename[strlen(genomesubdir)+strlen("/")]);
597     offsets_index1info_ptr = &(offsets_basename_ptr[patternlength]);
598 
599     sprintf(offsets_filename,"%s/%s",genomesubdir,base_filename);
600     if (Access_file_exists_p(offsets_filename) == false) {
601       fprintf(stderr,"Offsets filename %s does not exist\n",offsets_filename);
602       FREE(offsets_filename);
603       /* offsets_filename = (char *) NULL; */
604       /* positions_high_filename = (char *) NULL; */
605       /* positions_filename = (char *) NULL; */
606       FREE(base_filename);
607       if (snps_root != NULL) {
608 	FREE(locoffsets_suffix);
609 	FREE(offsets_suffix);
610 	FREE(positions_high_suffix);
611 	FREE(positions_suffix);
612       }
613       return (Indexdb_filenames_T) NULL;
614     }
615 
616 
617     if ((q = strstr(base_filename,offsets_suffix)) == NULL) {
618       abort();
619     } else {
620       rootlength = q - base_filename;
621     }
622 
623     positions_high_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(positions_high_suffix)+1,sizeof(char));
624     positions_high_basename_ptr = &(positions_high_filename[strlen(genomesubdir)+strlen("/")]);
625     positions_high_index1info_ptr = &(positions_high_basename_ptr[patternlength]);
626 
627     positions_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(positions_suffix)+1,sizeof(char));
628     positions_basename_ptr = &(positions_filename[strlen(genomesubdir)+strlen("/")]);
629     positions_index1info_ptr = &(positions_basename_ptr[patternlength]);
630 
631     sprintf(positions_high_filename,"%s/",genomesubdir);
632     strncpy(positions_high_basename_ptr,base_filename,rootlength);
633     strcpy(&(positions_high_basename_ptr[rootlength]),positions_high_suffix);
634 
635     sprintf(positions_filename,"%s/",genomesubdir);
636     strncpy(positions_basename_ptr,base_filename,rootlength);
637     strcpy(&(positions_basename_ptr[rootlength]),positions_suffix);
638 
639     if (offsets_only_p == true) {
640       /* Do not look for a positions file */
641     } else if (Access_file_exists_p(positions_filename) == false) {
642       fprintf(stderr,"Positions filename %s does not exist\n",positions_filename);
643       FREE(offsets_filename);
644       FREE(positions_high_filename);
645       FREE(positions_filename);
646       /* offsets_filename = (char *) NULL; */
647       /* positions_high_filename = (char *) NULL; */
648       /* positions_filename = (char *) NULL; */
649       FREE(base_filename);
650       if (snps_root != NULL) {
651 	FREE(offsets_suffix);
652 	FREE(positions_high_suffix);
653 	FREE(positions_suffix);
654       }
655       return (Indexdb_filenames_T) NULL;
656     }
657 
658     if (Access_file_exists_p(positions_high_filename) == false) {
659       /* Not a large genome */
660       FREE(positions_high_filename);
661       positions_high_filename = (char *) NULL;
662     }
663 
664     if (snps_root != NULL) {
665       FREE(offsets_suffix);
666       FREE(positions_high_suffix);
667       FREE(positions_suffix);
668     }
669 
670     FREE(base_filename);
671     fprintf(stderr,"Looking for index files in directory %s (offsets not compressed)\n",genomesubdir);
672     fprintf(stderr,"  Offsets file is %s\n",offsets_basename_ptr);
673     if (positions_high_filename == NULL) {
674       fprintf(stderr,"  Positions file is %s\n",positions_basename_ptr);
675     } else {
676       fprintf(stderr,"  Positions files are %s and %s\n",positions_high_basename_ptr,positions_basename_ptr);
677     }
678     return Indexdb_filenames_new(/*pages_filename*/NULL,/*pointers_filename*/NULL,offsets_filename,
679 				 positions_high_filename,positions_filename,
680 				 /*pointers_basename_ptr*/NULL,offsets_basename_ptr,
681 				 positions_high_basename_ptr,positions_basename_ptr,
682 				 /*pointers_index1info_ptr*/NULL,offsets_index1info_ptr,
683 				 positions_high_index1info_ptr,positions_index1info_ptr);
684   }
685 }
686 
687 
688 
689 #ifdef PMAP
690 #define BASE_KMER_SAMPLING 3   /* e.g., 677 */
691 #define KMER_SAMPLING 2   /* e.g., 77 */
692 #else
693 #define BASE_KMER_SAMPLING 3   /* e.g., 153 */
694 #define KMER_SAMPLING 1   /* e.g., 3 */
695 #endif
696 
697 
698 Indexdb_filenames_T
Indexdb_get_filenames_bitpack(Alphabet_T * alphabet,Alphabet_T required_alphabet,Width_T * index1part,Width_T * index1interval,char * genomesubdir,char * fileroot,char * idx_filesuffix,char * snps_root,Width_T required_index1part,Width_T required_interval,Blocksize_T blocksize,bool offsets_only_p)699 Indexdb_get_filenames_bitpack (
700 #ifdef PMAP
701 			       Alphabet_T *alphabet, Alphabet_T required_alphabet,
702 #endif
703 			       Width_T *index1part, Width_T *index1interval,
704 			       char *genomesubdir, char *fileroot, char *idx_filesuffix, char *snps_root,
705 			       Width_T required_index1part, Width_T required_interval,
706 			       Blocksize_T blocksize, bool offsets_only_p) {
707   char *offsetspages_filename, *offsetsmeta_filename, *offsetsstrm_filename,
708     *positions_high_filename, *positions_filename,
709     *offsetspages_basename_ptr, *offsetsmeta_basename_ptr, *offsetsstrm_basename_ptr,
710     *positions_high_basename_ptr, *positions_basename_ptr,
711     *offsetsmeta_index1info_ptr, *offsetsstrm_index1info_ptr,
712     *positions_high_index1info_ptr, *positions_index1info_ptr;
713   /* char *offsetspages_index1info_ptr; */
714 
715   char *base_filename, *filename;
716 #ifdef PMAP
717   char *pattern1, *pattern2, *a;
718   int patternlength1, patternlength2, alphabet_strlen;
719   Alphabet_T found_alphabet;
720 #else
721   char *pattern;
722   char tens;
723 #endif
724   char interval_char, digit_string[2], *p, *q;
725   Width_T found_index1part = 0, found_interval = 0;
726   int rootlength, patternlength;
727 
728   char ones;
729   char *locoffsetsstrm_suffix, *offsetspages_suffix, *offsetsmeta_suffix, *offsetsstrm_suffix,
730     *positions_high_suffix, *positions_suffix;
731   struct dirent *entry;
732   DIR *dp;
733 
734 
735   if (snps_root == NULL) {
736     if (blocksize == 32) {
737       offsetspages_suffix = "offsets32pages";
738       offsetsmeta_suffix = "offsets32meta";
739       locoffsetsstrm_suffix = "locoffsets32strm";
740       offsetsstrm_suffix = "offsets32strm";
741       positions_high_suffix = POSITIONS_HIGH_FILESUFFIX;
742       positions_suffix = POSITIONS_FILESUFFIX;
743     } else if (blocksize == 64) {
744       offsetspages_suffix = "offsets64pages";
745       offsetsmeta_suffix = "offsets64meta";
746       locoffsetsstrm_suffix = "locoffsets64strm";
747       offsetsstrm_suffix = "offsets64strm";
748       positions_high_suffix = POSITIONS_HIGH_FILESUFFIX;
749       positions_suffix = POSITIONS_FILESUFFIX;
750     } else {
751       fprintf(stderr,"Unexpected blocksize %d\n",blocksize);
752       abort();
753     }
754   } else {
755     if (blocksize == 32) {
756       offsetspages_suffix = (char *) CALLOC(strlen("offsets32pages.")+strlen(snps_root)+1,sizeof(char));
757       offsetsmeta_suffix = (char *) CALLOC(strlen("offsets32meta.")+strlen(snps_root)+1,sizeof(char));
758       locoffsetsstrm_suffix = (char *) CALLOC(strlen("locoffsets32strm.")+strlen(snps_root)+1,sizeof(char));
759       offsetsstrm_suffix = (char *) CALLOC(strlen("offsets32strm.")+strlen(snps_root)+1,sizeof(char));
760       positions_high_suffix = (char *) CALLOC(strlen(POSITIONS_HIGH_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
761       positions_suffix = (char *) CALLOC(strlen(POSITIONS_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
762 
763       sprintf(offsetspages_suffix,"offsets32pages.%s",snps_root);
764       sprintf(offsetsmeta_suffix,"offsets32meta.%s",snps_root);
765       sprintf(locoffsetsstrm_suffix,"locoffsets32strm.%s",snps_root);
766       sprintf(offsetsstrm_suffix,"offsets32strm.%s",snps_root);
767       sprintf(positions_high_suffix,"%s.%s",POSITIONS_HIGH_FILESUFFIX,snps_root);
768       sprintf(positions_suffix,"%s.%s",POSITIONS_FILESUFFIX,snps_root);
769 
770     } else if (blocksize == 64) {
771       offsetspages_suffix = (char *) CALLOC(strlen("offsets64pages.")+strlen(snps_root)+1,sizeof(char));
772       offsetsmeta_suffix = (char *) CALLOC(strlen("offsets64meta.")+strlen(snps_root)+1,sizeof(char));
773       locoffsetsstrm_suffix = (char *) CALLOC(strlen("locoffsets64strm.")+strlen(snps_root)+1,sizeof(char));
774       offsetsstrm_suffix = (char *) CALLOC(strlen("offsets64strm.")+strlen(snps_root)+1,sizeof(char));
775       positions_high_suffix = (char *) CALLOC(strlen(POSITIONS_HIGH_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
776       positions_suffix = (char *) CALLOC(strlen(POSITIONS_FILESUFFIX)+strlen(".")+strlen(snps_root)+1,sizeof(char));
777 
778       sprintf(offsetspages_suffix,"offsets64pages.%s",snps_root);
779       sprintf(offsetsmeta_suffix,"offsets64meta.%s",snps_root);
780       sprintf(locoffsetsstrm_suffix,"locoffsets64strm.%s",snps_root);
781       sprintf(offsetsstrm_suffix,"offsets64strm.%s",snps_root);
782       sprintf(positions_high_suffix,"%s.%s",POSITIONS_HIGH_FILESUFFIX,snps_root);
783       sprintf(positions_suffix,"%s.%s",POSITIONS_FILESUFFIX,snps_root);
784 
785     } else {
786       fprintf(stderr,"Unexpected blocksize %d\n",blocksize);
787       abort();
788     }
789   }
790 
791 
792 #ifdef PMAP
793   *alphabet = NALPHABETS + 1;
794 #endif
795   *index1part = 0;
796   *index1interval = 1000;
797   base_filename = (char *) NULL;
798 
799   if ((dp = opendir(genomesubdir)) == NULL) {
800     fprintf(stderr,"Unable to open directory %s\n",genomesubdir);
801     return (Indexdb_filenames_T) NULL;
802   } else {
803     fprintf(stderr,"Looking for genome %s in directory %s\n",fileroot,genomesubdir);
804   }
805 
806 #ifdef PMAP
807   pattern1 = (char *) CALLOC(strlen(fileroot)+strlen(".")+1,sizeof(char)); /* e.g., "hg19." */
808   sprintf(pattern1,"%s.",fileroot);
809   patternlength1 = strlen(pattern1);
810 
811   pattern2 = (char *) CALLOC(strlen(".")+strlen(idx_filesuffix)+1,sizeof(char)); /* e.g., ".pr" */
812   sprintf(pattern2,".%s",idx_filesuffix);
813   patternlength2 = strlen(pattern2);
814 
815   digit_string[1] = '\0';	/* Needed for atoi */
816   while ((entry = readdir(dp)) != NULL) {
817     filename = entry->d_name;
818     if (!strncmp(filename,pattern1,patternlength1)) {
819       a = &(filename[strlen(pattern1)]); /* Points after fileroot, e.g., "hg19." */
820       if ((p = strstr(a,pattern2)) != NULL && (q = strstr(p,locoffsetsstrm_suffix)) != NULL && !strcmp(q,locoffsetsstrm_suffix)) {
821 	/* Skip */
822       } else if ((p = strstr(a,pattern2)) != NULL && (q = strstr(p,offsetsstrm_suffix)) != NULL && !strcmp(q,offsetsstrm_suffix)) {
823 	if ((found_alphabet = Alphabet_find(a)) != AA0) {
824 	  alphabet_strlen = p - a;
825 	  p += patternlength2;
826 
827 	  if (q - p == KMER_SAMPLING) {
828 	    /* Latest style, e.g., pf77 */
829 	    if (sscanf(p,"%c%c",&ones,&interval_char) == 2) {
830 	      digit_string[0] = ones;
831 	      found_index1part = atoi(digit_string);
832 
833 	      digit_string[0] = interval_char;
834 	      found_interval = atoi(digit_string);
835 	    } else {
836 	      abort();
837 	    }
838 
839 #if 0
840 	  } else if (q - p == BASE_KMER_SAMPLING) {
841 	    /* Previous style, e.g., pf677.  No longer supported. */
842 	    if (sscanf(p,"%c%c%c",&ones0,&ones,&interval_char) == 3) {
843 	      digit_string[0] = ones0;
844 	      found_basesize = atoi(digit_string);
845 
846 	      digit_string[0] = ones;
847 	      found_index1part = atoi(digit_string);
848 
849 	      digit_string[0] = interval_char;
850 	      found_interval = atoi(digit_string);
851 	    } else {
852 	      abort();
853 	    }
854 #endif
855 
856 	  } else {
857 	    /* fprintf(stderr,"Cannot parse part between %s and offsets in filename %s\n",idx_filesuffix,filename); */
858 	    if (snps_root != NULL) {
859 	      FREE(locoffsetsstrm_suffix);
860 	      FREE(offsetsstrm_suffix);
861 	      FREE(offsetsmeta_suffix);
862 	      FREE(offsetspages_suffix);
863 	      FREE(positions_high_suffix);
864 	      FREE(positions_suffix);
865 	    }
866 	    return (Indexdb_filenames_T) NULL;
867 	  }
868 
869 	  if ((required_alphabet == AA0 || found_alphabet == required_alphabet) &&
870 	      (required_index1part == 0 || found_index1part == required_index1part) &&
871 	      (required_interval == 0 || found_interval == required_interval)) {
872 	    if (required_alphabet == AA0 && found_alphabet > *alphabet) {
873 	      /* Skip, since we have already found an earlier alphabet */
874 	    } else if (required_index1part == 0 && found_index1part < *index1part) {
875 	      /* Skip, since we have already found a larger index1part */
876 	    } else if (required_interval == 0 && found_interval > *index1interval) {
877 	      /* Skip, since we have already found a smaller interval */
878 	    } else {
879 	      patternlength = patternlength1 + alphabet_strlen + patternlength2;
880 	      /* *basesize = found_basesize; */
881 	      *index1part = found_index1part;
882 	      *index1interval = found_interval;
883 	      *alphabet = found_alphabet;
884 	      FREE(base_filename);
885 	      base_filename = (char *) CALLOC(strlen(filename)+1,sizeof(char));
886 	      strcpy(base_filename,filename);
887 	    }
888 	  }
889 	}
890       }
891     }
892   }
893 
894   FREE(pattern2);
895   FREE(pattern1);
896 
897 #else
898 
899   pattern = (char *) CALLOC(strlen(fileroot)+strlen(".")+strlen(idx_filesuffix)+1,sizeof(char));
900   sprintf(pattern,"%s.%s",fileroot,idx_filesuffix);
901   patternlength = strlen(pattern);
902 
903   digit_string[1] = '\0';	/* Needed for atoi */
904   while ((entry = readdir(dp)) != NULL) {
905     filename = entry->d_name;
906     if (!strncmp(filename,pattern,patternlength)) {
907       p = &(filename[strlen(pattern)]); /* Points after idx_filesuffix, e.g., "ref" */
908       if ((q = strstr(p,locoffsetsstrm_suffix)) != NULL && !strcmp(q,locoffsetsstrm_suffix)) {
909 	/* Skip */
910 
911       } else if ((q = strstr(p,offsetsstrm_suffix)) != NULL && !strcmp(q,offsetsstrm_suffix)) {
912 
913 	if (q - p == BASE_KMER_SAMPLING) {
914 	  /* New style, e.g., ref153 */
915 	  if (sscanf(p,"%c%c%c",&tens,&ones,&interval_char) == 3) {
916 	    digit_string[0] = tens;
917 	    found_index1part = 10*atoi(digit_string);
918 	    digit_string[0] = ones;
919 	    found_index1part += atoi(digit_string);
920 
921 	    digit_string[0] = interval_char;
922 	    found_interval = atoi(digit_string);
923 	  } else {
924 	    abort();
925 	  }
926 
927 	} else {
928 	  fprintf(stderr,"Cannot parse part between %s and offsets in filename %s: found %ld characters, expecting %d\n",
929 		  idx_filesuffix,filename,q-p,BASE_KMER_SAMPLING);
930 	  abort();
931 	  if (snps_root != NULL) {
932 	    FREE(locoffsetsstrm_suffix);
933 	    FREE(offsetsstrm_suffix);
934 	    FREE(offsetsmeta_suffix);
935 	    FREE(offsetspages_suffix);
936 	    FREE(positions_high_suffix);
937 	    FREE(positions_suffix);
938 	  }
939 	  return (Indexdb_filenames_T) NULL;
940 	}
941 
942 	if ((required_index1part == 0 || found_index1part == required_index1part) &&
943 	    (required_interval == 0 || found_interval == required_interval)) {
944 	  if (required_index1part == 0 && found_index1part < *index1part) {
945 	    /* Skip, since we have already found a larger index1part */
946 	  } else if (required_interval == 0 && found_interval > *index1interval) {
947 	    /* Skip, since we have already found a smaller interval */
948 	  } else {
949 	    *index1part = found_index1part;
950 	    *index1interval = found_interval;
951 	    FREE(base_filename);
952 	    base_filename = (char *) CALLOC(strlen(filename)+1,sizeof(char));
953 	    strcpy(base_filename,filename);
954 	  }
955 	}
956       }
957     }
958   }
959 
960   FREE(pattern);
961 #endif
962 
963 
964   if (closedir(dp) < 0) {
965     fprintf(stderr,"Unable to close directory %s\n",genomesubdir);
966   }
967 
968   /* Construct full filenames */
969   if (base_filename == NULL) {
970     /* offsetspages_filename = (char *) NULL; */
971     /* offsetsmeta_filename = (char *) NULL; */
972     /* offsetsstrm_filename = (char *) NULL; */
973     /* positions_high_filename = (char *) NULL; */
974     /* positions_filename = (char *) NULL; */
975     if (snps_root != NULL) {
976       FREE(locoffsetsstrm_suffix);
977       FREE(offsetsstrm_suffix);
978       FREE(offsetsmeta_suffix);
979       FREE(offsetspages_suffix);
980       FREE(positions_high_suffix);
981       FREE(positions_suffix);
982     }
983     return (Indexdb_filenames_T) NULL;
984 
985   } else {
986     offsetsstrm_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+strlen(base_filename)+1,sizeof(char));
987     offsetsstrm_basename_ptr = &(offsetsstrm_filename[strlen(genomesubdir)+strlen("/")]);
988     offsetsstrm_index1info_ptr = &(offsetsstrm_basename_ptr[patternlength]);
989 
990     sprintf(offsetsstrm_filename,"%s/%s",genomesubdir,base_filename);
991     if (Access_file_exists_p(offsetsstrm_filename) == false) {
992       fprintf(stderr,"Offsets filename %s does not exist\n",offsetsstrm_filename);
993       FREE(offsetsstrm_filename);
994       /* offsetsstrm_filename = (char *) NULL; */
995       /* positions_high_filename = (char *) NULL; */
996       /* positions_filename = (char *) NULL; */
997       FREE(base_filename);
998       if (snps_root != NULL) {
999 	FREE(locoffsetsstrm_suffix);
1000 	FREE(offsetsstrm_suffix);
1001 	FREE(offsetsmeta_suffix);
1002 	FREE(offsetspages_suffix);
1003 	FREE(positions_high_suffix);
1004 	FREE(positions_suffix);
1005       }
1006       return (Indexdb_filenames_T) NULL;
1007     }
1008 
1009 
1010     if ((q = strstr(base_filename,offsetsstrm_suffix)) == NULL) {
1011       abort();
1012     } else {
1013       rootlength = q - base_filename;
1014     }
1015 
1016     offsetsmeta_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(offsetsmeta_suffix)+1,sizeof(char));
1017     offsetsmeta_basename_ptr = &(offsetsmeta_filename[strlen(genomesubdir)+strlen("/")]);
1018     offsetsmeta_index1info_ptr = &(offsetsmeta_basename_ptr[patternlength]);
1019 
1020     sprintf(offsetsmeta_filename,"%s/",genomesubdir);
1021     strncpy(offsetsmeta_basename_ptr,base_filename,rootlength);
1022     strcpy(&(offsetsmeta_basename_ptr[rootlength]),offsetsmeta_suffix);
1023 
1024     if (Access_file_exists_p(offsetsmeta_filename) == false) {
1025       fprintf(stderr,"Offsetsmeta filename %s does not exist\n",offsetsmeta_filename);
1026       FREE(offsetsstrm_filename);
1027       /* offsetsstrm_filename = (char *) NULL; */
1028       FREE(base_filename);
1029       if (snps_root != NULL) {
1030 	FREE(locoffsetsstrm_suffix);
1031 	FREE(offsetsstrm_suffix);
1032 	FREE(offsetsmeta_suffix);
1033 	FREE(positions_high_suffix);
1034 	FREE(positions_suffix);
1035       }
1036       return (Indexdb_filenames_T) NULL;
1037     }
1038 
1039 
1040     offsetspages_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(offsetspages_suffix)+1,sizeof(char));
1041     offsetspages_basename_ptr = &(offsetspages_filename[strlen(genomesubdir)+strlen("/")]);
1042     /* offsetspages_index1info_ptr = &(offsetspages_basename_ptr[patternlength]); */
1043 
1044     sprintf(offsetspages_filename,"%s/",genomesubdir);
1045     strncpy(offsetspages_basename_ptr,base_filename,rootlength);
1046     strcpy(&(offsetspages_basename_ptr[rootlength]),offsetspages_suffix);
1047     if (Access_file_exists_p(offsetspages_filename) == false) {
1048       /* Not a huge genome */
1049       FREE(offsetspages_filename);
1050       offsetspages_filename = (char *) NULL;
1051     }
1052 
1053     positions_high_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(positions_high_suffix)+1,sizeof(char));
1054     positions_high_basename_ptr = &(positions_high_filename[strlen(genomesubdir)+strlen("/")]);
1055     positions_high_index1info_ptr = &(positions_high_basename_ptr[patternlength]);
1056 
1057     positions_filename = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+rootlength+strlen(positions_suffix)+1,sizeof(char));
1058     positions_basename_ptr = &(positions_filename[strlen(genomesubdir)+strlen("/")]);
1059     positions_index1info_ptr = &(positions_basename_ptr[patternlength]);
1060 
1061     sprintf(positions_high_filename,"%s/",genomesubdir);
1062     strncpy(positions_high_basename_ptr,base_filename,rootlength);
1063     strcpy(&(positions_high_basename_ptr[rootlength]),positions_high_suffix);
1064 
1065     sprintf(positions_filename,"%s/",genomesubdir);
1066     strncpy(positions_basename_ptr,base_filename,rootlength);
1067     strcpy(&(positions_basename_ptr[rootlength]),positions_suffix);
1068 
1069     if (offsets_only_p == true) {
1070       /* Do not look for a positions file */
1071     } else if (Access_file_exists_p(positions_filename) == false) {
1072       /* Try newer naming scheme: ref153positions instead of ref12153positions */
1073       sprintf(positions_high_filename,"%s/",genomesubdir);
1074       strncpy(positions_high_basename_ptr,base_filename,rootlength-BASE_KMER_SAMPLING); /* e.g., skip "153" */
1075       strncpy(&(positions_high_basename_ptr[rootlength-BASE_KMER_SAMPLING]),&(base_filename[rootlength-BASE_KMER_SAMPLING]),BASE_KMER_SAMPLING);
1076       strcpy(&(positions_high_basename_ptr[rootlength]),positions_high_suffix);
1077 
1078       sprintf(positions_filename,"%s/",genomesubdir);
1079       strncpy(positions_basename_ptr,base_filename,rootlength-BASE_KMER_SAMPLING); /* e.g., skip "153" */
1080       strncpy(&(positions_basename_ptr[rootlength-BASE_KMER_SAMPLING]),&(base_filename[rootlength-BASE_KMER_SAMPLING]),BASE_KMER_SAMPLING);
1081       strcpy(&(positions_basename_ptr[rootlength]),positions_suffix);
1082 
1083       if (Access_file_exists_p(positions_filename) == false) {
1084 	fprintf(stderr,"Positions filename %s does not exist\n",positions_filename);
1085 	FREE(offsetspages_filename);
1086 	FREE(offsetsmeta_filename);
1087 	FREE(offsetsstrm_filename);
1088 	FREE(positions_high_filename);
1089 	FREE(positions_filename);
1090 	/* offsetsmeta_filename = (char *) NULL; */
1091 	/* offsetsstrm_filename = (char *) NULL; */
1092 	/* positions_high_filename = (char *) NULL; */
1093 	/* positions_filename = (char *) NULL; */
1094 	FREE(base_filename);
1095 	if (snps_root != NULL) {
1096 	  FREE(locoffsetsstrm_suffix);
1097 	  FREE(offsetsstrm_suffix);
1098 	  FREE(offsetsmeta_suffix);
1099 	  FREE(offsetspages_suffix);
1100 	  FREE(positions_high_suffix);
1101 	  FREE(positions_suffix);
1102 	}
1103 	return (Indexdb_filenames_T) NULL;
1104       }
1105     }
1106 
1107     if (Access_file_exists_p(positions_high_filename) == false) {
1108       /* Not a large genome */
1109       FREE(positions_high_filename);
1110       positions_high_filename = (char *) NULL;
1111     }
1112 
1113     if (snps_root != NULL) {
1114       FREE(locoffsetsstrm_suffix);
1115       FREE(offsetsstrm_suffix);
1116       FREE(offsetsmeta_suffix);
1117       FREE(offsetspages_suffix);
1118       FREE(positions_high_suffix);
1119       FREE(positions_suffix);
1120     }
1121 
1122     FREE(base_filename);
1123 
1124     fprintf(stderr,"Looking for index files in directory %s\n",genomesubdir);
1125     if (offsetspages_filename != NULL)  {
1126       fprintf(stderr,"  Pages file is %s\n",offsetspages_basename_ptr);
1127     }
1128     fprintf(stderr,"  Pointers file is %s\n",offsetsmeta_basename_ptr);
1129     fprintf(stderr,"  Offsets file is %s\n",offsetsstrm_basename_ptr);
1130     if (positions_high_filename == NULL) {
1131       fprintf(stderr,"  Positions file is %s\n",positions_basename_ptr);
1132     } else {
1133       fprintf(stderr,"  Positions files are %s and %s\n",positions_high_basename_ptr,positions_basename_ptr);
1134     }
1135     return Indexdb_filenames_new(offsetspages_filename,offsetsmeta_filename,offsetsstrm_filename,
1136 				 positions_high_filename,positions_filename,
1137 				 offsetsmeta_basename_ptr,offsetsstrm_basename_ptr,
1138 				 positions_high_basename_ptr,positions_basename_ptr,
1139 				 offsetsmeta_index1info_ptr,offsetsstrm_index1info_ptr,
1140 				 positions_high_index1info_ptr,positions_index1info_ptr);
1141 
1142   }
1143 }
1144 
1145 
1146 Indexdb_filenames_T
Indexdb_get_filenames(int * compression_type,Alphabet_T * alphabet,Alphabet_T required_alphabet,Width_T * index1part,Width_T * index1interval,char * genomesubdir,char * fileroot,char * idx_filesuffix,char * snps_root,Width_T required_index1part,Width_T required_interval,bool offsets_only_p)1147 Indexdb_get_filenames (int *compression_type,
1148 #ifdef PMAP
1149 		       Alphabet_T *alphabet, Alphabet_T required_alphabet,
1150 #endif
1151 		       Width_T *index1part, Width_T *index1interval, char *genomesubdir,
1152 		       char *fileroot, char *idx_filesuffix, char *snps_root,
1153 		       Width_T required_index1part, Width_T required_interval,
1154 		       bool offsets_only_p) {
1155   Indexdb_filenames_T filenames;
1156 
1157   if ((filenames = Indexdb_get_filenames_no_compression(&(*index1part),&(*index1interval),
1158 							genomesubdir,fileroot,idx_filesuffix,snps_root,
1159 							required_interval,offsets_only_p)) != NULL) {
1160     *compression_type = NO_COMPRESSION;
1161     return filenames;
1162 
1163 
1164   } else if ((filenames = Indexdb_get_filenames_bitpack(
1165 #ifdef PMAP
1166 							&(*alphabet),required_alphabet,
1167 #endif
1168 							&(*index1part),&(*index1interval),
1169 							genomesubdir,fileroot,idx_filesuffix,snps_root,
1170 							required_index1part,required_interval,
1171 							/*blocksize*/64,offsets_only_p)) != NULL) {
1172     *compression_type = BITPACK64_COMPRESSION;
1173     return filenames;
1174 
1175   } else {
1176     return (Indexdb_filenames_T) NULL;
1177   }
1178 }
1179 
1180 
1181 #if 0
1182 void
1183 Indexdb_shmem_remove (char *genomesubdir, char *fileroot, char *idx_filesuffix, char *snps_root,
1184 #ifdef PMAP
1185 		      Alphabet_T *alphabet, int *alphabet_size, Alphabet_T required_alphabet,
1186 #endif
1187 		      Width_T required_index1part, Width_T required_interval, bool expand_offsets_p) {
1188   Indexdb_filenames_T filenames;
1189   int index1part, index1interval;
1190 
1191   if ((filenames = Indexdb_get_filenames_no_compression(&index1part,&index1interval,
1192 							genomesubdir,fileroot,idx_filesuffix,snps_root,
1193 							required_interval,/*offsets_only_p*/false)) != NULL) {
1194     /* Try non-compressed files */
1195     Access_shmem_remove(filenames->offsets_filename);
1196 
1197   } else if ((filenames = Indexdb_get_filenames_bitpack(
1198 #ifdef PMAP
1199 							&(*alphabet),required_alphabet,
1200 #endif
1201 							&index1part,&index1interval,
1202 							genomesubdir,fileroot,idx_filesuffix,snps_root,
1203 							required_index1part,required_interval,
1204 							/*blocksize*/64,/*offsets_only_p*/false)) != NULL) {
1205     if (expand_offsets_p == true) {
1206       /* ALLOCATED_PRIVATE */
1207 
1208     } else {
1209       Access_shmem_remove(filenames->pointers_filename);
1210       Access_shmem_remove(filenames->offsets_filename);
1211 #ifdef LARGE_GENOMES
1212       if (filenames->pages_filename != NULL) {
1213 	Access_shmem_remove(filenames->pages_filename);
1214       }
1215 #endif
1216     }
1217   }
1218 
1219 #ifdef LARGE_GENOMES
1220   Access_shmem_remove(filenames->positions_high_filename);
1221   Access_shmem_remove(filenames->positions_filename);
1222 #else
1223   Access_shmem_remove(filenames->positions_filename);
1224 #endif
1225 
1226   Indexdb_filenames_free(&filenames);
1227 
1228   return;
1229 }
1230 #endif
1231 
1232 
1233 static void
load_offsets(T new,Indexdb_filenames_T filenames,char * idx_filesuffix,char * snps_root,Access_mode_T offsetsstrm_access,bool sharedp,bool multiple_sequences_p,bool preload_shared_memory_p,bool unload_shared_memory_p)1234 load_offsets (T new, Indexdb_filenames_T filenames, char *idx_filesuffix, char *snps_root,
1235 	      Access_mode_T offsetsstrm_access, bool sharedp,
1236 	      bool multiple_sequences_p, bool preload_shared_memory_p,
1237 	      bool unload_shared_memory_p) {
1238   char *comma;
1239   double seconds;
1240 #ifdef HAVE_MMAP
1241   int npages;
1242 #endif
1243 
1244   fprintf(stderr,"Offsets compression type: bitpack64\n");
1245   new->compression_type = BITPACK64_COMPRESSION;
1246   new->blocksize = 64;  /* Used to be determined by 4^(kmer - basesize), but now fixed at 64 */
1247 
1248   /* (1) Offsetsmeta: always ALLOCATED */
1249   if (snps_root) {
1250     fprintf(stderr,"Allocating memory for %s (%s) offset pointers, kmer %d, interval %d...",
1251 	    idx_filesuffix,snps_root,new->index1part,new->index1interval);
1252   } else {
1253     fprintf(stderr,"Allocating memory for %s offset pointers, kmer %d, interval %d...",
1254 	    idx_filesuffix,new->index1part,new->index1interval);
1255   }
1256 
1257   if (
1258 #ifndef HAVE_MMAP
1259       0 &&
1260 #endif
1261       multiple_sequences_p == false && preload_shared_memory_p == false && unload_shared_memory_p == false) {
1262 #ifdef HAVE_MMAP
1263     new->offsetsmeta = (UINT4 *) Access_mmap(&new->offsetsmeta_fd,&new->offsetsmeta_len,&seconds,
1264 					     filenames->pointers_filename,/*randomp*/false);
1265     new->offsetsmeta_access = MMAPPED;
1266 
1267 #endif
1268   } else if (sharedp == true) {
1269     new->offsetsmeta = (UINT4 *) Access_allocate_shared(&new->offsetsmeta_access,&new->offsetsmeta_shmid,&new->offsetsmeta_key,
1270 							&new->offsetsmeta_fd,&new->offsetsmeta_len,&seconds,
1271 							filenames->pointers_filename,sizeof(UINT4));
1272   } else {
1273     new->offsetsmeta = (UINT4 *) Access_allocate_private(&new->offsetsmeta_access,&new->offsetsmeta_len,&seconds,
1274 							 filenames->pointers_filename,sizeof(UINT4));
1275   }
1276 
1277   if (new->offsetsmeta == NULL) {
1278     fprintf(stderr,"insufficient memory for allocating %s\n",filenames->pointers_filename);
1279     exit(9);
1280   } else {
1281     comma = Genomicpos_commafmt(new->offsetsmeta_len);
1282     fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1283     FREE(comma);
1284     /* new->offsetsmeta_access set by Access_allocate */
1285   }
1286 
1287 
1288   /* (2) Offsetsstrm: could be ALLOCATED or MMAPPED +/- PRELOAD */
1289   if (offsetsstrm_access == USE_ALLOCATE) {
1290     if (snps_root) {
1291       fprintf(stderr,"Allocating memory for %s (%s) offsets, kmer %d, interval %d...",
1292 	      idx_filesuffix,snps_root,new->index1part,new->index1interval);
1293     } else {
1294       fprintf(stderr,"Allocating memory for %s offsets, kmer %d, interval %d...",
1295 	      idx_filesuffix,new->index1part,new->index1interval);
1296     }
1297 
1298     if (sharedp == true) {
1299       new->offsetsstrm = (UINT4 *) Access_allocate_shared(&new->offsetsstrm_access,&new->offsetsstrm_shmid,&new->offsetsstrm_key,
1300 							  &new->offsetsstrm_fd,&new->offsetsstrm_len,&seconds,
1301 							  filenames->offsets_filename,sizeof(UINT4));
1302     } else {
1303       new->offsetsstrm = (UINT4 *) Access_allocate_private(&new->offsetsstrm_access,&new->offsetsstrm_len,&seconds,
1304 							   filenames->offsets_filename,sizeof(UINT4));
1305     }
1306 
1307     if (new->offsetsstrm == NULL) {
1308       fprintf(stderr,"insufficient memory for allocating %s\n",filenames->offsets_filename);
1309       exit(9);
1310     } else {
1311       comma = Genomicpos_commafmt(new->offsetsstrm_len);
1312       fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1313       FREE(comma);
1314     }
1315 
1316 #ifdef HAVE_MMAP
1317   } else if (offsetsstrm_access == USE_MMAP_PRELOAD) {
1318     if (snps_root) {
1319       fprintf(stderr,"Pre-loading %s (%s) offsets, kmer %d, interval %d...",
1320 	      idx_filesuffix,snps_root,new->index1part,new->index1interval);
1321     } else {
1322       fprintf(stderr,"Pre-loading %s offsets, kmer %d, interval %d...",
1323 	      idx_filesuffix,new->index1part,new->index1interval);
1324     }
1325 
1326     new->offsetsstrm = (UINT4 *) Access_mmap_and_preload(&new->offsetsstrm_fd,&new->offsetsstrm_len,&npages,&seconds,
1327 							 filenames->offsets_filename,sizeof(UINT4));
1328     if (new->offsetsstrm == NULL) {
1329       fprintf(stderr,"insufficient memory\n");
1330       exit(9);
1331     } else {
1332       comma = Genomicpos_commafmt(new->offsetsstrm_len);
1333       fprintf(stderr,"done (%s bytes, %d pages, %.2f sec)\n",comma,npages,seconds);
1334       FREE(comma);
1335       new->offsetsstrm_access = MMAPPED;
1336     }
1337 #endif
1338 
1339 #ifdef HAVE_MMAP
1340   } else if (offsetsstrm_access == USE_MMAP_ONLY) {
1341     if (snps_root) {
1342       fprintf(stderr,"Memory mapping %s (%s) offsets, kmer %d, interval %d...",
1343 	      idx_filesuffix,snps_root,new->index1part,new->index1interval);
1344     } else {
1345       fprintf(stderr,"Memory mapping %s offsets, kmer %d, interval %d...",
1346 	      idx_filesuffix,new->index1part,new->index1interval);
1347     }
1348 
1349     new->offsetsstrm = (UINT4 *) Access_mmap(&new->offsetsstrm_fd,&new->offsetsstrm_len,&seconds,
1350 					     filenames->offsets_filename,/*randomp*/false);
1351     if (new->offsetsstrm == NULL) {
1352       fprintf(stderr,"Insufficient memory for memory mapping %s\n",filenames->offsets_filename);
1353       exit(9);
1354     } else {
1355       comma = Genomicpos_commafmt(new->offsetsstrm_len);
1356       fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1357       FREE(comma);
1358       new->offsetsstrm_access = MMAPPED;
1359     }
1360 #endif
1361 
1362   } else if (offsetsstrm_access == USE_FILEIO) {
1363     fprintf(stderr,"Offsetsstrm file I/O access of %s not allowed\n",filenames->offsets_filename);
1364     exit(9);
1365 
1366   } else {
1367     fprintf(stderr,"Don't recognize offsetsstrm_access type %d\n",offsetsstrm_access);
1368     abort();
1369   }
1370 
1371 
1372 #if defined(LARGE_GENOMES) || defined(UTILITYP)
1373   /* (3) Pages: always ALLOCATED */
1374   if (filenames->pages_filename == NULL) {
1375     new->hugep = false;
1376     new->offsetspages_access = NOT_USED;
1377     new->offsetspages_len = 0;
1378     new->offsetspages = (UINT4 *) MALLOC(1*sizeof(UINT4));
1379     new->offsetspages[0] = (UINT4) -1;
1380 
1381   } else {
1382     new->hugep = true;
1383     if (
1384 #ifndef HAVE_MMAP
1385 	0 &&
1386 #endif
1387 	multiple_sequences_p == false && preload_shared_memory_p == false && unload_shared_memory_p == false) {
1388 #ifdef HAVE_MMAP
1389       new->offsetspages = (UINT4 *) Access_mmap(&new->offsetspages_fd,&new->offsetspages_len,&seconds,
1390 						filenames->pages_filename,/*randomp*/false);
1391       new->offsetspages_access = MMAPPED;
1392 #endif
1393     } else if (sharedp == true) {
1394       new->offsetspages = (UINT4 *) Access_allocate_shared(&new->offsetspages_access,&new->offsetspages_shmid,&new->offsetspages_key,
1395 							   &new->offsetspages_fd,&new->offsetspages_len,&seconds,
1396 							   filenames->pages_filename,sizeof(UINT4));
1397     } else {
1398       new->offsetspages = (UINT4 *) Access_allocate_private(&new->offsetspages_access,&new->offsetspages_len,&seconds,
1399 							    filenames->pages_filename,sizeof(UINT4));
1400     }
1401   }
1402 #endif
1403 
1404   return;
1405 }
1406 
1407 
1408 #if defined(LARGE_GENOMES) || defined(UTILITYP)
1409 static void
load_positions_high(T new,Indexdb_filenames_T filenames,char * idx_filesuffix,char * snps_root,Access_mode_T positions_access,bool sharedp,bool multiple_sequences_p,bool preload_shared_memory_p,bool unload_shared_memory_p)1410 load_positions_high (T new, Indexdb_filenames_T filenames, char *idx_filesuffix,
1411 		     char *snps_root, Access_mode_T positions_access, bool sharedp,
1412 		     bool multiple_sequences_p, bool preload_shared_memory_p,
1413 		     bool unload_shared_memory_p) {
1414   char *comma;
1415   double seconds;
1416 #ifdef HAVE_MMAP
1417   int npages;
1418 #endif
1419 
1420   /* (4) Positions high */
1421   if (positions_access == USE_ALLOCATE) {
1422     if (snps_root) {
1423       fprintf(stderr,"Allocating memory for %s (%s) positions high, kmer %d, interval %d...",
1424 	      idx_filesuffix,snps_root,new->index1part,new->index1interval);
1425     } else {
1426       fprintf(stderr,"Allocating memory for %s positions high, kmer %d, interval %d...",
1427 	      idx_filesuffix,new->index1part,new->index1interval);
1428     }
1429 
1430     if (filenames->positions_high_filename == NULL) {
1431       new->positions_high = (unsigned char *) NULL;
1432 
1433     } else if (
1434 #ifndef HAVE_MMAP
1435 	       0 &&
1436 #endif
1437 	       multiple_sequences_p == false && preload_shared_memory_p == false && unload_shared_memory_p == false) {
1438 #ifdef HAVE_MMAP
1439       new->positions_high = (unsigned char *) Access_mmap(&new->positions_high_fd,&new->positions_high_len,&seconds,
1440 							  filenames->positions_high_filename,/*randomp*/false);
1441       new->positions_high_access = MMAPPED;
1442 #endif
1443 
1444     } else if (sharedp == true) {
1445       new->positions_high = (unsigned char *) Access_allocate_shared(&new->positions_high_access,&new->positions_high_shmid,&new->positions_high_key,
1446 								     &new->positions_high_fd,&new->positions_high_len,&seconds,
1447 								     filenames->positions_high_filename,sizeof(unsigned char));
1448 
1449     } else {
1450       new->positions_high = (unsigned char *) Access_allocate_private(&new->positions_high_access,&new->positions_high_len,&seconds,
1451 								      filenames->positions_high_filename,sizeof(unsigned char));
1452     }
1453 
1454     if (filenames->positions_high_filename == NULL) {
1455       new->positions_high_access = NOT_USED;
1456     } else if (new->positions_high == NULL) {
1457       fprintf(stderr,"insufficient memory (need to use a lower batch mode (-B)\n");
1458       exit(9);
1459     } else {
1460       comma = Genomicpos_commafmt(new->positions_high_len);
1461       fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1462       FREE(comma);
1463       /* new->positions_high_access set by Access_allocate */
1464     }
1465 
1466 #ifdef HAVE_MMAP
1467   } else if (positions_access == USE_MMAP_PRELOAD) {
1468     if (snps_root) {
1469       fprintf(stderr,"Pre-loading %s (%s) positions high, kmer %d, interval %d...",
1470 	      idx_filesuffix,snps_root,new->index1part,new->index1interval);
1471     } else {
1472       fprintf(stderr,"Pre-loading %s positions high, kmer %d, interval %d...",
1473 	      idx_filesuffix,new->index1part,new->index1interval);
1474     }
1475 
1476     if (filenames->positions_high_filename == NULL) {
1477       new->positions_high = (unsigned char *) NULL;
1478     } else {
1479       new->positions_high = (unsigned char *) Access_mmap_and_preload(&new->positions_high_fd,&new->positions_high_len,&npages,&seconds,
1480 								      filenames->positions_high_filename,sizeof(unsigned char));
1481     }
1482 
1483     if (filenames->positions_high_filename == NULL) {
1484       new->positions_high_access = NOT_USED;
1485     } else if (new->positions_high == NULL) {
1486       fprintf(stderr,"insufficient memory\n");
1487       exit(9);
1488     } else {
1489       comma = Genomicpos_commafmt(new->positions_high_len);
1490       fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1491       FREE(comma);
1492       new->positions_high_access = MMAPPED;
1493     }
1494 #endif
1495 
1496 #ifdef HAVE_MMAP
1497   } else if (positions_access == USE_MMAP_ONLY) {
1498     if (snps_root) {
1499       fprintf(stderr,"Memory mapping %s (%s) positions high, kmer %d, interval %d...",
1500 	      idx_filesuffix,snps_root,new->index1part,new->index1interval);
1501     } else {
1502       fprintf(stderr,"Memory mapping %s positions high, kmer %d, interval %d...",
1503 	      idx_filesuffix,new->index1part,new->index1interval);
1504     }
1505 
1506     if (filenames->positions_high_filename == NULL) {
1507       new->positions_high = (unsigned char *) NULL;
1508     } else {
1509       new->positions_high = (unsigned char *) Access_mmap(&new->positions_high_fd,&new->positions_high_len,&seconds,
1510 							  filenames->positions_high_filename,/*randomp*/true);
1511     }
1512 
1513     if (filenames->positions_high_filename == NULL) {
1514       new->positions_high_access = NOT_USED;
1515     } else if (new->positions_high == NULL) {
1516       fprintf(stderr,"Insufficient memory for memory mapping %s\n",
1517 	      filenames->positions_high_filename);
1518       exit(9);
1519     } else {
1520       comma = Genomicpos_commafmt(new->positions_high_len);
1521       fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1522       FREE(comma);
1523       new->positions_high_access = MMAPPED;
1524     }
1525 #endif
1526 
1527   } else if (positions_access == USE_FILEIO) {
1528     fprintf(stderr,"Positions high file I/O access of %s not allowed\n",
1529 	    filenames->positions_high_filename);
1530     exit(9);
1531 
1532   } else {
1533     fprintf(stderr,"Don't recognize positions_access type %d\n",positions_access);
1534     abort();
1535   }
1536 
1537   return;
1538 }
1539 #endif
1540 
1541 
1542 static void
load_positions(T new,Indexdb_filenames_T filenames,char * idx_filesuffix,char * snps_root,Access_mode_T positions_access,bool sharedp,bool multiple_sequences_p,bool preload_shared_memory_p,bool unload_shared_memory_p)1543 load_positions (T new, Indexdb_filenames_T filenames, char *idx_filesuffix,
1544 		char *snps_root, Access_mode_T positions_access, bool sharedp,
1545 		bool multiple_sequences_p, bool preload_shared_memory_p,
1546 		bool unload_shared_memory_p) {
1547   char *comma;
1548   double seconds;
1549 #ifdef HAVE_MMAP
1550   int npages;
1551 #endif
1552 
1553   /* (5) Positions */
1554   if (positions_access == USE_ALLOCATE) {
1555     if (snps_root) {
1556       fprintf(stderr,"Allocating memory for %s (%s) positions, kmer %d, interval %d...",
1557 	      idx_filesuffix,snps_root,new->index1part,new->index1interval);
1558     } else {
1559       fprintf(stderr,"Allocating memory for %s positions, kmer %d, interval %d...",
1560 	      idx_filesuffix,new->index1part,new->index1interval);
1561     }
1562 
1563     if (
1564 #ifndef HAVE_MMAP
1565 	0 &&
1566 #endif
1567 	multiple_sequences_p == false && preload_shared_memory_p == false && unload_shared_memory_p == false) {
1568 #ifdef HAVE_MMAP
1569       new->positions = (UINT4 *) Access_mmap(&new->positions_fd,&new->positions_len,&seconds,
1570 					     filenames->positions_filename,/*randomp*/false);
1571       new->positions_access = MMAPPED;
1572 #endif
1573 
1574     } else if (sharedp == true) {
1575       new->positions = (UINT4 *) Access_allocate_shared(&new->positions_access,&new->positions_shmid,&new->positions_key,
1576 							&new->positions_fd,&new->positions_len,&seconds,
1577 							filenames->positions_filename,sizeof(UINT4));
1578 
1579     } else {
1580       new->positions = (UINT4 *) Access_allocate_private(&new->positions_access,&new->positions_len,&seconds,
1581 							 filenames->positions_filename,sizeof(UINT4));
1582     }
1583 
1584     if (new->positions == NULL) {
1585       fprintf(stderr,"insufficient memory (need to use a lower batch mode (-B)\n");
1586       exit(9);
1587     } else {
1588       comma = Genomicpos_commafmt(new->positions_len);
1589       fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1590       FREE(comma);
1591       /* new->positions_access set by Access_allocate */
1592     }
1593 
1594 #ifdef HAVE_MMAP
1595   } else if (positions_access == USE_MMAP_PRELOAD) {
1596     if (snps_root) {
1597       fprintf(stderr,"Pre-loading %s (%s) positions, kmer %d, interval %d...",
1598 	      idx_filesuffix,snps_root,new->index1part,new->index1interval);
1599     } else {
1600       fprintf(stderr,"Pre-loading %s positions, kmer %d, interval %d...",
1601 	      idx_filesuffix,new->index1part,new->index1interval);
1602     }
1603 
1604     new->positions = (UINT4 *) Access_mmap_and_preload(&new->positions_fd,&new->positions_len,&npages,&seconds,
1605 						       filenames->positions_filename,sizeof(UINT4));
1606 
1607     if (new->positions == NULL) {
1608       fprintf(stderr,"insufficient memory\n");
1609       exit(9);
1610     } else {
1611       comma = Genomicpos_commafmt(new->positions_len);
1612       fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1613       FREE(comma);
1614       new->positions_access = MMAPPED;
1615     }
1616 #endif
1617 
1618 #ifdef HAVE_MMAP
1619   } else if (positions_access == USE_MMAP_ONLY) {
1620     if (snps_root) {
1621       fprintf(stderr,"Memory mapping %s (%s) positions, kmer %d, interval %d...",
1622 	      idx_filesuffix,snps_root,new->index1part,new->index1interval);
1623     } else {
1624       fprintf(stderr,"Memory mapping %s positions, kmer %d, interval %d...",
1625 	      idx_filesuffix,new->index1part,new->index1interval);
1626     }
1627 
1628     new->positions = (UINT4 *) Access_mmap(&new->positions_fd,&new->positions_len,&seconds,
1629 					   filenames->positions_filename,/*randomp*/true);
1630 
1631     if (new->positions == NULL) {
1632       fprintf(stderr,"Insufficient memory for memory mapping %s\n",
1633 		filenames->positions_filename);
1634       exit(9);
1635     } else {
1636       comma = Genomicpos_commafmt(new->positions_len);
1637       fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
1638       FREE(comma);
1639       new->positions_access = MMAPPED;
1640     }
1641 #endif
1642 
1643   } else if (positions_access == USE_FILEIO) {
1644     fprintf(stderr,"Positions file I/O access of %s not allowed\n",
1645 	    filenames->positions_filename);
1646     exit(9);
1647 
1648   } else {
1649     fprintf(stderr,"Don't recognize positions_access %d\n",positions_access);
1650     abort();
1651   }
1652 
1653 #ifdef HAVE_PTHREAD
1654 #if defined(LARGE_GENOMES) || defined(UTILITYP)
1655   if (new->positions_high_access == FILEIO || new->positions_access == FILEIO) {
1656     pthread_mutex_init(&new->positions_read_mutex,NULL);
1657   }
1658 #else
1659   if (new->positions_access == FILEIO) {
1660     pthread_mutex_init(&new->positions_read_mutex,NULL);
1661   }
1662 #endif
1663 #endif
1664 
1665   return;
1666 }
1667 
1668 
1669 T
Indexdb_new_genome(Width_T * index1part,Width_T * index1interval,char * genomesubdir,char * fileroot,char * idx_filesuffix,char * snps_root,Width_T required_index1part,Width_T required_interval,Access_mode_T offsetsstrm_access,Access_mode_T positions_access,bool sharedp,bool multiple_sequences_p,bool preload_shared_memory_p,bool unload_shared_memory_p)1670 Indexdb_new_genome (Width_T *index1part, Width_T *index1interval,
1671 		    char *genomesubdir, char *fileroot, char *idx_filesuffix, char *snps_root,
1672 		    Width_T required_index1part, Width_T required_interval,
1673 		    Access_mode_T offsetsstrm_access, Access_mode_T positions_access, bool sharedp,
1674 		    bool multiple_sequences_p, bool preload_shared_memory_p, bool unload_shared_memory_p) {
1675   T new = (T) MALLOC(sizeof(*new));
1676   Indexdb_filenames_T filenames;
1677 
1678   /* Oligospace_T poly_T; */
1679   /* Positionsptr_T ptr0; -- UINT8 or UINT4 */
1680   /* Positionsptr_T end0; -- UINT8 or UINT4 */
1681 
1682   if ((filenames = Indexdb_get_filenames_bitpack(&new->index1part,&new->index1interval,
1683 						 genomesubdir,fileroot,idx_filesuffix,snps_root,
1684 						 required_index1part,required_interval,
1685 						 /*blocksize*/64,/*offsets_only_p*/false)) == NULL) {
1686     fprintf(stderr,"Cannot find genomic index files in either current or old format.  Looking for files containing %s\n",
1687 	    idx_filesuffix);
1688     if (required_index1part != 0) {
1689       fprintf(stderr,"Required kmer is %d\n",required_index1part);
1690     }
1691     if (required_interval != 0) {
1692       fprintf(stderr,"Required interval is %d\n",required_interval);
1693     }
1694     exit(9);
1695   }
1696 
1697 
1698   *index1part = new->index1part;
1699   *index1interval = new->index1interval;
1700   load_offsets(new,filenames,idx_filesuffix,snps_root,offsetsstrm_access,sharedp,
1701 	       multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
1702 
1703   new->total_npositions = Access_filesize(filenames->positions_filename)/sizeof(UINT4);
1704 
1705 #if defined(LARGE_GENOMES) || defined(UTILITYP)
1706   load_positions_high(new,filenames,idx_filesuffix,snps_root,positions_access,sharedp,
1707 		      multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
1708 #endif
1709   load_positions(new,filenames,idx_filesuffix,snps_root,positions_access,sharedp,
1710 		 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
1711 
1712   Indexdb_filenames_free(&filenames);
1713 
1714   return new;
1715 }
1716 
1717 
1718 /* Assume that transcriptome is less than 4 billion bp, even for GSNAPL */
1719 T
Indexdb_new_transcriptome(Width_T * index1part,Width_T * index1interval,char * genomesubdir,char * fileroot,char * idx_filesuffix,char * snps_root,Alphabet_T * alphabet,int * alphabet_size,Alphabet_T required_alphabet,Width_T required_index1part,Width_T required_interval,Access_mode_T offsetsstrm_access,Access_mode_T positions_access,bool sharedp,bool multiple_sequences_p,bool preload_shared_memory_p,bool unload_shared_memory_p)1720 Indexdb_new_transcriptome (Width_T *index1part, Width_T *index1interval,
1721 			   char *genomesubdir, char *fileroot, char *idx_filesuffix, char *snps_root,
1722 #ifdef PMAP
1723 			   Alphabet_T *alphabet, int *alphabet_size, Alphabet_T required_alphabet,
1724 #endif
1725 			   Width_T required_index1part, Width_T required_interval,
1726 			   Access_mode_T offsetsstrm_access, Access_mode_T positions_access, bool sharedp,
1727 			   bool multiple_sequences_p, bool preload_shared_memory_p, bool unload_shared_memory_p) {
1728   T new = (T) MALLOC(sizeof(*new));
1729   Indexdb_filenames_T filenames;
1730 
1731   /* Oligospace_T poly_T; */
1732   /* Positionsptr_T ptr0; -- UINT8 or UINT4 */
1733   /* Positionsptr_T end0; -- UINT8 or UINT4 */
1734 
1735   if ((filenames = Indexdb_get_filenames_bitpack(
1736 #ifdef PMAP
1737 						 &(*alphabet),required_alphabet,
1738 #endif
1739 						 &new->index1part,&new->index1interval,
1740 						 genomesubdir,fileroot,idx_filesuffix,snps_root,
1741 						 required_index1part,required_interval,
1742 						 /*blocksize*/64,/*offsets_only_p*/false)) == NULL) {
1743     fprintf(stderr,"Cannot find genomic index files in a recent format.  Looking for files containing %s\n",idx_filesuffix);
1744     exit(9);
1745   } else {
1746     /* Bitpack compression  */
1747     *index1part = new->index1part;
1748     *index1interval = new->index1interval;
1749   }
1750 
1751   load_offsets(new,filenames,idx_filesuffix,snps_root,offsetsstrm_access,sharedp,
1752 	       multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
1753   load_positions(new,filenames,idx_filesuffix,snps_root,positions_access,sharedp,
1754 		 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
1755 
1756   Indexdb_filenames_free(&filenames);
1757 
1758   return new;
1759 }
1760 
1761 
1762 /************************************************************************
1763  *   Debugging procedures
1764  ************************************************************************/
1765 
1766 #ifndef PMAP
1767 
1768 /*                      87654321 */
1769 #define LOW_TWO_BITS  0x00000003
1770 
1771 #if (defined(DEBUG0) || defined(DEBUG1) || defined(DEBUG2))
1772 static char *
shortoligo_nt(Oligospace_T oligo,Width_T oligosize)1773 shortoligo_nt (Oligospace_T oligo, Width_T oligosize) {
1774   char *nt;
1775   Width_T i, j;
1776   Oligospace_T lowbits;
1777 
1778   nt = (char *) CALLOC(oligosize+1,sizeof(char));
1779   j = oligosize-1;
1780   for (i = 0; i < oligosize; i++) {
1781     lowbits = oligo & LOW_TWO_BITS;
1782     switch (lowbits) {
1783     case RIGHT_A: nt[j] = 'A'; break;
1784     case RIGHT_C: nt[j] = 'C'; break;
1785     case RIGHT_G: nt[j] = 'G'; break;
1786     case RIGHT_T: nt[j] = 'T'; break;
1787     }
1788     oligo >>= 2;
1789     j--;
1790   }
1791 
1792   return nt;
1793 }
1794 #endif
1795 
1796 #endif
1797 
1798 
1799 /************************************************************************
1800  *   Read procedures
1801  ************************************************************************/
1802 
1803 #ifdef LARGE_GENOMES
1804 static void
positions_move_absolute_1(int positions_fd,size_t ptr)1805 positions_move_absolute_1 (int positions_fd, size_t ptr) {
1806   off_t offset = ptr*sizeof(unsigned char);
1807 
1808   if (lseek(positions_fd,offset,SEEK_SET) < 0) {
1809     fprintf(stderr,"Attempted to do lseek on offset %zu*%zu=%zu\n",
1810 	    ptr,sizeof(unsigned char),(size_t) offset);
1811     perror("Error in indexdb.c, positions_move_absolute");
1812     exit(9);
1813   }
1814   return;
1815 }
1816 #endif
1817 
1818 
1819 static void
positions_move_absolute_4(int positions_fd,size_t ptr)1820 positions_move_absolute_4 (int positions_fd, size_t ptr) {
1821   off_t offset = ptr*sizeof(UINT4);
1822 
1823   if (lseek(positions_fd,offset,SEEK_SET) < 0) {
1824     fprintf(stderr,"Attempted to do lseek on offset %zu*%zu=%zu\n",
1825 	    ptr,sizeof(UINT4),(size_t) offset);
1826     perror("Error in indexdb.c, positions_move_absolute");
1827     exit(9);
1828   }
1829   return;
1830 }
1831 
1832 #if 0
1833 static Univcoord_T
1834 positions_read_forward (int positions_fd) {
1835   Univcoord_T value;
1836   char buffer[4];
1837 
1838   read(positions_fd,buffer,4);
1839 
1840   value = (buffer[3] & 0xff);
1841   value <<= 8;
1842   value |= (buffer[2] & 0xff);
1843   value <<= 8;
1844   value |= (buffer[1] & 0xff);
1845   value <<= 8;
1846   value |= (buffer[0] & 0xff);
1847 
1848   return value;
1849 }
1850 #endif
1851 
1852 #ifdef LARGE_GENOMES
1853 static void
positions_read_multiple_large(int positions_high_fd,int positions_fd,Univcoord_T * values,int n)1854 positions_read_multiple_large (int positions_high_fd, int positions_fd, Univcoord_T *values, int n) {
1855   int i;
1856   Univcoord_T value;
1857   unsigned char buffer[4];
1858 
1859 #ifdef WORDS_BIGENDIAN
1860   /* Need to keep in bigendian format */
1861   for (i = 0; i < n; i++) {
1862     read(positions_high_fd,buffer,1);
1863     value = (buffer[0] & 0xff);
1864     value <<= 8;
1865 
1866     read(positions_fd,buffer,4);
1867     value |= (buffer[0] & 0xff);
1868     value <<= 8;
1869     value |= (buffer[1] & 0xff);
1870     value <<= 8;
1871     value |= (buffer[2] & 0xff);
1872     value <<= 8;
1873     value |= (buffer[3] & 0xff);
1874 
1875     values[i] = value;
1876   }
1877 #else
1878   for (i = 0; i < n; i++) {
1879     read(positions_high_fd,buffer,1);
1880     value = (buffer[0] & 0xff);
1881     value <<= 8;
1882 
1883     read(positions_fd,buffer,4);
1884     value |= (buffer[3] & 0xff);
1885     value <<= 8;
1886     value |= (buffer[2] & 0xff);
1887     value <<= 8;
1888     value |= (buffer[1] & 0xff);
1889     value <<= 8;
1890     value |= (buffer[0] & 0xff);
1891 
1892     values[i] = value;
1893   }
1894 #endif
1895 
1896   return;
1897 }
1898 
1899 static void
positions_copy_multiple_large(Univcoord_T * positions,unsigned char * positions_high,UINT4 * positions_low,int n)1900 positions_copy_multiple_large (Univcoord_T *positions, unsigned char *positions_high, UINT4 *positions_low, int n) {
1901   int i;
1902 
1903   for (i = 0; i < n; i++) {
1904     positions[i] = ((Univcoord_T) positions_high[i] << 32) + positions_low[i];
1905   }
1906 
1907   return;
1908 }
1909 
1910 #else
1911 
1912 static void
positions_read_multiple(int positions_fd,Univcoord_T * values,int n)1913 positions_read_multiple (int positions_fd, Univcoord_T *values, int n) {
1914   int i;
1915   Univcoord_T value;
1916   unsigned char buffer[4];
1917 
1918 #ifdef WORDS_BIGENDIAN
1919   /* Need to keep in bigendian format */
1920   for (i = 0; i < n; i++) {
1921     read(positions_fd,buffer,4);
1922 
1923     value = (buffer[0] & 0xff);
1924     value <<= 8;
1925     value |= (buffer[1] & 0xff);
1926     value <<= 8;
1927     value |= (buffer[2] & 0xff);
1928     value <<= 8;
1929     value |= (buffer[3] & 0xff);
1930 
1931     values[i] = value;
1932   }
1933 #else
1934   for (i = 0; i < n; i++) {
1935     read(positions_fd,buffer,4);
1936 
1937     value = (buffer[3] & 0xff);
1938     value <<= 8;
1939     value |= (buffer[2] & 0xff);
1940     value <<= 8;
1941     value |= (buffer[1] & 0xff);
1942     value <<= 8;
1943     value |= (buffer[0] & 0xff);
1944 
1945     values[i] = value;
1946   }
1947 #endif
1948 
1949   return;
1950 }
1951 #endif
1952 
1953 
1954 
1955 
1956 #if 0
1957 static Univcoord_T
1958 positions_read_backward (int positions_fd) {
1959   Univcoord_T value;
1960   char buffer[4];
1961   off_t reloffset = -2*((off_t) sizeof(Univcoord_T)); /* 1 to undo the effect of read */
1962 
1963   read(positions_fd,buffer,4);
1964 
1965   value = (buffer[3] & 0xff);
1966   value <<= 8;
1967   value |= (buffer[2] & 0xff);
1968   value <<= 8;
1969   value |= (buffer[1] & 0xff);
1970   value <<= 8;
1971   value |= (buffer[0] & 0xff);
1972 
1973   if (lseek(positions_fd,reloffset,SEEK_CUR) < 0) {
1974     fprintf(stderr,"Attempted to do lseek on relative offset %ld\n",(long int) reloffset);
1975     perror("Error in indexdb.c, positions_read_backward");
1976     exit(9);
1977   }
1978 
1979   return value;
1980 }
1981 #endif
1982 
1983 
1984 /* Used by non-utility programs */
1985 Positionsptr_T *
Indexdb_offsets_from_bitpack(char * offsetsmetafile,char * offsetsstrmfile,int alphabet_size,Width_T index1part_aa)1986 Indexdb_offsets_from_bitpack (char *offsetsmetafile, char *offsetsstrmfile,
1987 #ifdef PMAP
1988 			      int alphabet_size, Width_T index1part_aa
1989 #else
1990 			      Width_T index1part
1991 #endif
1992 			      ) {
1993   UINT4 *offsetsmeta;
1994   UINT4 *offsetsstrm;
1995   size_t offsetsmeta_len, offsetsstrm_len;
1996   Positionsptr_T *offsets = NULL;
1997   Oligospace_T oligospace;
1998 #ifdef PMAP
1999   Oligospace_T oligoi;
2000 #elif defined(LARGE_GENOMES)
2001 #else
2002   Oligospace_T oligoi;
2003   Blocksize_T blocksize;
2004 #endif
2005 #ifdef HAVE_MMAP
2006   int offsetsmeta_fd, offsetsstrm_fd;
2007 #else
2008   Access_T offsetsmeta_access, offsetsstrm_access;
2009 #endif
2010   double seconds;
2011 
2012 
2013 #ifdef PMAP
2014   oligospace = power(alphabet_size,index1part_aa);
2015 #else
2016   oligospace = power(4,index1part);
2017 #endif
2018 #if !defined(PMAP) && !defined(LARGE_GENOMES)
2019   blocksize = 64; /* Used to be determined by 4^(kmer - basesize), but now fixed at 64 */
2020 #endif
2021 
2022 
2023 #ifdef HAVE_MMAP
2024   offsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,offsetsmetafile,/*randomp*/false);
2025   offsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,offsetsstrmfile,/*randomp*/false);
2026 #else
2027   offsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,offsetsmetafile,sizeof(UINT4));
2028   offsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,offsetsstrmfile,sizeof(UINT4));
2029 #endif
2030 
2031 #ifdef OLIGOSPACE_NOT_LONG
2032   fprintf(stderr,"Allocating memory (%u 4-byte words) for offsets, kmer %d...",oligospace+1U,
2033 #ifdef PMAP
2034 	  index1part_aa
2035 #else
2036 	  index1part
2037 #endif
2038 	  );
2039 #else
2040   fprintf(stderr,"Allocating memory (%llu 4-byte words) for offsets, kmer %d...",(unsigned long long) oligospace+1UL,
2041 #ifdef PMAP
2042 	  index1part_aa
2043 #else
2044 	  index1part
2045 #endif
2046 	  );
2047 #endif
2048 
2049   /* Bitpack procedures start from offsets[1], so we need to print offsets[0] as a special case */
2050   offsets = (Positionsptr_T *) MALLOC((oligospace+1) * sizeof(Positionsptr_T));
2051 
2052   if (offsets == NULL) {
2053     fprintf(stderr,"cannot allocated requested memory.  Cannot run expand offsets on this machine.\n");
2054     exit(9);
2055   } else {
2056     fprintf(stderr,"done\n");
2057   }
2058 
2059   fprintf(stderr,"Expanding offsetsstrm into offsets...");
2060 #ifdef PMAP
2061   for (oligoi = 0UL; oligoi <= oligospace; oligoi += 1) {
2062     offsets[oligoi] = Bitpack64_read_one(oligoi,offsetsmeta,offsetsstrm);
2063   }
2064 #elif defined(LARGE_GENOMES)
2065   fprintf(stderr,"Cannot do expand offsets on large genomes\n");
2066   exit(9);
2067 #else
2068   for (oligoi = 0UL; oligoi < oligospace; oligoi += blocksize) {
2069     Bitpack64_block_offsets(&(offsets[oligoi]),oligoi,offsetsmeta,offsetsstrm);
2070   }
2071 #endif
2072 
2073   fprintf(stderr,"done\n");
2074 
2075 #ifdef HAVE_MMAP
2076   munmap((void *) offsetsstrm,offsetsstrm_len);
2077   munmap((void *) offsetsmeta,offsetsmeta_len);
2078 #else
2079   FREE(offsetsstrm);
2080   FREE(offsetsmeta);
2081 #endif
2082 
2083   return offsets;
2084 }
2085 
2086 
2087 #if defined(HAVE_64_BIT) && defined(UTILITYP)
2088 /* Used by utility programs for writing indexdb */
2089 Hugepositionsptr_T *
Indexdb_offsets_from_bitpack_huge(char * offsetspagesfile,char * offsetsmetafile,char * offsetsstrmfile,int alphabet_size,Width_T index1part_aa)2090 Indexdb_offsets_from_bitpack_huge (char *offsetspagesfile, char *offsetsmetafile, char *offsetsstrmfile,
2091 #ifdef PMAP
2092 				   int alphabet_size, Width_T index1part_aa
2093 #else
2094 				   Width_T index1part
2095 #endif
2096 				   ) {
2097   UINT4 *offsetspages;
2098   UINT4 *offsetsmeta;
2099   UINT4 *offsetsstrm;
2100 
2101   size_t offsetspages_len, offsetsmeta_len, offsetsstrm_len;
2102   Hugepositionsptr_T *offsets = NULL;
2103   Oligospace_T oligospace, oligoi;
2104   Blocksize_T blocksize;
2105 #ifdef HAVE_MMAP
2106   int offsetsmeta_fd, offsetsstrm_fd;
2107 #else
2108   Access_T offsetsmeta_access;
2109 #endif
2110   Access_T offsetspages_access, offsetsstrm_access;
2111   double seconds;
2112 
2113 
2114 #ifdef PMAP
2115   oligospace = power(alphabet_size,index1part_aa);
2116 #else
2117   oligospace = power(4,index1part);
2118 #endif
2119   blocksize = 64; /* Used to be determined by 4^(kmer - basesize), but now fixed at 64 */
2120 
2121   if (blocksize == 1) {
2122     return (Hugepositionsptr_T *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,
2123 							  offsetsstrmfile,sizeof(Hugepositionsptr_T));
2124 
2125   } else {
2126     if (offsetspagesfile == NULL) {
2127       offsetspages_access = NOT_USED;
2128       offsetspages_len = 0;
2129       offsetspages = (UINT4 *) MALLOC(1*sizeof(UINT4));
2130       offsetspages[0] = (UINT4) -1;
2131     } else {
2132       offsetspages = (UINT4 *) Access_allocate_private(&offsetspages_access,&offsetspages_len,&seconds,offsetspagesfile,sizeof(UINT4));
2133     }
2134 #ifdef HAVE_MMAP
2135     offsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,offsetsmetafile,/*randomp*/false);
2136     offsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,offsetsstrmfile,/*randomp*/false);
2137 #else
2138     offsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,offsetsmetafile,sizeof(UINT4));
2139     offsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,offsetsstrmfile,sizeof(UINT4));
2140 #endif
2141 
2142 #ifdef OLIGOSPACE_NOT_LONG
2143     fprintf(stderr,"Allocating memory (%u 8-byte words) for offsets, kmer %d...",oligospace+1U,
2144 #ifdef PMAP
2145 	    index1part_aa
2146 #else
2147 	    index1part
2148 #endif
2149 	    );
2150 #else
2151     fprintf(stderr,"Allocating memory (%llu 8-byte words) for offsets, kmer %d...",(unsigned long long) oligospace+1UL,
2152 #ifdef PMAP
2153 	    index1part_aa
2154 #else
2155 	    index1part
2156 #endif
2157 	    );
2158 #endif
2159 
2160     /* Bitpack procedures start from offsets[1], so we need to print offsets[0] as a special case */
2161     offsets = (Hugepositionsptr_T *) MALLOC((oligospace+1) * sizeof(Hugepositionsptr_T));
2162 
2163     if (offsets == NULL) {
2164       fprintf(stderr,"cannot allocated requested memory.  Cannot run expand offsets on this machine.\n");
2165       exit(9);
2166     } else {
2167       fprintf(stderr,"done\n");
2168     }
2169 
2170     fprintf(stderr,"Expanding offsetsstrm into offsets...");
2171 #ifdef PMAP
2172     for (oligoi = 0UL; oligoi <= oligospace; oligoi += 1) {
2173       offsets[oligoi] = Bitpack64_read_one_huge(oligoi,offsetspages,offsetsmeta,offsetsstrm);
2174     }
2175 #else
2176     for (oligoi = 0UL; oligoi < oligospace; oligoi += blocksize) {
2177       Bitpack64_block_offsets_huge(&(offsets[oligoi]),oligoi,offsetspages,offsetsmeta,offsetsstrm);
2178     }
2179 #endif
2180 
2181     fprintf(stderr,"done\n");
2182 
2183 #ifdef HAVE_MMAP
2184     munmap((void *) offsetsstrm,offsetsstrm_len);
2185     munmap((void *) offsetsmeta,offsetsmeta_len);
2186 #else
2187     FREE(offsetsstrm);
2188     FREE(offsetsmeta);
2189 #endif
2190     FREE(offsetspages);
2191 
2192     return offsets;
2193   }
2194 }
2195 #endif
2196 
2197 
2198 
2199 #ifdef PMAP
2200 
2201 /* PMAP version.  Doesn't mask bottom 12 nt. */
2202 Univcoord_T *
Indexdb_read(int * nentries,T this,Oligospace_T aaindex)2203 Indexdb_read (int *nentries, T this, Oligospace_T aaindex) {
2204   Univcoord_T *positions, bigendian, littleendian;
2205   Positionsptr_T ptr, ptr0, end0;
2206   int i;
2207   char byte1, byte2, byte3;
2208 
2209   debug0(printf("%u (%s)\n",aaindex,Alphabet_aaindex_aa(aaindex,this->alphabet)));
2210 
2211   if (this->compression_type == NO_COMPRESSION) {
2212 #ifdef WORDS_BIGENDIAN
2213     /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
2214     ptr0 = Bigendian_convert_uint(this->offsetsstrm[aaindex]);
2215     end0 = Bigendian_convert_uint(this->offsetsstrm[aaindex+1]);
2216 #else
2217     ptr0 = this->offsetsstrm[aaindex];
2218     end0 = this->offsetsstrm[aaindex+1];
2219 #endif
2220 
2221   } else if (this->compression_type == BITPACK64_COMPRESSION) {
2222     ptr0 = Bitpack64_read_two(&end0,aaindex,this->offsetsmeta,this->offsetsstrm);
2223 
2224   } else {
2225     abort();
2226   }
2227 
2228 
2229   debug0(printf("offset pointers are %u and %u\n",ptr0,end0));
2230 
2231 #ifdef ALLOW_DUPLICATES
2232   /* Not used */
2233   /* Skip backward over bad values, due to duplicates */
2234   if (this->positions_access == FILEIO) {
2235 #ifdef HAVE_PTHREAD
2236     pthread_mutex_lock(&this->positions_read_mutex);
2237 #endif
2238     positions_move_absolute(this->positions_fd,end0-1);
2239     while (end0 > ptr0 && positions_read_backward(this->positions_fd) == BADVAL) {
2240       end0--;
2241     }
2242 #ifdef HAVE_PTHREAD
2243     pthread_mutex_unlock(&this->positions_read_mutex);
2244 #endif
2245   } else {
2246     while (end0 > ptr0 && this->positions[end0-1] == BADVAL) {
2247       end0--;
2248     }
2249   }
2250 #endif	/* ALLOW_DUPLICATES */
2251 
2252   if ((*nentries = end0 - ptr0) == 0) {
2253     return NULL;
2254   } else {
2255     debug0(printf("Indexdb_read: offset pointers are %u and %u\n",ptr0,end0));
2256 
2257     positions = (Univcoord_T *) MALLOC(*nentries*sizeof(Univcoord_T));
2258     if (this->positions_access == FILEIO) {
2259 #ifdef HAVE_PTHREAD
2260       pthread_mutex_lock(&this->positions_read_mutex);
2261 #endif
2262 #ifdef LARGE_GENOMES
2263       positions_move_absolute_1(this->positions_high_fd,ptr0);
2264       positions_move_absolute_4(this->positions_fd,ptr0);
2265       positions_read_multiple_large(this->positions_high_fd,this->positions_fd,positions,*nentries);
2266 #else
2267       positions_move_absolute_4(this->positions_fd,ptr0);
2268       positions_read_multiple(this->positions_fd,positions,*nentries);
2269 #endif
2270 #ifdef HAVE_PTHREAD
2271       pthread_mutex_unlock(&this->positions_read_mutex);
2272 #endif
2273 
2274     } else if (this->positions_access == ALLOCATED_PRIVATE || this->positions_access == ALLOCATED_SHARED) {
2275 #ifdef LARGE_GENOMES
2276       positions_copy_multiple_large(positions,&(this->positions_high[ptr0]),&(this->positions[ptr0]),*nentries);
2277 #else
2278       memcpy(positions,&(this->positions[ptr0]),(*nentries)*sizeof(Univcoord_T));
2279 #endif
2280 
2281     } else {
2282 #ifdef WORDS_BIGENDIAN
2283 #ifdef LARGE_GENOMES
2284       for (ptr = ptr0, i = 0; ptr < end0; ptr++, i++) {
2285 	bigendian = (Univcoord_T) this->positions_high[ptr];
2286 	bigendian <<= 8;
2287 
2288 	littleendian = this->positions[ptr];
2289 	bigendian |= littleendian & 0xff; /* 0 */
2290 	bigendian <<= 8;
2291 	bigendian |= ((littleendian >>= 8) & 0xff); /* 1 */
2292 	bigendian <<= 8;
2293 	bigendian |= ((littleendian >>= 8) & 0xff); /* 2 */
2294 	bigendian <<= 8;
2295 	bigendian |= ((littleendian >>= 8) & 0xff); /* 3 */
2296 	positions[i] = bigendian;
2297       }
2298 #else
2299       for (ptr = ptr0, i = 0; ptr < end0; ptr++, i++) {
2300 	littleendian = this->positions[ptr];
2301 	bigendian = littleendian & 0xff; /* 0 */
2302 	bigendian <<= 8;
2303 	bigendian |= ((littleendian >>= 8) & 0xff); /* 1 */
2304 	bigendian <<= 8;
2305 	bigendian |= ((littleendian >>= 8) & 0xff); /* 2 */
2306 	bigendian <<= 8;
2307 	bigendian |= ((littleendian >>= 8) & 0xff); /* 3 */
2308 	positions[i] = bigendian;
2309       }
2310 #endif
2311 
2312 #else  /* littleendian */
2313 
2314 #ifdef LARGE_GENOMES
2315       positions_copy_multiple_large(positions,&(this->positions_high[ptr0]),&(this->positions[ptr0]),*nentries);
2316 #else
2317       memcpy(positions,&(this->positions[ptr0]),(*nentries)*sizeof(Univcoord_T));
2318 #endif
2319 #endif
2320     }
2321     debug0(
2322 	   printf("%d entries:",*nentries);
2323 	   for (i = 0; i < *nentries; i++) {
2324 	     printf(" %u",positions[i]);
2325 	   }
2326 	   printf("\n");
2327 	   );
2328 
2329     return positions;
2330   }
2331 }
2332 
2333 #else
2334 
2335 /* GMAP version: Allocates memory */
2336 Univcoord_T *
Indexdb_read(int * nentries,T this,Oligospace_T oligo)2337 Indexdb_read (int *nentries, T this, Oligospace_T oligo) {
2338   Univcoord_T *positions;
2339   Positionsptr_T ptr0, end0;
2340   Oligospace_T part0;
2341 #ifdef WORDS_BIGENDIAN
2342   int i;
2343   Positionsptr_T ptr;
2344   Univcoord_T bigendian, littleendian;
2345   char byte1, byte2, byte3;
2346 #endif
2347 #ifdef DEBUG0
2348   int j;
2349 #endif
2350 
2351 
2352 #if 0
2353   debug0(printf("%06X (%s)\n",oligo,shortoligo_nt(oligo,index1part)));
2354 #endif
2355   part0 = oligo & poly_T;
2356 
2357   /* Ignore poly A and poly T on stage 1 */
2358   /* Was commented out */
2359   if (part0 == poly_A || part0 == poly_T) {
2360     *nentries = 0;
2361     return NULL;
2362   }
2363 
2364   if (this->compression_type == NO_COMPRESSION) {
2365 #ifdef WORDS_BIGENDIAN
2366     /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
2367     ptr0 = Bigendian_convert_uint(this->offsetsstrm[part0]);
2368     end0 = Bigendian_convert_uint(this->offsetsstrm[part0+1]);
2369 #else
2370     ptr0 = this->offsetsstrm[part0];
2371     end0 = this->offsetsstrm[part0+1];
2372 #endif
2373 
2374   } else if (this->compression_type == BITPACK64_COMPRESSION) {
2375 #ifdef LARGE_GENOMES
2376     ptr0 = Bitpack64_read_two_huge(&end0,part0,this->offsetspages,this->offsetsmeta,this->offsetsstrm);
2377 #else
2378     ptr0 = Bitpack64_read_two(&end0,part0,this->offsetsmeta,this->offsetsstrm);
2379 #endif
2380 
2381   } else {
2382     abort();
2383   }
2384 
2385 
2386 #ifdef ALLOW_DUPLICATES
2387   /* Not used */
2388   /* Skip backward over bad values, due to duplicates */
2389   if (this->positions_access == FILEIO) {
2390 #ifdef HAVE_PTHREAD
2391     pthread_mutex_lock(&this->positions_read_mutex);
2392 #endif
2393     positions_move_absolute(this->positions_fd,end0-1);
2394     while (end0 > ptr0 && positions_read_backward(this->positions_fd) == BADVAL) {
2395       end0--;
2396     }
2397 #ifdef HAVE_PTHREAD
2398     pthread_mutex_unlock(&this->positions_read_mutex);
2399 #endif
2400   } else {
2401     while (end0 > ptr0 && this->positions[end0-1] == BADVAL) {
2402       end0--;
2403     }
2404   }
2405 #endif
2406 
2407   if ((*nentries = end0 - ptr0) == 0) {
2408     return NULL;
2409   } else {
2410     debug0(printf("Indexdb_read: offset pointers are %llu and %llu\n",ptr0,end0));
2411 
2412     positions = (Univcoord_T *) MALLOC(*nentries*sizeof(Univcoord_T));
2413 #ifdef LARGE_GENOMES
2414     if (this->positions_high_access == FILEIO || this->positions_access == FILEIO) {
2415 #ifdef HAVE_PTHREAD
2416       pthread_mutex_lock(&this->positions_read_mutex);
2417 #endif
2418       positions_move_absolute_1(this->positions_high_fd,ptr0);
2419       positions_move_absolute_4(this->positions_fd,ptr0);
2420       positions_read_multiple_large(this->positions_high_fd,this->positions_fd,positions,*nentries);
2421 #ifdef HAVE_PTHREAD
2422       pthread_mutex_unlock(&this->positions_read_mutex);
2423 #endif
2424 
2425     } else if (this->positions_high_access == ALLOCATED_PRIVATE || this->positions_high_access == ALLOCATED_SHARED ||
2426 	       this->positions_access == ALLOCATED_PRIVATE || this->positions_access == ALLOCATED_SHARED) {
2427       positions_copy_multiple_large(positions,&(this->positions_high[ptr0]),&(this->positions[ptr0]),*nentries);
2428 
2429 #else
2430 
2431     if (this->positions_access == FILEIO) {
2432 #ifdef HAVE_PTHREAD
2433       pthread_mutex_lock(&this->positions_read_mutex);
2434 #endif
2435       positions_move_absolute_4(this->positions_fd,ptr0);
2436       positions_read_multiple(this->positions_fd,positions,*nentries);
2437 #ifdef HAVE_PTHREAD
2438       pthread_mutex_unlock(&this->positions_read_mutex);
2439 #endif
2440 
2441     } else if (this->positions_access == ALLOCATED_PRIVATE || this->positions_access == ALLOCATED_SHARED) {
2442       memcpy(positions,&(this->positions[ptr0]),(*nentries)*sizeof(Univcoord_T));
2443 
2444 #endif
2445 
2446     } else {
2447 #ifdef WORDS_BIGENDIAN
2448 #ifdef LARGE_GENOMES
2449       for (ptr = ptr0, i = 0; ptr < end0; ptr++, i++) {
2450 	bigendian = (Univcoord_T) this->positions_high[ptr];
2451 	bigendian <<= 8;
2452 
2453 	littleendian = this->positions[ptr];
2454 	bigendian |= littleendian & 0xff; /* 0 */
2455 	bigendian <<= 8;
2456 	bigendian |= ((littleendian >>= 8) & 0xff); /* 1 */
2457 	bigendian <<= 8;
2458 	bigendian |= ((littleendian >>= 8) & 0xff); /* 2 */
2459 	bigendian <<= 8;
2460 	bigendian |= ((littleendian >>= 8) & 0xff); /* 3 */
2461 	positions[i] = bigendian;
2462       }
2463 #else
2464       for (ptr = ptr0, i = 0; ptr < end0; ptr++, i++) {
2465 	littleendian = this->positions[ptr];
2466 	bigendian = littleendian & 0xff; /* 0 */
2467 	bigendian <<= 8;
2468 	bigendian |= ((littleendian >>= 8) & 0xff); /* 1 */
2469 	bigendian <<= 8;
2470 	bigendian |= ((littleendian >>= 8) & 0xff); /* 2 */
2471 	bigendian <<= 8;
2472 	bigendian |= ((littleendian >>= 8) & 0xff); /* 3 */
2473 	positions[i] = bigendian;
2474       }
2475 #endif
2476 
2477 #else  /* littleendian */
2478 
2479 #ifdef LARGE_GENOMES
2480       positions_copy_multiple_large(positions,&(this->positions_high[ptr0]),&(this->positions[ptr0]),*nentries);
2481 #else
2482       memcpy(positions,&(this->positions[ptr0]),(*nentries)*sizeof(Univcoord_T));
2483 #endif
2484 #endif
2485     }
2486 
2487     debug0(
2488 	   printf("%d entries:",*nentries);
2489 	   for (j = 0; j < *nentries; j++) {
2490 	     printf(" %llu",positions[j]);
2491 	   }
2492 	   printf("\n");
2493 	   );
2494 
2495     return positions;
2496   }
2497 }
2498 
2499 #endif	/* ifdef PMAP */
2500 
2501 
2502 
2503 /* Does not allocate memory, but points to positions in Indexdb_T.
2504    diagterm is needed to skip any positions that extend past genomicpos 0 */
2505 #ifdef UTILITYP
2506 
2507 /* Utility programs do not distinguish between Univcoord_T */
2508 
2509 #elif !defined(GSNAP)
2510 /* GMAP reads instead of using pointers */
2511 
2512 #elif defined(LARGE_GENOMES)
2513 int
2514 Indexdb_largeptr (unsigned char **positions_high, UINT4 **positions,
2515 		  T this, Oligospace_T oligo) {
2516   int nentries;
2517   Positionsptr_T ptr0, end0;
2518 #ifdef DEBUG0
2519   unsigned char *p, *r;
2520   UINT4 *q;
2521 #endif
2522 
2523   ptr0 = Bitpack64_read_two_huge(&end0,oligo,this->offsetspages,this->offsetsmeta,this->offsetsstrm);
2524 
2525   debug0(printf("Indexdb_largeptr: oligo = %06llX, offset pointers are %llu and %llu\n",
2526 		oligo,ptr0,end0));
2527 
2528   if ((nentries = end0 - ptr0) == 0) {
2529     *positions_high = (unsigned char *) NULL;
2530     *positions = (UINT4 *) NULL;
2531     return 0;
2532 
2533   } else {
2534     *positions_high = &(this->positions_high[ptr0]);
2535     *positions = &(this->positions[ptr0]);
2536 
2537     debug0(
2538 	   printf("%d entries:",nentries);
2539 	   r = &(this->positions_high[end0]);
2540 	   p = *positions_high;
2541 	   q = *positions;
2542 	   while (p < r) {
2543 	     printf(" %u %u",*p,*q);
2544 	     p++; q++;
2545 	   }
2546 	   printf("\n");
2547 	   );
2548 
2549     return nentries;
2550   }
2551 }
2552 #endif
2553 
2554 
2555 #ifdef GSNAP
2556 int
2557 Indexdb_ptr (UINT4 **positions, T this, Oligospace_T oligo) {
2558   int nentries;
2559   UINT4 ptr0, end0;
2560 #ifdef DEBUG0
2561   UINT4 *p, *r;
2562 #endif
2563 
2564   ptr0 = Bitpack64_read_two(&end0,oligo,this->offsetsmeta,this->offsetsstrm);
2565 
2566   debug0(printf("Indexdb_ptr: oligo = %06llX, offset pointers are %u and %u\n",
2567 		oligo,ptr0,end0));
2568 
2569   if ((nentries = end0 - ptr0) == 0) {
2570     *positions = (UINT4 *) NULL;
2571     return 0;
2572 
2573   } else {
2574     *positions = &(this->positions[ptr0]);
2575 
2576     debug0(
2577 	   printf("%d entries",nentries);
2578 	   r = &(this->positions[end0]);
2579 	   p = *positions;
2580 	   while (p < r) {
2581 	     printf(" %u",*p);
2582 	     p++;
2583 	   }
2584 	   printf("\n");
2585 	   );
2586 
2587     return nentries;
2588   }
2589 }
2590 #endif
2591 
2592 
2593 
2594 #if defined(UTILITYP)
2595 
2596 #elif defined(GSNAP)
2597 /* GSNAP uses pointers instead of reading */
2598 
2599 #else
2600 /* Analogous to Indexdb_read, except this includes diagterm.  Always allocates memory. */
2601 Univcoord_T *
2602 Indexdb_read_with_diagterm (int *nentries, T this, Oligospace_T oligo, int diagterm) {
2603   Univcoord_T *positions;
2604   Positionsptr_T ptr0, end0;
2605 
2606 #ifdef LARGE_GENOMES
2607   Positionsptr_T ptr;
2608   int i;
2609 #ifdef HAVE_AVX2
2610   __m256i _high, _low, _diagterm;
2611   Univcoord_T *out;
2612   unsigned char *in_high;
2613   UINT4 *in_low;
2614 #endif
2615 
2616 #elif defined(HAVE_AVX2)
2617   __m256i _diagterm;
2618   Univcoord_T *out, *in;
2619 #elif defined(HAVE_SSE2)
2620   __m128i _diagterm;
2621   Univcoord_T *out, *in;
2622 #else
2623   Positionsptr_T ptr;
2624   int i;
2625 #endif
2626 
2627 #ifdef DEBUG0
2628   int k;
2629 #endif
2630 
2631   assert(diagterm >= 0);	/* For GMAP */
2632 
2633 #ifdef LARGE_GENOMES
2634   ptr0 = Bitpack64_read_two_huge(&end0,oligo,this->offsetspages,this->offsetsmeta,this->offsetsstrm);
2635 #else
2636   ptr0 = Bitpack64_read_two(&end0,oligo,this->offsetsmeta,this->offsetsstrm);
2637 #endif
2638 
2639   debug0(printf("Indexdb_read_with_diagterm: oligo = %06llX, offset pointers are %llu and %llu, diagterm %d\n",
2640 		oligo,ptr0,end0,diagterm));
2641 
2642   if ((*nentries = end0 - ptr0) == 0) {
2643     return (Univcoord_T *) NULL;
2644 
2645 #if 0
2646   } else {
2647     /* Should not be necessary if diagterm >= 0 */
2648     /* Skip any positions that extend past genomicpos 0 */
2649 #ifdef LARGE_GENOMES
2650     while (ptr0 < end0 && ((Univcoord_T) this->positions_high[ptr0] << 32) + this->positions[ptr0] < (Univcoord_T) -diagterm) {
2651       ptr0++;
2652     }
2653 #else
2654     while (ptr0 < end0 && this->positions[ptr0] < (Univcoord_T) -diagterm) {
2655       ptr0++;
2656     }
2657 #endif
2658     if ((*nentries = end0 - ptr0) == 0) {
2659       return (Univcoord_T *) NULL;
2660     }
2661 #endif
2662   }
2663 
2664   positions = (Univcoord_T *) MALLOC((*nentries)*sizeof(Univcoord_T));
2665 
2666 #ifdef LARGE_GENOMES
2667   /* Previously checked for (this->positions_high_access == ALLOCATED_PRIVATE || this->positions_high_access == ALLOCATED_SHARED ||
2668      this->positions_access == ALLOCATED_PRIVATE || this->positions_access == ALLOCATED_SHARED) */
2669 #ifdef HAVE_AVX2
2670   _diagterm = _mm256_set1_epi64x(diagterm);
2671   in_high = &(this->positions_high[ptr0]);
2672   in_low = &(this->positions[ptr0]);
2673   out = &(positions[0]);
2674   while (in_low + 4 < &(this->positions[end0])) {
2675     _high = _mm256_slli_epi64(_mm256_cvtepu8_epi64(_mm_loadu_si128((__m128i *) in_high)), 32);
2676     _low = _mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i *) in_low));
2677     _mm256_storeu_si256((__m256i *) out, _mm256_add_epi64(_mm256_add_epi64(_high,_low), _diagterm));
2678     in_high += 4;
2679     in_low += 4;
2680     out += 4;
2681   }
2682   while (in_low < &(this->positions[end0])) {
2683     *out++ = ((Univcoord_T) *in_high++ << 32) + (*in_low++) + diagterm;
2684   }
2685 
2686 #else
2687   for (ptr = ptr0, i = 0; ptr < end0; ptr++) {
2688     positions[i++] = ((Univcoord_T) this->positions_high[ptr] << 32) + this->positions[ptr] + diagterm;
2689   }
2690 #endif
2691 
2692 #elif defined(HAVE_AVX2)
2693   _diagterm = _mm256_set1_epi32(diagterm);
2694   in = &(this->positions[ptr0]);
2695   out = &(positions[0]);
2696   while (in + 8 < &(this->positions[end0])) {
2697     _mm256_storeu_si256((__m256i *) out, _mm256_add_epi32(_mm256_loadu_si256((__m256i *) in), _diagterm));
2698     in += 8;
2699     out += 8;
2700   }
2701   while (in < &(this->positions[end0])) {
2702     *out++ = *in++ + diagterm;
2703   }
2704 
2705 #elif defined(HAVE_SSE2)
2706   _diagterm = _mm_set1_epi32(diagterm);
2707   in = &(this->positions[ptr0]);
2708   out = &(positions[0]);
2709   while (in + 4 < &(this->positions[end0])) {
2710     _mm_storeu_si128((__m128i *) out, _mm_add_epi32(_mm_loadu_si128((__m128i *) in), _diagterm));
2711     in += 4;
2712     out += 4;
2713   }
2714   while (in < &(this->positions[end0])) {
2715     *out++ = *in++ + diagterm;
2716   }
2717 
2718 #else
2719   for (ptr = ptr0, i = 0; ptr < end0; ptr++) {
2720     positions[i++] = this->positions[ptr] + diagterm;
2721   }
2722 #endif
2723 
2724   debug0(
2725 	 printf("%d entries (vs diagterm %d):",*nentries,diagterm);
2726 	for (k = 0; k < *nentries; k++) {
2727 	  printf(" %u",positions[k]);
2728 	}
2729 	printf("\n");
2730 	);
2731 
2732   return positions;
2733 }
2734 #endif
2735 
2736 
2737 
2738 /************************************************************************
2739  *   Create procedure -- for user-provided genomic segment
2740  ************************************************************************/
2741 
2742 #if defined(UTILITYP) || defined(LARGE_GENOMES)
2743 #else
2744 T
2745 Indexdb_new_segment (char *genomicseg,
2746 #ifdef PMAP
2747 		     int alphabet_size, Width_T index1part_aa, bool watsonp,
2748 #else
2749 		     Width_T index1part,
2750 #endif
2751 		     Width_T index1interval) {
2752   T new = (T) MALLOC(sizeof(*new));
2753   char *uppercaseCode;
2754   Positionsptr_T *work_offsets;	/* Working set for use in calculating positions */
2755   int totalcounts = 0;
2756   int c;
2757   Oligospace_T oligospace, oligoi;
2758   char *p;
2759   Univcoord_T position = 0UL;
2760 
2761 #ifdef HAVE_64_BIT
2762   Oligospace_T oligo = 0U, masked, mask;
2763 #else
2764   Shortoligomer_T high = 0U, low = 0U, carry;
2765 #endif
2766 
2767 #ifdef PMAP
2768   int frame = -1, between_counter[3], in_counter[3];
2769   Oligospace_T aaindex;
2770   int index1part_nt = 3*index1part_aa;
2771 #else
2772   int between_counter, in_counter;
2773 #endif
2774 
2775   uppercaseCode = UPPERCASE_U2T;
2776 
2777 #ifdef PMAP
2778   oligospace = power(alphabet_size,index1part_aa);
2779   between_counter[0] = between_counter[1] = between_counter[2] = 0;
2780   in_counter[0] = in_counter[1] = in_counter[2] = 0;
2781   new->index1part = index1part_aa;
2782 
2783 #else
2784 #ifdef HAVE_64_BIT
2785   mask = ~(~0ULL << 2*index1part);
2786 #else
2787   mask = ~(~0U << 2*index1part);
2788 #endif
2789   oligospace = power(4,index1part);
2790   between_counter = in_counter = 0;
2791   new->index1part = index1part;
2792 #endif
2793 
2794   new->index1interval = 1;
2795   new->compression_type = NO_COMPRESSION;
2796 
2797   new->offsetsmeta = (UINT4 *) CALLOC(oligospace+1,sizeof(UINT4));
2798   for (oligoi = 0; oligoi <= oligospace; oligoi++) {
2799     new->offsetsmeta[oligoi] = oligoi;
2800   }
2801   new->offsetsmeta_access = ALLOCATED_PRIVATE;
2802 
2803   new->offsetsstrm = (UINT4 *) CALLOC(oligospace+1,sizeof(UINT4));
2804   new->offsetsstrm_access = ALLOCATED_PRIVATE;
2805 
2806   p = genomicseg;
2807   while ((c = *(p++)) != '\0') {
2808 #ifdef PMAP
2809     if (++frame == 3) {
2810       frame = 0;
2811     }
2812     between_counter[frame] += 1;
2813     in_counter[frame] += 1;
2814 #else
2815     between_counter++;
2816     in_counter++;
2817 #endif
2818 
2819 #ifdef HAVE_64_BIT
2820     switch (uppercaseCode[c]) {
2821     case 'A': oligo = (oligo << 2); break;
2822     case 'C': oligo = (oligo << 2) | 1U; break;
2823     case 'G': oligo = (oligo << 2) | 2U; break;
2824     case 'T': oligo = (oligo << 2) | 3U; break;
2825     default:
2826       oligo = 0U;
2827 #ifdef PMAP
2828       in_counter[0] = in_counter[1] = in_counter[2] = 0;
2829 #else
2830       in_counter = 0;
2831 #endif
2832       break;
2833     }
2834 
2835 #else
2836     carry = (low >> 30);
2837     switch (uppercaseCode[c]) {
2838     case 'A': low = (low << 2); break;
2839     case 'C': low = (low << 2) | 1U; break;
2840     case 'G': low = (low << 2) | 2U; break;
2841     case 'T': low = (low << 2) | 3U; break;
2842     default:
2843       high = low = carry = 0U;
2844 #ifdef PMAP
2845       in_counter[0] = in_counter[1] = in_counter[2] = 0;
2846 #else
2847       in_counter = 0;
2848 #endif
2849       break;
2850     }
2851     high = (high << 2) | carry;
2852 #endif
2853 
2854 #ifdef PMAP
2855     if (in_counter[frame] > 0) {
2856       if (watsonp == true) {
2857 #ifdef HAVE_64_BIT
2858 	if (Alphabet_get_codon_fwd(oligo) == AA_STOP) {
2859 	  in_counter[frame] = 0;
2860 	}
2861 #else
2862 	if (Alphabet_get_codon_fwd(low) == AA_STOP) {
2863 	  in_counter[frame] = 0;
2864 	}
2865 #endif
2866       } else {
2867 #ifdef HAVE_64_BIT
2868 	if (Alphabet_get_codon_rev(oligo) == AA_STOP) {
2869 	  in_counter[frame] = 0;
2870 	}
2871 #else
2872 	if (Alphabet_get_codon_rev(low) == AA_STOP) {
2873 	  in_counter[frame] = 0;
2874 	}
2875 #endif
2876       }
2877     }
2878     if (in_counter[frame] == index1part_aa + 1) {
2879       if (between_counter[frame] >= index1interval) {
2880 #ifdef HAVE_64_BIT
2881 	aaindex = Alphabet_get_aa_index(oligo,watsonp,index1part_nt);
2882 #else
2883 	aaindex = Alphabet_get_aa_index(high,low,watsonp,index1part_nt);
2884 #endif
2885 #ifdef OLIGOSPACE_NOT_LONG
2886 	oligoi = (Oligospace_T) aaindex + 1U;
2887 #else
2888 	oligoi = (Oligospace_T) aaindex + 1UL;
2889 #endif
2890 	new->offsetsstrm[oligoi] += 1;
2891 	between_counter[frame] = 0;
2892       }
2893       in_counter[frame] -= 1;
2894     }
2895 #else
2896     if (in_counter == index1part) {
2897       if (
2898 #ifdef NONMODULAR
2899 	  between_counter >= index1interval
2900 #else
2901 	  /* Actually, modular condition not needed for user-supplied genomic segment */
2902 	  (position-index1part+1U) % index1interval == 0
2903 #endif
2904 	  ) {
2905 	masked = oligo & mask;
2906 #ifdef OLIGOSPACE_NOT_LONG
2907 	oligoi = (Oligospace_T) masked + 1U;
2908 #else
2909 	oligoi = (Oligospace_T) masked + 1UL;
2910 #endif
2911 	new->offsetsstrm[oligoi] += 1;
2912 	debug(printf("Found oligo %06llX.  Incremented offsets for %llu to be %u\n",
2913 		     masked,(unsigned long long) oligoi,new->offsetsstrm[oligoi]));
2914 	between_counter = 0;
2915       }
2916       in_counter--;
2917     }
2918 #endif
2919 
2920     position++;
2921   }
2922 
2923 #ifdef ADDSENTINEL
2924   for (oligoi = 1; oligoi <= oligospace; oligoi++) {
2925     new->offsetsstrm[oligoi] = new->offsetsstrm[oligoi] + new->offsetsstrm[oligoi-1] + 1;
2926     debug(printf("Offset for %06llX: %u\n",oligoi,new->offsetsstrm[oligoi]));
2927   }
2928 #else
2929   for (oligoi = 1; oligoi <= oligospace; oligoi++) {
2930     new->offsetsstrm[oligoi] = new->offsetsstrm[oligoi] + new->offsetsstrm[oligoi-1];
2931     debug(printf("Offset for %06llX: %u\n",oligoi,new->offsetsstrm[oligoi]));
2932   }
2933 #endif
2934 
2935 
2936   /* Create positions */
2937   position = 0U;
2938 #ifdef HAVE_64_BIT
2939   oligo = 0ULL;
2940 #else
2941   high = low = 0U;
2942 #endif
2943 
2944 #ifdef PMAP
2945   frame = -1;
2946   between_counter[0] = between_counter[1] = between_counter[2] = 0;
2947   in_counter[0] = in_counter[1] = in_counter[2] = 0;
2948 #else
2949   between_counter = in_counter = 0;
2950 #endif
2951 
2952   work_offsets = (Positionsptr_T *) CALLOC(oligospace+1,sizeof(Positionsptr_T));
2953   for (oligoi = 0; oligoi <= oligospace; oligoi++) {
2954     work_offsets[oligoi] = new->offsetsstrm[oligoi];
2955   }
2956 
2957   totalcounts = new->offsetsstrm[oligospace];
2958   if (totalcounts == 0) {
2959 #ifdef PMAP
2960     fprintf(stderr,"Error: user-provided genomic segment has no valid oligomers of size %d\n",index1part_nt);
2961 #else
2962     fprintf(stderr,"Error: user-provided genomic segment has no valid oligomers of size %d\n",index1part);
2963 #endif
2964     exit(9);
2965   }
2966   new->positions = (Univcoord_T *) CALLOC(totalcounts,sizeof(Univcoord_T));
2967   new->positions_access = ALLOCATED_PRIVATE;
2968 
2969   p = genomicseg;
2970   while ((c = *(p++)) != '\0') {
2971 #ifdef PMAP
2972     if (++frame == 3) {
2973       frame = 0;
2974     }
2975     between_counter[frame] += 1;
2976     in_counter[frame] += 1;
2977 #else
2978     between_counter++;
2979     in_counter++;
2980 #endif
2981 
2982 #ifdef HAVE_64_BIT
2983     switch (uppercaseCode[c]) {
2984     case 'A': oligo = (oligo << 2); break;
2985     case 'C': oligo = (oligo << 2) | 1U; break;
2986     case 'G': oligo = (oligo << 2) | 2U; break;
2987     case 'T': oligo = (oligo << 2) | 3U; break;
2988     default:
2989       oligo = 0U;
2990 #ifdef PMAP
2991       in_counter[0] = in_counter[1] = in_counter[2] = 0;
2992 #else
2993       in_counter = 0;
2994 #endif
2995       break;
2996     }
2997 
2998 #else
2999     carry = (low >> 30);
3000     switch (uppercaseCode[c]) {
3001     case 'A': low = (low << 2); break;
3002     case 'C': low = (low << 2) | 1U; break;
3003     case 'G': low = (low << 2) | 2U; break;
3004     case 'T': low = (low << 2) | 3U; break;
3005     default:
3006       high = low = carry = 0U;
3007 #ifdef PMAP
3008       in_counter[0] = in_counter[1] = in_counter[2] = 0;
3009 #else
3010       in_counter = 0;
3011 #endif
3012       break;
3013     }
3014     high = (high << 2) | carry;
3015 #endif
3016 
3017 #ifdef PMAP
3018     if (in_counter[frame] > 0) {
3019       if (watsonp == true) {
3020 #ifdef HAVE_64_BIT
3021 	if (Alphabet_get_codon_fwd(oligo) == AA_STOP) {
3022 	  in_counter[frame] = 0;
3023 	}
3024 #else
3025 	if (Alphabet_get_codon_fwd(low) == AA_STOP) {
3026 	  in_counter[frame] = 0;
3027 	}
3028 #endif
3029       } else {
3030 #ifdef HAVE_64_BIT
3031 	if (Alphabet_get_codon_rev(oligo) == AA_STOP) {
3032 	  in_counter[frame] = 0;
3033 	}
3034 #else
3035 	if (Alphabet_get_codon_rev(low) == AA_STOP) {
3036 	  in_counter[frame] = 0;
3037 	}
3038 #endif
3039       }
3040     }
3041 
3042     if (in_counter[frame] == index1part_aa + 1) {
3043       if (between_counter[frame] >= index1interval) {
3044 #ifdef HAVE_64_BIT
3045 	aaindex = Alphabet_get_aa_index(oligo,watsonp,index1part_nt);
3046 #else
3047 	aaindex = Alphabet_get_aa_index(high,low,watsonp,index1part_nt);
3048 #endif
3049 	if (watsonp == true) {
3050 	  new->positions[work_offsets[aaindex]++] = position-index1part_nt+1;
3051 	} else {
3052 	  new->positions[work_offsets[aaindex]++] = position;
3053 	}
3054 	between_counter[frame] = 0;
3055       }
3056       in_counter[frame] -= 1;
3057     }
3058 #else
3059     if (in_counter == index1part) {
3060       if (
3061 #ifdef NONMODULAR
3062 	  between_counter >= index1interval
3063 #else
3064 	  /* Actually, modular condition not needed for user-supplied genomic segment */
3065 	  (position-index1part+1) % index1interval == 0
3066 #endif
3067 	  ) {
3068 	masked = oligo & mask;
3069 	new->positions[work_offsets[masked]++] = position-index1part+1;
3070 	between_counter = 0;
3071       }
3072       in_counter--;
3073     }
3074 #endif
3075 
3076     position++;
3077   }
3078 
3079 #ifdef ADDSENTINEL
3080   for (oligoi = 0; oligoi < oligospace; oligoi++) {
3081     new->positions[work_offsets[oligoi]] = (Univcoord_T) -1;
3082   }
3083 #endif
3084 
3085   FREE(work_offsets);
3086 
3087   return new;
3088 }
3089 #endif
3090 
3091 
3092