1 static char rcsid[] = "$Id: atoiindex.c 222799 2020-06-03 21:56:34Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11 
12 #ifdef WORDS_BIGENDIAN
13 #include "bigendian.h"
14 #else
15 #include "littleendian.h"
16 #endif
17 
18 #include <stdio.h>
19 #include <stddef.h>
20 #include <stdlib.h>
21 #include <string.h>		/* For memset */
22 #include <ctype.h>		/* For toupper */
23 #include <sys/mman.h>		/* For munmap */
24 #include <math.h>		/* For qsort */
25 #ifdef HAVE_UNISTD_H
26 #include <unistd.h>		/* For lseek and close */
27 #endif
28 #ifdef HAVE_SYS_TYPES_H
29 #include <sys/types.h>		/* For off_t */
30 #endif
31 #if HAVE_DIRENT_H
32 # include <dirent.h>
33 # define NAMLEN(dirent) strlen((dirent)->d_name)
34 #else
35 # define dirent direct
36 # define NAMLEN(dirent) (dirent)->d_namlen
37 # if HAVE_SYS_NDIR_H
38 #  include <sys/ndir.h>
39 # endif
40 # if HAVE_SYS_DIR_H
41 #  include <sys/dir.h>
42 # endif
43 # if HAVE_NDIR_H
44 #  include <ndir.h>
45 # endif
46 #endif
47 
48 #include "mem.h"
49 #include "fopen.h"
50 #include "access.h"
51 #include "types.h"		/* For Positionsptr_T and Oligospace_T */
52 #include "mode.h"
53 #include "filesuffix.h"
54 
55 #include "atoi.h"
56 
57 #include "assert.h"
58 #include "bool.h"
59 #include "genomicpos.h"
60 #include "iit-read-univ.h"
61 #include "indexdb.h"
62 #include "indexdbdef.h"
63 #include "indexdb-write.h"
64 #include "genome.h"
65 #include "genome128_hr.h"
66 
67 #include "bitpack64-write.h"
68 #include "bitpack64-read.h"
69 #include "bitpack64-readtwo.h"
70 #include "bitpack64-access.h"
71 #include "bitpack64-incr.h"
72 
73 #include "regiondb-write.h"
74 #include "datadir.h"
75 #include "getopt.h"
76 
77 
78 #define POSITIONS8_HIGH_SHIFT 32
79 #define POSITIONS8_LOW_MASK 0xFFFFFFFF
80 
81 #define LOCTABLE_SIZE 2
82 #define DIFFERENTIAL_METAINFO_SIZE 2
83 #define BLOCKSIZE 64
84 #define BUFFER_SIZE 1000000
85 
86 #define MONITOR_INTERVAL 10000000
87 #define MONITOR_INTERVAL_REGION 100000
88 
89 
90 #ifdef DEBUG
91 #define debug(x) x
92 #else
93 #define debug(x)
94 #endif
95 
96 
97 
98 static char *user_sourcedir = NULL;
99 static char *user_destdir = NULL;
100 static char *dbroot = NULL;
101 static char *dbversion = NULL;
102 static int compression_type;
103 
104 static int index1part = 15;
105 static int required_index1part = 0;
106 static int index1interval;
107 static int required_index1interval = 0;
108 
109 static int region1part = 6;
110 static int region1interval = 1;
111 
112 static char *snps_root = NULL;
113 
114 
115 static struct option long_options[] = {
116   /* Input options */
117   {"sourcedir", required_argument, 0, 'F'},	/* user_sourcedir */
118   {"destdir", required_argument, 0, 'D'},	/* user_destdir */
119   {"kmer", required_argument, 0, 'k'}, /* required_index1part */
120   {"sampling", required_argument, 0, 'q'}, /* required_interval */
121   {"db", required_argument, 0, 'd'}, /* dbroot */
122   {"usesnps", required_argument, 0, 'v'}, /* snps_root */
123 
124   /* Help options */
125   {"version", no_argument, 0, 0}, /* print_program_version */
126   {"help", no_argument, 0, 0}, /* print_program_usage */
127   {0, 0, 0, 0}
128 };
129 
130 
131 static void
print_program_version()132 print_program_version () {
133   fprintf(stdout,"\n");
134   fprintf(stdout,"ATOIINDEX: Builds GMAP index files for A-to-I RNA editing\n");
135   fprintf(stdout,"Part of GMAP package, version %s\n",PACKAGE_VERSION);
136   fprintf(stdout,"Default gmap directory: %s\n",GMAPDB);
137   fprintf(stdout,"Thomas D. Wu, Genentech, Inc.\n");
138   fprintf(stdout,"Contact: twu@gene.com\n");
139   fprintf(stdout,"\n");
140   return;
141 }
142 
143 static void
144 print_program_usage ();
145 
146 
147 static Oligospace_T
power(int base,int exponent)148 power (int base, int exponent) {
149   Oligospace_T result = 1UL;
150   int i;
151 
152   for (i = 0; i < exponent; i++) {
153     result *= base;
154   }
155   return result;
156 }
157 
158 
159 /*                      87654321 */
160 #define LOW_TWO_BITS  0x00000003
161 
162 
163 #if 0
164 static Oligospace_T
165 reduce_oligo_old (Oligospace_T oligo, int oligosize) {
166   Oligospace_T reduced = 0U, lowbits;
167   int i;
168 
169   for (i = 0; i < oligosize; i++) {
170     lowbits = oligo & LOW_TWO_BITS;
171     reduced >>= 2;
172     switch (lowbits) {
173     case RIGHT_A: break;
174     case RIGHT_C: reduced |= LEFT_T; break;
175     case RIGHT_G: reduced |= LEFT_G; break;
176     case RIGHT_T: reduced |= LEFT_T; break;
177     }
178     oligo >>= 2;
179   }
180 
181   for ( ; i < 16; i++) {
182     reduced >>= 2;
183   }
184 
185   return reduced;
186 }
187 #endif
188 
189 
190 #if 0
191 static char *
192 shortoligo_nt (Oligospace_T oligo, int oligosize) {
193   char *nt;
194   int i, j;
195   Oligospace_T lowbits;
196 
197   nt = (char *) CALLOC(oligosize+1,sizeof(char));
198   j = oligosize-1;
199   for (i = 0; i < oligosize; i++) {
200     lowbits = oligo & LOW_TWO_BITS;
201     switch (lowbits) {
202     case RIGHT_A: nt[j] = 'A'; break;
203     case RIGHT_C: nt[j] = 'C'; break;
204     case RIGHT_G: nt[j] = 'G'; break;
205     case RIGHT_T: nt[j] = 'T'; break;
206     }
207     oligo >>= 2;
208     j--;
209   }
210 
211   return nt;
212 }
213 #endif
214 
215 
216 #if 0
217 /* Old method */
218 static Positionsptr_T *
219 compute_offsets_ag_using_array (Positionsptr_T *oldoffsets, Oligospace_T oligospace, Oligospace_T mask) {
220   Positionsptr_T *offsets;
221   Oligospace_T oligoi, reduced;
222 #ifdef DEBUG
223   char *nt1, *nt2;
224 #endif
225 
226   /* Fill with sizes */
227   offsets = (Positionsptr_T *) CALLOC(oligospace+1,sizeof(Positionsptr_T));
228 
229   for (oligoi = 0; oligoi < oligospace; oligoi++) {
230 #if 0
231     if (reduce_oligo(oligoi) != reduce_oligo_old(oligoi,index1part)) {
232       abort();
233     }
234 #endif
235     reduced = Atoi_reduce_ag(oligoi) & mask;
236     debug(
237 	  nt1 = shortoligo_nt(oligoi,index1part);
238 	  nt2 = shortoligo_nt(reduced,index1part);
239 	  printf("For oligo %s, updating sizes for %s from %u",nt1,nt2,offsets[reduced+1]);
240 	  );
241 #ifdef WORDS_BIGENDIAN
242     /*size*/offsets[reduced+1] += (Bigendian_convert_uint(oldoffsets[oligoi+1]) - Bigendian_convert_uint(oldoffsets[oligoi]));
243 #else
244     /*size*/offsets[reduced+1] += (oldoffsets[oligoi+1] - oldoffsets[oligoi]);
245 #endif
246     debug(
247 	  printf(" to %u\n",offsets[reduced+1]);
248 	  FREE(nt2);
249 	  FREE(nt1);
250 	  );
251   }
252 
253   offsets[0] = 0U;
254   for (oligoi = 1; oligoi <= oligospace; oligoi++) {
255     offsets[oligoi] = offsets[oligoi-1] + /*size*/offsets[oligoi];
256     debug(if (offsets[oligoi] != offsets[oligoi-1]) {
257 	    printf("Offset for %06X: %u\n",oligoi,offsets[oligoi]);
258 	  });
259   }
260 
261   return offsets;
262 }
263 #endif
264 
265 
266 #if 0
267 /* Old method */
268 static Positionsptr_T *
269 compute_offsets_tc (Positionsptr_T *oldoffsets, Oligospace_T oligospace, Oligospace_T mask) {
270   Positionsptr_T *offsets;
271   Oligospace_T oligoi, reduced;
272 #ifdef DEBUG
273   char *nt1, *nt2;
274 #endif
275 
276   /* Fill with sizes */
277   offsets = (Positionsptr_T *) CALLOC(oligospace+1,sizeof(Positionsptr_T));
278 
279   for (oligoi = 0; oligoi < oligospace; oligoi++) {
280     reduced = Atoi_reduce_tc(oligoi) & mask;
281     debug(
282 	  nt1 = shortoligo_nt(oligoi,index1part);
283 	  nt2 = shortoligo_nt(reduced,index1part);
284 	  printf("For oligo %s, updating sizes for %s from %u",nt1,nt2,offsets[reduced+1]);
285 	  );
286 #ifdef WORDS_BIGENDIAN
287     /*size*/offsets[reduced+1] += (Bigendian_convert_uint(oldoffsets[oligoi+1]) - Bigendian_convert_uint(oldoffsets[oligoi]));
288 #else
289     /*size*/offsets[reduced+1] += (oldoffsets[oligoi+1] - oldoffsets[oligoi]);
290 #endif
291     debug(
292 	  printf(" to %u\n",offsets[reduced+1]);
293 	  FREE(nt2);
294 	  FREE(nt1);
295 	  );
296   }
297 
298   offsets[0] = 0U;
299   for (oligoi = 1; oligoi <= oligospace; oligoi++) {
300     offsets[oligoi] = offsets[oligoi-1] + /*size*/offsets[oligoi];
301     debug(if (offsets[oligoi] != offsets[oligoi-1]) {
302 	    printf("Offset for %06X: %u\n",oligoi,offsets[oligoi]);
303 	  });
304   }
305 
306   return offsets;
307 }
308 #endif
309 
310 
311 static void
index_offsets_ag_using_bitpack(char * new_pointers_filename,char * new_offsets_filename,UINT4 * oldoffsetsmeta,UINT4 * oldoffsetsstrm,Oligospace_T oligospace,Oligospace_T mask)312 index_offsets_ag_using_bitpack (char *new_pointers_filename, char *new_offsets_filename,
313 				UINT4 *oldoffsetsmeta, UINT4 *oldoffsetsstrm,
314 				Oligospace_T oligospace, Oligospace_T mask) {
315   Oligospace_T bmerspace;
316   char *packsizes;
317   UINT4 **bitpacks;
318   UINT4 increment;
319   int new_packsize;
320   UINT4 offsets[BLOCKSIZE+1];
321   Oligospace_T oligoi, reduced, bmer;
322   int ii;
323 #ifdef DEBUG
324   char *nt1, *nt2;
325 #endif
326 
327   /* Fill with sizes */
328   bmerspace = oligospace/BLOCKSIZE;
329   packsizes = (char *) CALLOC(bmerspace,sizeof(char));
330   bitpacks = (UINT4 **) CALLOC(bmerspace,sizeof(UINT4 *));
331 
332   for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
333     Bitpack64_block_offsets(offsets,oligoi,oldoffsetsmeta,oldoffsetsstrm);
334     for (ii = 0; ii < BLOCKSIZE; ii++) {
335       if ((increment = offsets[ii+1] - offsets[ii]) > 0) {
336 	reduced = Atoi_reduce_ag(oligoi + ii) & mask;
337 	bmer = reduced/BLOCKSIZE;
338 	if ((new_packsize = Bitpack64_access_new_packsize(reduced,/*old_packsize*/(int) packsizes[bmer],
339 							  bitpacks[bmer],increment)) != (int) packsizes[bmer]) {
340 	  bitpacks[bmer] = Bitpack64_realloc_multiple((int) packsizes[bmer],new_packsize,bitpacks[bmer]);
341 	  packsizes[bmer] = (char) new_packsize;
342 	}
343 	Bitpack64_add_bitpack(reduced,(int) packsizes[bmer],bitpacks[bmer],increment);
344       }
345     }
346   }
347 
348   Bitpack64_write_differential_bitpacks(new_pointers_filename,new_offsets_filename,packsizes,bitpacks,oligospace);
349 
350   for (bmer = 0; bmer < bmerspace; bmer++) {
351     FREE(bitpacks[bmer]);
352   }
353   FREE(bitpacks);
354   FREE(packsizes);
355 
356   return;
357 }
358 
359 
360 static void
index_offsets_tc_using_bitpack(char * new_pointers_filename,char * new_offsets_filename,UINT4 * oldoffsetsmeta,UINT4 * oldoffsetsstrm,Oligospace_T oligospace,Oligospace_T mask)361 index_offsets_tc_using_bitpack (char *new_pointers_filename, char *new_offsets_filename,
362 				UINT4 *oldoffsetsmeta, UINT4 *oldoffsetsstrm,
363 				Oligospace_T oligospace, Oligospace_T mask) {
364   Oligospace_T bmerspace;
365   char *packsizes;
366   UINT4 **bitpacks;
367   UINT4 increment;
368   int new_packsize;
369   UINT4 offsets[BLOCKSIZE+1];
370   Oligospace_T oligoi, reduced, bmer;
371   int ii;
372 #ifdef DEBUG
373   char *nt1, *nt2;
374 #endif
375 
376   /* Fill with sizes */
377   bmerspace = oligospace/BLOCKSIZE;
378   packsizes = (char *) CALLOC(bmerspace,sizeof(char));
379   bitpacks = (UINT4 **) CALLOC(bmerspace,sizeof(UINT4 *));
380 
381   for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
382     Bitpack64_block_offsets(offsets,oligoi,oldoffsetsmeta,oldoffsetsstrm);
383     for (ii = 0; ii < BLOCKSIZE; ii++) {
384       if ((increment = offsets[ii+1] - offsets[ii]) > 0) {
385 	reduced = Atoi_reduce_tc(oligoi + ii) & mask;
386 	bmer = reduced/BLOCKSIZE;
387 	if ((new_packsize = Bitpack64_access_new_packsize(reduced,/*old_packsize*/(int) packsizes[bmer],
388 							  bitpacks[bmer],increment)) != (int) packsizes[bmer]) {
389 	  bitpacks[bmer] = Bitpack64_realloc_multiple((int) packsizes[bmer],new_packsize,bitpacks[bmer]);
390 	  packsizes[bmer] = (char) new_packsize;
391 	}
392 	Bitpack64_add_bitpack(reduced,(int) packsizes[bmer],bitpacks[bmer],increment);
393       }
394     }
395   }
396 
397   Bitpack64_write_differential_bitpacks(new_pointers_filename,new_offsets_filename,packsizes,bitpacks,oligospace);
398 
399   for (bmer = 0; bmer < bmerspace; bmer++) {
400     FREE(bitpacks[bmer]);
401   }
402   FREE(bitpacks);
403   FREE(packsizes);
404 
405   return;
406 }
407 
408 
409 #if 0
410 /* Needs work.  See cmetindex.c for preliminary version */
411 static void
412 sort_positions_local_snps (FILE *positions_fp, UINT2 *positions2,
413 			   UINT2 *newoffsetsmeta, UINT2 *newoffsetsstrm, Localspace_T localspace) {
414 }
415 #endif
416 
417 
418 static void
sort_8mers(unsigned char * positions8_high,UINT4 * positions8_low,Positionsptr_T npositions)419 sort_8mers (unsigned char *positions8_high, UINT4 *positions8_low, Positionsptr_T npositions) {
420   UINT8 *positions8;
421   Positionsptr_T i;
422 
423   positions8 = (UINT8 *) MALLOC(npositions*sizeof(UINT8));
424   for (i = 0; i < npositions; i++) {
425     positions8[i] = ((UINT8) positions8_high[i] << 32) + positions8_low[i];
426   }
427   qsort(positions8,npositions,sizeof(UINT8),UINT8_compare);
428   for (i = 0; i < npositions; i++) {
429     positions8_high[i] = positions8[i] >> POSITIONS8_HIGH_SHIFT;
430     positions8_low[i] = positions8[i] & POSITIONS8_LOW_MASK;
431   }
432   return;
433 }
434 
435 
436 static void
sort_positions(char * ptrsfile,char * compfile,FILE * positions_high_fp,FILE * positions_fp,unsigned char * positions8_high,UINT4 * positions8_low,UINT4 * positions4,UINT4 * newoffsetsmeta,UINT4 * newoffsetsstrm,Oligospace_T oligospace,bool coord_values_8p)437 sort_positions (char *ptrsfile, char *compfile, FILE *positions_high_fp, FILE *positions_fp,
438 		unsigned char *positions8_high, UINT4 *positions8_low, UINT4 *positions4,
439 		UINT4 *newoffsetsmeta, UINT4 *newoffsetsstrm, Oligospace_T oligospace, bool coord_values_8p) {
440   Oligospace_T oligoi;
441   Positionsptr_T block_start, block_end, j, npositions;
442 
443   FILE *ptrs_fp, *comp_fp;
444   UINT4 *ptrs, nregisters;
445   UINT4 ptri;
446   int ii;
447 
448   /* Buffer is used to avoid frequent writes to the file */
449   UINT4 *buffer;
450   int buffer_size = BUFFER_SIZE;
451   int buffer_i;
452 
453   UINT4 newoffsets[BLOCKSIZE+1];
454   UINT4 diffs[BLOCKSIZE], ascending[BLOCKSIZE+1];
455   UINT4 totalcount;
456   int packsize;
457 
458 
459   /* Sort positions in each block.  For snps_root, use code from Bitpack64_write_differential */
460   if (snps_root) {
461     ptrs = (UINT4 *) CALLOC(((oligospace + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
462     ptri = 0;
463 
464     if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
465       fprintf(stderr,"Can't write to file %s\n",compfile);
466       exit(9);
467     }
468     buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
469     buffer_i = 0;
470     nregisters = 0;
471   }
472 
473   totalcount = 0;
474   for (oligoi = 0; oligoi + BLOCKSIZE <= oligospace; oligoi += BLOCKSIZE) {
475     if (snps_root) {
476       ptrs[ptri++] = nregisters;
477       ptrs[ptri++] = totalcount;
478     }
479     ascending[0] = totalcount;
480     Bitpack64_block_offsets(newoffsets,oligoi,newoffsetsmeta,newoffsetsstrm);
481     for (ii = 0; ii < BLOCKSIZE; ii++) {
482       if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
483 	fprintf(stderr,".");
484       }
485       block_start = newoffsets[ii];
486       block_end = newoffsets[ii+1];
487       if ((npositions = block_end - block_start) > 0) {
488 	if (coord_values_8p == true) {
489 	  sort_8mers(&(positions8_high[block_start]),&(positions8_low[block_start]),npositions);
490 	  if (snps_root == NULL) {
491 	    /* FWRITE_UINT8S(&(positions8[block_start]),npositions,positions_fp); */
492 	    FWRITE_CHARS(&(positions8_high[block_start]),npositions,positions_high_fp);
493 	    FWRITE_UINTS(&(positions8_low[block_start]),npositions,positions_fp);
494 	  } else {
495 	    /* FWRITE_UINT8(positions8[block_start],positions_fp); */
496 	    FWRITE_CHAR(positions8_high[block_start],positions_high_fp);
497 	    FWRITE_UINT(positions8_low[block_start],positions_fp);
498 	    for (j = block_start+1; j < block_end; j++) {
499 	      if (positions8_high[j] == positions8_high[j-1] && positions8_low[j] == positions8_low[j-1]) {
500 		npositions--;
501 	      } else {
502 		/* FWRITE_UINT8(positions8[j],positions_fp); */
503 		FWRITE_CHAR(positions8_high[j],positions_high_fp);
504 		FWRITE_UINT(positions8_low[j],positions_fp);
505 	      }
506 	    }
507 	  }
508 
509 	} else {
510 	  qsort(&(positions4[block_start]),npositions,sizeof(UINT4),UINT4_compare);
511 	  if (snps_root == NULL) {
512 	    FWRITE_UINTS(&(positions4[block_start]),npositions,positions_fp);
513 	  } else {
514 	    FWRITE_UINT(positions4[block_start],positions_fp);
515 	    for (j = block_start+1; j < block_end; j++) {
516 	      if (positions4[j] == positions4[j-1]) {
517 		npositions--;
518 	      } else {
519 		FWRITE_UINT(positions4[j],positions_fp);
520 	      }
521 	    }
522 	  }
523 	}
524 	totalcount += npositions;
525       }
526       ascending[ii+1] = totalcount;
527     }
528     if (snps_root) {
529       packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
530       buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
531       nregisters += packsize / 2;
532     }
533   }
534 
535   if (oligoi <= oligospace) {
536     /* Finish last block (containing 1 entry, but expand to 64) */
537     if (snps_root) {
538       ptrs[ptri++] = nregisters;
539       ptrs[ptri++] = totalcount;
540     }
541     ascending[0] = totalcount;
542     for (ii = 0; ii <= (int) (oligospace - oligoi); ii++) {
543       block_start = Bitpack64_read_two(&block_end,oligoi+ii,newoffsetsmeta,newoffsetsstrm);
544       if ((npositions = block_end - block_start) > 0) {
545 	if (coord_values_8p == true) {
546 	  sort_8mers(&(positions8_high[block_start]),&(positions8_low[block_start]),npositions);
547 	  if (snps_root == NULL) {
548 	    /* FWRITE_UINT8S(&(positions8[block_start]),npositions,positions_fp); */
549 	    FWRITE_CHARS(&(positions8_high[block_start]),npositions,positions_high_fp);
550 	    FWRITE_UINTS(&(positions8_low[block_start]),npositions,positions_fp);
551 	  } else {
552 	    /* FWRITE_UINT8(positions8[block_start],positions_fp); */
553 	    FWRITE_CHAR(positions8_high[block_start],positions_high_fp);
554 	    FWRITE_UINT(positions8_low[block_start],positions_fp);
555 	    for (j = block_start+1; j < block_end; j++) {
556 	      if (positions8_high[j] == positions8_high[j-1] && positions8_low[j] == positions8_low[j-1]) {
557 		npositions--;
558 	      } else {
559 		/* FWRITE_UINT8(positions8[j],positions_fp); */
560 		FWRITE_CHAR(positions8_high[j],positions_high_fp);
561 		FWRITE_UINT(positions8_low[j],positions_fp);
562 	      }
563 	    }
564 	  }
565 
566 	} else {
567 	  qsort(&(positions4[block_start]),npositions,sizeof(UINT4),UINT4_compare);
568 	  if (snps_root == NULL) {
569 	    FWRITE_UINTS(&(positions4[block_start]),npositions,positions_fp);
570 	  } else {
571 	    FWRITE_UINT(positions4[block_start],positions_fp);
572 	    for (j = block_start+1; j < block_end; j++) {
573 	      if (positions4[j] == positions4[j-1]) {
574 		npositions--;
575 	      } else {
576 		FWRITE_UINT(positions4[j],positions_fp);
577 	      }
578 	    }
579 	  }
580 	}
581 	totalcount += npositions;
582       }
583       ascending[ii+1] = totalcount;
584     }
585     if (snps_root) {
586       for ( ; ii < BLOCKSIZE; ii++) {
587 	ascending[ii+1] = totalcount;
588       }
589       packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
590       buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
591       nregisters += packsize / 2;
592     }
593   }
594 
595   if (snps_root) {
596     /* Write the final pointer, which will point after the end of the file */
597     ptrs[ptri++] = nregisters;	/* In 128-bit registers */
598 
599     /* Value for end of block */
600     ptrs[ptri++] = totalcount;
601 
602     if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
603       fprintf(stderr,"Can't write to file %s\n",ptrsfile);
604       exit(9);
605     } else {
606       FWRITE_UINTS(ptrs,ptri,ptrs_fp);
607       FREE(ptrs);
608       fclose(ptrs_fp);
609     }
610 
611     /* Empty buffer */
612     if (buffer_i > 0) {
613       FWRITE_UINTS(buffer,buffer_i,comp_fp);
614       buffer_i = 0;
615     }
616     FREE(buffer);
617     fclose(comp_fp);
618   }
619 
620   return;
621 }
622 
623 
624 static void
compute_ag(char * new_pointers_filename,char * new_offsets_filename,char * new_positions_high_filename,char * new_positions_filename,Indexdb_T indexdb,Oligospace_T oligospace,Oligospace_T mask,bool coord_values_8p)625 compute_ag (char *new_pointers_filename, char *new_offsets_filename,
626 	    char *new_positions_high_filename, char *new_positions_filename,
627 	    Indexdb_T indexdb, Oligospace_T oligospace, Oligospace_T mask,
628 	    bool coord_values_8p) {
629   char *ptrsfile, *compfile;
630   FILE *positions_high_fp, *positions_fp;
631 
632   unsigned char *positions8_high;
633   UINT4 *positions8_low;
634   UINT4 *positions4;
635 
636   Oligospace_T oligoi, reduced;
637   UINT4 oldoffsets[BLOCKSIZE+1];
638 
639   UINT4 *newoffsetsmeta, *newoffsetsstrm, *countermeta, *counterstrm;
640   Positionsptr_T preunique_totalcounts, block_start, block_end, j, offset;
641   size_t offsetsmeta_len, offsetsstrm_len;
642   int ii;
643 #ifdef HAVE_MMAP
644   int offsetsmeta_fd, offsetsstrm_fd;
645 #else
646   Access_T offsetsmeta_access, offsetsstrm_access;
647 #endif
648   double seconds;
649 #ifdef DEBUG
650   char *nt1, *nt2;
651 #endif
652 
653 
654   /* offsets = compute_offsets_ag_using_array(oldoffsets,oligospace,mask); */
655   index_offsets_ag_using_bitpack(new_pointers_filename,new_offsets_filename,
656 				 indexdb->offsetsmeta,indexdb->offsetsstrm,
657 				 oligospace,mask);
658 
659   preunique_totalcounts = Bitpack64_read_one(oligospace,indexdb->offsetsmeta,indexdb->offsetsstrm);
660   if (preunique_totalcounts == 0) {
661     fprintf(stderr,"Something is wrong with the offsets.  Total counts is zero.\n");
662     exit(9);
663 
664   } else if (coord_values_8p == true) {
665     fprintf(stderr,"Trying to allocate %u*(%d+%d) bytes of memory...",
666 	    preunique_totalcounts,(int) sizeof(unsigned char),(int) sizeof(UINT4));
667     positions4 = (UINT4 *) NULL;
668     positions8_high = (unsigned char *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(unsigned char));
669     positions8_low = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
670     if (positions8_high == NULL || positions8_low == NULL) {
671       fprintf(stderr,"failed.  Need a computer with sufficient memory.\n");
672       exit(9);
673     } else {
674       fprintf(stderr,"done\n");
675     }
676 
677   } else {
678     fprintf(stderr,"Trying to allocate %u*%d bytes of memory...",preunique_totalcounts,(int) sizeof(UINT4));
679     positions8_high = (unsigned char *) NULL;
680     positions8_low = (UINT4 *) NULL;
681     positions4 = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
682     if (positions4 == NULL) {
683       fprintf(stderr,"failed.  Need a computer with sufficient memory.\n");
684       exit(9);
685     } else {
686       fprintf(stderr,"done\n");
687     }
688   }
689 
690   /* Point to offsets and revise */
691   fprintf(stderr,"Rearranging AG offsets for indexdb...");
692 #ifdef HAVE_MMAP
693   newoffsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,new_pointers_filename,/*randomp*/false);
694   newoffsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,new_offsets_filename,/*randomp*/false);
695 #else
696   newoffsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,new_pointers_filename,sizeof(UINT4));
697   newoffsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,new_offsets_filename,sizeof(UINT4));
698 #endif
699   countermeta = Indexdb_bitpack_counter(&counterstrm,newoffsetsmeta,index1part);
700 
701   for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
702     Bitpack64_block_offsets(oldoffsets,oligoi,indexdb->offsetsmeta,indexdb->offsetsstrm);
703     for (ii = 0; ii < BLOCKSIZE; ii++) {
704       if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
705 	fprintf(stderr,".");
706       }
707       block_start = oldoffsets[ii];
708       block_end = oldoffsets[ii+1];
709 
710       if (block_end > block_start) {
711 	reduced = Atoi_reduce_ag(oligoi + ii) & mask;
712 	offset = Bitpack64_read_one(reduced,newoffsetsmeta,newoffsetsstrm) + Bitpack64_access(reduced,countermeta,counterstrm);
713 	if (coord_values_8p == true) {
714 	  for (j = block_start; j < block_end; j++) {
715 	    positions8_high[offset] = indexdb->positions_high[j];
716 	    positions8_low[offset] = indexdb->positions[j];
717 	    offset++;
718 	  }
719 	} else {
720 	  for (j = block_start; j < block_end; j++) {
721 	    positions4[offset] = indexdb->positions[j];
722 	    offset++;
723 	  }
724 	}
725 	Bitpack64_add(reduced,countermeta,counterstrm,block_end - block_start);
726       }
727     }
728   }
729 
730   FREE(counterstrm);
731   FREE(countermeta);
732   fprintf(stderr,"done\n");
733 
734 
735   fprintf(stderr,"Sorting AG positions for indexdb...");
736   if (snps_root == NULL) {
737     ptrsfile = compfile = (char *) NULL;
738   } else {
739     ptrsfile = (char *) MALLOC((strlen(new_pointers_filename) + strlen(".temp") + 1) * sizeof(char));
740     sprintf(ptrsfile,"%s.temp",new_pointers_filename);
741     compfile = (char *) MALLOC((strlen(new_offsets_filename) + strlen(".temp") + 1) * sizeof(char));
742     sprintf(compfile,"%s.temp",new_offsets_filename);
743   }
744 
745   if (new_positions_high_filename != NULL) {
746     if ((positions_high_fp = FOPEN_WRITE_BINARY(new_positions_high_filename)) == NULL) {
747       fprintf(stderr,"Can't write to file %s\n",new_positions_high_filename);
748       exit(9);
749     }
750   }
751 
752   if ((positions_fp = FOPEN_WRITE_BINARY(new_positions_filename)) == NULL) {
753     fprintf(stderr,"Can't write to file %s\n",new_positions_filename);
754     exit(9);
755   }
756 
757   sort_positions(ptrsfile,compfile,positions_high_fp,positions_fp,
758 		 positions8_high,positions8_low,positions4,
759 		 newoffsetsmeta,newoffsetsstrm,oligospace,coord_values_8p);
760 
761   /* Clean up */
762 #ifdef HAVE_MMAP
763   munmap((void *) newoffsetsmeta,offsetsmeta_len);
764   munmap((void *) newoffsetsstrm,offsetsstrm_len);
765   close(offsetsmeta_fd);
766   close(offsetsstrm_fd);
767 #else
768   FREE(newoffsetsmeta);
769   FREE(newoffsetsstrm);
770 #endif
771 
772   if (coord_values_8p == true) {
773     FREE(positions8_high);
774     FREE(positions8_low);
775     fclose(positions_high_fp);
776     fclose(positions_fp);
777   } else {
778     FREE(positions4);
779     fclose(positions_fp);
780   }
781 
782   if (snps_root) {
783     rename(ptrsfile,new_pointers_filename);
784     rename(compfile,new_offsets_filename);
785     FREE(ptrsfile);
786     FREE(compfile);
787   }
788 
789   fprintf(stderr,"done\n");
790 
791   return;
792 }
793 
794 
795 static void
compute_tc(char * new_pointers_filename,char * new_offsets_filename,char * new_positions_high_filename,char * new_positions_filename,Indexdb_T indexdb,Oligospace_T oligospace,Oligospace_T mask,bool coord_values_8p)796 compute_tc (char *new_pointers_filename, char *new_offsets_filename,
797 	    char *new_positions_high_filename, char *new_positions_filename,
798 	    Indexdb_T indexdb, Oligospace_T oligospace, Oligospace_T mask,
799 	    bool coord_values_8p) {
800   char *ptrsfile, *compfile;
801   FILE *positions_high_fp, *positions_fp;
802 
803   unsigned char *positions8_high;
804   UINT4 *positions8_low;
805   UINT4 *positions4;
806 
807   Oligospace_T oligoi, reduced;
808   UINT4 oldoffsets[BLOCKSIZE+1];
809 
810   UINT4 *newoffsetsmeta, *newoffsetsstrm, *countermeta, *counterstrm;
811   Positionsptr_T preunique_totalcounts, block_start, block_end, j, offset;
812   size_t offsetsmeta_len, offsetsstrm_len;
813   int ii;
814 #ifdef HAVE_MMAP
815   int offsetsmeta_fd, offsetsstrm_fd;
816 #else
817   Access_T offsetsmeta_access, offsetsstrm_access;
818 #endif
819   double seconds;
820 #ifdef DEBUG
821   char *nt1, *nt2;
822 #endif
823 
824 
825   /* offsets = compute_offsets_tc_using_array(oldoffsets,oligospace,mask); */
826   index_offsets_tc_using_bitpack(new_pointers_filename,new_offsets_filename,
827 				 indexdb->offsetsmeta,indexdb->offsetsstrm,
828 				 oligospace,mask);
829 
830   preunique_totalcounts = Bitpack64_read_one(oligospace,indexdb->offsetsmeta,indexdb->offsetsstrm);
831   if (preunique_totalcounts == 0) {
832     fprintf(stderr,"Something is wrong with the offsets.  Total counts is zero.\n");
833     exit(9);
834 
835   } else if (coord_values_8p == true) {
836     fprintf(stderr,"Trying to allocate %u*(%d+%d) bytes of memory...",
837 	    preunique_totalcounts,(int) sizeof(unsigned char),(int) sizeof(UINT4));
838     positions4 = (UINT4 *) NULL;
839     positions8_high = (unsigned char *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(unsigned char));
840     positions8_low = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
841     if (positions8_high == NULL || positions8_low == NULL) {
842       fprintf(stderr,"failed.  Need a computer with sufficient memory.\n");
843       exit(9);
844     } else {
845       fprintf(stderr,"done\n");
846     }
847 
848   } else {
849     fprintf(stderr,"Trying to allocate %u*%d bytes of memory...",preunique_totalcounts,(int) sizeof(UINT4));
850     positions8_high = (unsigned char *) NULL;
851     positions8_low = (UINT4 *) NULL;
852     positions4 = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
853     if (positions4 == NULL) {
854       fprintf(stderr,"failed.  Need a computer with sufficient memory.\n");
855       exit(9);
856     } else {
857       fprintf(stderr,"done\n");
858     }
859   }
860 
861   /* Point to offsets and revise */
862   fprintf(stderr,"Rearranging TC offsets for indexdb...");
863 #ifdef HAVE_MMAP
864   newoffsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,new_pointers_filename,/*randomp*/false);
865   newoffsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,new_offsets_filename,/*randomp*/false);
866 #else
867   newoffsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,new_pointers_filename,sizeof(UINT4));
868   newoffsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,new_offsets_filename,sizeof(UINT4));
869 #endif
870   countermeta = Indexdb_bitpack_counter(&counterstrm,newoffsetsmeta,index1part);
871 
872   for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
873     Bitpack64_block_offsets(oldoffsets,oligoi,indexdb->offsetsmeta,indexdb->offsetsstrm);
874     for (ii = 0; ii < BLOCKSIZE; ii++) {
875       if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
876 	fprintf(stderr,".");
877       }
878       block_start = oldoffsets[ii];
879       block_end = oldoffsets[ii+1];
880 
881       if (block_end > block_start) {
882 	reduced = Atoi_reduce_tc(oligoi + ii) & mask;
883 	offset = Bitpack64_read_one(reduced,newoffsetsmeta,newoffsetsstrm) + Bitpack64_access(reduced,countermeta,counterstrm);
884 	if (coord_values_8p == true) {
885 	  for (j = block_start; j < block_end; j++) {
886 	    positions8_high[offset] = indexdb->positions_high[j];
887 	    positions8_low[offset] = indexdb->positions[j];
888 	    offset++;
889 	  }
890 	} else {
891 	  for (j = block_start; j < block_end; j++) {
892 	    positions4[offset] = indexdb->positions[j];
893 	    offset++;
894 	  }
895 	}
896 	Bitpack64_add(reduced,countermeta,counterstrm,block_end - block_start);
897       }
898     }
899   }
900 
901   FREE(counterstrm);
902   FREE(countermeta);
903   fprintf(stderr,"done\n");
904 
905 
906   fprintf(stderr,"Sorting TC positions for indexdb...");
907   if (snps_root == NULL) {
908     ptrsfile = compfile = (char *) NULL;
909   } else {
910     ptrsfile = (char *) MALLOC((strlen(new_pointers_filename) + strlen(".temp") + 1) * sizeof(char));
911     sprintf(ptrsfile,"%s.temp",new_pointers_filename);
912     compfile = (char *) MALLOC((strlen(new_offsets_filename) + strlen(".temp") + 1) * sizeof(char));
913     sprintf(compfile,"%s.temp",new_offsets_filename);
914   }
915 
916   if (new_positions_high_filename != NULL) {
917     if ((positions_high_fp = FOPEN_WRITE_BINARY(new_positions_high_filename)) == NULL) {
918       fprintf(stderr,"Can't write to file %s\n",new_positions_high_filename);
919       exit(9);
920     }
921   }
922 
923   if ((positions_fp = FOPEN_WRITE_BINARY(new_positions_filename)) == NULL) {
924     fprintf(stderr,"Can't write to file %s\n",new_positions_filename);
925     exit(9);
926   }
927 
928   sort_positions(ptrsfile,compfile,positions_high_fp,positions_fp,
929 		 positions8_high,positions8_low,positions4,
930 		 newoffsetsmeta,newoffsetsstrm,oligospace,coord_values_8p);
931 
932 /* Clean up */
933 #ifdef HAVE_MMAP
934   munmap((void *) newoffsetsmeta,offsetsmeta_len);
935   munmap((void *) newoffsetsstrm,offsetsstrm_len);
936   close(offsetsmeta_fd);
937   close(offsetsstrm_fd);
938 #else
939   FREE(newoffsetsmeta);
940   FREE(newoffsetsstrm);
941 #endif
942 
943   if (coord_values_8p == true) {
944     FREE(positions8_high);
945     FREE(positions8_low);
946     fclose(positions_high_fp);
947     fclose(positions_fp);
948   } else {
949     FREE(positions4);
950     fclose(positions_fp);
951   }
952 
953   if (snps_root) {
954     rename(ptrsfile,new_pointers_filename);
955     rename(compfile,new_offsets_filename);
956     FREE(ptrsfile);
957     FREE(compfile);
958   }
959 
960   fprintf(stderr,"done\n");
961 
962   return;
963 }
964 
965 
966 /* Usage: atoiindex -d <genome> */
967 
968 
969 /* Note: Depends on having gmapindex sampled on mod 3 bounds */
970 int
main(int argc,char * argv[])971 main (int argc, char *argv[]) {
972   char *sourcedir = NULL, *destdir = NULL, *filename, *fileroot;
973   Indexdb_filenames_T ifilenames;
974   char *new_pointers_filename = NULL, *new_offsets_filename = NULL,
975     *new_positions_high_filename = NULL, *new_positions_filename = NULL;
976   FILE *sequence_fp;
977 
978   Univ_IIT_T chromosome_iit;
979   /* size_t totalcounts, i; */
980   Oligospace_T oligospace, index_mask;
981   bool coord_values_8p;
982 
983   Indexdb_T indexdb;
984 
985   int opt;
986   extern int optind;
987   extern char *optarg;
988   int long_option_index = 0;
989   const char *long_name;
990 
991   while ((opt = getopt_long(argc,argv,"F:D:d:k:q:v:",
992 			    long_options,&long_option_index)) != -1) {
993     switch (opt) {
994     case 0:
995       long_name = long_options[long_option_index].name;
996       if (!strcmp(long_name,"version")) {
997 	print_program_version();
998 	exit(0);
999       } else if (!strcmp(long_name,"help")) {
1000 	print_program_usage();
1001 	exit(0);
1002 
1003       } else {
1004 	/* Shouldn't reach here */
1005 	fprintf(stderr,"Don't recognize option %s.  For usage, run 'atoiindex --help'",long_name);
1006 	exit(9);
1007       }
1008       break;
1009 
1010     case 'F': user_sourcedir = optarg; break;
1011     case 'D': user_destdir = optarg; break;
1012     case 'd': dbroot = optarg; break;
1013     case 'k': required_index1part = atoi(optarg); break;
1014     case 'q': required_index1interval = atoi(optarg); break;
1015     case 'v':
1016       snps_root = optarg;
1017       fprintf(stderr,"Combination of cmetindex and snps is not yet supported in 2018 versions\n");
1018       exit(9);
1019       break;
1020     default: fprintf(stderr,"Do not recognize flag %c\n",opt); exit(9);
1021     }
1022   }
1023   argc -= (optind - 1);
1024   argv += (optind - 1);
1025 
1026   if (dbroot == NULL) {
1027     fprintf(stderr,"Missing name of genome database.  Must specify with -d flag.\n");
1028     fprintf(stderr,"Usage: atoiindex -d <genome>\n");
1029     exit(9);
1030   } else {
1031     sourcedir = Datadir_find_genomesubdir(&fileroot,&dbversion,user_sourcedir,dbroot);
1032     fprintf(stderr,"Reading source files from %s\n",sourcedir);
1033   }
1034 
1035   if (user_destdir == NULL) {
1036     destdir = (char *) MALLOC((strlen(sourcedir)+strlen("/")+strlen(dbroot)+1)*sizeof(char));
1037     sprintf(destdir,"%s/%s",sourcedir,dbroot);
1038   } else {
1039     destdir = (char *) MALLOC((strlen(user_destdir)+strlen("/")+strlen(dbroot)+1)*sizeof(char));
1040     sprintf(destdir,"%s/%s",user_destdir,dbroot);
1041   }
1042   fprintf(stderr,"Writing atoi index files to %s\n",destdir);
1043 
1044 
1045   /* Chromosome IIT file.  Need to determine coord_values_8p and genomelength */
1046   filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+
1047 			     strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
1048   sprintf(filename,"%s/%s.chromosome.iit",sourcedir,fileroot);
1049   if ((chromosome_iit = Univ_IIT_read(filename,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
1050     fprintf(stderr,"IIT file %s is not valid\n",filename);
1051     exit(9);
1052   } else {
1053     coord_values_8p = Univ_IIT_coord_values_8p(chromosome_iit);
1054   }
1055   FREE(filename);
1056 
1057 
1058   ifilenames = Indexdb_get_filenames(&compression_type,&index1part,&index1interval,
1059 				     sourcedir,fileroot,IDX_FILESUFFIX,snps_root,
1060 				     required_index1part,required_index1interval,
1061 				     /*offsets_only_p*/false);
1062   if ((indexdb = Indexdb_new_genome(&index1part,&index1interval,
1063 				    sourcedir,fileroot,IDX_FILESUFFIX,/*snps_root*/NULL,
1064 				    required_index1part,required_index1interval,
1065 				    /*offsetsstrm_access*/USE_ALLOCATE,
1066 				    /*positions_access*/USE_ALLOCATE,/*sharedp*/false,
1067 				    /*multiple_sequences_p*/false,/*preload_shared_memory_p*/false,/*unload_shared_memory_p*/false)) != NULL) {
1068 
1069 #ifdef HAVE_64_BIT
1070     index_mask = ~(~0ULL << 2*index1part);
1071 #else
1072     index_mask = ~(~0U << 2*index1part);
1073 #endif
1074     oligospace = power(4,index1part);
1075 
1076 
1077     /* Compute and write AG files */
1078     new_pointers_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1079 					    strlen(".")+strlen("a2iag")+strlen(ifilenames->pointers_index1info_ptr)+1,sizeof(char));
1080     sprintf(new_pointers_filename,"%s/%s.%s%s",destdir,fileroot,"a2iag",ifilenames->pointers_index1info_ptr);
1081 
1082     new_offsets_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1083 					   strlen(".")+strlen("a2iag")+strlen(ifilenames->offsets_index1info_ptr)+1,sizeof(char));
1084     sprintf(new_offsets_filename,"%s/%s.%s%s",destdir,fileroot,"a2iag",ifilenames->offsets_index1info_ptr);
1085 
1086     new_positions_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1087 					     strlen(".")+strlen("a2iag")+strlen(ifilenames->positions_index1info_ptr)+1,sizeof(char));
1088     sprintf(new_positions_filename,"%s/%s.%s%s",destdir,fileroot,"a2iag",ifilenames->positions_index1info_ptr);
1089 
1090     if (coord_values_8p == true) {
1091       new_positions_high_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1092 						    strlen(".")+strlen("a2iag")+strlen(ifilenames->positions_high_index1info_ptr)+1,sizeof(char));
1093       sprintf(new_positions_high_filename,"%s/%s.%s%s",destdir,fileroot,"a2iag",ifilenames->positions_high_index1info_ptr);
1094     }
1095 
1096     compute_ag(new_pointers_filename,new_offsets_filename,
1097 	       new_positions_high_filename,new_positions_filename,
1098 	       indexdb,oligospace,index_mask,coord_values_8p);
1099 
1100     if (coord_values_8p == true) {
1101       FREE(new_positions_high_filename);
1102     }
1103     FREE(new_positions_filename);
1104     FREE(new_offsets_filename);
1105     FREE(new_pointers_filename);
1106 
1107     /* regiondb */
1108     filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+strlen(fileroot)+
1109 			       strlen(".genomecomp")+1,sizeof(char));
1110     sprintf(filename,"%s/%s.genomecomp",sourcedir,fileroot);
1111     if ((sequence_fp = fopen(filename,"rb")) == NULL) {
1112       fprintf(stderr,"Could not open file %s\n",filename);
1113       exit(9);
1114     }
1115     FREE(filename);
1116 
1117     Regiondb_write(destdir,/*region_interval_char*/'1',sequence_fp,chromosome_iit,
1118 		   region1part,region1interval,
1119 		   /*genome_lc_p*/false,fileroot,/*mask_lowercase_p*/false,
1120 		   Atoi_reduce_ag);
1121     fclose(sequence_fp);
1122 
1123 
1124     /* Compute and write TC files */
1125     new_pointers_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1126 					    strlen(".")+strlen("a2itc")+strlen(ifilenames->pointers_index1info_ptr)+1,sizeof(char));
1127     sprintf(new_pointers_filename,"%s/%s.%s%s",destdir,fileroot,"a2itc",ifilenames->pointers_index1info_ptr);
1128 
1129     new_offsets_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1130 					   strlen(".")+strlen("a2itc")+strlen(ifilenames->offsets_index1info_ptr)+1,sizeof(char));
1131     sprintf(new_offsets_filename,"%s/%s.%s%s",destdir,fileroot,"a2itc",ifilenames->offsets_index1info_ptr);
1132 
1133     new_positions_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1134 					     strlen(".")+strlen("a2itc")+strlen(ifilenames->positions_index1info_ptr)+1,sizeof(char));
1135     sprintf(new_positions_filename,"%s/%s.%s%s",destdir,fileroot,"a2itc",ifilenames->positions_index1info_ptr);
1136 
1137     if (coord_values_8p == true) {
1138       new_positions_high_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1139 						    strlen(".")+strlen("a2itc")+strlen(ifilenames->positions_high_index1info_ptr)+1,sizeof(char));
1140       sprintf(new_positions_high_filename,"%s/%s.%s%s",destdir,fileroot,"a2itc",ifilenames->positions_high_index1info_ptr);
1141     }
1142 
1143     compute_tc(new_pointers_filename,new_offsets_filename,
1144 	       new_positions_high_filename,new_positions_filename,
1145 	       indexdb,oligospace,index_mask,coord_values_8p);
1146     if (coord_values_8p == true) {
1147       FREE(new_positions_high_filename);
1148     }
1149     FREE(new_positions_filename);
1150     FREE(new_offsets_filename);
1151     FREE(new_pointers_filename);
1152 
1153     /* regiondb */
1154     filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+strlen(fileroot)+
1155 			       strlen(".genomecomp")+1,sizeof(char));
1156     sprintf(filename,"%s/%s.genomecomp",sourcedir,fileroot);
1157     if ((sequence_fp = fopen(filename,"rb")) == NULL) {
1158       fprintf(stderr,"Could not open file %s\n",filename);
1159       exit(9);
1160     }
1161     FREE(filename);
1162 
1163     Regiondb_write(destdir,/*region_interval_char*/'1',sequence_fp,chromosome_iit,
1164 		   region1part,region1interval,
1165 		   /*genome_lc_p*/false,fileroot,/*mask_lowercase_p*/false,
1166 		   Atoi_reduce_tc);
1167     fclose(sequence_fp);
1168 
1169     Indexdb_free(&indexdb);
1170     Indexdb_filenames_free(&ifilenames);
1171   }
1172 
1173   Univ_IIT_free(&chromosome_iit);
1174 
1175   FREE(destdir);
1176   FREE(dbversion);
1177   FREE(fileroot);
1178   FREE(sourcedir);
1179 
1180   return 0;
1181 }
1182 
1183 
1184 
1185 static void
print_program_usage()1186 print_program_usage () {
1187   fprintf(stdout,"\
1188 Usage: atoiindex [OPTIONS...] -d <genome>\n\
1189 \n\
1190 ");
1191 
1192   /* Input options */
1193   fprintf(stdout,"Options (must include -d)\n");
1194   fprintf(stdout,"\
1195   -F, --sourcedir=directory      Directory where to read cmet index files (default is\n\
1196                                    GMAP genome directory specified at compile time)\n\
1197   -D, --destdir=directory        Directory where to write cmet index files (default is\n\
1198                                    value of -F, if provided; otherwise the value of the\n\
1199                                    GMAP genome directory specified at compile time)\n\
1200   -d, --db=STRING                Genome database\n\
1201   -k, --kmer=INT                 kmer size to use in genome database (allowed values: 16 or less).\n\
1202                                    If not specified, the program will find the highest available\n\
1203                                    kmer size in the genome database\n\
1204   -q, --sampling=INT             Sampling to use in genome database.  If not specified, the program\n\
1205                                    will find the smallest available sampling value in the genome database\n\
1206                                    within selected basesize and k-mer size\n\
1207   -v, --use-snps=STRING          Use database containing known SNPs (in <STRING>.iit, built\n\
1208                                    previously using snpindex) for tolerance to SNPs\n\
1209 \n\
1210   --version                      Show version\n\
1211   --help                         Show this help message\n\
1212 ");
1213   return;
1214 }
1215 
1216