1 static char rcsid[] = "$Id: cmetindex.c 222799 2020-06-03 21:56:34Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11 
12 #ifdef WORDS_BIGENDIAN
13 #include "bigendian.h"
14 #else
15 #include "littleendian.h"
16 #endif
17 
18 #include <stdio.h>
19 #include <stddef.h>
20 #include <stdlib.h>
21 #include <string.h>		/* For memset */
22 #include <ctype.h>		/* For toupper */
23 #include <sys/mman.h>		/* For munmap */
24 #include <math.h>		/* For qsort */
25 #ifdef HAVE_UNISTD_H
26 #include <unistd.h>		/* For lseek and close */
27 #endif
28 #ifdef HAVE_SYS_TYPES_H
29 #include <sys/types.h>		/* For off_t */
30 #endif
31 #if HAVE_DIRENT_H
32 # include <dirent.h>
33 # define NAMLEN(dirent) strlen((dirent)->d_name)
34 #else
35 # define dirent direct
36 # define NAMLEN(dirent) (dirent)->d_namlen
37 # if HAVE_SYS_NDIR_H
38 #  include <sys/ndir.h>
39 # endif
40 # if HAVE_SYS_DIR_H
41 #  include <sys/dir.h>
42 # endif
43 # if HAVE_NDIR_H
44 #  include <ndir.h>
45 # endif
46 #endif
47 
48 #include "mem.h"
49 #include "fopen.h"
50 #include "access.h"
51 #include "types.h"		/* For Positionsptr_T and Oligospace_T */
52 #include "mode.h"
53 #include "filesuffix.h"
54 
55 #include "cmet.h"
56 
57 #include "assert.h"
58 #include "bool.h"
59 #include "genomicpos.h"
60 #include "iit-read-univ.h"
61 #include "indexdb.h"
62 #include "indexdbdef.h"
63 #include "indexdb-write.h"
64 #include "genome.h"
65 #include "genome128_hr.h"
66 
67 #include "bitpack64-write.h"
68 #include "bitpack64-read.h"
69 #include "bitpack64-readtwo.h"
70 #include "bitpack64-access.h"
71 #include "bitpack64-incr.h"
72 
73 #include "regiondb-write.h"
74 #include "datadir.h"
75 #include "getopt.h"
76 
77 
78 #define POSITIONS8_HIGH_SHIFT 32
79 #define POSITIONS8_LOW_MASK 0xFFFFFFFF
80 
81 #define LOCTABLE_SIZE 2
82 #define DIFFERENTIAL_METAINFO_SIZE 2
83 #define BLOCKSIZE 64
84 #define BUFFER_SIZE 1000000
85 
86 #define MONITOR_INTERVAL 10000000
87 #define MONITOR_INTERVAL_REGION 100000
88 
89 
90 #ifdef DEBUG
91 #define debug(x) x
92 #else
93 #define debug(x)
94 #endif
95 
96 
97 
98 static char *user_sourcedir = NULL;
99 static char *user_destdir = NULL;
100 static char *dbroot = NULL;
101 static char *dbversion = NULL;
102 static int compression_type;
103 
104 static int index1part = 15;
105 static int required_index1part = 0;
106 static int index1interval;
107 static int required_index1interval = 0;
108 
109 static int region1part = 6;
110 static int region1interval = 1;
111 
112 static char *snps_root = NULL;
113 
114 
115 static struct option long_options[] = {
116   /* Input options */
117   {"sourcedir", required_argument, 0, 'F'},	/* user_sourcedir */
118   {"destdir", required_argument, 0, 'D'},	/* user_destdir */
119   {"kmer", required_argument, 0, 'k'}, /* required_index1part */
120   {"sampling", required_argument, 0, 'q'}, /* required_interval */
121   {"db", required_argument, 0, 'd'}, /* dbroot */
122   {"usesnps", required_argument, 0, 'v'}, /* snps_root */
123 
124   /* Help options */
125   {"version", no_argument, 0, 0}, /* print_program_version */
126   {"help", no_argument, 0, 0}, /* print_program_usage */
127   {0, 0, 0, 0}
128 };
129 
130 
131 static void
print_program_version()132 print_program_version () {
133   fprintf(stdout,"\n");
134   fprintf(stdout,"CMETINDEX: Builds GMAP index files for methylated genomes\n");
135   fprintf(stdout,"Part of GMAP package, version %s\n",PACKAGE_VERSION);
136   fprintf(stdout,"Default gmap directory: %s\n",GMAPDB);
137   fprintf(stdout,"Thomas D. Wu, Genentech, Inc.\n");
138   fprintf(stdout,"Contact: twu@gene.com\n");
139   fprintf(stdout,"\n");
140   return;
141 }
142 
143 static void
144 print_program_usage ();
145 
146 
147 static Oligospace_T
power(int base,int exponent)148 power (int base, int exponent) {
149   Oligospace_T result = 1UL;
150   int i;
151 
152   for (i = 0; i < exponent; i++) {
153     result *= base;
154   }
155   return result;
156 }
157 
158 
159 /*                      87654321 */
160 #define LOW_TWO_BITS  0x00000003
161 
162 
163 #if 0
164 static Oligospace_T
165 reduce_oligo_old (Oligospace_T oligo, int oligosize) {
166   Oligospace_T reduced = 0U, lowbits;
167   int i;
168 
169   for (i = 0; i < oligosize; i++) {
170     lowbits = oligo & LOW_TWO_BITS;
171     reduced >>= 2;
172     switch (lowbits) {
173     case RIGHT_A: break;
174     case RIGHT_C: reduced |= LEFT_T; break;
175     case RIGHT_G: reduced |= LEFT_G; break;
176     case RIGHT_T: reduced |= LEFT_T; break;
177     }
178     oligo >>= 2;
179   }
180 
181   for ( ; i < 16; i++) {
182     reduced >>= 2;
183   }
184 
185   return reduced;
186 }
187 #endif
188 
189 
190 #if 0
191 static char *
192 shortoligo_nt (Oligospace_T oligo, int oligosize) {
193   char *nt;
194   int i, j;
195   Oligospace_T lowbits;
196 
197   nt = (char *) CALLOC(oligosize+1,sizeof(char));
198   j = oligosize-1;
199   for (i = 0; i < oligosize; i++) {
200     lowbits = oligo & LOW_TWO_BITS;
201     switch (lowbits) {
202     case RIGHT_A: nt[j] = 'A'; break;
203     case RIGHT_C: nt[j] = 'C'; break;
204     case RIGHT_G: nt[j] = 'G'; break;
205     case RIGHT_T: nt[j] = 'T'; break;
206     }
207     oligo >>= 2;
208     j--;
209   }
210 
211   return nt;
212 }
213 #endif
214 
215 
216 #if 0
217 /* Old method */
218 static Positionsptr_T *
219 compute_offsets_ct_using_array (Positionsptr_T *oldoffsets, Oligospace_T oligospace, Oligospace_T mask) {
220   Positionsptr_T *offsets;
221   Oligospace_T oligoi, reduced;
222 #ifdef DEBUG
223   char *nt1, *nt2;
224 #endif
225 
226   /* Fill with sizes */
227   offsets = (Positionsptr_T *) CALLOC(oligospace+1,sizeof(Positionsptr_T));
228 
229   for (oligoi = 0; oligoi < oligospace; oligoi++) {
230 #if 0
231     if (reduce_oligo(oligoi) != reduce_oligo_old(oligoi,index1part)) {
232       abort();
233     }
234 #endif
235     reduced = Cmet_reduce_ct(oligoi) & mask;
236     debug(
237 	  nt1 = shortoligo_nt(oligoi,index1part);
238 	  nt2 = shortoligo_nt(reduced,index1part);
239 	  printf("For oligo %s, updating sizes for %s from %u",nt1,nt2,offsets[reduced+1]);
240 	  );
241 #ifdef WORDS_BIGENDIAN
242     /*size*/offsets[reduced+1] += (Bigendian_convert_uint(oldoffsets[oligoi+1]) - Bigendian_convert_uint(oldoffsets[oligoi]));
243 #else
244     /*size*/offsets[reduced+1] += (oldoffsets[oligoi+1] - oldoffsets[oligoi]);
245 #endif
246     debug(
247 	  printf(" to %u\n",offsets[reduced+1]);
248 	  FREE(nt2);
249 	  FREE(nt1);
250 	  );
251   }
252 
253   offsets[0] = 0U;
254   for (oligoi = 1; oligoi <= oligospace; oligoi++) {
255     offsets[oligoi] = offsets[oligoi-1] + /*size*/offsets[oligoi];
256     debug(if (offsets[oligoi] != offsets[oligoi-1]) {
257 	    printf("Offset for %06X: %u\n",oligoi,offsets[oligoi]);
258 	  });
259   }
260 
261   return offsets;
262 }
263 #endif
264 
265 
266 #if 0
267 /* Old method */
268 static Positionsptr_T *
269 compute_offsets_ga_using_array (Positionsptr_T *oldoffsets, Oligospace_T oligospace, Oligospace_T mask) {
270   Positionsptr_T *offsets;
271   Oligospace_T oligoi, reduced;
272 #ifdef DEBUG
273   char *nt1, *nt2;
274 #endif
275 
276   /* Fill with sizes */
277   offsets = (Positionsptr_T *) CALLOC(oligospace+1,sizeof(Positionsptr_T));
278 
279   for (oligoi = 0; oligoi < oligospace; oligoi++) {
280     reduced = Cmet_reduce_ga(oligoi) & mask;
281     debug(
282 	  nt1 = shortoligo_nt(oligoi,index1part);
283 	  nt2 = shortoligo_nt(reduced,index1part);
284 	  printf("For oligo %s, updating sizes for %s from %u",nt1,nt2,offsets[reduced+1]);
285 	  );
286 #ifdef WORDS_BIGENDIAN
287     /*size*/offsets[reduced+1] += (Bigendian_convert_uint(oldoffsets[oligoi+1]) - Bigendian_convert_uint(oldoffsets[oligoi]));
288 #else
289     /*size*/offsets[reduced+1] += (oldoffsets[oligoi+1] - oldoffsets[oligoi]);
290 #endif
291     debug(
292 	  printf(" to %u\n",offsets[reduced+1]);
293 	  FREE(nt2);
294 	  FREE(nt1);
295 	  );
296   }
297 
298   offsets[0] = 0U;
299   for (oligoi = 1; oligoi <= oligospace; oligoi++) {
300     offsets[oligoi] = offsets[oligoi-1] + /*size*/offsets[oligoi];
301     debug(if (offsets[oligoi] != offsets[oligoi-1]) {
302 	    printf("Offset for %06X: %u\n",oligoi,offsets[oligoi]);
303 	  });
304   }
305 
306   return offsets;
307 }
308 #endif
309 
310 
311 static void
index_offsets_ct_using_bitpack(char * new_pointers_filename,char * new_offsets_filename,UINT4 * oldoffsetsmeta,UINT4 * oldoffsetsstrm,Oligospace_T oligospace,Oligospace_T mask)312 index_offsets_ct_using_bitpack (char *new_pointers_filename, char *new_offsets_filename,
313 				UINT4 *oldoffsetsmeta, UINT4 *oldoffsetsstrm,
314 				Oligospace_T oligospace, Oligospace_T mask) {
315   Oligospace_T bmerspace;
316   char *packsizes;
317   UINT4 **bitpacks;
318   UINT4 increment;
319   int new_packsize;
320   UINT4 offsets[BLOCKSIZE+1];
321   Oligospace_T oligoi, reduced, bmer;
322   int ii;
323 #ifdef DEBUG
324   char *nt1, *nt2;
325 #endif
326 
327   /* Fill with sizes */
328   bmerspace = oligospace/BLOCKSIZE;
329   packsizes = (char *) CALLOC(bmerspace,sizeof(char));
330   bitpacks = (UINT4 **) CALLOC(bmerspace,sizeof(UINT4 *));
331 
332   for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
333     Bitpack64_block_offsets(offsets,oligoi,oldoffsetsmeta,oldoffsetsstrm);
334     for (ii = 0; ii < BLOCKSIZE; ii++) {
335       if ((increment = offsets[ii+1] - offsets[ii]) > 0) {
336 	reduced = Cmet_reduce_ct(oligoi + ii) & mask;
337 	bmer = reduced/BLOCKSIZE;
338 	if ((new_packsize = Bitpack64_access_new_packsize(reduced,/*old_packsize*/(int) packsizes[bmer],
339 							  bitpacks[bmer],increment)) != (int) packsizes[bmer]) {
340 	  bitpacks[bmer] = Bitpack64_realloc_multiple((int) packsizes[bmer],new_packsize,bitpacks[bmer]);
341 	  packsizes[bmer] = (char) new_packsize;
342 	}
343 	Bitpack64_add_bitpack(reduced,(int) packsizes[bmer],bitpacks[bmer],increment);
344       }
345     }
346   }
347 
348   Bitpack64_write_differential_bitpacks(new_pointers_filename,new_offsets_filename,packsizes,bitpacks,oligospace);
349 
350   for (bmer = 0; bmer < bmerspace; bmer++) {
351     FREE(bitpacks[bmer]);
352   }
353   FREE(bitpacks);
354   FREE(packsizes);
355 
356   return;
357 }
358 
359 
360 static void
index_offsets_ga_using_bitpack(char * new_pointers_filename,char * new_offsets_filename,UINT4 * oldoffsetsmeta,UINT4 * oldoffsetsstrm,Oligospace_T oligospace,Oligospace_T mask)361 index_offsets_ga_using_bitpack (char *new_pointers_filename, char *new_offsets_filename,
362 				UINT4 *oldoffsetsmeta, UINT4 *oldoffsetsstrm,
363 				Oligospace_T oligospace, Oligospace_T mask) {
364   Oligospace_T bmerspace;
365   char *packsizes;
366   UINT4 **bitpacks;
367   UINT4 increment;
368   int new_packsize;
369   UINT4 offsets[BLOCKSIZE+1];
370   Oligospace_T oligoi, reduced, bmer;
371   int ii;
372 #ifdef DEBUG
373   char *nt1, *nt2;
374 #endif
375 
376   /* Fill with sizes */
377   bmerspace = oligospace/BLOCKSIZE;
378   packsizes = (char *) CALLOC(bmerspace,sizeof(char));
379   bitpacks = (UINT4 **) CALLOC(bmerspace,sizeof(UINT4 *));
380 
381   for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
382     Bitpack64_block_offsets(offsets,oligoi,oldoffsetsmeta,oldoffsetsstrm);
383     for (ii = 0; ii < BLOCKSIZE; ii++) {
384       if ((increment = offsets[ii+1] - offsets[ii]) > 0) {
385 	reduced = Cmet_reduce_ga(oligoi + ii) & mask;
386 	bmer = reduced/BLOCKSIZE;
387 	if ((new_packsize = Bitpack64_access_new_packsize(reduced,/*old_packsize*/(int) packsizes[bmer],
388 							  bitpacks[bmer],increment)) != (int) packsizes[bmer]) {
389 	  bitpacks[bmer] = Bitpack64_realloc_multiple((int) packsizes[bmer],new_packsize,bitpacks[bmer]);
390 	  packsizes[bmer] = (char) new_packsize;
391 	}
392 	Bitpack64_add_bitpack(reduced,(int) packsizes[bmer],bitpacks[bmer],increment);
393       }
394     }
395   }
396 
397   Bitpack64_write_differential_bitpacks(new_pointers_filename,new_offsets_filename,packsizes,bitpacks,oligospace);
398 
399   for (bmer = 0; bmer < bmerspace; bmer++) {
400     FREE(bitpacks[bmer]);
401   }
402   FREE(bitpacks);
403   FREE(packsizes);
404 
405   return;
406 }
407 
408 
409 #if 0
410 /* Needs work */
411 static void
412 sort_positions_local_snps (FILE *positions_fp, UINT2 *positions2,
413 			   UINT2 *newoffsetsmeta, UINT2 *newoffsetsstrm, Localspace_T localspace) {
414   Localspace_T oligoi;
415   UINT2 block_start, block_end, j, npositions;
416   int ii;
417 
418   UINT4 newoffsets[BLOCKSIZE+1];
419 
420 #if 0
421   /* For snps_root */
422   FILE *ptrs_fp, *comp_fp;
423   UINT4 *ptrs;
424   UINT4 ptri;
425   int nregisters;
426   UINT4 ascending[BLOCKSIZE+1];
427   UINT2 diffs[BLOCKSIZE];
428   UINT4 totalcount;
429   int packsize;
430 
431   /* Buffer is used to avoid frequent writes to the file */
432   UINT4 *buffer;
433   int buffer_size = BUFFER_SIZE;
434   int buffer_i;
435 #endif
436 
437 
438   /* Sort positions in each block.  For snps_root, use code from Epu16_bitpack64_append_differential */
439   if (snps_root) {
440     fprintf(stderr,"Not supported\n");
441     abort();
442 #if 0
443     ptrs = (UINT4 *) CALLOC(((oligospace + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
444     ptri = 0;
445 
446     if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
447       fprintf(stderr,"Can't write to file %s\n",compfile);
448       exit(9);
449     }
450     buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
451     buffer_i = 0;
452     nregisters = 0;
453 #endif
454   }
455 
456   /* totalcount = 0; -- For snps_root */
457   for (oligoi = 0; oligoi + BLOCKSIZE <= oligospace; oligoi += BLOCKSIZE) {
458     if (snps_root) {
459 #if 0
460       ptrs[ptri++] = nregisters;
461       ptrs[ptri++] = totalcount;
462 #endif
463     }
464     /* ascending[0] = totalcount; -- For snps_root */
465     Epu16_bitpack64_block_offsets(newoffsets,oligoi,newoffsetsmeta,newoffsetsstrm);
466     for (ii = 0; ii < BLOCKSIZE; ii++) {
467       block_start = newoffsets[ii];
468       block_end = newoffsets[ii+1];
469       if ((npositions = block_end - block_start) > 0) {
470 	qsort(&(positions2[block_start]),npositions,sizeof(UINT2),UINT2_compare);
471 	if (snps_root == NULL) {
472 	  FWRITE_UINTS(&(positions2[block_start]),npositions,positions_fp);
473 	} else {
474 #if 0
475 	  FWRITE_UINT(positions4[block_start],positions_fp);
476 	  for (j = block_start+1; j < block_end; j++) {
477 	    if (positions4[j] == positions4[j-1]) {
478 	      npositions--;
479 	    } else {
480 	      FWRITE_UINT(positions4[j],positions_fp);
481 	    }
482 	  }
483 #endif
484 	}
485 	/* totalcount += npositions; -- For snps_root */
486       }
487       /* ascending[ii+1] = totalcount; -- For snps_root */
488     }
489     if (snps_root) {
490 #if 0
491       packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
492       buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
493       nregisters += packsize / 2;
494 #endif
495     }
496   }
497 
498   if (snps_root) {
499 #if 0
500     /* Write the final pointer, which will point after the end of the file */
501     ptrs[ptri++] = nregisters;	/* In 128-bit registers */
502 
503     /* Value for end of block */
504     ptrs[ptri++] = totalcount;
505 
506     if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
507       fprintf(stderr,"Can't write to file %s\n",ptrsfile);
508       exit(9);
509     } else {
510       FWRITE_UINTS(ptrs,ptri,ptrs_fp);
511       FREE(ptrs);
512       fclose(ptrs_fp);
513     }
514 
515     /* Empty buffer */
516     if (buffer_i > 0) {
517       FWRITE_UINTS(buffer,buffer_i,comp_fp);
518       buffer_i = 0;
519     }
520     FREE(buffer);
521     fclose(comp_fp);
522 #endif
523   }
524 
525   return;
526 }
527 #endif
528 
529 
530 #if 0
531 /* Does not support SNPs.  Written in-line into compute_ct_local and compute_ag_local */
532 static void
533 sort_positions_local (FILE *positions_fp, UINT2 *positions2,
534 		      UINT2 *newoffsetsmeta, UINT2 *newoffsetsstrm, Localspace_T localspace) {
535   Localspace_T oligoi;
536   UINT2 block_start, block_end, j, npositions;
537   UINT4 newoffsets[BLOCKSIZE+1];
538   int ii;
539 
540   /* Sort positions in each block */
541   for (oligoi = 0; oligoi + BLOCKSIZE <= localspace; oligoi += BLOCKSIZE) {
542     Epu16_bitpack64_block_offsets(newoffsets,oligoi,newoffsetsmeta,newoffsetsstrm);
543     for (ii = 0; ii < BLOCKSIZE; ii++) {
544       block_start = newoffsets[ii];
545       block_end = newoffsets[ii+1];
546       if ((npositions = block_end - block_start) > 0) {
547 	qsort(&(positions2[block_start]),npositions,sizeof(UINT2),UINT2_compare);
548 	FWRITE_UINTS(&(positions2[block_start]),npositions,positions_fp);
549       }
550     }
551   }
552 
553   return;
554 }
555 #endif
556 
557 
558 static void
sort_8mers(unsigned char * positions8_high,UINT4 * positions8_low,Positionsptr_T npositions)559 sort_8mers (unsigned char *positions8_high, UINT4 *positions8_low, Positionsptr_T npositions) {
560   UINT8 *positions8;
561   Positionsptr_T i;
562 
563   positions8 = (UINT8 *) MALLOC(npositions*sizeof(UINT8));
564   for (i = 0; i < npositions; i++) {
565     positions8[i] = ((UINT8) positions8_high[i] << 32) + positions8_low[i];
566   }
567   qsort(positions8,npositions,sizeof(UINT8),UINT8_compare);
568   for (i = 0; i < npositions; i++) {
569     positions8_high[i] = positions8[i] >> POSITIONS8_HIGH_SHIFT;
570     positions8_low[i] = positions8[i] & POSITIONS8_LOW_MASK;
571   }
572   return;
573 }
574 
575 
576 static void
sort_positions(char * ptrsfile,char * compfile,FILE * positions_high_fp,FILE * positions_fp,unsigned char * positions8_high,UINT4 * positions8_low,UINT4 * positions4,UINT4 * newoffsetsmeta,UINT4 * newoffsetsstrm,Oligospace_T oligospace,bool coord_values_8p)577 sort_positions (char *ptrsfile, char *compfile, FILE *positions_high_fp, FILE *positions_fp,
578 		unsigned char *positions8_high, UINT4 *positions8_low, UINT4 *positions4,
579 		UINT4 *newoffsetsmeta, UINT4 *newoffsetsstrm, Oligospace_T oligospace, bool coord_values_8p) {
580   Oligospace_T oligoi;
581   Positionsptr_T block_start, block_end, j, npositions;
582 
583   FILE *ptrs_fp, *comp_fp;
584   UINT4 *ptrs, nregisters;
585   UINT4 ptri;
586   int ii;
587 
588   /* Buffer is used to avoid frequent writes to the file */
589   UINT4 *buffer;
590   int buffer_size = BUFFER_SIZE;
591   int buffer_i;
592 
593   UINT4 newoffsets[BLOCKSIZE+1];
594   UINT4 diffs[BLOCKSIZE], ascending[BLOCKSIZE+1];
595   UINT4 totalcount;
596   int packsize;
597 
598 
599   /* Sort positions in each block.  For snps_root, use code from Bitpack64_write_differential */
600   if (snps_root) {
601     ptrs = (UINT4 *) CALLOC(((oligospace + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
602     ptri = 0;
603 
604     if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
605       fprintf(stderr,"Can't write to file %s\n",compfile);
606       exit(9);
607     }
608     buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
609     buffer_i = 0;
610     nregisters = 0;
611   }
612 
613   totalcount = 0;
614   for (oligoi = 0; oligoi + BLOCKSIZE <= oligospace; oligoi += BLOCKSIZE) {
615     if (snps_root) {
616       ptrs[ptri++] = nregisters;
617       ptrs[ptri++] = totalcount;
618     }
619     ascending[0] = totalcount;
620     Bitpack64_block_offsets(newoffsets,oligoi,newoffsetsmeta,newoffsetsstrm);
621     for (ii = 0; ii < BLOCKSIZE; ii++) {
622       if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
623 	fprintf(stderr,".");
624       }
625       block_start = newoffsets[ii];
626       block_end = newoffsets[ii+1];
627       if ((npositions = block_end - block_start) > 0) {
628 	if (coord_values_8p == true) {
629 	  sort_8mers(&(positions8_high[block_start]),&(positions8_low[block_start]),npositions);
630 	  if (snps_root == NULL) {
631 	    /* FWRITE_UINT8S(&(positions8[block_start]),npositions,positions_fp); */
632 	    FWRITE_CHARS(&(positions8_high[block_start]),npositions,positions_high_fp);
633 	    FWRITE_UINTS(&(positions8_low[block_start]),npositions,positions_fp);
634 	  } else {
635 	    /* FWRITE_UINT8(positions8[block_start],positions_fp); */
636 	    FWRITE_CHAR(positions8_high[block_start],positions_high_fp);
637 	    FWRITE_UINT(positions8_low[block_start],positions_fp);
638 	    for (j = block_start+1; j < block_end; j++) {
639 	      if (positions8_high[j] == positions8_high[j-1] && positions8_low[j] == positions8_low[j-1]) {
640 		npositions--;
641 	      } else {
642 		/* FWRITE_UINT8(positions8[j],positions_fp); */
643 		FWRITE_CHAR(positions8_high[j],positions_high_fp);
644 		FWRITE_UINT(positions8_low[j],positions_fp);
645 	      }
646 	    }
647 	  }
648 
649 	} else {
650 	  qsort(&(positions4[block_start]),npositions,sizeof(UINT4),UINT4_compare);
651 	  if (snps_root == NULL) {
652 	    FWRITE_UINTS(&(positions4[block_start]),npositions,positions_fp);
653 	  } else {
654 	    FWRITE_UINT(positions4[block_start],positions_fp);
655 	    for (j = block_start+1; j < block_end; j++) {
656 	      if (positions4[j] == positions4[j-1]) {
657 		npositions--;
658 	      } else {
659 		FWRITE_UINT(positions4[j],positions_fp);
660 	      }
661 	    }
662 	  }
663 	}
664 	totalcount += npositions;
665       }
666       ascending[ii+1] = totalcount;
667     }
668     if (snps_root) {
669       packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
670       buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
671       nregisters += packsize / 2;
672     }
673   }
674 
675   if (oligoi <= oligospace) {
676     /* Finish last block (containing 1 entry, but expand to 64) */
677     if (snps_root) {
678       ptrs[ptri++] = nregisters;
679       ptrs[ptri++] = totalcount;
680     }
681     ascending[0] = totalcount;
682     for (ii = 0; ii <= (int) (oligospace - oligoi); ii++) {
683       block_start = Bitpack64_read_two(&block_end,oligoi+ii,newoffsetsmeta,newoffsetsstrm);
684       if ((npositions = block_end - block_start) > 0) {
685 	if (coord_values_8p == true) {
686 	  sort_8mers(&(positions8_high[block_start]),&(positions8_low[block_start]),npositions);
687 	  if (snps_root == NULL) {
688 	    /* FWRITE_UINT8S(&(positions8[block_start]),npositions,positions_fp); */
689 	    FWRITE_CHARS(&(positions8_high[block_start]),npositions,positions_high_fp);
690 	    FWRITE_UINTS(&(positions8_low[block_start]),npositions,positions_fp);
691 	  } else {
692 	    /* FWRITE_UINT8(positions8[block_start],positions_fp); */
693 	    FWRITE_CHAR(positions8_high[block_start],positions_high_fp);
694 	    FWRITE_UINT(positions8_low[block_start],positions_fp);
695 	    for (j = block_start+1; j < block_end; j++) {
696 	      if (positions8_high[j] == positions8_high[j-1] && positions8_low[j] == positions8_low[j-1]) {
697 		npositions--;
698 	      } else {
699 		/* FWRITE_UINT8(positions8[j],positions_fp); */
700 		FWRITE_CHAR(positions8_high[j],positions_high_fp);
701 		FWRITE_UINT(positions8_low[j],positions_fp);
702 	      }
703 	    }
704 	  }
705 
706 	} else {
707 	  qsort(&(positions4[block_start]),npositions,sizeof(UINT4),UINT4_compare);
708 	  if (snps_root == NULL) {
709 	    FWRITE_UINTS(&(positions4[block_start]),npositions,positions_fp);
710 	  } else {
711 	    FWRITE_UINT(positions4[block_start],positions_fp);
712 	    for (j = block_start+1; j < block_end; j++) {
713 	      if (positions4[j] == positions4[j-1]) {
714 		npositions--;
715 	      } else {
716 		FWRITE_UINT(positions4[j],positions_fp);
717 	      }
718 	    }
719 	  }
720 	}
721 	totalcount += npositions;
722       }
723       ascending[ii+1] = totalcount;
724     }
725     if (snps_root) {
726       for ( ; ii < BLOCKSIZE; ii++) {
727 	ascending[ii+1] = totalcount;
728       }
729       packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
730       buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
731       nregisters += packsize / 2;
732     }
733   }
734 
735   if (snps_root) {
736     /* Write the final pointer, which will point after the end of the file */
737     ptrs[ptri++] = nregisters;	/* In 128-bit registers */
738 
739     /* Value for end of block */
740     ptrs[ptri++] = totalcount;
741 
742     if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
743       fprintf(stderr,"Can't write to file %s\n",ptrsfile);
744       exit(9);
745     } else {
746       FWRITE_UINTS(ptrs,ptri,ptrs_fp);
747       FREE(ptrs);
748       fclose(ptrs_fp);
749     }
750 
751     /* Empty buffer */
752     if (buffer_i > 0) {
753       FWRITE_UINTS(buffer,buffer_i,comp_fp);
754       buffer_i = 0;
755     }
756     FREE(buffer);
757     fclose(comp_fp);
758   }
759 
760   return;
761 }
762 
763 
764 static void
compute_ct(char * new_pointers_filename,char * new_offsets_filename,char * new_positions_high_filename,char * new_positions_filename,Indexdb_T indexdb,Oligospace_T oligospace,Oligospace_T mask,bool coord_values_8p)765 compute_ct (char *new_pointers_filename, char *new_offsets_filename,
766 	    char *new_positions_high_filename, char *new_positions_filename,
767 	    Indexdb_T indexdb, Oligospace_T oligospace, Oligospace_T mask,
768 	    bool coord_values_8p) {
769   char *ptrsfile, *compfile;
770   FILE *positions_high_fp, *positions_fp;
771 
772   unsigned char *positions8_high;
773   UINT4 *positions8_low;
774   UINT4 *positions4;
775 
776   Oligospace_T oligoi, reduced;
777   UINT4 oldoffsets[BLOCKSIZE+1];
778 
779   UINT4 *newoffsetsmeta, *newoffsetsstrm, *countermeta, *counterstrm;
780   Positionsptr_T preunique_totalcounts, block_start, block_end, j, offset;
781   size_t offsetsmeta_len, offsetsstrm_len;
782   int ii;
783 #ifdef HAVE_MMAP
784   int offsetsmeta_fd, offsetsstrm_fd;
785 #else
786   Access_T offsetsmeta_access, offsetsstrm_access;
787 #endif
788   double seconds;
789 #ifdef DEBUG
790   char *nt1, *nt2;
791 #endif
792 
793 
794   /* offsets = compute_offsets_ct_using_array(oldoffsets,oligospace,mask); */
795   index_offsets_ct_using_bitpack(new_pointers_filename,new_offsets_filename,
796 				 indexdb->offsetsmeta,indexdb->offsetsstrm,
797 				 oligospace,mask);
798 
799   preunique_totalcounts = Bitpack64_read_one(oligospace,indexdb->offsetsmeta,indexdb->offsetsstrm);
800   if (preunique_totalcounts == 0) {
801     fprintf(stderr,"Something is wrong with the offsets.  Total counts is zero.\n");
802     exit(9);
803 
804   } else if (coord_values_8p == true) {
805     fprintf(stderr,"Trying to allocate %u*(%d+%d) bytes of memory...",
806 	    preunique_totalcounts,(int) sizeof(unsigned char),(int) sizeof(UINT4));
807     positions4 = (UINT4 *) NULL;
808     positions8_high = (unsigned char *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(unsigned char));
809     positions8_low = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
810     if (positions8_high == NULL || positions8_low == NULL) {
811       fprintf(stderr,"failed.  Need a computer with sufficient memory.\n");
812       exit(9);
813     } else {
814       fprintf(stderr,"done\n");
815     }
816 
817   } else {
818     fprintf(stderr,"Trying to allocate %u*%d bytes of memory...",preunique_totalcounts,(int) sizeof(UINT4));
819     positions8_high = (unsigned char *) NULL;
820     positions8_low = (UINT4 *) NULL;
821     positions4 = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
822     if (positions4 == NULL) {
823       fprintf(stderr,"failed.  Need a computer with sufficient memory.\n");
824       exit(9);
825     } else {
826       fprintf(stderr,"done\n");
827     }
828   }
829 
830   /* Point to offsets and revise */
831   fprintf(stderr,"Rearranging CT offsets for indexdb...");
832 #ifdef HAVE_MMAP
833   newoffsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,new_pointers_filename,/*randomp*/false);
834   newoffsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,new_offsets_filename,/*randomp*/false);
835 #else
836   newoffsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,new_pointers_filename,sizeof(UINT4));
837   newoffsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,new_offsets_filename,sizeof(UINT4));
838 #endif
839   countermeta = Indexdb_bitpack_counter(&counterstrm,newoffsetsmeta,index1part);
840 
841   for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
842     Bitpack64_block_offsets(oldoffsets,oligoi,indexdb->offsetsmeta,indexdb->offsetsstrm);
843     for (ii = 0; ii < BLOCKSIZE; ii++) {
844       if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
845 	fprintf(stderr,".");
846       }
847       block_start = oldoffsets[ii];
848       block_end = oldoffsets[ii+1];
849 
850       if (block_end > block_start) {
851 	reduced = Cmet_reduce_ct(oligoi + ii) & mask;
852 	offset = Bitpack64_read_one(reduced,newoffsetsmeta,newoffsetsstrm) + Bitpack64_access(reduced,countermeta,counterstrm);
853 	if (coord_values_8p == true) {
854 	  for (j = block_start; j < block_end; j++) {
855 	    positions8_high[offset] = indexdb->positions_high[j];
856 	    positions8_low[offset] = indexdb->positions[j];
857 	    offset++;
858 	  }
859 	} else {
860 	  for (j = block_start; j < block_end; j++) {
861 	    positions4[offset] = indexdb->positions[j];
862 	    offset++;
863 	  }
864 	}
865 	Bitpack64_add(reduced,countermeta,counterstrm,block_end - block_start);
866       }
867     }
868   }
869 
870   FREE(counterstrm);
871   FREE(countermeta);
872   fprintf(stderr,"done\n");
873 
874 
875   fprintf(stderr,"Sorting CT positions for indexdb...");
876   if (snps_root == NULL) {
877     ptrsfile = compfile = (char *) NULL;
878   } else {
879     ptrsfile = (char *) MALLOC((strlen(new_pointers_filename) + strlen(".temp") + 1) * sizeof(char));
880     sprintf(ptrsfile,"%s.temp",new_pointers_filename);
881     compfile = (char *) MALLOC((strlen(new_offsets_filename) + strlen(".temp") + 1) * sizeof(char));
882     sprintf(compfile,"%s.temp",new_offsets_filename);
883   }
884 
885   if (new_positions_high_filename != NULL) {
886     if ((positions_high_fp = FOPEN_WRITE_BINARY(new_positions_high_filename)) == NULL) {
887       fprintf(stderr,"Can't write to file %s\n",new_positions_high_filename);
888       exit(9);
889     }
890   }
891 
892   if ((positions_fp = FOPEN_WRITE_BINARY(new_positions_filename)) == NULL) {
893     fprintf(stderr,"Can't write to file %s\n",new_positions_filename);
894     exit(9);
895   }
896 
897   sort_positions(ptrsfile,compfile,positions_high_fp,positions_fp,
898 		 positions8_high,positions8_low,positions4,
899 		 newoffsetsmeta,newoffsetsstrm,oligospace,coord_values_8p);
900 
901   /* Clean up */
902 #ifdef HAVE_MMAP
903   munmap((void *) newoffsetsmeta,offsetsmeta_len);
904   munmap((void *) newoffsetsstrm,offsetsstrm_len);
905   close(offsetsmeta_fd);
906   close(offsetsstrm_fd);
907 #else
908   FREE(newoffsetsmeta);
909   FREE(newoffsetsstrm);
910 #endif
911 
912   if (coord_values_8p == true) {
913     FREE(positions8_high);
914     FREE(positions8_low);
915     fclose(positions_high_fp);
916     fclose(positions_fp);
917   } else {
918     FREE(positions4);
919     fclose(positions_fp);
920   }
921 
922   if (snps_root) {
923     rename(ptrsfile,new_pointers_filename);
924     rename(compfile,new_offsets_filename);
925     FREE(ptrsfile);
926     FREE(compfile);
927   }
928 
929   fprintf(stderr,"done\n");
930 
931   return;
932 }
933 
934 
935 static void
compute_ga(char * new_pointers_filename,char * new_offsets_filename,char * new_positions_high_filename,char * new_positions_filename,Indexdb_T indexdb,Oligospace_T oligospace,Oligospace_T mask,bool coord_values_8p)936 compute_ga (char *new_pointers_filename, char *new_offsets_filename,
937 	    char *new_positions_high_filename, char *new_positions_filename,
938 	    Indexdb_T indexdb, Oligospace_T oligospace, Oligospace_T mask,
939 	    bool coord_values_8p) {
940   char *ptrsfile, *compfile;
941   FILE *positions_high_fp, *positions_fp;
942 
943   unsigned char *positions8_high;
944   UINT4 *positions8_low;
945   UINT4 *positions4;
946 
947   Oligospace_T oligoi, reduced;
948   UINT4 oldoffsets[BLOCKSIZE+1];
949 
950   UINT4 *newoffsetsmeta, *newoffsetsstrm, *countermeta, *counterstrm;
951   Positionsptr_T preunique_totalcounts, block_start, block_end, j, offset;
952   size_t offsetsmeta_len, offsetsstrm_len;
953   int ii;
954 #ifdef HAVE_MMAP
955   int offsetsmeta_fd, offsetsstrm_fd;
956 #else
957   Access_T offsetsmeta_access, offsetsstrm_access;
958 #endif
959   double seconds;
960 #ifdef DEBUG
961   char *nt1, *nt2;
962 #endif
963 
964 
965   /* offsets = compute_offsets_ga_using_array(oldoffsets,oligospace,mask); */
966   index_offsets_ga_using_bitpack(new_pointers_filename,new_offsets_filename,
967 				 indexdb->offsetsmeta,indexdb->offsetsstrm,
968 				 oligospace,mask);
969 
970   preunique_totalcounts = Bitpack64_read_one(oligospace,indexdb->offsetsmeta,indexdb->offsetsstrm);
971   if (preunique_totalcounts == 0) {
972     fprintf(stderr,"Something is wrong with the offsets.  Total counts is zero.\n");
973     exit(9);
974 
975   } else if (coord_values_8p == true) {
976     fprintf(stderr,"Trying to allocate %u*(%d+%d) bytes of memory...",
977 	    preunique_totalcounts,(int) sizeof(unsigned char),(int) sizeof(UINT4));
978     positions4 = (UINT4 *) NULL;
979     positions8_high = (unsigned char *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(unsigned char));
980     positions8_low = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
981     if (positions8_high == NULL || positions8_low == NULL) {
982       fprintf(stderr,"failed.  Need a computer with sufficient memory.\n");
983       exit(9);
984     } else {
985       fprintf(stderr,"done\n");
986     }
987 
988   } else {
989     fprintf(stderr,"Trying to allocate %u*%d bytes of memory...",preunique_totalcounts,(int) sizeof(UINT4));
990     positions8_high = (unsigned char *) NULL;
991     positions8_low = (UINT4 *) NULL;
992     positions4 = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
993     if (positions4 == NULL) {
994       fprintf(stderr,"failed.  Need a computer with sufficient memory.\n");
995       exit(9);
996     } else {
997       fprintf(stderr,"done\n");
998     }
999   }
1000 
1001   /* Point to offsets and revise */
1002   fprintf(stderr,"Rearranging GA offsets for indexdb...");
1003 #ifdef HAVE_MMAP
1004   newoffsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,new_pointers_filename,/*randomp*/false);
1005   newoffsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,new_offsets_filename,/*randomp*/false);
1006 #else
1007   newoffsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,new_pointers_filename,sizeof(UINT4));
1008   newoffsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,new_offsets_filename,sizeof(UINT4));
1009 #endif
1010   countermeta = Indexdb_bitpack_counter(&counterstrm,newoffsetsmeta,index1part);
1011 
1012   for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
1013     Bitpack64_block_offsets(oldoffsets,oligoi,indexdb->offsetsmeta,indexdb->offsetsstrm);
1014     for (ii = 0; ii < BLOCKSIZE; ii++) {
1015       if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
1016 	fprintf(stderr,".");
1017       }
1018       block_start = oldoffsets[ii];
1019       block_end = oldoffsets[ii+1];
1020 
1021       if (block_end > block_start) {
1022 	reduced = Cmet_reduce_ga(oligoi + ii) & mask;
1023 	offset = Bitpack64_read_one(reduced,newoffsetsmeta,newoffsetsstrm) + Bitpack64_access(reduced,countermeta,counterstrm);
1024 	if (coord_values_8p == true) {
1025 	  for (j = block_start; j < block_end; j++) {
1026 	    positions8_high[offset] = indexdb->positions_high[j];
1027 	    positions8_low[offset] = indexdb->positions[j];
1028 	    offset++;
1029 	  }
1030 	} else {
1031 	  for (j = block_start; j < block_end; j++) {
1032 	    positions4[offset] = indexdb->positions[j];
1033 	    offset++;
1034 	  }
1035 	}
1036 	Bitpack64_add(reduced,countermeta,counterstrm,block_end - block_start);
1037       }
1038     }
1039   }
1040 
1041   FREE(counterstrm);
1042   FREE(countermeta);
1043   fprintf(stderr,"done\n");
1044 
1045 
1046   fprintf(stderr,"Sorting GA positions for indexdb...");
1047   if (snps_root == NULL) {
1048     ptrsfile = compfile = (char *) NULL;
1049   } else {
1050     ptrsfile = (char *) MALLOC((strlen(new_pointers_filename) + strlen(".temp") + 1) * sizeof(char));
1051     sprintf(ptrsfile,"%s.temp",new_pointers_filename);
1052     compfile = (char *) MALLOC((strlen(new_offsets_filename) + strlen(".temp") + 1) * sizeof(char));
1053     sprintf(compfile,"%s.temp",new_offsets_filename);
1054   }
1055 
1056   if (new_positions_high_filename != NULL) {
1057     if ((positions_high_fp = FOPEN_WRITE_BINARY(new_positions_high_filename)) == NULL) {
1058       fprintf(stderr,"Can't write to file %s\n",new_positions_high_filename);
1059       exit(9);
1060     }
1061   }
1062 
1063   if ((positions_fp = FOPEN_WRITE_BINARY(new_positions_filename)) == NULL) {
1064     fprintf(stderr,"Can't write to file %s\n",new_positions_filename);
1065     exit(9);
1066   }
1067 
1068   sort_positions(ptrsfile,compfile,positions_high_fp,positions_fp,
1069 		 positions8_high,positions8_low,positions4,
1070 		 newoffsetsmeta,newoffsetsstrm,oligospace,coord_values_8p);
1071 
1072 /* Clean up */
1073 #ifdef HAVE_MMAP
1074   munmap((void *) newoffsetsmeta,offsetsmeta_len);
1075   munmap((void *) newoffsetsstrm,offsetsstrm_len);
1076   close(offsetsmeta_fd);
1077   close(offsetsstrm_fd);
1078 #else
1079   FREE(newoffsetsmeta);
1080   FREE(newoffsetsstrm);
1081 #endif
1082 
1083   if (coord_values_8p == true) {
1084     FREE(positions8_high);
1085     FREE(positions8_low);
1086     fclose(positions_high_fp);
1087     fclose(positions_fp);
1088   } else {
1089     FREE(positions4);
1090     fclose(positions_fp);
1091   }
1092 
1093   if (snps_root) {
1094     rename(ptrsfile,new_pointers_filename);
1095     rename(compfile,new_offsets_filename);
1096     FREE(ptrsfile);
1097     FREE(compfile);
1098   }
1099 
1100   fprintf(stderr,"done\n");
1101 
1102   return;
1103 }
1104 
1105 
1106 /* Usage: cmetindex -d <genome> */
1107 
1108 
1109 /* Note: Depends on having gmapindex sampled on mod 3 bounds */
1110 int
main(int argc,char * argv[])1111 main (int argc, char *argv[]) {
1112   char *sourcedir = NULL, *destdir = NULL, *filename, *fileroot;
1113   Indexdb_filenames_T ifilenames;
1114   char *new_pointers_filename = NULL, *new_offsets_filename = NULL,
1115     *new_positions_high_filename = NULL, *new_positions_filename = NULL;
1116   FILE *sequence_fp;
1117 
1118   Univ_IIT_T chromosome_iit;
1119   /* size_t totalcounts, i; */
1120   Oligospace_T oligospace, index_mask;
1121   bool coord_values_8p;
1122 
1123   Indexdb_T indexdb;
1124 
1125   int opt;
1126   extern int optind;
1127   extern char *optarg;
1128   int long_option_index = 0;
1129   const char *long_name;
1130 
1131   while ((opt = getopt_long(argc,argv,"F:D:d:k:q:v:",
1132 			    long_options,&long_option_index)) != -1) {
1133     switch (opt) {
1134     case 0:
1135       long_name = long_options[long_option_index].name;
1136       if (!strcmp(long_name,"version")) {
1137 	print_program_version();
1138 	exit(0);
1139       } else if (!strcmp(long_name,"help")) {
1140 	print_program_usage();
1141 	exit(0);
1142 
1143       } else {
1144 	/* Shouldn't reach here */
1145 	fprintf(stderr,"Don't recognize option %s.  For usage, run 'cmetindex --help'",long_name);
1146 	exit(9);
1147       }
1148       break;
1149 
1150     case 'F': user_sourcedir = optarg; break;
1151     case 'D': user_destdir = optarg; break;
1152     case 'd': dbroot = optarg; break;
1153     case 'k': required_index1part = atoi(optarg); break;
1154     case 'q': required_index1interval = atoi(optarg); break;
1155     case 'v':
1156       snps_root = optarg;
1157       fprintf(stderr,"Combination of cmetindex and snps is not yet supported in 2018 versions\n");
1158       exit(9);
1159       break;
1160     default: fprintf(stderr,"Do not recognize flag %c\n",opt); exit(9);
1161     }
1162   }
1163   argc -= (optind - 1);
1164   argv += (optind - 1);
1165 
1166   if (dbroot == NULL) {
1167     fprintf(stderr,"Missing name of genome database.  Must specify with -d flag.\n");
1168     fprintf(stderr,"Usage: cmetindex -d <genome>\n");
1169     exit(9);
1170   } else {
1171     sourcedir = Datadir_find_genomesubdir(&fileroot,&dbversion,user_sourcedir,dbroot);
1172     fprintf(stderr,"Reading source files from %s\n",sourcedir);
1173   }
1174 
1175   if (user_destdir == NULL) {
1176     destdir = (char *) MALLOC((strlen(sourcedir)+strlen("/")+strlen(dbroot)+1)*sizeof(char));
1177     sprintf(destdir,"%s/%s",sourcedir,dbroot);
1178   } else {
1179     destdir = (char *) MALLOC((strlen(user_destdir)+strlen("/")+strlen(dbroot)+1)*sizeof(char));
1180     sprintf(destdir,"%s/%s",user_destdir,dbroot);
1181   }
1182   fprintf(stderr,"Writing cmet index files to %s\n",destdir);
1183 
1184 
1185   /* Chromosome IIT file.  Need to determine coord_values_8p and genomelength */
1186   filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+
1187 			     strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
1188   sprintf(filename,"%s/%s.chromosome.iit",sourcedir,fileroot);
1189   if ((chromosome_iit = Univ_IIT_read(filename,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
1190     fprintf(stderr,"IIT file %s is not valid\n",filename);
1191     exit(9);
1192   } else {
1193     coord_values_8p = Univ_IIT_coord_values_8p(chromosome_iit);
1194   }
1195   FREE(filename);
1196 
1197 
1198   ifilenames = Indexdb_get_filenames(&compression_type,&index1part,&index1interval,
1199 				     sourcedir,fileroot,IDX_FILESUFFIX,snps_root,
1200 				     required_index1part,required_index1interval,
1201 				     /*offsets_only_p*/false);
1202   if ((indexdb = Indexdb_new_genome(&index1part,&index1interval,
1203 				    sourcedir,fileroot,IDX_FILESUFFIX,/*snps_root*/NULL,
1204 				    required_index1part,required_index1interval,
1205 				    /*offsetsstrm_access*/USE_ALLOCATE,
1206 				    /*positions_access*/USE_ALLOCATE,/*sharedp*/false,
1207 				    /*multiple_sequences_p*/false,/*preload_shared_memory_p*/false,/*unload_shared_memory_p*/false)) != NULL) {
1208 
1209 #ifdef HAVE_64_BIT
1210     index_mask = ~(~0ULL << 2*index1part);
1211 #else
1212     index_mask = ~(~0U << 2*index1part);
1213 #endif
1214     oligospace = power(4,index1part);
1215 
1216 
1217     /* Compute and write CT files */
1218     new_pointers_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1219 					    strlen(".")+strlen("metct")+strlen(ifilenames->pointers_index1info_ptr)+1,sizeof(char));
1220     sprintf(new_pointers_filename,"%s/%s.%s%s",destdir,fileroot,"metct",ifilenames->pointers_index1info_ptr);
1221 
1222     new_offsets_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1223 					   strlen(".")+strlen("metct")+strlen(ifilenames->offsets_index1info_ptr)+1,sizeof(char));
1224     sprintf(new_offsets_filename,"%s/%s.%s%s",destdir,fileroot,"metct",ifilenames->offsets_index1info_ptr);
1225 
1226     new_positions_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1227 					     strlen(".")+strlen("metct")+strlen(ifilenames->positions_index1info_ptr)+1,sizeof(char));
1228     sprintf(new_positions_filename,"%s/%s.%s%s",destdir,fileroot,"metct",ifilenames->positions_index1info_ptr);
1229 
1230     if (coord_values_8p == true) {
1231       new_positions_high_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1232 						    strlen(".")+strlen("metct")+strlen(ifilenames->positions_high_index1info_ptr)+1,sizeof(char));
1233       sprintf(new_positions_high_filename,"%s/%s.%s%s",destdir,fileroot,"metct",ifilenames->positions_high_index1info_ptr);
1234     }
1235 
1236     compute_ct(new_pointers_filename,new_offsets_filename,
1237 	       new_positions_high_filename,new_positions_filename,
1238 	       indexdb,oligospace,index_mask,coord_values_8p);
1239 
1240     if (coord_values_8p == true) {
1241       FREE(new_positions_high_filename);
1242     }
1243     FREE(new_positions_filename);
1244     FREE(new_offsets_filename);
1245     FREE(new_pointers_filename);
1246 
1247     /* regiondb */
1248     filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+strlen(fileroot)+
1249 			       strlen(".genomecomp")+1,sizeof(char));
1250     sprintf(filename,"%s/%s.genomecomp",sourcedir,fileroot);
1251     if ((sequence_fp = fopen(filename,"rb")) == NULL) {
1252       fprintf(stderr,"Could not open file %s\n",filename);
1253       exit(9);
1254     }
1255     FREE(filename);
1256 
1257     Regiondb_write(destdir,/*region_interval_char*/'1',sequence_fp,chromosome_iit,
1258 		   region1part,region1interval,
1259 		   /*genome_lc_p*/false,fileroot,/*mask_lowercase_p*/false,
1260 		   Cmet_reduce_ct);
1261     fclose(sequence_fp);
1262 
1263 
1264     /* Compute and write GA files */
1265     new_pointers_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1266 					    strlen(".")+strlen("metga")+strlen(ifilenames->pointers_index1info_ptr)+1,sizeof(char));
1267     sprintf(new_pointers_filename,"%s/%s.%s%s",destdir,fileroot,"metga",ifilenames->pointers_index1info_ptr);
1268 
1269     new_offsets_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1270 					   strlen(".")+strlen("metga")+strlen(ifilenames->offsets_index1info_ptr)+1,sizeof(char));
1271     sprintf(new_offsets_filename,"%s/%s.%s%s",destdir,fileroot,"metga",ifilenames->offsets_index1info_ptr);
1272 
1273     new_positions_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1274 					     strlen(".")+strlen("metga")+strlen(ifilenames->positions_index1info_ptr)+1,sizeof(char));
1275     sprintf(new_positions_filename,"%s/%s.%s%s",destdir,fileroot,"metga",ifilenames->positions_index1info_ptr);
1276 
1277     if (coord_values_8p == true) {
1278       new_positions_high_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1279 						    strlen(".")+strlen("metga")+strlen(ifilenames->positions_high_index1info_ptr)+1,sizeof(char));
1280       sprintf(new_positions_high_filename,"%s/%s.%s%s",destdir,fileroot,"metga",ifilenames->positions_high_index1info_ptr);
1281     }
1282 
1283     compute_ga(new_pointers_filename,new_offsets_filename,
1284 	       new_positions_high_filename,new_positions_filename,
1285 	       indexdb,oligospace,index_mask,coord_values_8p);
1286     if (coord_values_8p == true) {
1287       FREE(new_positions_high_filename);
1288     }
1289     FREE(new_positions_filename);
1290     FREE(new_offsets_filename);
1291     FREE(new_pointers_filename);
1292 
1293     /* regiondb */
1294     filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+strlen(fileroot)+
1295 			       strlen(".genomecomp")+1,sizeof(char));
1296     sprintf(filename,"%s/%s.genomecomp",sourcedir,fileroot);
1297     if ((sequence_fp = fopen(filename,"rb")) == NULL) {
1298       fprintf(stderr,"Could not open file %s\n",filename);
1299       exit(9);
1300     }
1301     FREE(filename);
1302 
1303     Regiondb_write(destdir,/*region_interval_char*/'1',sequence_fp,chromosome_iit,
1304 		   region1part,region1interval,
1305 		   /*genome_lc_p*/false,fileroot,/*mask_lowercase_p*/false,
1306 		   Cmet_reduce_ga);
1307     fclose(sequence_fp);
1308 
1309     Indexdb_free(&indexdb);
1310     Indexdb_filenames_free(&ifilenames);
1311   }
1312 
1313   Univ_IIT_free(&chromosome_iit);
1314 
1315   FREE(destdir);
1316   FREE(dbversion);
1317   FREE(fileroot);
1318   FREE(sourcedir);
1319 
1320   return 0;
1321 }
1322 
1323 
1324 
1325 static void
print_program_usage()1326 print_program_usage () {
1327   fprintf(stdout,"\
1328 Usage: cmetindex [OPTIONS...] -d <genome>\n\
1329 \n\
1330 ");
1331 
1332   /* Input options */
1333   fprintf(stdout,"Options (must include -d)\n");
1334   fprintf(stdout,"\
1335   -F, --sourcedir=directory      Directory where to read cmet index files (default is\n\
1336                                    GMAP genome directory specified at compile time)\n\
1337   -D, --destdir=directory        Directory where to write cmet index files (default is\n\
1338                                    value of -F, if provided; otherwise the value of the\n\
1339                                    GMAP genome directory specified at compile time)\n\
1340   -d, --db=STRING                Genome database\n\
1341   -k, --kmer=INT                 kmer size to use in genome database (allowed values: 16 or less).\n\
1342                                    If not specified, the program will find the highest available\n\
1343                                    kmer size in the genome database\n\
1344   -q, --sampling=INT             Sampling to use in genome database.  If not specified, the program\n\
1345                                    will find the smallest available sampling value in the genome database\n\
1346                                    within selected basesize and k-mer size\n\
1347   -v, --use-snps=STRING          Use database containing known SNPs (in <STRING>.iit, built\n\
1348                                    previously using snpindex) for tolerance to SNPs\n\
1349 \n\
1350   --version                      Show version\n\
1351   --help                         Show this help message\n\
1352 ");
1353   return;
1354 }
1355 
1356