1 static char rcsid[] = "$Id: cmetindex.c 222799 2020-06-03 21:56:34Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11
12 #ifdef WORDS_BIGENDIAN
13 #include "bigendian.h"
14 #else
15 #include "littleendian.h"
16 #endif
17
18 #include <stdio.h>
19 #include <stddef.h>
20 #include <stdlib.h>
21 #include <string.h> /* For memset */
22 #include <ctype.h> /* For toupper */
23 #include <sys/mman.h> /* For munmap */
24 #include <math.h> /* For qsort */
25 #ifdef HAVE_UNISTD_H
26 #include <unistd.h> /* For lseek and close */
27 #endif
28 #ifdef HAVE_SYS_TYPES_H
29 #include <sys/types.h> /* For off_t */
30 #endif
31 #if HAVE_DIRENT_H
32 # include <dirent.h>
33 # define NAMLEN(dirent) strlen((dirent)->d_name)
34 #else
35 # define dirent direct
36 # define NAMLEN(dirent) (dirent)->d_namlen
37 # if HAVE_SYS_NDIR_H
38 # include <sys/ndir.h>
39 # endif
40 # if HAVE_SYS_DIR_H
41 # include <sys/dir.h>
42 # endif
43 # if HAVE_NDIR_H
44 # include <ndir.h>
45 # endif
46 #endif
47
48 #include "mem.h"
49 #include "fopen.h"
50 #include "access.h"
51 #include "types.h" /* For Positionsptr_T and Oligospace_T */
52 #include "mode.h"
53 #include "filesuffix.h"
54
55 #include "cmet.h"
56
57 #include "assert.h"
58 #include "bool.h"
59 #include "genomicpos.h"
60 #include "iit-read-univ.h"
61 #include "indexdb.h"
62 #include "indexdbdef.h"
63 #include "indexdb-write.h"
64 #include "genome.h"
65 #include "genome128_hr.h"
66
67 #include "bitpack64-write.h"
68 #include "bitpack64-read.h"
69 #include "bitpack64-readtwo.h"
70 #include "bitpack64-access.h"
71 #include "bitpack64-incr.h"
72
73 #include "regiondb-write.h"
74 #include "datadir.h"
75 #include "getopt.h"
76
77
78 #define POSITIONS8_HIGH_SHIFT 32
79 #define POSITIONS8_LOW_MASK 0xFFFFFFFF
80
81 #define LOCTABLE_SIZE 2
82 #define DIFFERENTIAL_METAINFO_SIZE 2
83 #define BLOCKSIZE 64
84 #define BUFFER_SIZE 1000000
85
86 #define MONITOR_INTERVAL 10000000
87 #define MONITOR_INTERVAL_REGION 100000
88
89
90 #ifdef DEBUG
91 #define debug(x) x
92 #else
93 #define debug(x)
94 #endif
95
96
97
98 static char *user_sourcedir = NULL;
99 static char *user_destdir = NULL;
100 static char *dbroot = NULL;
101 static char *dbversion = NULL;
102 static int compression_type;
103
104 static int index1part = 15;
105 static int required_index1part = 0;
106 static int index1interval;
107 static int required_index1interval = 0;
108
109 static int region1part = 6;
110 static int region1interval = 1;
111
112 static char *snps_root = NULL;
113
114
115 static struct option long_options[] = {
116 /* Input options */
117 {"sourcedir", required_argument, 0, 'F'}, /* user_sourcedir */
118 {"destdir", required_argument, 0, 'D'}, /* user_destdir */
119 {"kmer", required_argument, 0, 'k'}, /* required_index1part */
120 {"sampling", required_argument, 0, 'q'}, /* required_interval */
121 {"db", required_argument, 0, 'd'}, /* dbroot */
122 {"usesnps", required_argument, 0, 'v'}, /* snps_root */
123
124 /* Help options */
125 {"version", no_argument, 0, 0}, /* print_program_version */
126 {"help", no_argument, 0, 0}, /* print_program_usage */
127 {0, 0, 0, 0}
128 };
129
130
131 static void
print_program_version()132 print_program_version () {
133 fprintf(stdout,"\n");
134 fprintf(stdout,"CMETINDEX: Builds GMAP index files for methylated genomes\n");
135 fprintf(stdout,"Part of GMAP package, version %s\n",PACKAGE_VERSION);
136 fprintf(stdout,"Default gmap directory: %s\n",GMAPDB);
137 fprintf(stdout,"Thomas D. Wu, Genentech, Inc.\n");
138 fprintf(stdout,"Contact: twu@gene.com\n");
139 fprintf(stdout,"\n");
140 return;
141 }
142
143 static void
144 print_program_usage ();
145
146
147 static Oligospace_T
power(int base,int exponent)148 power (int base, int exponent) {
149 Oligospace_T result = 1UL;
150 int i;
151
152 for (i = 0; i < exponent; i++) {
153 result *= base;
154 }
155 return result;
156 }
157
158
159 /* 87654321 */
160 #define LOW_TWO_BITS 0x00000003
161
162
163 #if 0
164 static Oligospace_T
165 reduce_oligo_old (Oligospace_T oligo, int oligosize) {
166 Oligospace_T reduced = 0U, lowbits;
167 int i;
168
169 for (i = 0; i < oligosize; i++) {
170 lowbits = oligo & LOW_TWO_BITS;
171 reduced >>= 2;
172 switch (lowbits) {
173 case RIGHT_A: break;
174 case RIGHT_C: reduced |= LEFT_T; break;
175 case RIGHT_G: reduced |= LEFT_G; break;
176 case RIGHT_T: reduced |= LEFT_T; break;
177 }
178 oligo >>= 2;
179 }
180
181 for ( ; i < 16; i++) {
182 reduced >>= 2;
183 }
184
185 return reduced;
186 }
187 #endif
188
189
190 #if 0
191 static char *
192 shortoligo_nt (Oligospace_T oligo, int oligosize) {
193 char *nt;
194 int i, j;
195 Oligospace_T lowbits;
196
197 nt = (char *) CALLOC(oligosize+1,sizeof(char));
198 j = oligosize-1;
199 for (i = 0; i < oligosize; i++) {
200 lowbits = oligo & LOW_TWO_BITS;
201 switch (lowbits) {
202 case RIGHT_A: nt[j] = 'A'; break;
203 case RIGHT_C: nt[j] = 'C'; break;
204 case RIGHT_G: nt[j] = 'G'; break;
205 case RIGHT_T: nt[j] = 'T'; break;
206 }
207 oligo >>= 2;
208 j--;
209 }
210
211 return nt;
212 }
213 #endif
214
215
216 #if 0
217 /* Old method */
218 static Positionsptr_T *
219 compute_offsets_ct_using_array (Positionsptr_T *oldoffsets, Oligospace_T oligospace, Oligospace_T mask) {
220 Positionsptr_T *offsets;
221 Oligospace_T oligoi, reduced;
222 #ifdef DEBUG
223 char *nt1, *nt2;
224 #endif
225
226 /* Fill with sizes */
227 offsets = (Positionsptr_T *) CALLOC(oligospace+1,sizeof(Positionsptr_T));
228
229 for (oligoi = 0; oligoi < oligospace; oligoi++) {
230 #if 0
231 if (reduce_oligo(oligoi) != reduce_oligo_old(oligoi,index1part)) {
232 abort();
233 }
234 #endif
235 reduced = Cmet_reduce_ct(oligoi) & mask;
236 debug(
237 nt1 = shortoligo_nt(oligoi,index1part);
238 nt2 = shortoligo_nt(reduced,index1part);
239 printf("For oligo %s, updating sizes for %s from %u",nt1,nt2,offsets[reduced+1]);
240 );
241 #ifdef WORDS_BIGENDIAN
242 /*size*/offsets[reduced+1] += (Bigendian_convert_uint(oldoffsets[oligoi+1]) - Bigendian_convert_uint(oldoffsets[oligoi]));
243 #else
244 /*size*/offsets[reduced+1] += (oldoffsets[oligoi+1] - oldoffsets[oligoi]);
245 #endif
246 debug(
247 printf(" to %u\n",offsets[reduced+1]);
248 FREE(nt2);
249 FREE(nt1);
250 );
251 }
252
253 offsets[0] = 0U;
254 for (oligoi = 1; oligoi <= oligospace; oligoi++) {
255 offsets[oligoi] = offsets[oligoi-1] + /*size*/offsets[oligoi];
256 debug(if (offsets[oligoi] != offsets[oligoi-1]) {
257 printf("Offset for %06X: %u\n",oligoi,offsets[oligoi]);
258 });
259 }
260
261 return offsets;
262 }
263 #endif
264
265
266 #if 0
267 /* Old method */
268 static Positionsptr_T *
269 compute_offsets_ga_using_array (Positionsptr_T *oldoffsets, Oligospace_T oligospace, Oligospace_T mask) {
270 Positionsptr_T *offsets;
271 Oligospace_T oligoi, reduced;
272 #ifdef DEBUG
273 char *nt1, *nt2;
274 #endif
275
276 /* Fill with sizes */
277 offsets = (Positionsptr_T *) CALLOC(oligospace+1,sizeof(Positionsptr_T));
278
279 for (oligoi = 0; oligoi < oligospace; oligoi++) {
280 reduced = Cmet_reduce_ga(oligoi) & mask;
281 debug(
282 nt1 = shortoligo_nt(oligoi,index1part);
283 nt2 = shortoligo_nt(reduced,index1part);
284 printf("For oligo %s, updating sizes for %s from %u",nt1,nt2,offsets[reduced+1]);
285 );
286 #ifdef WORDS_BIGENDIAN
287 /*size*/offsets[reduced+1] += (Bigendian_convert_uint(oldoffsets[oligoi+1]) - Bigendian_convert_uint(oldoffsets[oligoi]));
288 #else
289 /*size*/offsets[reduced+1] += (oldoffsets[oligoi+1] - oldoffsets[oligoi]);
290 #endif
291 debug(
292 printf(" to %u\n",offsets[reduced+1]);
293 FREE(nt2);
294 FREE(nt1);
295 );
296 }
297
298 offsets[0] = 0U;
299 for (oligoi = 1; oligoi <= oligospace; oligoi++) {
300 offsets[oligoi] = offsets[oligoi-1] + /*size*/offsets[oligoi];
301 debug(if (offsets[oligoi] != offsets[oligoi-1]) {
302 printf("Offset for %06X: %u\n",oligoi,offsets[oligoi]);
303 });
304 }
305
306 return offsets;
307 }
308 #endif
309
310
311 static void
index_offsets_ct_using_bitpack(char * new_pointers_filename,char * new_offsets_filename,UINT4 * oldoffsetsmeta,UINT4 * oldoffsetsstrm,Oligospace_T oligospace,Oligospace_T mask)312 index_offsets_ct_using_bitpack (char *new_pointers_filename, char *new_offsets_filename,
313 UINT4 *oldoffsetsmeta, UINT4 *oldoffsetsstrm,
314 Oligospace_T oligospace, Oligospace_T mask) {
315 Oligospace_T bmerspace;
316 char *packsizes;
317 UINT4 **bitpacks;
318 UINT4 increment;
319 int new_packsize;
320 UINT4 offsets[BLOCKSIZE+1];
321 Oligospace_T oligoi, reduced, bmer;
322 int ii;
323 #ifdef DEBUG
324 char *nt1, *nt2;
325 #endif
326
327 /* Fill with sizes */
328 bmerspace = oligospace/BLOCKSIZE;
329 packsizes = (char *) CALLOC(bmerspace,sizeof(char));
330 bitpacks = (UINT4 **) CALLOC(bmerspace,sizeof(UINT4 *));
331
332 for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
333 Bitpack64_block_offsets(offsets,oligoi,oldoffsetsmeta,oldoffsetsstrm);
334 for (ii = 0; ii < BLOCKSIZE; ii++) {
335 if ((increment = offsets[ii+1] - offsets[ii]) > 0) {
336 reduced = Cmet_reduce_ct(oligoi + ii) & mask;
337 bmer = reduced/BLOCKSIZE;
338 if ((new_packsize = Bitpack64_access_new_packsize(reduced,/*old_packsize*/(int) packsizes[bmer],
339 bitpacks[bmer],increment)) != (int) packsizes[bmer]) {
340 bitpacks[bmer] = Bitpack64_realloc_multiple((int) packsizes[bmer],new_packsize,bitpacks[bmer]);
341 packsizes[bmer] = (char) new_packsize;
342 }
343 Bitpack64_add_bitpack(reduced,(int) packsizes[bmer],bitpacks[bmer],increment);
344 }
345 }
346 }
347
348 Bitpack64_write_differential_bitpacks(new_pointers_filename,new_offsets_filename,packsizes,bitpacks,oligospace);
349
350 for (bmer = 0; bmer < bmerspace; bmer++) {
351 FREE(bitpacks[bmer]);
352 }
353 FREE(bitpacks);
354 FREE(packsizes);
355
356 return;
357 }
358
359
360 static void
index_offsets_ga_using_bitpack(char * new_pointers_filename,char * new_offsets_filename,UINT4 * oldoffsetsmeta,UINT4 * oldoffsetsstrm,Oligospace_T oligospace,Oligospace_T mask)361 index_offsets_ga_using_bitpack (char *new_pointers_filename, char *new_offsets_filename,
362 UINT4 *oldoffsetsmeta, UINT4 *oldoffsetsstrm,
363 Oligospace_T oligospace, Oligospace_T mask) {
364 Oligospace_T bmerspace;
365 char *packsizes;
366 UINT4 **bitpacks;
367 UINT4 increment;
368 int new_packsize;
369 UINT4 offsets[BLOCKSIZE+1];
370 Oligospace_T oligoi, reduced, bmer;
371 int ii;
372 #ifdef DEBUG
373 char *nt1, *nt2;
374 #endif
375
376 /* Fill with sizes */
377 bmerspace = oligospace/BLOCKSIZE;
378 packsizes = (char *) CALLOC(bmerspace,sizeof(char));
379 bitpacks = (UINT4 **) CALLOC(bmerspace,sizeof(UINT4 *));
380
381 for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
382 Bitpack64_block_offsets(offsets,oligoi,oldoffsetsmeta,oldoffsetsstrm);
383 for (ii = 0; ii < BLOCKSIZE; ii++) {
384 if ((increment = offsets[ii+1] - offsets[ii]) > 0) {
385 reduced = Cmet_reduce_ga(oligoi + ii) & mask;
386 bmer = reduced/BLOCKSIZE;
387 if ((new_packsize = Bitpack64_access_new_packsize(reduced,/*old_packsize*/(int) packsizes[bmer],
388 bitpacks[bmer],increment)) != (int) packsizes[bmer]) {
389 bitpacks[bmer] = Bitpack64_realloc_multiple((int) packsizes[bmer],new_packsize,bitpacks[bmer]);
390 packsizes[bmer] = (char) new_packsize;
391 }
392 Bitpack64_add_bitpack(reduced,(int) packsizes[bmer],bitpacks[bmer],increment);
393 }
394 }
395 }
396
397 Bitpack64_write_differential_bitpacks(new_pointers_filename,new_offsets_filename,packsizes,bitpacks,oligospace);
398
399 for (bmer = 0; bmer < bmerspace; bmer++) {
400 FREE(bitpacks[bmer]);
401 }
402 FREE(bitpacks);
403 FREE(packsizes);
404
405 return;
406 }
407
408
409 #if 0
410 /* Needs work */
411 static void
412 sort_positions_local_snps (FILE *positions_fp, UINT2 *positions2,
413 UINT2 *newoffsetsmeta, UINT2 *newoffsetsstrm, Localspace_T localspace) {
414 Localspace_T oligoi;
415 UINT2 block_start, block_end, j, npositions;
416 int ii;
417
418 UINT4 newoffsets[BLOCKSIZE+1];
419
420 #if 0
421 /* For snps_root */
422 FILE *ptrs_fp, *comp_fp;
423 UINT4 *ptrs;
424 UINT4 ptri;
425 int nregisters;
426 UINT4 ascending[BLOCKSIZE+1];
427 UINT2 diffs[BLOCKSIZE];
428 UINT4 totalcount;
429 int packsize;
430
431 /* Buffer is used to avoid frequent writes to the file */
432 UINT4 *buffer;
433 int buffer_size = BUFFER_SIZE;
434 int buffer_i;
435 #endif
436
437
438 /* Sort positions in each block. For snps_root, use code from Epu16_bitpack64_append_differential */
439 if (snps_root) {
440 fprintf(stderr,"Not supported\n");
441 abort();
442 #if 0
443 ptrs = (UINT4 *) CALLOC(((oligospace + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
444 ptri = 0;
445
446 if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
447 fprintf(stderr,"Can't write to file %s\n",compfile);
448 exit(9);
449 }
450 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
451 buffer_i = 0;
452 nregisters = 0;
453 #endif
454 }
455
456 /* totalcount = 0; -- For snps_root */
457 for (oligoi = 0; oligoi + BLOCKSIZE <= oligospace; oligoi += BLOCKSIZE) {
458 if (snps_root) {
459 #if 0
460 ptrs[ptri++] = nregisters;
461 ptrs[ptri++] = totalcount;
462 #endif
463 }
464 /* ascending[0] = totalcount; -- For snps_root */
465 Epu16_bitpack64_block_offsets(newoffsets,oligoi,newoffsetsmeta,newoffsetsstrm);
466 for (ii = 0; ii < BLOCKSIZE; ii++) {
467 block_start = newoffsets[ii];
468 block_end = newoffsets[ii+1];
469 if ((npositions = block_end - block_start) > 0) {
470 qsort(&(positions2[block_start]),npositions,sizeof(UINT2),UINT2_compare);
471 if (snps_root == NULL) {
472 FWRITE_UINTS(&(positions2[block_start]),npositions,positions_fp);
473 } else {
474 #if 0
475 FWRITE_UINT(positions4[block_start],positions_fp);
476 for (j = block_start+1; j < block_end; j++) {
477 if (positions4[j] == positions4[j-1]) {
478 npositions--;
479 } else {
480 FWRITE_UINT(positions4[j],positions_fp);
481 }
482 }
483 #endif
484 }
485 /* totalcount += npositions; -- For snps_root */
486 }
487 /* ascending[ii+1] = totalcount; -- For snps_root */
488 }
489 if (snps_root) {
490 #if 0
491 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
492 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
493 nregisters += packsize / 2;
494 #endif
495 }
496 }
497
498 if (snps_root) {
499 #if 0
500 /* Write the final pointer, which will point after the end of the file */
501 ptrs[ptri++] = nregisters; /* In 128-bit registers */
502
503 /* Value for end of block */
504 ptrs[ptri++] = totalcount;
505
506 if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
507 fprintf(stderr,"Can't write to file %s\n",ptrsfile);
508 exit(9);
509 } else {
510 FWRITE_UINTS(ptrs,ptri,ptrs_fp);
511 FREE(ptrs);
512 fclose(ptrs_fp);
513 }
514
515 /* Empty buffer */
516 if (buffer_i > 0) {
517 FWRITE_UINTS(buffer,buffer_i,comp_fp);
518 buffer_i = 0;
519 }
520 FREE(buffer);
521 fclose(comp_fp);
522 #endif
523 }
524
525 return;
526 }
527 #endif
528
529
530 #if 0
531 /* Does not support SNPs. Written in-line into compute_ct_local and compute_ag_local */
532 static void
533 sort_positions_local (FILE *positions_fp, UINT2 *positions2,
534 UINT2 *newoffsetsmeta, UINT2 *newoffsetsstrm, Localspace_T localspace) {
535 Localspace_T oligoi;
536 UINT2 block_start, block_end, j, npositions;
537 UINT4 newoffsets[BLOCKSIZE+1];
538 int ii;
539
540 /* Sort positions in each block */
541 for (oligoi = 0; oligoi + BLOCKSIZE <= localspace; oligoi += BLOCKSIZE) {
542 Epu16_bitpack64_block_offsets(newoffsets,oligoi,newoffsetsmeta,newoffsetsstrm);
543 for (ii = 0; ii < BLOCKSIZE; ii++) {
544 block_start = newoffsets[ii];
545 block_end = newoffsets[ii+1];
546 if ((npositions = block_end - block_start) > 0) {
547 qsort(&(positions2[block_start]),npositions,sizeof(UINT2),UINT2_compare);
548 FWRITE_UINTS(&(positions2[block_start]),npositions,positions_fp);
549 }
550 }
551 }
552
553 return;
554 }
555 #endif
556
557
558 static void
sort_8mers(unsigned char * positions8_high,UINT4 * positions8_low,Positionsptr_T npositions)559 sort_8mers (unsigned char *positions8_high, UINT4 *positions8_low, Positionsptr_T npositions) {
560 UINT8 *positions8;
561 Positionsptr_T i;
562
563 positions8 = (UINT8 *) MALLOC(npositions*sizeof(UINT8));
564 for (i = 0; i < npositions; i++) {
565 positions8[i] = ((UINT8) positions8_high[i] << 32) + positions8_low[i];
566 }
567 qsort(positions8,npositions,sizeof(UINT8),UINT8_compare);
568 for (i = 0; i < npositions; i++) {
569 positions8_high[i] = positions8[i] >> POSITIONS8_HIGH_SHIFT;
570 positions8_low[i] = positions8[i] & POSITIONS8_LOW_MASK;
571 }
572 return;
573 }
574
575
576 static void
sort_positions(char * ptrsfile,char * compfile,FILE * positions_high_fp,FILE * positions_fp,unsigned char * positions8_high,UINT4 * positions8_low,UINT4 * positions4,UINT4 * newoffsetsmeta,UINT4 * newoffsetsstrm,Oligospace_T oligospace,bool coord_values_8p)577 sort_positions (char *ptrsfile, char *compfile, FILE *positions_high_fp, FILE *positions_fp,
578 unsigned char *positions8_high, UINT4 *positions8_low, UINT4 *positions4,
579 UINT4 *newoffsetsmeta, UINT4 *newoffsetsstrm, Oligospace_T oligospace, bool coord_values_8p) {
580 Oligospace_T oligoi;
581 Positionsptr_T block_start, block_end, j, npositions;
582
583 FILE *ptrs_fp, *comp_fp;
584 UINT4 *ptrs, nregisters;
585 UINT4 ptri;
586 int ii;
587
588 /* Buffer is used to avoid frequent writes to the file */
589 UINT4 *buffer;
590 int buffer_size = BUFFER_SIZE;
591 int buffer_i;
592
593 UINT4 newoffsets[BLOCKSIZE+1];
594 UINT4 diffs[BLOCKSIZE], ascending[BLOCKSIZE+1];
595 UINT4 totalcount;
596 int packsize;
597
598
599 /* Sort positions in each block. For snps_root, use code from Bitpack64_write_differential */
600 if (snps_root) {
601 ptrs = (UINT4 *) CALLOC(((oligospace + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
602 ptri = 0;
603
604 if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
605 fprintf(stderr,"Can't write to file %s\n",compfile);
606 exit(9);
607 }
608 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
609 buffer_i = 0;
610 nregisters = 0;
611 }
612
613 totalcount = 0;
614 for (oligoi = 0; oligoi + BLOCKSIZE <= oligospace; oligoi += BLOCKSIZE) {
615 if (snps_root) {
616 ptrs[ptri++] = nregisters;
617 ptrs[ptri++] = totalcount;
618 }
619 ascending[0] = totalcount;
620 Bitpack64_block_offsets(newoffsets,oligoi,newoffsetsmeta,newoffsetsstrm);
621 for (ii = 0; ii < BLOCKSIZE; ii++) {
622 if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
623 fprintf(stderr,".");
624 }
625 block_start = newoffsets[ii];
626 block_end = newoffsets[ii+1];
627 if ((npositions = block_end - block_start) > 0) {
628 if (coord_values_8p == true) {
629 sort_8mers(&(positions8_high[block_start]),&(positions8_low[block_start]),npositions);
630 if (snps_root == NULL) {
631 /* FWRITE_UINT8S(&(positions8[block_start]),npositions,positions_fp); */
632 FWRITE_CHARS(&(positions8_high[block_start]),npositions,positions_high_fp);
633 FWRITE_UINTS(&(positions8_low[block_start]),npositions,positions_fp);
634 } else {
635 /* FWRITE_UINT8(positions8[block_start],positions_fp); */
636 FWRITE_CHAR(positions8_high[block_start],positions_high_fp);
637 FWRITE_UINT(positions8_low[block_start],positions_fp);
638 for (j = block_start+1; j < block_end; j++) {
639 if (positions8_high[j] == positions8_high[j-1] && positions8_low[j] == positions8_low[j-1]) {
640 npositions--;
641 } else {
642 /* FWRITE_UINT8(positions8[j],positions_fp); */
643 FWRITE_CHAR(positions8_high[j],positions_high_fp);
644 FWRITE_UINT(positions8_low[j],positions_fp);
645 }
646 }
647 }
648
649 } else {
650 qsort(&(positions4[block_start]),npositions,sizeof(UINT4),UINT4_compare);
651 if (snps_root == NULL) {
652 FWRITE_UINTS(&(positions4[block_start]),npositions,positions_fp);
653 } else {
654 FWRITE_UINT(positions4[block_start],positions_fp);
655 for (j = block_start+1; j < block_end; j++) {
656 if (positions4[j] == positions4[j-1]) {
657 npositions--;
658 } else {
659 FWRITE_UINT(positions4[j],positions_fp);
660 }
661 }
662 }
663 }
664 totalcount += npositions;
665 }
666 ascending[ii+1] = totalcount;
667 }
668 if (snps_root) {
669 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
670 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
671 nregisters += packsize / 2;
672 }
673 }
674
675 if (oligoi <= oligospace) {
676 /* Finish last block (containing 1 entry, but expand to 64) */
677 if (snps_root) {
678 ptrs[ptri++] = nregisters;
679 ptrs[ptri++] = totalcount;
680 }
681 ascending[0] = totalcount;
682 for (ii = 0; ii <= (int) (oligospace - oligoi); ii++) {
683 block_start = Bitpack64_read_two(&block_end,oligoi+ii,newoffsetsmeta,newoffsetsstrm);
684 if ((npositions = block_end - block_start) > 0) {
685 if (coord_values_8p == true) {
686 sort_8mers(&(positions8_high[block_start]),&(positions8_low[block_start]),npositions);
687 if (snps_root == NULL) {
688 /* FWRITE_UINT8S(&(positions8[block_start]),npositions,positions_fp); */
689 FWRITE_CHARS(&(positions8_high[block_start]),npositions,positions_high_fp);
690 FWRITE_UINTS(&(positions8_low[block_start]),npositions,positions_fp);
691 } else {
692 /* FWRITE_UINT8(positions8[block_start],positions_fp); */
693 FWRITE_CHAR(positions8_high[block_start],positions_high_fp);
694 FWRITE_UINT(positions8_low[block_start],positions_fp);
695 for (j = block_start+1; j < block_end; j++) {
696 if (positions8_high[j] == positions8_high[j-1] && positions8_low[j] == positions8_low[j-1]) {
697 npositions--;
698 } else {
699 /* FWRITE_UINT8(positions8[j],positions_fp); */
700 FWRITE_CHAR(positions8_high[j],positions_high_fp);
701 FWRITE_UINT(positions8_low[j],positions_fp);
702 }
703 }
704 }
705
706 } else {
707 qsort(&(positions4[block_start]),npositions,sizeof(UINT4),UINT4_compare);
708 if (snps_root == NULL) {
709 FWRITE_UINTS(&(positions4[block_start]),npositions,positions_fp);
710 } else {
711 FWRITE_UINT(positions4[block_start],positions_fp);
712 for (j = block_start+1; j < block_end; j++) {
713 if (positions4[j] == positions4[j-1]) {
714 npositions--;
715 } else {
716 FWRITE_UINT(positions4[j],positions_fp);
717 }
718 }
719 }
720 }
721 totalcount += npositions;
722 }
723 ascending[ii+1] = totalcount;
724 }
725 if (snps_root) {
726 for ( ; ii < BLOCKSIZE; ii++) {
727 ascending[ii+1] = totalcount;
728 }
729 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
730 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
731 nregisters += packsize / 2;
732 }
733 }
734
735 if (snps_root) {
736 /* Write the final pointer, which will point after the end of the file */
737 ptrs[ptri++] = nregisters; /* In 128-bit registers */
738
739 /* Value for end of block */
740 ptrs[ptri++] = totalcount;
741
742 if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
743 fprintf(stderr,"Can't write to file %s\n",ptrsfile);
744 exit(9);
745 } else {
746 FWRITE_UINTS(ptrs,ptri,ptrs_fp);
747 FREE(ptrs);
748 fclose(ptrs_fp);
749 }
750
751 /* Empty buffer */
752 if (buffer_i > 0) {
753 FWRITE_UINTS(buffer,buffer_i,comp_fp);
754 buffer_i = 0;
755 }
756 FREE(buffer);
757 fclose(comp_fp);
758 }
759
760 return;
761 }
762
763
764 static void
compute_ct(char * new_pointers_filename,char * new_offsets_filename,char * new_positions_high_filename,char * new_positions_filename,Indexdb_T indexdb,Oligospace_T oligospace,Oligospace_T mask,bool coord_values_8p)765 compute_ct (char *new_pointers_filename, char *new_offsets_filename,
766 char *new_positions_high_filename, char *new_positions_filename,
767 Indexdb_T indexdb, Oligospace_T oligospace, Oligospace_T mask,
768 bool coord_values_8p) {
769 char *ptrsfile, *compfile;
770 FILE *positions_high_fp, *positions_fp;
771
772 unsigned char *positions8_high;
773 UINT4 *positions8_low;
774 UINT4 *positions4;
775
776 Oligospace_T oligoi, reduced;
777 UINT4 oldoffsets[BLOCKSIZE+1];
778
779 UINT4 *newoffsetsmeta, *newoffsetsstrm, *countermeta, *counterstrm;
780 Positionsptr_T preunique_totalcounts, block_start, block_end, j, offset;
781 size_t offsetsmeta_len, offsetsstrm_len;
782 int ii;
783 #ifdef HAVE_MMAP
784 int offsetsmeta_fd, offsetsstrm_fd;
785 #else
786 Access_T offsetsmeta_access, offsetsstrm_access;
787 #endif
788 double seconds;
789 #ifdef DEBUG
790 char *nt1, *nt2;
791 #endif
792
793
794 /* offsets = compute_offsets_ct_using_array(oldoffsets,oligospace,mask); */
795 index_offsets_ct_using_bitpack(new_pointers_filename,new_offsets_filename,
796 indexdb->offsetsmeta,indexdb->offsetsstrm,
797 oligospace,mask);
798
799 preunique_totalcounts = Bitpack64_read_one(oligospace,indexdb->offsetsmeta,indexdb->offsetsstrm);
800 if (preunique_totalcounts == 0) {
801 fprintf(stderr,"Something is wrong with the offsets. Total counts is zero.\n");
802 exit(9);
803
804 } else if (coord_values_8p == true) {
805 fprintf(stderr,"Trying to allocate %u*(%d+%d) bytes of memory...",
806 preunique_totalcounts,(int) sizeof(unsigned char),(int) sizeof(UINT4));
807 positions4 = (UINT4 *) NULL;
808 positions8_high = (unsigned char *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(unsigned char));
809 positions8_low = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
810 if (positions8_high == NULL || positions8_low == NULL) {
811 fprintf(stderr,"failed. Need a computer with sufficient memory.\n");
812 exit(9);
813 } else {
814 fprintf(stderr,"done\n");
815 }
816
817 } else {
818 fprintf(stderr,"Trying to allocate %u*%d bytes of memory...",preunique_totalcounts,(int) sizeof(UINT4));
819 positions8_high = (unsigned char *) NULL;
820 positions8_low = (UINT4 *) NULL;
821 positions4 = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
822 if (positions4 == NULL) {
823 fprintf(stderr,"failed. Need a computer with sufficient memory.\n");
824 exit(9);
825 } else {
826 fprintf(stderr,"done\n");
827 }
828 }
829
830 /* Point to offsets and revise */
831 fprintf(stderr,"Rearranging CT offsets for indexdb...");
832 #ifdef HAVE_MMAP
833 newoffsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,new_pointers_filename,/*randomp*/false);
834 newoffsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,new_offsets_filename,/*randomp*/false);
835 #else
836 newoffsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,new_pointers_filename,sizeof(UINT4));
837 newoffsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,new_offsets_filename,sizeof(UINT4));
838 #endif
839 countermeta = Indexdb_bitpack_counter(&counterstrm,newoffsetsmeta,index1part);
840
841 for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
842 Bitpack64_block_offsets(oldoffsets,oligoi,indexdb->offsetsmeta,indexdb->offsetsstrm);
843 for (ii = 0; ii < BLOCKSIZE; ii++) {
844 if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
845 fprintf(stderr,".");
846 }
847 block_start = oldoffsets[ii];
848 block_end = oldoffsets[ii+1];
849
850 if (block_end > block_start) {
851 reduced = Cmet_reduce_ct(oligoi + ii) & mask;
852 offset = Bitpack64_read_one(reduced,newoffsetsmeta,newoffsetsstrm) + Bitpack64_access(reduced,countermeta,counterstrm);
853 if (coord_values_8p == true) {
854 for (j = block_start; j < block_end; j++) {
855 positions8_high[offset] = indexdb->positions_high[j];
856 positions8_low[offset] = indexdb->positions[j];
857 offset++;
858 }
859 } else {
860 for (j = block_start; j < block_end; j++) {
861 positions4[offset] = indexdb->positions[j];
862 offset++;
863 }
864 }
865 Bitpack64_add(reduced,countermeta,counterstrm,block_end - block_start);
866 }
867 }
868 }
869
870 FREE(counterstrm);
871 FREE(countermeta);
872 fprintf(stderr,"done\n");
873
874
875 fprintf(stderr,"Sorting CT positions for indexdb...");
876 if (snps_root == NULL) {
877 ptrsfile = compfile = (char *) NULL;
878 } else {
879 ptrsfile = (char *) MALLOC((strlen(new_pointers_filename) + strlen(".temp") + 1) * sizeof(char));
880 sprintf(ptrsfile,"%s.temp",new_pointers_filename);
881 compfile = (char *) MALLOC((strlen(new_offsets_filename) + strlen(".temp") + 1) * sizeof(char));
882 sprintf(compfile,"%s.temp",new_offsets_filename);
883 }
884
885 if (new_positions_high_filename != NULL) {
886 if ((positions_high_fp = FOPEN_WRITE_BINARY(new_positions_high_filename)) == NULL) {
887 fprintf(stderr,"Can't write to file %s\n",new_positions_high_filename);
888 exit(9);
889 }
890 }
891
892 if ((positions_fp = FOPEN_WRITE_BINARY(new_positions_filename)) == NULL) {
893 fprintf(stderr,"Can't write to file %s\n",new_positions_filename);
894 exit(9);
895 }
896
897 sort_positions(ptrsfile,compfile,positions_high_fp,positions_fp,
898 positions8_high,positions8_low,positions4,
899 newoffsetsmeta,newoffsetsstrm,oligospace,coord_values_8p);
900
901 /* Clean up */
902 #ifdef HAVE_MMAP
903 munmap((void *) newoffsetsmeta,offsetsmeta_len);
904 munmap((void *) newoffsetsstrm,offsetsstrm_len);
905 close(offsetsmeta_fd);
906 close(offsetsstrm_fd);
907 #else
908 FREE(newoffsetsmeta);
909 FREE(newoffsetsstrm);
910 #endif
911
912 if (coord_values_8p == true) {
913 FREE(positions8_high);
914 FREE(positions8_low);
915 fclose(positions_high_fp);
916 fclose(positions_fp);
917 } else {
918 FREE(positions4);
919 fclose(positions_fp);
920 }
921
922 if (snps_root) {
923 rename(ptrsfile,new_pointers_filename);
924 rename(compfile,new_offsets_filename);
925 FREE(ptrsfile);
926 FREE(compfile);
927 }
928
929 fprintf(stderr,"done\n");
930
931 return;
932 }
933
934
935 static void
compute_ga(char * new_pointers_filename,char * new_offsets_filename,char * new_positions_high_filename,char * new_positions_filename,Indexdb_T indexdb,Oligospace_T oligospace,Oligospace_T mask,bool coord_values_8p)936 compute_ga (char *new_pointers_filename, char *new_offsets_filename,
937 char *new_positions_high_filename, char *new_positions_filename,
938 Indexdb_T indexdb, Oligospace_T oligospace, Oligospace_T mask,
939 bool coord_values_8p) {
940 char *ptrsfile, *compfile;
941 FILE *positions_high_fp, *positions_fp;
942
943 unsigned char *positions8_high;
944 UINT4 *positions8_low;
945 UINT4 *positions4;
946
947 Oligospace_T oligoi, reduced;
948 UINT4 oldoffsets[BLOCKSIZE+1];
949
950 UINT4 *newoffsetsmeta, *newoffsetsstrm, *countermeta, *counterstrm;
951 Positionsptr_T preunique_totalcounts, block_start, block_end, j, offset;
952 size_t offsetsmeta_len, offsetsstrm_len;
953 int ii;
954 #ifdef HAVE_MMAP
955 int offsetsmeta_fd, offsetsstrm_fd;
956 #else
957 Access_T offsetsmeta_access, offsetsstrm_access;
958 #endif
959 double seconds;
960 #ifdef DEBUG
961 char *nt1, *nt2;
962 #endif
963
964
965 /* offsets = compute_offsets_ga_using_array(oldoffsets,oligospace,mask); */
966 index_offsets_ga_using_bitpack(new_pointers_filename,new_offsets_filename,
967 indexdb->offsetsmeta,indexdb->offsetsstrm,
968 oligospace,mask);
969
970 preunique_totalcounts = Bitpack64_read_one(oligospace,indexdb->offsetsmeta,indexdb->offsetsstrm);
971 if (preunique_totalcounts == 0) {
972 fprintf(stderr,"Something is wrong with the offsets. Total counts is zero.\n");
973 exit(9);
974
975 } else if (coord_values_8p == true) {
976 fprintf(stderr,"Trying to allocate %u*(%d+%d) bytes of memory...",
977 preunique_totalcounts,(int) sizeof(unsigned char),(int) sizeof(UINT4));
978 positions4 = (UINT4 *) NULL;
979 positions8_high = (unsigned char *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(unsigned char));
980 positions8_low = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
981 if (positions8_high == NULL || positions8_low == NULL) {
982 fprintf(stderr,"failed. Need a computer with sufficient memory.\n");
983 exit(9);
984 } else {
985 fprintf(stderr,"done\n");
986 }
987
988 } else {
989 fprintf(stderr,"Trying to allocate %u*%d bytes of memory...",preunique_totalcounts,(int) sizeof(UINT4));
990 positions8_high = (unsigned char *) NULL;
991 positions8_low = (UINT4 *) NULL;
992 positions4 = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
993 if (positions4 == NULL) {
994 fprintf(stderr,"failed. Need a computer with sufficient memory.\n");
995 exit(9);
996 } else {
997 fprintf(stderr,"done\n");
998 }
999 }
1000
1001 /* Point to offsets and revise */
1002 fprintf(stderr,"Rearranging GA offsets for indexdb...");
1003 #ifdef HAVE_MMAP
1004 newoffsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,new_pointers_filename,/*randomp*/false);
1005 newoffsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,new_offsets_filename,/*randomp*/false);
1006 #else
1007 newoffsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,new_pointers_filename,sizeof(UINT4));
1008 newoffsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,new_offsets_filename,sizeof(UINT4));
1009 #endif
1010 countermeta = Indexdb_bitpack_counter(&counterstrm,newoffsetsmeta,index1part);
1011
1012 for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
1013 Bitpack64_block_offsets(oldoffsets,oligoi,indexdb->offsetsmeta,indexdb->offsetsstrm);
1014 for (ii = 0; ii < BLOCKSIZE; ii++) {
1015 if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
1016 fprintf(stderr,".");
1017 }
1018 block_start = oldoffsets[ii];
1019 block_end = oldoffsets[ii+1];
1020
1021 if (block_end > block_start) {
1022 reduced = Cmet_reduce_ga(oligoi + ii) & mask;
1023 offset = Bitpack64_read_one(reduced,newoffsetsmeta,newoffsetsstrm) + Bitpack64_access(reduced,countermeta,counterstrm);
1024 if (coord_values_8p == true) {
1025 for (j = block_start; j < block_end; j++) {
1026 positions8_high[offset] = indexdb->positions_high[j];
1027 positions8_low[offset] = indexdb->positions[j];
1028 offset++;
1029 }
1030 } else {
1031 for (j = block_start; j < block_end; j++) {
1032 positions4[offset] = indexdb->positions[j];
1033 offset++;
1034 }
1035 }
1036 Bitpack64_add(reduced,countermeta,counterstrm,block_end - block_start);
1037 }
1038 }
1039 }
1040
1041 FREE(counterstrm);
1042 FREE(countermeta);
1043 fprintf(stderr,"done\n");
1044
1045
1046 fprintf(stderr,"Sorting GA positions for indexdb...");
1047 if (snps_root == NULL) {
1048 ptrsfile = compfile = (char *) NULL;
1049 } else {
1050 ptrsfile = (char *) MALLOC((strlen(new_pointers_filename) + strlen(".temp") + 1) * sizeof(char));
1051 sprintf(ptrsfile,"%s.temp",new_pointers_filename);
1052 compfile = (char *) MALLOC((strlen(new_offsets_filename) + strlen(".temp") + 1) * sizeof(char));
1053 sprintf(compfile,"%s.temp",new_offsets_filename);
1054 }
1055
1056 if (new_positions_high_filename != NULL) {
1057 if ((positions_high_fp = FOPEN_WRITE_BINARY(new_positions_high_filename)) == NULL) {
1058 fprintf(stderr,"Can't write to file %s\n",new_positions_high_filename);
1059 exit(9);
1060 }
1061 }
1062
1063 if ((positions_fp = FOPEN_WRITE_BINARY(new_positions_filename)) == NULL) {
1064 fprintf(stderr,"Can't write to file %s\n",new_positions_filename);
1065 exit(9);
1066 }
1067
1068 sort_positions(ptrsfile,compfile,positions_high_fp,positions_fp,
1069 positions8_high,positions8_low,positions4,
1070 newoffsetsmeta,newoffsetsstrm,oligospace,coord_values_8p);
1071
1072 /* Clean up */
1073 #ifdef HAVE_MMAP
1074 munmap((void *) newoffsetsmeta,offsetsmeta_len);
1075 munmap((void *) newoffsetsstrm,offsetsstrm_len);
1076 close(offsetsmeta_fd);
1077 close(offsetsstrm_fd);
1078 #else
1079 FREE(newoffsetsmeta);
1080 FREE(newoffsetsstrm);
1081 #endif
1082
1083 if (coord_values_8p == true) {
1084 FREE(positions8_high);
1085 FREE(positions8_low);
1086 fclose(positions_high_fp);
1087 fclose(positions_fp);
1088 } else {
1089 FREE(positions4);
1090 fclose(positions_fp);
1091 }
1092
1093 if (snps_root) {
1094 rename(ptrsfile,new_pointers_filename);
1095 rename(compfile,new_offsets_filename);
1096 FREE(ptrsfile);
1097 FREE(compfile);
1098 }
1099
1100 fprintf(stderr,"done\n");
1101
1102 return;
1103 }
1104
1105
1106 /* Usage: cmetindex -d <genome> */
1107
1108
1109 /* Note: Depends on having gmapindex sampled on mod 3 bounds */
1110 int
main(int argc,char * argv[])1111 main (int argc, char *argv[]) {
1112 char *sourcedir = NULL, *destdir = NULL, *filename, *fileroot;
1113 Indexdb_filenames_T ifilenames;
1114 char *new_pointers_filename = NULL, *new_offsets_filename = NULL,
1115 *new_positions_high_filename = NULL, *new_positions_filename = NULL;
1116 FILE *sequence_fp;
1117
1118 Univ_IIT_T chromosome_iit;
1119 /* size_t totalcounts, i; */
1120 Oligospace_T oligospace, index_mask;
1121 bool coord_values_8p;
1122
1123 Indexdb_T indexdb;
1124
1125 int opt;
1126 extern int optind;
1127 extern char *optarg;
1128 int long_option_index = 0;
1129 const char *long_name;
1130
1131 while ((opt = getopt_long(argc,argv,"F:D:d:k:q:v:",
1132 long_options,&long_option_index)) != -1) {
1133 switch (opt) {
1134 case 0:
1135 long_name = long_options[long_option_index].name;
1136 if (!strcmp(long_name,"version")) {
1137 print_program_version();
1138 exit(0);
1139 } else if (!strcmp(long_name,"help")) {
1140 print_program_usage();
1141 exit(0);
1142
1143 } else {
1144 /* Shouldn't reach here */
1145 fprintf(stderr,"Don't recognize option %s. For usage, run 'cmetindex --help'",long_name);
1146 exit(9);
1147 }
1148 break;
1149
1150 case 'F': user_sourcedir = optarg; break;
1151 case 'D': user_destdir = optarg; break;
1152 case 'd': dbroot = optarg; break;
1153 case 'k': required_index1part = atoi(optarg); break;
1154 case 'q': required_index1interval = atoi(optarg); break;
1155 case 'v':
1156 snps_root = optarg;
1157 fprintf(stderr,"Combination of cmetindex and snps is not yet supported in 2018 versions\n");
1158 exit(9);
1159 break;
1160 default: fprintf(stderr,"Do not recognize flag %c\n",opt); exit(9);
1161 }
1162 }
1163 argc -= (optind - 1);
1164 argv += (optind - 1);
1165
1166 if (dbroot == NULL) {
1167 fprintf(stderr,"Missing name of genome database. Must specify with -d flag.\n");
1168 fprintf(stderr,"Usage: cmetindex -d <genome>\n");
1169 exit(9);
1170 } else {
1171 sourcedir = Datadir_find_genomesubdir(&fileroot,&dbversion,user_sourcedir,dbroot);
1172 fprintf(stderr,"Reading source files from %s\n",sourcedir);
1173 }
1174
1175 if (user_destdir == NULL) {
1176 destdir = (char *) MALLOC((strlen(sourcedir)+strlen("/")+strlen(dbroot)+1)*sizeof(char));
1177 sprintf(destdir,"%s/%s",sourcedir,dbroot);
1178 } else {
1179 destdir = (char *) MALLOC((strlen(user_destdir)+strlen("/")+strlen(dbroot)+1)*sizeof(char));
1180 sprintf(destdir,"%s/%s",user_destdir,dbroot);
1181 }
1182 fprintf(stderr,"Writing cmet index files to %s\n",destdir);
1183
1184
1185 /* Chromosome IIT file. Need to determine coord_values_8p and genomelength */
1186 filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+
1187 strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
1188 sprintf(filename,"%s/%s.chromosome.iit",sourcedir,fileroot);
1189 if ((chromosome_iit = Univ_IIT_read(filename,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
1190 fprintf(stderr,"IIT file %s is not valid\n",filename);
1191 exit(9);
1192 } else {
1193 coord_values_8p = Univ_IIT_coord_values_8p(chromosome_iit);
1194 }
1195 FREE(filename);
1196
1197
1198 ifilenames = Indexdb_get_filenames(&compression_type,&index1part,&index1interval,
1199 sourcedir,fileroot,IDX_FILESUFFIX,snps_root,
1200 required_index1part,required_index1interval,
1201 /*offsets_only_p*/false);
1202 if ((indexdb = Indexdb_new_genome(&index1part,&index1interval,
1203 sourcedir,fileroot,IDX_FILESUFFIX,/*snps_root*/NULL,
1204 required_index1part,required_index1interval,
1205 /*offsetsstrm_access*/USE_ALLOCATE,
1206 /*positions_access*/USE_ALLOCATE,/*sharedp*/false,
1207 /*multiple_sequences_p*/false,/*preload_shared_memory_p*/false,/*unload_shared_memory_p*/false)) != NULL) {
1208
1209 #ifdef HAVE_64_BIT
1210 index_mask = ~(~0ULL << 2*index1part);
1211 #else
1212 index_mask = ~(~0U << 2*index1part);
1213 #endif
1214 oligospace = power(4,index1part);
1215
1216
1217 /* Compute and write CT files */
1218 new_pointers_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1219 strlen(".")+strlen("metct")+strlen(ifilenames->pointers_index1info_ptr)+1,sizeof(char));
1220 sprintf(new_pointers_filename,"%s/%s.%s%s",destdir,fileroot,"metct",ifilenames->pointers_index1info_ptr);
1221
1222 new_offsets_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1223 strlen(".")+strlen("metct")+strlen(ifilenames->offsets_index1info_ptr)+1,sizeof(char));
1224 sprintf(new_offsets_filename,"%s/%s.%s%s",destdir,fileroot,"metct",ifilenames->offsets_index1info_ptr);
1225
1226 new_positions_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1227 strlen(".")+strlen("metct")+strlen(ifilenames->positions_index1info_ptr)+1,sizeof(char));
1228 sprintf(new_positions_filename,"%s/%s.%s%s",destdir,fileroot,"metct",ifilenames->positions_index1info_ptr);
1229
1230 if (coord_values_8p == true) {
1231 new_positions_high_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1232 strlen(".")+strlen("metct")+strlen(ifilenames->positions_high_index1info_ptr)+1,sizeof(char));
1233 sprintf(new_positions_high_filename,"%s/%s.%s%s",destdir,fileroot,"metct",ifilenames->positions_high_index1info_ptr);
1234 }
1235
1236 compute_ct(new_pointers_filename,new_offsets_filename,
1237 new_positions_high_filename,new_positions_filename,
1238 indexdb,oligospace,index_mask,coord_values_8p);
1239
1240 if (coord_values_8p == true) {
1241 FREE(new_positions_high_filename);
1242 }
1243 FREE(new_positions_filename);
1244 FREE(new_offsets_filename);
1245 FREE(new_pointers_filename);
1246
1247 /* regiondb */
1248 filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+strlen(fileroot)+
1249 strlen(".genomecomp")+1,sizeof(char));
1250 sprintf(filename,"%s/%s.genomecomp",sourcedir,fileroot);
1251 if ((sequence_fp = fopen(filename,"rb")) == NULL) {
1252 fprintf(stderr,"Could not open file %s\n",filename);
1253 exit(9);
1254 }
1255 FREE(filename);
1256
1257 Regiondb_write(destdir,/*region_interval_char*/'1',sequence_fp,chromosome_iit,
1258 region1part,region1interval,
1259 /*genome_lc_p*/false,fileroot,/*mask_lowercase_p*/false,
1260 Cmet_reduce_ct);
1261 fclose(sequence_fp);
1262
1263
1264 /* Compute and write GA files */
1265 new_pointers_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1266 strlen(".")+strlen("metga")+strlen(ifilenames->pointers_index1info_ptr)+1,sizeof(char));
1267 sprintf(new_pointers_filename,"%s/%s.%s%s",destdir,fileroot,"metga",ifilenames->pointers_index1info_ptr);
1268
1269 new_offsets_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1270 strlen(".")+strlen("metga")+strlen(ifilenames->offsets_index1info_ptr)+1,sizeof(char));
1271 sprintf(new_offsets_filename,"%s/%s.%s%s",destdir,fileroot,"metga",ifilenames->offsets_index1info_ptr);
1272
1273 new_positions_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1274 strlen(".")+strlen("metga")+strlen(ifilenames->positions_index1info_ptr)+1,sizeof(char));
1275 sprintf(new_positions_filename,"%s/%s.%s%s",destdir,fileroot,"metga",ifilenames->positions_index1info_ptr);
1276
1277 if (coord_values_8p == true) {
1278 new_positions_high_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1279 strlen(".")+strlen("metga")+strlen(ifilenames->positions_high_index1info_ptr)+1,sizeof(char));
1280 sprintf(new_positions_high_filename,"%s/%s.%s%s",destdir,fileroot,"metga",ifilenames->positions_high_index1info_ptr);
1281 }
1282
1283 compute_ga(new_pointers_filename,new_offsets_filename,
1284 new_positions_high_filename,new_positions_filename,
1285 indexdb,oligospace,index_mask,coord_values_8p);
1286 if (coord_values_8p == true) {
1287 FREE(new_positions_high_filename);
1288 }
1289 FREE(new_positions_filename);
1290 FREE(new_offsets_filename);
1291 FREE(new_pointers_filename);
1292
1293 /* regiondb */
1294 filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+strlen(fileroot)+
1295 strlen(".genomecomp")+1,sizeof(char));
1296 sprintf(filename,"%s/%s.genomecomp",sourcedir,fileroot);
1297 if ((sequence_fp = fopen(filename,"rb")) == NULL) {
1298 fprintf(stderr,"Could not open file %s\n",filename);
1299 exit(9);
1300 }
1301 FREE(filename);
1302
1303 Regiondb_write(destdir,/*region_interval_char*/'1',sequence_fp,chromosome_iit,
1304 region1part,region1interval,
1305 /*genome_lc_p*/false,fileroot,/*mask_lowercase_p*/false,
1306 Cmet_reduce_ga);
1307 fclose(sequence_fp);
1308
1309 Indexdb_free(&indexdb);
1310 Indexdb_filenames_free(&ifilenames);
1311 }
1312
1313 Univ_IIT_free(&chromosome_iit);
1314
1315 FREE(destdir);
1316 FREE(dbversion);
1317 FREE(fileroot);
1318 FREE(sourcedir);
1319
1320 return 0;
1321 }
1322
1323
1324
1325 static void
print_program_usage()1326 print_program_usage () {
1327 fprintf(stdout,"\
1328 Usage: cmetindex [OPTIONS...] -d <genome>\n\
1329 \n\
1330 ");
1331
1332 /* Input options */
1333 fprintf(stdout,"Options (must include -d)\n");
1334 fprintf(stdout,"\
1335 -F, --sourcedir=directory Directory where to read cmet index files (default is\n\
1336 GMAP genome directory specified at compile time)\n\
1337 -D, --destdir=directory Directory where to write cmet index files (default is\n\
1338 value of -F, if provided; otherwise the value of the\n\
1339 GMAP genome directory specified at compile time)\n\
1340 -d, --db=STRING Genome database\n\
1341 -k, --kmer=INT kmer size to use in genome database (allowed values: 16 or less).\n\
1342 If not specified, the program will find the highest available\n\
1343 kmer size in the genome database\n\
1344 -q, --sampling=INT Sampling to use in genome database. If not specified, the program\n\
1345 will find the smallest available sampling value in the genome database\n\
1346 within selected basesize and k-mer size\n\
1347 -v, --use-snps=STRING Use database containing known SNPs (in <STRING>.iit, built\n\
1348 previously using snpindex) for tolerance to SNPs\n\
1349 \n\
1350 --version Show version\n\
1351 --help Show this help message\n\
1352 ");
1353 return;
1354 }
1355
1356