1 static char rcsid[] = "$Id: atoiindex.c 222799 2020-06-03 21:56:34Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11
12 #ifdef WORDS_BIGENDIAN
13 #include "bigendian.h"
14 #else
15 #include "littleendian.h"
16 #endif
17
18 #include <stdio.h>
19 #include <stddef.h>
20 #include <stdlib.h>
21 #include <string.h> /* For memset */
22 #include <ctype.h> /* For toupper */
23 #include <sys/mman.h> /* For munmap */
24 #include <math.h> /* For qsort */
25 #ifdef HAVE_UNISTD_H
26 #include <unistd.h> /* For lseek and close */
27 #endif
28 #ifdef HAVE_SYS_TYPES_H
29 #include <sys/types.h> /* For off_t */
30 #endif
31 #if HAVE_DIRENT_H
32 # include <dirent.h>
33 # define NAMLEN(dirent) strlen((dirent)->d_name)
34 #else
35 # define dirent direct
36 # define NAMLEN(dirent) (dirent)->d_namlen
37 # if HAVE_SYS_NDIR_H
38 # include <sys/ndir.h>
39 # endif
40 # if HAVE_SYS_DIR_H
41 # include <sys/dir.h>
42 # endif
43 # if HAVE_NDIR_H
44 # include <ndir.h>
45 # endif
46 #endif
47
48 #include "mem.h"
49 #include "fopen.h"
50 #include "access.h"
51 #include "types.h" /* For Positionsptr_T and Oligospace_T */
52 #include "mode.h"
53 #include "filesuffix.h"
54
55 #include "atoi.h"
56
57 #include "assert.h"
58 #include "bool.h"
59 #include "genomicpos.h"
60 #include "iit-read-univ.h"
61 #include "indexdb.h"
62 #include "indexdbdef.h"
63 #include "indexdb-write.h"
64 #include "genome.h"
65 #include "genome128_hr.h"
66
67 #include "bitpack64-write.h"
68 #include "bitpack64-read.h"
69 #include "bitpack64-readtwo.h"
70 #include "bitpack64-access.h"
71 #include "bitpack64-incr.h"
72
73 #include "regiondb-write.h"
74 #include "datadir.h"
75 #include "getopt.h"
76
77
78 #define POSITIONS8_HIGH_SHIFT 32
79 #define POSITIONS8_LOW_MASK 0xFFFFFFFF
80
81 #define LOCTABLE_SIZE 2
82 #define DIFFERENTIAL_METAINFO_SIZE 2
83 #define BLOCKSIZE 64
84 #define BUFFER_SIZE 1000000
85
86 #define MONITOR_INTERVAL 10000000
87 #define MONITOR_INTERVAL_REGION 100000
88
89
90 #ifdef DEBUG
91 #define debug(x) x
92 #else
93 #define debug(x)
94 #endif
95
96
97
98 static char *user_sourcedir = NULL;
99 static char *user_destdir = NULL;
100 static char *dbroot = NULL;
101 static char *dbversion = NULL;
102 static int compression_type;
103
104 static int index1part = 15;
105 static int required_index1part = 0;
106 static int index1interval;
107 static int required_index1interval = 0;
108
109 static int region1part = 6;
110 static int region1interval = 1;
111
112 static char *snps_root = NULL;
113
114
115 static struct option long_options[] = {
116 /* Input options */
117 {"sourcedir", required_argument, 0, 'F'}, /* user_sourcedir */
118 {"destdir", required_argument, 0, 'D'}, /* user_destdir */
119 {"kmer", required_argument, 0, 'k'}, /* required_index1part */
120 {"sampling", required_argument, 0, 'q'}, /* required_interval */
121 {"db", required_argument, 0, 'd'}, /* dbroot */
122 {"usesnps", required_argument, 0, 'v'}, /* snps_root */
123
124 /* Help options */
125 {"version", no_argument, 0, 0}, /* print_program_version */
126 {"help", no_argument, 0, 0}, /* print_program_usage */
127 {0, 0, 0, 0}
128 };
129
130
131 static void
print_program_version()132 print_program_version () {
133 fprintf(stdout,"\n");
134 fprintf(stdout,"ATOIINDEX: Builds GMAP index files for A-to-I RNA editing\n");
135 fprintf(stdout,"Part of GMAP package, version %s\n",PACKAGE_VERSION);
136 fprintf(stdout,"Default gmap directory: %s\n",GMAPDB);
137 fprintf(stdout,"Thomas D. Wu, Genentech, Inc.\n");
138 fprintf(stdout,"Contact: twu@gene.com\n");
139 fprintf(stdout,"\n");
140 return;
141 }
142
143 static void
144 print_program_usage ();
145
146
147 static Oligospace_T
power(int base,int exponent)148 power (int base, int exponent) {
149 Oligospace_T result = 1UL;
150 int i;
151
152 for (i = 0; i < exponent; i++) {
153 result *= base;
154 }
155 return result;
156 }
157
158
159 /* 87654321 */
160 #define LOW_TWO_BITS 0x00000003
161
162
163 #if 0
164 static Oligospace_T
165 reduce_oligo_old (Oligospace_T oligo, int oligosize) {
166 Oligospace_T reduced = 0U, lowbits;
167 int i;
168
169 for (i = 0; i < oligosize; i++) {
170 lowbits = oligo & LOW_TWO_BITS;
171 reduced >>= 2;
172 switch (lowbits) {
173 case RIGHT_A: break;
174 case RIGHT_C: reduced |= LEFT_T; break;
175 case RIGHT_G: reduced |= LEFT_G; break;
176 case RIGHT_T: reduced |= LEFT_T; break;
177 }
178 oligo >>= 2;
179 }
180
181 for ( ; i < 16; i++) {
182 reduced >>= 2;
183 }
184
185 return reduced;
186 }
187 #endif
188
189
190 #if 0
191 static char *
192 shortoligo_nt (Oligospace_T oligo, int oligosize) {
193 char *nt;
194 int i, j;
195 Oligospace_T lowbits;
196
197 nt = (char *) CALLOC(oligosize+1,sizeof(char));
198 j = oligosize-1;
199 for (i = 0; i < oligosize; i++) {
200 lowbits = oligo & LOW_TWO_BITS;
201 switch (lowbits) {
202 case RIGHT_A: nt[j] = 'A'; break;
203 case RIGHT_C: nt[j] = 'C'; break;
204 case RIGHT_G: nt[j] = 'G'; break;
205 case RIGHT_T: nt[j] = 'T'; break;
206 }
207 oligo >>= 2;
208 j--;
209 }
210
211 return nt;
212 }
213 #endif
214
215
216 #if 0
217 /* Old method */
218 static Positionsptr_T *
219 compute_offsets_ag_using_array (Positionsptr_T *oldoffsets, Oligospace_T oligospace, Oligospace_T mask) {
220 Positionsptr_T *offsets;
221 Oligospace_T oligoi, reduced;
222 #ifdef DEBUG
223 char *nt1, *nt2;
224 #endif
225
226 /* Fill with sizes */
227 offsets = (Positionsptr_T *) CALLOC(oligospace+1,sizeof(Positionsptr_T));
228
229 for (oligoi = 0; oligoi < oligospace; oligoi++) {
230 #if 0
231 if (reduce_oligo(oligoi) != reduce_oligo_old(oligoi,index1part)) {
232 abort();
233 }
234 #endif
235 reduced = Atoi_reduce_ag(oligoi) & mask;
236 debug(
237 nt1 = shortoligo_nt(oligoi,index1part);
238 nt2 = shortoligo_nt(reduced,index1part);
239 printf("For oligo %s, updating sizes for %s from %u",nt1,nt2,offsets[reduced+1]);
240 );
241 #ifdef WORDS_BIGENDIAN
242 /*size*/offsets[reduced+1] += (Bigendian_convert_uint(oldoffsets[oligoi+1]) - Bigendian_convert_uint(oldoffsets[oligoi]));
243 #else
244 /*size*/offsets[reduced+1] += (oldoffsets[oligoi+1] - oldoffsets[oligoi]);
245 #endif
246 debug(
247 printf(" to %u\n",offsets[reduced+1]);
248 FREE(nt2);
249 FREE(nt1);
250 );
251 }
252
253 offsets[0] = 0U;
254 for (oligoi = 1; oligoi <= oligospace; oligoi++) {
255 offsets[oligoi] = offsets[oligoi-1] + /*size*/offsets[oligoi];
256 debug(if (offsets[oligoi] != offsets[oligoi-1]) {
257 printf("Offset for %06X: %u\n",oligoi,offsets[oligoi]);
258 });
259 }
260
261 return offsets;
262 }
263 #endif
264
265
266 #if 0
267 /* Old method */
268 static Positionsptr_T *
269 compute_offsets_tc (Positionsptr_T *oldoffsets, Oligospace_T oligospace, Oligospace_T mask) {
270 Positionsptr_T *offsets;
271 Oligospace_T oligoi, reduced;
272 #ifdef DEBUG
273 char *nt1, *nt2;
274 #endif
275
276 /* Fill with sizes */
277 offsets = (Positionsptr_T *) CALLOC(oligospace+1,sizeof(Positionsptr_T));
278
279 for (oligoi = 0; oligoi < oligospace; oligoi++) {
280 reduced = Atoi_reduce_tc(oligoi) & mask;
281 debug(
282 nt1 = shortoligo_nt(oligoi,index1part);
283 nt2 = shortoligo_nt(reduced,index1part);
284 printf("For oligo %s, updating sizes for %s from %u",nt1,nt2,offsets[reduced+1]);
285 );
286 #ifdef WORDS_BIGENDIAN
287 /*size*/offsets[reduced+1] += (Bigendian_convert_uint(oldoffsets[oligoi+1]) - Bigendian_convert_uint(oldoffsets[oligoi]));
288 #else
289 /*size*/offsets[reduced+1] += (oldoffsets[oligoi+1] - oldoffsets[oligoi]);
290 #endif
291 debug(
292 printf(" to %u\n",offsets[reduced+1]);
293 FREE(nt2);
294 FREE(nt1);
295 );
296 }
297
298 offsets[0] = 0U;
299 for (oligoi = 1; oligoi <= oligospace; oligoi++) {
300 offsets[oligoi] = offsets[oligoi-1] + /*size*/offsets[oligoi];
301 debug(if (offsets[oligoi] != offsets[oligoi-1]) {
302 printf("Offset for %06X: %u\n",oligoi,offsets[oligoi]);
303 });
304 }
305
306 return offsets;
307 }
308 #endif
309
310
311 static void
index_offsets_ag_using_bitpack(char * new_pointers_filename,char * new_offsets_filename,UINT4 * oldoffsetsmeta,UINT4 * oldoffsetsstrm,Oligospace_T oligospace,Oligospace_T mask)312 index_offsets_ag_using_bitpack (char *new_pointers_filename, char *new_offsets_filename,
313 UINT4 *oldoffsetsmeta, UINT4 *oldoffsetsstrm,
314 Oligospace_T oligospace, Oligospace_T mask) {
315 Oligospace_T bmerspace;
316 char *packsizes;
317 UINT4 **bitpacks;
318 UINT4 increment;
319 int new_packsize;
320 UINT4 offsets[BLOCKSIZE+1];
321 Oligospace_T oligoi, reduced, bmer;
322 int ii;
323 #ifdef DEBUG
324 char *nt1, *nt2;
325 #endif
326
327 /* Fill with sizes */
328 bmerspace = oligospace/BLOCKSIZE;
329 packsizes = (char *) CALLOC(bmerspace,sizeof(char));
330 bitpacks = (UINT4 **) CALLOC(bmerspace,sizeof(UINT4 *));
331
332 for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
333 Bitpack64_block_offsets(offsets,oligoi,oldoffsetsmeta,oldoffsetsstrm);
334 for (ii = 0; ii < BLOCKSIZE; ii++) {
335 if ((increment = offsets[ii+1] - offsets[ii]) > 0) {
336 reduced = Atoi_reduce_ag(oligoi + ii) & mask;
337 bmer = reduced/BLOCKSIZE;
338 if ((new_packsize = Bitpack64_access_new_packsize(reduced,/*old_packsize*/(int) packsizes[bmer],
339 bitpacks[bmer],increment)) != (int) packsizes[bmer]) {
340 bitpacks[bmer] = Bitpack64_realloc_multiple((int) packsizes[bmer],new_packsize,bitpacks[bmer]);
341 packsizes[bmer] = (char) new_packsize;
342 }
343 Bitpack64_add_bitpack(reduced,(int) packsizes[bmer],bitpacks[bmer],increment);
344 }
345 }
346 }
347
348 Bitpack64_write_differential_bitpacks(new_pointers_filename,new_offsets_filename,packsizes,bitpacks,oligospace);
349
350 for (bmer = 0; bmer < bmerspace; bmer++) {
351 FREE(bitpacks[bmer]);
352 }
353 FREE(bitpacks);
354 FREE(packsizes);
355
356 return;
357 }
358
359
360 static void
index_offsets_tc_using_bitpack(char * new_pointers_filename,char * new_offsets_filename,UINT4 * oldoffsetsmeta,UINT4 * oldoffsetsstrm,Oligospace_T oligospace,Oligospace_T mask)361 index_offsets_tc_using_bitpack (char *new_pointers_filename, char *new_offsets_filename,
362 UINT4 *oldoffsetsmeta, UINT4 *oldoffsetsstrm,
363 Oligospace_T oligospace, Oligospace_T mask) {
364 Oligospace_T bmerspace;
365 char *packsizes;
366 UINT4 **bitpacks;
367 UINT4 increment;
368 int new_packsize;
369 UINT4 offsets[BLOCKSIZE+1];
370 Oligospace_T oligoi, reduced, bmer;
371 int ii;
372 #ifdef DEBUG
373 char *nt1, *nt2;
374 #endif
375
376 /* Fill with sizes */
377 bmerspace = oligospace/BLOCKSIZE;
378 packsizes = (char *) CALLOC(bmerspace,sizeof(char));
379 bitpacks = (UINT4 **) CALLOC(bmerspace,sizeof(UINT4 *));
380
381 for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
382 Bitpack64_block_offsets(offsets,oligoi,oldoffsetsmeta,oldoffsetsstrm);
383 for (ii = 0; ii < BLOCKSIZE; ii++) {
384 if ((increment = offsets[ii+1] - offsets[ii]) > 0) {
385 reduced = Atoi_reduce_tc(oligoi + ii) & mask;
386 bmer = reduced/BLOCKSIZE;
387 if ((new_packsize = Bitpack64_access_new_packsize(reduced,/*old_packsize*/(int) packsizes[bmer],
388 bitpacks[bmer],increment)) != (int) packsizes[bmer]) {
389 bitpacks[bmer] = Bitpack64_realloc_multiple((int) packsizes[bmer],new_packsize,bitpacks[bmer]);
390 packsizes[bmer] = (char) new_packsize;
391 }
392 Bitpack64_add_bitpack(reduced,(int) packsizes[bmer],bitpacks[bmer],increment);
393 }
394 }
395 }
396
397 Bitpack64_write_differential_bitpacks(new_pointers_filename,new_offsets_filename,packsizes,bitpacks,oligospace);
398
399 for (bmer = 0; bmer < bmerspace; bmer++) {
400 FREE(bitpacks[bmer]);
401 }
402 FREE(bitpacks);
403 FREE(packsizes);
404
405 return;
406 }
407
408
409 #if 0
410 /* Needs work. See cmetindex.c for preliminary version */
411 static void
412 sort_positions_local_snps (FILE *positions_fp, UINT2 *positions2,
413 UINT2 *newoffsetsmeta, UINT2 *newoffsetsstrm, Localspace_T localspace) {
414 }
415 #endif
416
417
418 static void
sort_8mers(unsigned char * positions8_high,UINT4 * positions8_low,Positionsptr_T npositions)419 sort_8mers (unsigned char *positions8_high, UINT4 *positions8_low, Positionsptr_T npositions) {
420 UINT8 *positions8;
421 Positionsptr_T i;
422
423 positions8 = (UINT8 *) MALLOC(npositions*sizeof(UINT8));
424 for (i = 0; i < npositions; i++) {
425 positions8[i] = ((UINT8) positions8_high[i] << 32) + positions8_low[i];
426 }
427 qsort(positions8,npositions,sizeof(UINT8),UINT8_compare);
428 for (i = 0; i < npositions; i++) {
429 positions8_high[i] = positions8[i] >> POSITIONS8_HIGH_SHIFT;
430 positions8_low[i] = positions8[i] & POSITIONS8_LOW_MASK;
431 }
432 return;
433 }
434
435
436 static void
sort_positions(char * ptrsfile,char * compfile,FILE * positions_high_fp,FILE * positions_fp,unsigned char * positions8_high,UINT4 * positions8_low,UINT4 * positions4,UINT4 * newoffsetsmeta,UINT4 * newoffsetsstrm,Oligospace_T oligospace,bool coord_values_8p)437 sort_positions (char *ptrsfile, char *compfile, FILE *positions_high_fp, FILE *positions_fp,
438 unsigned char *positions8_high, UINT4 *positions8_low, UINT4 *positions4,
439 UINT4 *newoffsetsmeta, UINT4 *newoffsetsstrm, Oligospace_T oligospace, bool coord_values_8p) {
440 Oligospace_T oligoi;
441 Positionsptr_T block_start, block_end, j, npositions;
442
443 FILE *ptrs_fp, *comp_fp;
444 UINT4 *ptrs, nregisters;
445 UINT4 ptri;
446 int ii;
447
448 /* Buffer is used to avoid frequent writes to the file */
449 UINT4 *buffer;
450 int buffer_size = BUFFER_SIZE;
451 int buffer_i;
452
453 UINT4 newoffsets[BLOCKSIZE+1];
454 UINT4 diffs[BLOCKSIZE], ascending[BLOCKSIZE+1];
455 UINT4 totalcount;
456 int packsize;
457
458
459 /* Sort positions in each block. For snps_root, use code from Bitpack64_write_differential */
460 if (snps_root) {
461 ptrs = (UINT4 *) CALLOC(((oligospace + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
462 ptri = 0;
463
464 if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
465 fprintf(stderr,"Can't write to file %s\n",compfile);
466 exit(9);
467 }
468 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
469 buffer_i = 0;
470 nregisters = 0;
471 }
472
473 totalcount = 0;
474 for (oligoi = 0; oligoi + BLOCKSIZE <= oligospace; oligoi += BLOCKSIZE) {
475 if (snps_root) {
476 ptrs[ptri++] = nregisters;
477 ptrs[ptri++] = totalcount;
478 }
479 ascending[0] = totalcount;
480 Bitpack64_block_offsets(newoffsets,oligoi,newoffsetsmeta,newoffsetsstrm);
481 for (ii = 0; ii < BLOCKSIZE; ii++) {
482 if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
483 fprintf(stderr,".");
484 }
485 block_start = newoffsets[ii];
486 block_end = newoffsets[ii+1];
487 if ((npositions = block_end - block_start) > 0) {
488 if (coord_values_8p == true) {
489 sort_8mers(&(positions8_high[block_start]),&(positions8_low[block_start]),npositions);
490 if (snps_root == NULL) {
491 /* FWRITE_UINT8S(&(positions8[block_start]),npositions,positions_fp); */
492 FWRITE_CHARS(&(positions8_high[block_start]),npositions,positions_high_fp);
493 FWRITE_UINTS(&(positions8_low[block_start]),npositions,positions_fp);
494 } else {
495 /* FWRITE_UINT8(positions8[block_start],positions_fp); */
496 FWRITE_CHAR(positions8_high[block_start],positions_high_fp);
497 FWRITE_UINT(positions8_low[block_start],positions_fp);
498 for (j = block_start+1; j < block_end; j++) {
499 if (positions8_high[j] == positions8_high[j-1] && positions8_low[j] == positions8_low[j-1]) {
500 npositions--;
501 } else {
502 /* FWRITE_UINT8(positions8[j],positions_fp); */
503 FWRITE_CHAR(positions8_high[j],positions_high_fp);
504 FWRITE_UINT(positions8_low[j],positions_fp);
505 }
506 }
507 }
508
509 } else {
510 qsort(&(positions4[block_start]),npositions,sizeof(UINT4),UINT4_compare);
511 if (snps_root == NULL) {
512 FWRITE_UINTS(&(positions4[block_start]),npositions,positions_fp);
513 } else {
514 FWRITE_UINT(positions4[block_start],positions_fp);
515 for (j = block_start+1; j < block_end; j++) {
516 if (positions4[j] == positions4[j-1]) {
517 npositions--;
518 } else {
519 FWRITE_UINT(positions4[j],positions_fp);
520 }
521 }
522 }
523 }
524 totalcount += npositions;
525 }
526 ascending[ii+1] = totalcount;
527 }
528 if (snps_root) {
529 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
530 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
531 nregisters += packsize / 2;
532 }
533 }
534
535 if (oligoi <= oligospace) {
536 /* Finish last block (containing 1 entry, but expand to 64) */
537 if (snps_root) {
538 ptrs[ptri++] = nregisters;
539 ptrs[ptri++] = totalcount;
540 }
541 ascending[0] = totalcount;
542 for (ii = 0; ii <= (int) (oligospace - oligoi); ii++) {
543 block_start = Bitpack64_read_two(&block_end,oligoi+ii,newoffsetsmeta,newoffsetsstrm);
544 if ((npositions = block_end - block_start) > 0) {
545 if (coord_values_8p == true) {
546 sort_8mers(&(positions8_high[block_start]),&(positions8_low[block_start]),npositions);
547 if (snps_root == NULL) {
548 /* FWRITE_UINT8S(&(positions8[block_start]),npositions,positions_fp); */
549 FWRITE_CHARS(&(positions8_high[block_start]),npositions,positions_high_fp);
550 FWRITE_UINTS(&(positions8_low[block_start]),npositions,positions_fp);
551 } else {
552 /* FWRITE_UINT8(positions8[block_start],positions_fp); */
553 FWRITE_CHAR(positions8_high[block_start],positions_high_fp);
554 FWRITE_UINT(positions8_low[block_start],positions_fp);
555 for (j = block_start+1; j < block_end; j++) {
556 if (positions8_high[j] == positions8_high[j-1] && positions8_low[j] == positions8_low[j-1]) {
557 npositions--;
558 } else {
559 /* FWRITE_UINT8(positions8[j],positions_fp); */
560 FWRITE_CHAR(positions8_high[j],positions_high_fp);
561 FWRITE_UINT(positions8_low[j],positions_fp);
562 }
563 }
564 }
565
566 } else {
567 qsort(&(positions4[block_start]),npositions,sizeof(UINT4),UINT4_compare);
568 if (snps_root == NULL) {
569 FWRITE_UINTS(&(positions4[block_start]),npositions,positions_fp);
570 } else {
571 FWRITE_UINT(positions4[block_start],positions_fp);
572 for (j = block_start+1; j < block_end; j++) {
573 if (positions4[j] == positions4[j-1]) {
574 npositions--;
575 } else {
576 FWRITE_UINT(positions4[j],positions_fp);
577 }
578 }
579 }
580 }
581 totalcount += npositions;
582 }
583 ascending[ii+1] = totalcount;
584 }
585 if (snps_root) {
586 for ( ; ii < BLOCKSIZE; ii++) {
587 ascending[ii+1] = totalcount;
588 }
589 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
590 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
591 nregisters += packsize / 2;
592 }
593 }
594
595 if (snps_root) {
596 /* Write the final pointer, which will point after the end of the file */
597 ptrs[ptri++] = nregisters; /* In 128-bit registers */
598
599 /* Value for end of block */
600 ptrs[ptri++] = totalcount;
601
602 if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
603 fprintf(stderr,"Can't write to file %s\n",ptrsfile);
604 exit(9);
605 } else {
606 FWRITE_UINTS(ptrs,ptri,ptrs_fp);
607 FREE(ptrs);
608 fclose(ptrs_fp);
609 }
610
611 /* Empty buffer */
612 if (buffer_i > 0) {
613 FWRITE_UINTS(buffer,buffer_i,comp_fp);
614 buffer_i = 0;
615 }
616 FREE(buffer);
617 fclose(comp_fp);
618 }
619
620 return;
621 }
622
623
624 static void
compute_ag(char * new_pointers_filename,char * new_offsets_filename,char * new_positions_high_filename,char * new_positions_filename,Indexdb_T indexdb,Oligospace_T oligospace,Oligospace_T mask,bool coord_values_8p)625 compute_ag (char *new_pointers_filename, char *new_offsets_filename,
626 char *new_positions_high_filename, char *new_positions_filename,
627 Indexdb_T indexdb, Oligospace_T oligospace, Oligospace_T mask,
628 bool coord_values_8p) {
629 char *ptrsfile, *compfile;
630 FILE *positions_high_fp, *positions_fp;
631
632 unsigned char *positions8_high;
633 UINT4 *positions8_low;
634 UINT4 *positions4;
635
636 Oligospace_T oligoi, reduced;
637 UINT4 oldoffsets[BLOCKSIZE+1];
638
639 UINT4 *newoffsetsmeta, *newoffsetsstrm, *countermeta, *counterstrm;
640 Positionsptr_T preunique_totalcounts, block_start, block_end, j, offset;
641 size_t offsetsmeta_len, offsetsstrm_len;
642 int ii;
643 #ifdef HAVE_MMAP
644 int offsetsmeta_fd, offsetsstrm_fd;
645 #else
646 Access_T offsetsmeta_access, offsetsstrm_access;
647 #endif
648 double seconds;
649 #ifdef DEBUG
650 char *nt1, *nt2;
651 #endif
652
653
654 /* offsets = compute_offsets_ag_using_array(oldoffsets,oligospace,mask); */
655 index_offsets_ag_using_bitpack(new_pointers_filename,new_offsets_filename,
656 indexdb->offsetsmeta,indexdb->offsetsstrm,
657 oligospace,mask);
658
659 preunique_totalcounts = Bitpack64_read_one(oligospace,indexdb->offsetsmeta,indexdb->offsetsstrm);
660 if (preunique_totalcounts == 0) {
661 fprintf(stderr,"Something is wrong with the offsets. Total counts is zero.\n");
662 exit(9);
663
664 } else if (coord_values_8p == true) {
665 fprintf(stderr,"Trying to allocate %u*(%d+%d) bytes of memory...",
666 preunique_totalcounts,(int) sizeof(unsigned char),(int) sizeof(UINT4));
667 positions4 = (UINT4 *) NULL;
668 positions8_high = (unsigned char *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(unsigned char));
669 positions8_low = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
670 if (positions8_high == NULL || positions8_low == NULL) {
671 fprintf(stderr,"failed. Need a computer with sufficient memory.\n");
672 exit(9);
673 } else {
674 fprintf(stderr,"done\n");
675 }
676
677 } else {
678 fprintf(stderr,"Trying to allocate %u*%d bytes of memory...",preunique_totalcounts,(int) sizeof(UINT4));
679 positions8_high = (unsigned char *) NULL;
680 positions8_low = (UINT4 *) NULL;
681 positions4 = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
682 if (positions4 == NULL) {
683 fprintf(stderr,"failed. Need a computer with sufficient memory.\n");
684 exit(9);
685 } else {
686 fprintf(stderr,"done\n");
687 }
688 }
689
690 /* Point to offsets and revise */
691 fprintf(stderr,"Rearranging AG offsets for indexdb...");
692 #ifdef HAVE_MMAP
693 newoffsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,new_pointers_filename,/*randomp*/false);
694 newoffsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,new_offsets_filename,/*randomp*/false);
695 #else
696 newoffsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,new_pointers_filename,sizeof(UINT4));
697 newoffsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,new_offsets_filename,sizeof(UINT4));
698 #endif
699 countermeta = Indexdb_bitpack_counter(&counterstrm,newoffsetsmeta,index1part);
700
701 for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
702 Bitpack64_block_offsets(oldoffsets,oligoi,indexdb->offsetsmeta,indexdb->offsetsstrm);
703 for (ii = 0; ii < BLOCKSIZE; ii++) {
704 if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
705 fprintf(stderr,".");
706 }
707 block_start = oldoffsets[ii];
708 block_end = oldoffsets[ii+1];
709
710 if (block_end > block_start) {
711 reduced = Atoi_reduce_ag(oligoi + ii) & mask;
712 offset = Bitpack64_read_one(reduced,newoffsetsmeta,newoffsetsstrm) + Bitpack64_access(reduced,countermeta,counterstrm);
713 if (coord_values_8p == true) {
714 for (j = block_start; j < block_end; j++) {
715 positions8_high[offset] = indexdb->positions_high[j];
716 positions8_low[offset] = indexdb->positions[j];
717 offset++;
718 }
719 } else {
720 for (j = block_start; j < block_end; j++) {
721 positions4[offset] = indexdb->positions[j];
722 offset++;
723 }
724 }
725 Bitpack64_add(reduced,countermeta,counterstrm,block_end - block_start);
726 }
727 }
728 }
729
730 FREE(counterstrm);
731 FREE(countermeta);
732 fprintf(stderr,"done\n");
733
734
735 fprintf(stderr,"Sorting AG positions for indexdb...");
736 if (snps_root == NULL) {
737 ptrsfile = compfile = (char *) NULL;
738 } else {
739 ptrsfile = (char *) MALLOC((strlen(new_pointers_filename) + strlen(".temp") + 1) * sizeof(char));
740 sprintf(ptrsfile,"%s.temp",new_pointers_filename);
741 compfile = (char *) MALLOC((strlen(new_offsets_filename) + strlen(".temp") + 1) * sizeof(char));
742 sprintf(compfile,"%s.temp",new_offsets_filename);
743 }
744
745 if (new_positions_high_filename != NULL) {
746 if ((positions_high_fp = FOPEN_WRITE_BINARY(new_positions_high_filename)) == NULL) {
747 fprintf(stderr,"Can't write to file %s\n",new_positions_high_filename);
748 exit(9);
749 }
750 }
751
752 if ((positions_fp = FOPEN_WRITE_BINARY(new_positions_filename)) == NULL) {
753 fprintf(stderr,"Can't write to file %s\n",new_positions_filename);
754 exit(9);
755 }
756
757 sort_positions(ptrsfile,compfile,positions_high_fp,positions_fp,
758 positions8_high,positions8_low,positions4,
759 newoffsetsmeta,newoffsetsstrm,oligospace,coord_values_8p);
760
761 /* Clean up */
762 #ifdef HAVE_MMAP
763 munmap((void *) newoffsetsmeta,offsetsmeta_len);
764 munmap((void *) newoffsetsstrm,offsetsstrm_len);
765 close(offsetsmeta_fd);
766 close(offsetsstrm_fd);
767 #else
768 FREE(newoffsetsmeta);
769 FREE(newoffsetsstrm);
770 #endif
771
772 if (coord_values_8p == true) {
773 FREE(positions8_high);
774 FREE(positions8_low);
775 fclose(positions_high_fp);
776 fclose(positions_fp);
777 } else {
778 FREE(positions4);
779 fclose(positions_fp);
780 }
781
782 if (snps_root) {
783 rename(ptrsfile,new_pointers_filename);
784 rename(compfile,new_offsets_filename);
785 FREE(ptrsfile);
786 FREE(compfile);
787 }
788
789 fprintf(stderr,"done\n");
790
791 return;
792 }
793
794
795 static void
compute_tc(char * new_pointers_filename,char * new_offsets_filename,char * new_positions_high_filename,char * new_positions_filename,Indexdb_T indexdb,Oligospace_T oligospace,Oligospace_T mask,bool coord_values_8p)796 compute_tc (char *new_pointers_filename, char *new_offsets_filename,
797 char *new_positions_high_filename, char *new_positions_filename,
798 Indexdb_T indexdb, Oligospace_T oligospace, Oligospace_T mask,
799 bool coord_values_8p) {
800 char *ptrsfile, *compfile;
801 FILE *positions_high_fp, *positions_fp;
802
803 unsigned char *positions8_high;
804 UINT4 *positions8_low;
805 UINT4 *positions4;
806
807 Oligospace_T oligoi, reduced;
808 UINT4 oldoffsets[BLOCKSIZE+1];
809
810 UINT4 *newoffsetsmeta, *newoffsetsstrm, *countermeta, *counterstrm;
811 Positionsptr_T preunique_totalcounts, block_start, block_end, j, offset;
812 size_t offsetsmeta_len, offsetsstrm_len;
813 int ii;
814 #ifdef HAVE_MMAP
815 int offsetsmeta_fd, offsetsstrm_fd;
816 #else
817 Access_T offsetsmeta_access, offsetsstrm_access;
818 #endif
819 double seconds;
820 #ifdef DEBUG
821 char *nt1, *nt2;
822 #endif
823
824
825 /* offsets = compute_offsets_tc_using_array(oldoffsets,oligospace,mask); */
826 index_offsets_tc_using_bitpack(new_pointers_filename,new_offsets_filename,
827 indexdb->offsetsmeta,indexdb->offsetsstrm,
828 oligospace,mask);
829
830 preunique_totalcounts = Bitpack64_read_one(oligospace,indexdb->offsetsmeta,indexdb->offsetsstrm);
831 if (preunique_totalcounts == 0) {
832 fprintf(stderr,"Something is wrong with the offsets. Total counts is zero.\n");
833 exit(9);
834
835 } else if (coord_values_8p == true) {
836 fprintf(stderr,"Trying to allocate %u*(%d+%d) bytes of memory...",
837 preunique_totalcounts,(int) sizeof(unsigned char),(int) sizeof(UINT4));
838 positions4 = (UINT4 *) NULL;
839 positions8_high = (unsigned char *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(unsigned char));
840 positions8_low = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
841 if (positions8_high == NULL || positions8_low == NULL) {
842 fprintf(stderr,"failed. Need a computer with sufficient memory.\n");
843 exit(9);
844 } else {
845 fprintf(stderr,"done\n");
846 }
847
848 } else {
849 fprintf(stderr,"Trying to allocate %u*%d bytes of memory...",preunique_totalcounts,(int) sizeof(UINT4));
850 positions8_high = (unsigned char *) NULL;
851 positions8_low = (UINT4 *) NULL;
852 positions4 = (UINT4 *) CALLOC_NO_EXCEPTION(preunique_totalcounts,sizeof(UINT4));
853 if (positions4 == NULL) {
854 fprintf(stderr,"failed. Need a computer with sufficient memory.\n");
855 exit(9);
856 } else {
857 fprintf(stderr,"done\n");
858 }
859 }
860
861 /* Point to offsets and revise */
862 fprintf(stderr,"Rearranging TC offsets for indexdb...");
863 #ifdef HAVE_MMAP
864 newoffsetsmeta = (UINT4 *) Access_mmap(&offsetsmeta_fd,&offsetsmeta_len,&seconds,new_pointers_filename,/*randomp*/false);
865 newoffsetsstrm = (UINT4 *) Access_mmap(&offsetsstrm_fd,&offsetsstrm_len,&seconds,new_offsets_filename,/*randomp*/false);
866 #else
867 newoffsetsmeta = (UINT4 *) Access_allocate_private(&offsetsmeta_access,&offsetsmeta_len,&seconds,new_pointers_filename,sizeof(UINT4));
868 newoffsetsstrm = (UINT4 *) Access_allocate_private(&offsetsstrm_access,&offsetsstrm_len,&seconds,new_offsets_filename,sizeof(UINT4));
869 #endif
870 countermeta = Indexdb_bitpack_counter(&counterstrm,newoffsetsmeta,index1part);
871
872 for (oligoi = 0; oligoi < oligospace; oligoi += BLOCKSIZE) {
873 Bitpack64_block_offsets(oldoffsets,oligoi,indexdb->offsetsmeta,indexdb->offsetsstrm);
874 for (ii = 0; ii < BLOCKSIZE; ii++) {
875 if ((oligoi + ii) % MONITOR_INTERVAL == 0) {
876 fprintf(stderr,".");
877 }
878 block_start = oldoffsets[ii];
879 block_end = oldoffsets[ii+1];
880
881 if (block_end > block_start) {
882 reduced = Atoi_reduce_tc(oligoi + ii) & mask;
883 offset = Bitpack64_read_one(reduced,newoffsetsmeta,newoffsetsstrm) + Bitpack64_access(reduced,countermeta,counterstrm);
884 if (coord_values_8p == true) {
885 for (j = block_start; j < block_end; j++) {
886 positions8_high[offset] = indexdb->positions_high[j];
887 positions8_low[offset] = indexdb->positions[j];
888 offset++;
889 }
890 } else {
891 for (j = block_start; j < block_end; j++) {
892 positions4[offset] = indexdb->positions[j];
893 offset++;
894 }
895 }
896 Bitpack64_add(reduced,countermeta,counterstrm,block_end - block_start);
897 }
898 }
899 }
900
901 FREE(counterstrm);
902 FREE(countermeta);
903 fprintf(stderr,"done\n");
904
905
906 fprintf(stderr,"Sorting TC positions for indexdb...");
907 if (snps_root == NULL) {
908 ptrsfile = compfile = (char *) NULL;
909 } else {
910 ptrsfile = (char *) MALLOC((strlen(new_pointers_filename) + strlen(".temp") + 1) * sizeof(char));
911 sprintf(ptrsfile,"%s.temp",new_pointers_filename);
912 compfile = (char *) MALLOC((strlen(new_offsets_filename) + strlen(".temp") + 1) * sizeof(char));
913 sprintf(compfile,"%s.temp",new_offsets_filename);
914 }
915
916 if (new_positions_high_filename != NULL) {
917 if ((positions_high_fp = FOPEN_WRITE_BINARY(new_positions_high_filename)) == NULL) {
918 fprintf(stderr,"Can't write to file %s\n",new_positions_high_filename);
919 exit(9);
920 }
921 }
922
923 if ((positions_fp = FOPEN_WRITE_BINARY(new_positions_filename)) == NULL) {
924 fprintf(stderr,"Can't write to file %s\n",new_positions_filename);
925 exit(9);
926 }
927
928 sort_positions(ptrsfile,compfile,positions_high_fp,positions_fp,
929 positions8_high,positions8_low,positions4,
930 newoffsetsmeta,newoffsetsstrm,oligospace,coord_values_8p);
931
932 /* Clean up */
933 #ifdef HAVE_MMAP
934 munmap((void *) newoffsetsmeta,offsetsmeta_len);
935 munmap((void *) newoffsetsstrm,offsetsstrm_len);
936 close(offsetsmeta_fd);
937 close(offsetsstrm_fd);
938 #else
939 FREE(newoffsetsmeta);
940 FREE(newoffsetsstrm);
941 #endif
942
943 if (coord_values_8p == true) {
944 FREE(positions8_high);
945 FREE(positions8_low);
946 fclose(positions_high_fp);
947 fclose(positions_fp);
948 } else {
949 FREE(positions4);
950 fclose(positions_fp);
951 }
952
953 if (snps_root) {
954 rename(ptrsfile,new_pointers_filename);
955 rename(compfile,new_offsets_filename);
956 FREE(ptrsfile);
957 FREE(compfile);
958 }
959
960 fprintf(stderr,"done\n");
961
962 return;
963 }
964
965
966 /* Usage: atoiindex -d <genome> */
967
968
969 /* Note: Depends on having gmapindex sampled on mod 3 bounds */
970 int
main(int argc,char * argv[])971 main (int argc, char *argv[]) {
972 char *sourcedir = NULL, *destdir = NULL, *filename, *fileroot;
973 Indexdb_filenames_T ifilenames;
974 char *new_pointers_filename = NULL, *new_offsets_filename = NULL,
975 *new_positions_high_filename = NULL, *new_positions_filename = NULL;
976 FILE *sequence_fp;
977
978 Univ_IIT_T chromosome_iit;
979 /* size_t totalcounts, i; */
980 Oligospace_T oligospace, index_mask;
981 bool coord_values_8p;
982
983 Indexdb_T indexdb;
984
985 int opt;
986 extern int optind;
987 extern char *optarg;
988 int long_option_index = 0;
989 const char *long_name;
990
991 while ((opt = getopt_long(argc,argv,"F:D:d:k:q:v:",
992 long_options,&long_option_index)) != -1) {
993 switch (opt) {
994 case 0:
995 long_name = long_options[long_option_index].name;
996 if (!strcmp(long_name,"version")) {
997 print_program_version();
998 exit(0);
999 } else if (!strcmp(long_name,"help")) {
1000 print_program_usage();
1001 exit(0);
1002
1003 } else {
1004 /* Shouldn't reach here */
1005 fprintf(stderr,"Don't recognize option %s. For usage, run 'atoiindex --help'",long_name);
1006 exit(9);
1007 }
1008 break;
1009
1010 case 'F': user_sourcedir = optarg; break;
1011 case 'D': user_destdir = optarg; break;
1012 case 'd': dbroot = optarg; break;
1013 case 'k': required_index1part = atoi(optarg); break;
1014 case 'q': required_index1interval = atoi(optarg); break;
1015 case 'v':
1016 snps_root = optarg;
1017 fprintf(stderr,"Combination of cmetindex and snps is not yet supported in 2018 versions\n");
1018 exit(9);
1019 break;
1020 default: fprintf(stderr,"Do not recognize flag %c\n",opt); exit(9);
1021 }
1022 }
1023 argc -= (optind - 1);
1024 argv += (optind - 1);
1025
1026 if (dbroot == NULL) {
1027 fprintf(stderr,"Missing name of genome database. Must specify with -d flag.\n");
1028 fprintf(stderr,"Usage: atoiindex -d <genome>\n");
1029 exit(9);
1030 } else {
1031 sourcedir = Datadir_find_genomesubdir(&fileroot,&dbversion,user_sourcedir,dbroot);
1032 fprintf(stderr,"Reading source files from %s\n",sourcedir);
1033 }
1034
1035 if (user_destdir == NULL) {
1036 destdir = (char *) MALLOC((strlen(sourcedir)+strlen("/")+strlen(dbroot)+1)*sizeof(char));
1037 sprintf(destdir,"%s/%s",sourcedir,dbroot);
1038 } else {
1039 destdir = (char *) MALLOC((strlen(user_destdir)+strlen("/")+strlen(dbroot)+1)*sizeof(char));
1040 sprintf(destdir,"%s/%s",user_destdir,dbroot);
1041 }
1042 fprintf(stderr,"Writing atoi index files to %s\n",destdir);
1043
1044
1045 /* Chromosome IIT file. Need to determine coord_values_8p and genomelength */
1046 filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+
1047 strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
1048 sprintf(filename,"%s/%s.chromosome.iit",sourcedir,fileroot);
1049 if ((chromosome_iit = Univ_IIT_read(filename,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
1050 fprintf(stderr,"IIT file %s is not valid\n",filename);
1051 exit(9);
1052 } else {
1053 coord_values_8p = Univ_IIT_coord_values_8p(chromosome_iit);
1054 }
1055 FREE(filename);
1056
1057
1058 ifilenames = Indexdb_get_filenames(&compression_type,&index1part,&index1interval,
1059 sourcedir,fileroot,IDX_FILESUFFIX,snps_root,
1060 required_index1part,required_index1interval,
1061 /*offsets_only_p*/false);
1062 if ((indexdb = Indexdb_new_genome(&index1part,&index1interval,
1063 sourcedir,fileroot,IDX_FILESUFFIX,/*snps_root*/NULL,
1064 required_index1part,required_index1interval,
1065 /*offsetsstrm_access*/USE_ALLOCATE,
1066 /*positions_access*/USE_ALLOCATE,/*sharedp*/false,
1067 /*multiple_sequences_p*/false,/*preload_shared_memory_p*/false,/*unload_shared_memory_p*/false)) != NULL) {
1068
1069 #ifdef HAVE_64_BIT
1070 index_mask = ~(~0ULL << 2*index1part);
1071 #else
1072 index_mask = ~(~0U << 2*index1part);
1073 #endif
1074 oligospace = power(4,index1part);
1075
1076
1077 /* Compute and write AG files */
1078 new_pointers_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1079 strlen(".")+strlen("a2iag")+strlen(ifilenames->pointers_index1info_ptr)+1,sizeof(char));
1080 sprintf(new_pointers_filename,"%s/%s.%s%s",destdir,fileroot,"a2iag",ifilenames->pointers_index1info_ptr);
1081
1082 new_offsets_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1083 strlen(".")+strlen("a2iag")+strlen(ifilenames->offsets_index1info_ptr)+1,sizeof(char));
1084 sprintf(new_offsets_filename,"%s/%s.%s%s",destdir,fileroot,"a2iag",ifilenames->offsets_index1info_ptr);
1085
1086 new_positions_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1087 strlen(".")+strlen("a2iag")+strlen(ifilenames->positions_index1info_ptr)+1,sizeof(char));
1088 sprintf(new_positions_filename,"%s/%s.%s%s",destdir,fileroot,"a2iag",ifilenames->positions_index1info_ptr);
1089
1090 if (coord_values_8p == true) {
1091 new_positions_high_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1092 strlen(".")+strlen("a2iag")+strlen(ifilenames->positions_high_index1info_ptr)+1,sizeof(char));
1093 sprintf(new_positions_high_filename,"%s/%s.%s%s",destdir,fileroot,"a2iag",ifilenames->positions_high_index1info_ptr);
1094 }
1095
1096 compute_ag(new_pointers_filename,new_offsets_filename,
1097 new_positions_high_filename,new_positions_filename,
1098 indexdb,oligospace,index_mask,coord_values_8p);
1099
1100 if (coord_values_8p == true) {
1101 FREE(new_positions_high_filename);
1102 }
1103 FREE(new_positions_filename);
1104 FREE(new_offsets_filename);
1105 FREE(new_pointers_filename);
1106
1107 /* regiondb */
1108 filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+strlen(fileroot)+
1109 strlen(".genomecomp")+1,sizeof(char));
1110 sprintf(filename,"%s/%s.genomecomp",sourcedir,fileroot);
1111 if ((sequence_fp = fopen(filename,"rb")) == NULL) {
1112 fprintf(stderr,"Could not open file %s\n",filename);
1113 exit(9);
1114 }
1115 FREE(filename);
1116
1117 Regiondb_write(destdir,/*region_interval_char*/'1',sequence_fp,chromosome_iit,
1118 region1part,region1interval,
1119 /*genome_lc_p*/false,fileroot,/*mask_lowercase_p*/false,
1120 Atoi_reduce_ag);
1121 fclose(sequence_fp);
1122
1123
1124 /* Compute and write TC files */
1125 new_pointers_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1126 strlen(".")+strlen("a2itc")+strlen(ifilenames->pointers_index1info_ptr)+1,sizeof(char));
1127 sprintf(new_pointers_filename,"%s/%s.%s%s",destdir,fileroot,"a2itc",ifilenames->pointers_index1info_ptr);
1128
1129 new_offsets_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1130 strlen(".")+strlen("a2itc")+strlen(ifilenames->offsets_index1info_ptr)+1,sizeof(char));
1131 sprintf(new_offsets_filename,"%s/%s.%s%s",destdir,fileroot,"a2itc",ifilenames->offsets_index1info_ptr);
1132
1133 new_positions_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1134 strlen(".")+strlen("a2itc")+strlen(ifilenames->positions_index1info_ptr)+1,sizeof(char));
1135 sprintf(new_positions_filename,"%s/%s.%s%s",destdir,fileroot,"a2itc",ifilenames->positions_index1info_ptr);
1136
1137 if (coord_values_8p == true) {
1138 new_positions_high_filename = (char *) CALLOC(strlen(destdir)+strlen("/")+strlen(fileroot)+
1139 strlen(".")+strlen("a2itc")+strlen(ifilenames->positions_high_index1info_ptr)+1,sizeof(char));
1140 sprintf(new_positions_high_filename,"%s/%s.%s%s",destdir,fileroot,"a2itc",ifilenames->positions_high_index1info_ptr);
1141 }
1142
1143 compute_tc(new_pointers_filename,new_offsets_filename,
1144 new_positions_high_filename,new_positions_filename,
1145 indexdb,oligospace,index_mask,coord_values_8p);
1146 if (coord_values_8p == true) {
1147 FREE(new_positions_high_filename);
1148 }
1149 FREE(new_positions_filename);
1150 FREE(new_offsets_filename);
1151 FREE(new_pointers_filename);
1152
1153 /* regiondb */
1154 filename = (char *) CALLOC(strlen(sourcedir)+strlen("/")+strlen(fileroot)+
1155 strlen(".genomecomp")+1,sizeof(char));
1156 sprintf(filename,"%s/%s.genomecomp",sourcedir,fileroot);
1157 if ((sequence_fp = fopen(filename,"rb")) == NULL) {
1158 fprintf(stderr,"Could not open file %s\n",filename);
1159 exit(9);
1160 }
1161 FREE(filename);
1162
1163 Regiondb_write(destdir,/*region_interval_char*/'1',sequence_fp,chromosome_iit,
1164 region1part,region1interval,
1165 /*genome_lc_p*/false,fileroot,/*mask_lowercase_p*/false,
1166 Atoi_reduce_tc);
1167 fclose(sequence_fp);
1168
1169 Indexdb_free(&indexdb);
1170 Indexdb_filenames_free(&ifilenames);
1171 }
1172
1173 Univ_IIT_free(&chromosome_iit);
1174
1175 FREE(destdir);
1176 FREE(dbversion);
1177 FREE(fileroot);
1178 FREE(sourcedir);
1179
1180 return 0;
1181 }
1182
1183
1184
1185 static void
print_program_usage()1186 print_program_usage () {
1187 fprintf(stdout,"\
1188 Usage: atoiindex [OPTIONS...] -d <genome>\n\
1189 \n\
1190 ");
1191
1192 /* Input options */
1193 fprintf(stdout,"Options (must include -d)\n");
1194 fprintf(stdout,"\
1195 -F, --sourcedir=directory Directory where to read cmet index files (default is\n\
1196 GMAP genome directory specified at compile time)\n\
1197 -D, --destdir=directory Directory where to write cmet index files (default is\n\
1198 value of -F, if provided; otherwise the value of the\n\
1199 GMAP genome directory specified at compile time)\n\
1200 -d, --db=STRING Genome database\n\
1201 -k, --kmer=INT kmer size to use in genome database (allowed values: 16 or less).\n\
1202 If not specified, the program will find the highest available\n\
1203 kmer size in the genome database\n\
1204 -q, --sampling=INT Sampling to use in genome database. If not specified, the program\n\
1205 will find the smallest available sampling value in the genome database\n\
1206 within selected basesize and k-mer size\n\
1207 -v, --use-snps=STRING Use database containing known SNPs (in <STRING>.iit, built\n\
1208 previously using snpindex) for tolerance to SNPs\n\
1209 \n\
1210 --version Show version\n\
1211 --help Show this help message\n\
1212 ");
1213 return;
1214 }
1215
1216