1 /***************************************************************
2
3 The Subread and Rsubread software packages are free
4 software packages:
5
6 you can redistribute it and/or modify it under the terms
7 of the GNU General Public License as published by the
8 Free Software Foundation, either version 3 of the License,
9 or (at your option) any later version.
10
11 Subread is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty
13 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14
15 See the GNU General Public License for more details.
16
17 Authors: Drs Yang Liao and Wei Shi
18
19 ***************************************************************/
20
21
22 #include <stdio.h>
23 #include <ctype.h>
24 #include <string.h>
25 #include <getopt.h>
26 #include <signal.h>
27 #include <unistd.h>
28 #include <time.h>
29 #include <sys/types.h>
30 #include "hashtable.h"
31 #include "gene-value-index.h"
32 #include "HelperFunctions.h"
33 #include "gene-algorithms.h"
34 #include "sorted-hashtable.h"
35 #include "seek-zlib.h"
36 #include "hashtable.h"
37 #include "input-files.h"
38
39 #define NO_GENE_DEBUG_
40 #define _GENE_DEBUG_SIZE_ 40000
41 #define MIN_READ_SPLICING 2000000
42
43
44 #define MAX_KEY_MATCH GENE_VOTE_SPACE
45 int GENE_SLIDING_STEP = 3;
46 int IS_COLOR_SPACE = 0;
47 int MARK_NONINFORMATIVE_SUBREADS = 0;
48 int IS_FORCED_ONE_BLOCK = 0;
49 int ignore_bar_in_seqnames = 0;
50 double begin00_ftime = 0.0;
51
52 #define NEXT_READ 1
53 #define NEXT_FILE 2
54 #define FULL_PARTITION 4
55
estimate_memory_peak(unsigned int * bucket_sizes,unsigned int bucket_no,unsigned int tables)56 long long estimate_memory_peak(unsigned int * bucket_sizes, unsigned int bucket_no, unsigned int tables ){
57 long long peakuse = 0;
58 int tn;
59 for(tn=0; tn<tables; tn++){
60 long long this_peak = 0;
61 unsigned int bn;
62 for(bn=0; bn<bucket_no; bn++)
63 this_peak += bucket_sizes[bn + tn*bucket_no];
64 peakuse = max(peakuse, this_peak);
65 }
66 //SUBREADprintf("EST MEMORY PEAK=%lld, bucks=%u, tabs=%u\n", peakuse, bucket_no, tables);
67 return peakuse *(sizeof(short)+sizeof(gehash_data_t)) + sizeof(struct gehash_bucket)*bucket_no + sizeof(int) * tables*bucket_no;
68 }
69
print_build_log(double finished_rate,double read_per_second,double expected_seconds,unsigned long long int total_reads)70 void print_build_log(double finished_rate, double read_per_second, double expected_seconds, unsigned long long int total_reads)
71 {
72 print_in_box( 81,0,0,"%4d%%%%, %3d mins elapsed, rate=%.1fk bps/s", (int)(finished_rate*100), (int)(miltime()-begin_ftime)/60, read_per_second/1000 );
73 }
74
75
76 #define MAX_BASES_IN_INDEX 4294900000.0
77
build_gene_index(const char index_prefix[],char ** chro_files,int chro_file_number,int threshold,HashTable * huge_table,unsigned int * chro_lens,long long actual_bases,int for_measure_buckets,unsigned int ** bucket_sizes,unsigned int expected_hash_items,unsigned int bucket_no,unsigned int * total_tables)78 int build_gene_index(const char index_prefix [], char ** chro_files, int chro_file_number, int threshold, HashTable * huge_table, unsigned int * chro_lens, long long actual_bases, int for_measure_buckets, unsigned int ** bucket_sizes, unsigned int expected_hash_items, unsigned int bucket_no, unsigned int * total_tables){
79 int file_number, table_no, status = NEXT_FILE;
80 unsigned int offset, read_no, current_tab_items;
81 long long int all_bases = guess_gene_bases(chro_files,chro_file_number);
82
83 int chro_table_maxsize=100, dump_res = 0;
84 unsigned int * read_offsets = malloc(sizeof(unsigned int) * chro_table_maxsize);
85 char * read_names = malloc(MAX_READ_NAME_LEN * chro_table_maxsize);
86 gehash_t table;
87 // gehash_t huge_table;
88 gene_value_index_t value_array_index;
89
90 gene_input_t * ginp = malloc(sizeof(gene_input_t));
91 begin_ftime = miltime();
92
93 // SUBREADprintf ("Index items per partition = %u\n\n", expected_hash_items);
94
95 if (chro_file_number > 199)
96 {
97 SUBREADprintf("ERROR: There are too many FASTA files. You may merge them into one FASTA file.\n");
98 return -1;
99 }
100
101 if (strlen (index_prefix) > 290)
102 {
103 SUBREADprintf("ERROR: The path is too long. It should not be longer than 290 chars.\n");
104 return -1;
105 }
106
107 if(all_bases<1)
108 {
109 SUBREADprintf("ERROR: File '%s' is inaccessible.\n", chro_files[-all_bases-1]);
110 return -1;
111 }
112
113 current_tab_items = 0;
114 table_no = 0;
115 int padding_around_contigs = MAX_READ_LENGTH;
116 if(0 == for_measure_buckets){
117 if(gehash_create_ex(& table, expected_hash_items, 0, SUBINDEX_VER2, GENE_SLIDING_STEP, padding_around_contigs)) return 1;
118 if(gvindex_init(&value_array_index, 0, (unsigned int) actual_bases))return 1;
119 }
120 if(1==for_measure_buckets){
121 *bucket_sizes = realloc(*bucket_sizes, sizeof(int) * (table_no+1) * bucket_no);
122 memset((*bucket_sizes) + table_no * bucket_no, 0, sizeof(int) * bucket_no);
123 memset(&table,0 , sizeof(gehash_t));
124 memset(&value_array_index, 0, sizeof(gene_value_index_t));
125 table.padding = padding_around_contigs;
126 table.index_gap = GENE_SLIDING_STEP;
127 }
128
129 file_number = 0;
130 offset = table.padding;
131 read_no = 0;
132
133 char * fn = malloc(3100);
134 memset(read_offsets,0, chro_table_maxsize*sizeof(int));
135 sprintf(fn, "%s.files", index_prefix);
136 unlink(fn);
137
138 status = NEXT_FILE;
139
140 if(1==for_measure_buckets)
141 print_in_box(80,0,0,"Estimate the index size...");
142 else
143 print_in_box(80,0,0,"Build the index...");
144
145 {
146 char window [16], last_color_base=-1, last_last_color_base=-1;
147 int i, read_len = 0;
148 unsigned int int_key = 0, array_int_key = 0;
149 int skips=0, all_skips = 0;
150
151 //Pre-fill
152 while(1)
153 {
154 //Subread Cycle
155 char next_char;
156 if (status == NEXT_FILE)
157 {
158 if(file_number == chro_file_number)
159 {
160 FILE * fp;
161
162 geinput_close(ginp);
163
164 //SUBREADprintf ("Processing chromosome files ...\n");
165
166 if(0==for_measure_buckets){
167 sprintf (fn, "%s.%02d.%c.tab", index_prefix, table_no, IS_COLOR_SPACE?'c':'b');
168 SUBREADfflush(stdout);
169
170 if(!dump_res)dump_res |= gehash_dump(&table, fn);
171
172 sprintf (fn, "%s.%02d.%c.array", index_prefix, table_no, IS_COLOR_SPACE?'c':'b');
173 if(!dump_res)dump_res |= gvindex_dump(&value_array_index, fn);
174 gvindex_destory(&value_array_index) ;
175
176 gehash_destory(&table);
177 }
178
179 read_offsets[read_no-1] = offset + table.padding;
180
181 for(i=table_no+1; i<100; i++)
182 {
183 sprintf(fn, "%s.%02d.%c.tab", index_prefix, i, IS_COLOR_SPACE?'c':'b');
184 unlink(fn);
185 sprintf(fn, "%s.%02d.%c.array", index_prefix, i, IS_COLOR_SPACE?'c':'b');
186 unlink(fn);
187 }
188
189 sprintf (fn, "%s.reads", index_prefix);
190 fp = f_subr_open(fn, "wb");
191 for (i=0; i<read_no; i++)
192 fprintf(fp, "%u\t%s\n", read_offsets[i], read_names+i*MAX_READ_NAME_LEN);
193
194 fclose(fp);
195
196 break;
197 }
198 else
199 {
200 if (file_number)
201 geinput_close(ginp);
202 geinput_open(chro_files[file_number++], ginp);
203 status = NEXT_READ;
204 }
205 }
206 if (status == NEXT_READ)
207 {
208
209 geinput_readline(ginp, fn, 0);
210
211 if(read_no>0){
212 read_offsets[read_no-1] = offset + table.padding;
213 offset += 2*table.padding;
214 }
215
216 //printf("TTTXT FN=%s\n",fn);
217
218 for(i=0;(fn[i+1] != ' ' && fn[i+1] != '\0' && fn[i+1] != '\t' && i<MAX_CHROMOSOME_NAME_LEN - 1 && ( fn[i+1] != '|' || !ignore_bar_in_seqnames )); i++)
219 *(read_names + MAX_READ_NAME_LEN*read_no + i) = fn[i+1];
220
221 *(read_names + MAX_READ_NAME_LEN*read_no + i) = 0;
222
223 sprintf(fn, "%s.files", index_prefix);
224 FILE * fname_fp = f_subr_open(fn, "a");
225 fprintf(fname_fp, "%s\t%s\t%ld\n", read_names+read_no*MAX_READ_NAME_LEN, ginp -> filename, ftell(ginp -> input_fp));
226 fclose(fname_fp);
227
228 for (i=0; i<16; i++)
229 {
230 char nch = geinput_next_char(ginp);
231 if (nch == 'N') skips = 16;
232 else if (skips>0) skips--;
233 window[i] = nch;
234 }
235
236 read_len = 16;
237 read_no ++;
238
239 if(read_no >= chro_table_maxsize)
240 {
241 read_offsets = realloc(read_offsets, 2* chro_table_maxsize * sizeof(unsigned int));
242 read_names = realloc(read_names, 2* chro_table_maxsize * MAX_READ_NAME_LEN);
243 chro_table_maxsize *= 2;
244 }
245
246 if(IS_COLOR_SPACE)
247 {
248 int_key = genekey2color('A',window);
249 array_int_key = genekey2int(window,GENE_SPACE_BASE);
250 last_last_color_base = -1;
251 last_color_base = window[15];
252 }
253 else
254 array_int_key = int_key = genekey2int(window,GENE_SPACE_BASE);
255
256 }
257 if (status == FULL_PARTITION)
258 {
259 int seek_back_reads ;
260
261 if (read_len < MIN_READ_SPLICING)
262 {
263 seek_back_reads = read_len;
264 offset -= seek_back_reads;
265 offset += 16;
266 }
267 else
268 {
269 seek_back_reads = MIN_READ_SPLICING - 10;
270 seek_back_reads -= seek_back_reads%3;
271 seek_back_reads += 1;
272 offset -= seek_back_reads;
273 offset += 16;
274 }
275
276 if(0==for_measure_buckets){
277 sprintf(fn, "%s.%02d.%c.tab", index_prefix, table_no, IS_COLOR_SPACE?'c':'b');
278 SUBREADfflush(stdout);
279
280 if(!dump_res)dump_res |= gehash_dump(&table, fn);
281 sprintf(fn, "%s.%02d.%c.array", index_prefix, table_no, IS_COLOR_SPACE?'c':'b');
282 if(!dump_res)dump_res |= gvindex_dump(&value_array_index, fn);
283 }
284
285 table_no ++;
286
287 if(0==for_measure_buckets){
288 gehash_destory(&table);
289 gvindex_destory(&value_array_index);
290 }
291
292 read_len -= seek_back_reads;
293 read_len += 16;
294
295 while(seek_back_reads)
296 {
297 fseek(ginp -> input_fp, -1, SEEK_CUR);
298 char bnch = fgetc(ginp -> input_fp);
299 if ((bnch >='A' && bnch <= 'Z' ) || (bnch >='a' && bnch <= 'z' ) || bnch == '-' || bnch == 'N' || bnch=='.')seek_back_reads--;
300 fseek(ginp -> input_fp, -1, SEEK_CUR);
301 }
302
303 for (i=0; i<16; i++)
304 {
305 char nch = geinput_next_char(ginp);
306 if (nch == 'N' ) skips = 16;
307 else if (skips>0) skips--;
308 window[i] = nch;
309 }
310
311 if(IS_COLOR_SPACE)
312 {
313 int_key = genekey2color('A',window);
314 array_int_key = genekey2int(window,GENE_SPACE_BASE);
315 last_last_color_base = -1;
316 last_color_base = window[15];
317 }
318 else
319 array_int_key = int_key = genekey2int(window, GENE_SPACE_BASE);
320
321 current_tab_items = 0;
322 if(0==for_measure_buckets){
323 if(gehash_create_ex(&table, expected_hash_items, 0, SUBINDEX_VER2, GENE_SLIDING_STEP, padding_around_contigs)) return 1;
324 if(gvindex_init(&value_array_index, offset, (unsigned int) actual_bases)) return 1;
325 }
326
327 if(1==for_measure_buckets){
328 //unsigned int * tmpp = *bucket_sizes;
329 *bucket_sizes = realloc(*bucket_sizes, sizeof(int) * (table_no+1) * bucket_no);
330 assert(NULL!= *bucket_sizes);
331 memset((*bucket_sizes) + table_no * bucket_no, 0, sizeof(int) * bucket_no);
332 }
333 }
334
335 status = 0;
336
337 if(skips || (IS_COLOR_SPACE && last_last_color_base<0))
338 all_skips ++;
339 else
340 {
341 int is_no_info = HashTableGet(huge_table, NULL+1+int_key)-NULL;
342 if(is_no_info){
343 // SUBREADprintf("NOINFO KEY=%u AT %u\n", int_key, offset);
344 }else {
345 if(0 == for_measure_buckets)if(gehash_insert(&table, int_key, offset - (IS_COLOR_SPACE?1:0), (*bucket_sizes)+ table_no * bucket_no))return 1;
346 if(1 == for_measure_buckets) gehash_try_insert_measure((*bucket_sizes)+ table_no * bucket_no , bucket_no, int_key);
347 current_tab_items++;
348 }
349 if( 0 == for_measure_buckets )
350 gvindex_set(&value_array_index, offset - (IS_COLOR_SPACE?0:0), array_int_key, padding_around_contigs);
351 }
352
353 if((!IS_FORCED_ONE_BLOCK) && current_tab_items >= expected_hash_items && (read_len > MIN_READ_SPLICING || read_len < 32))
354 {
355 status = FULL_PARTITION;
356 continue;
357 }
358
359 for (i=0; i<GENE_SLIDING_STEP; i++)
360 {
361 next_char = geinput_next_char(ginp);
362 if(next_char < 0) {
363 if( 0 == for_measure_buckets ) gvindex_set(&value_array_index, offset - (IS_COLOR_SPACE?0:0), array_int_key, padding_around_contigs);
364
365 if (next_char == -1) status = NEXT_READ;
366 if (next_char == -2) status = NEXT_FILE;
367 if (next_char == -3) return 0;
368 break;
369 }
370 //SUBREADprintf("NEXT_CH=%c\n", next_char);
371
372 if (next_char == 'N' )skips = 16;
373 else if (skips>0){
374 skips--;
375 last_color_base = -1;
376 }
377
378 int_key = int_key << 2;
379
380
381 if (IS_COLOR_SPACE)
382 {
383 if(last_color_base>0)
384 int_key += chars2color(last_color_base, next_char);
385 array_int_key = array_int_key << 2;
386 array_int_key += base2int (next_char);
387
388 last_last_color_base = last_color_base;
389 last_color_base = next_char;
390 }
391 else
392 {
393 int_key += base2int (next_char);
394 array_int_key = int_key;
395 }
396
397 if(actual_bases > 12){
398 if (offset>0 && offset % (actual_bases / 12) == 0)
399 {
400 double finished_rate = offset*1.0/actual_bases;
401 double base_per_second = offset / (miltime()-begin_ftime);
402 double ETA = (actual_bases-offset) / base_per_second;
403 print_build_log(finished_rate,base_per_second,ETA, actual_bases);
404 SUBREADfflush(stdout) ;
405 }
406 }
407
408 offset ++;
409 read_len ++;
410
411 if(offset > 0xFFFFFFFDU)
412 {
413 SUBREADprintf("ERROR: the provided reference sequences include more than 4 billion bases.\n") ;
414 return -1;
415 }
416
417 }
418
419 }
420 }
421 free(read_names);
422 free(read_offsets);
423 if(dump_res){
424 SUBREADprintf("No index was built.\n");
425 sprintf(fn, "%s.files", index_prefix);
426 unlink(fn);
427 sprintf(fn, "%s.reads", index_prefix);
428 unlink(fn);
429 int index_i;
430 for(index_i = 0; index_i <= 99; index_i++){
431 sprintf(fn, "%s.%02d.b.tab", index_prefix, index_i);
432 unlink(fn);
433 sprintf(fn, "%s.%02d.c.tab", index_prefix, index_i);
434 unlink(fn);
435 sprintf(fn, "%s.%02d.b.array", index_prefix, index_i);
436 unlink(fn);
437 sprintf(fn, "%s.%02d.c.array", index_prefix, index_i);
438 unlink(fn);
439 }
440 }
441 free(fn);
442 free(ginp);
443 *total_tables = table_no+1;
444 return 0;
445 }
446
add_repeated_subread(gehash_t * tab,unsigned int subr,unsigned char ** huge_index)447 int add_repeated_subread(gehash_t * tab , unsigned int subr, unsigned char ** huge_index)
448 {
449 int huge_byte = (subr>>1);
450 int huge_offset = (subr % 2) * 4;
451 int ind_id = (huge_byte >> 24) & 127;
452
453 unsigned int byte_value = huge_index[ ind_id ][huge_byte&0xffffff] ;
454
455 int huge_value = (byte_value>> huge_offset) & 0xf;
456 if(huge_value <15){
457 huge_value ++;
458 byte_value = (byte_value & (~(0xf << huge_offset))) | (huge_value << huge_offset);
459 huge_index[ ind_id ][huge_byte&0xffffff] = (unsigned char)(byte_value&0xff);
460 return 0;
461 }
462 unsigned int times = 0;
463 int matched = gehash_get(tab, subr, ×, 1);
464 if(matched)
465 gehash_update(tab, subr, times+1);
466 else
467 if(gehash_insert(tab, subr, 16, NULL)) return 1;
468
469 return 0;
470 }
471
scan_gene_index(const char index_prefix[],char ** chro_files,int chro_file_number,int threshold,HashTable * huge_table,long long * actual_total_bases_inc_marging)472 int scan_gene_index(const char index_prefix [], char ** chro_files, int chro_file_number, int threshold, HashTable *huge_table, long long * actual_total_bases_inc_marging)
473 {
474 int file_number, i ,j;
475 int status = NEXT_FILE;
476 unsigned int offset, read_no;
477 long long int guess_all_bases = guess_gene_bases(chro_files,chro_file_number);
478 int padding_around_contigs = MAX_READ_LENGTH;
479 *actual_total_bases_inc_marging = padding_around_contigs;
480
481 gehash_t occurrence_table;
482 unsigned char * huge_index[128];
483
484 for(i=0;i<128;i++) {
485 huge_index[i] = calloc(1024*1024*16,1);
486 if(NULL == huge_index[i]){
487 SUBREADprintf("ERROR: No memory can be allocated.\n");
488 return -1;
489 }
490 }
491
492 if(gehash_create_ex(&occurrence_table, 500000, 0, SUBINDEX_VER0, 1, 0)) return 1;
493
494 gene_input_t * ginp = malloc(sizeof(gene_input_t));
495
496 print_in_box(80,0,0,"Scan uninformative subreads in reference sequences ...");
497
498 if (chro_file_number > 199)
499 {
500 SUBREADprintf("ERROR: There are too many FASTA files. You may merge them into one FASTA file.\n");
501 return -1;
502 }
503
504 if (strlen (index_prefix) > 290)
505 {
506 SUBREADprintf("ERROR: The path is too long. It should not be longer than 290 chars.\n");
507 return -1;
508 }
509 if(guess_all_bases<1)
510 {
511 SUBREADprintf("ERROR: File '%s' is inaccessible.\n", chro_files[-guess_all_bases-1]);
512 return -1;
513 }
514
515
516 file_number = 0;
517 offset = 0;
518 read_no = 0;
519
520 char * fn = malloc(3100);
521 sprintf(fn, "%s.files", index_prefix);
522 unlink(fn);
523
524 status = NEXT_FILE;
525
526
527 {
528 char window [16], last_color_base=-1, last_last_color_base=-1;
529 int i, read_len = 0;
530 unsigned int int_key = 0, array_int_key = 0;
531 int skips=0, all_skips = 0;
532
533 //Pre-fill
534 while(1)
535 {
536 //Subread Cycle
537 char next_char;
538 //if(status > 0)SUBREADprintf("BDSSTATUS = %d\n", status);
539
540 if (status == NEXT_FILE)
541 {
542 (*actual_total_bases_inc_marging)+=padding_around_contigs;
543 if(file_number == chro_file_number)
544 {
545 geinput_close(ginp);
546
547 break;
548 }
549 else
550 {
551 if (file_number)
552 geinput_close(ginp);
553 geinput_open(chro_files[file_number++], ginp);
554 status = NEXT_READ;
555 }
556 }
557 if (status == NEXT_READ)
558 {
559 (*actual_total_bases_inc_marging)+=2*padding_around_contigs;
560
561 geinput_readline(ginp, fn, 0);
562 //SUBREADprintf("HEADER_SCAN '''%s'''\n", fn);
563
564 for (i=0; i<16; i++)
565 {
566 char nch = geinput_next_char(ginp);
567 if (nch == 'N') skips = 16;
568 else if (skips>0) skips--;
569 window[i] = nch;
570 }
571 read_len = 16;
572 read_no ++;
573
574 if(IS_COLOR_SPACE)
575 {
576 int_key = genekey2color('A',window);
577 array_int_key = genekey2int(window,GENE_SPACE_BASE);
578 last_last_color_base = -1;
579 last_color_base = window[15];
580 }
581 else
582 array_int_key = int_key = genekey2int(window,GENE_SPACE_BASE);
583
584 }
585
586 status = 0;
587
588 if(skips || (IS_COLOR_SPACE && last_last_color_base<0))
589 all_skips ++;
590 else
591 {
592 int rv = add_repeated_subread(&occurrence_table, int_key, huge_index);
593 if(rv<0) {
594 return -1;
595 }
596 }
597
598
599 for (i=0; i<GENE_SLIDING_STEP; i++)
600 {
601 next_char = geinput_next_char(ginp);
602 if(next_char < 0)
603 {
604 if (next_char == -1) status = NEXT_READ;
605 if (next_char == -2) status = NEXT_FILE;
606 if (next_char == -3) return 0;
607 break;
608 }
609
610 if (next_char == 'N' )skips = 16;
611 else if (skips>0){
612 skips--;
613 last_color_base = -1;
614 }
615
616 int_key = int_key << 2;
617
618 if (IS_COLOR_SPACE)
619 {
620 if(last_color_base>0)
621 int_key += chars2color(last_color_base, next_char);
622 array_int_key = array_int_key << 2;
623 array_int_key += base2int (next_char);
624
625 last_last_color_base = last_color_base;
626 last_color_base = next_char;
627 }
628 else
629 {
630 int_key += base2int (next_char);
631 array_int_key = int_key;
632 }
633
634 offset ++;
635 (*actual_total_bases_inc_marging)++;
636 read_len ++;
637
638 if(offset > 0xFFFFFFFDU)
639 {
640 SUBREADprintf("ERROR: the provided reference sequences include more than 4 billion bases.\n") ;
641 return -1;
642 }
643
644 }
645
646 }
647 }
648
649
650
651 free(fn);
652
653 for (i=0; i<occurrence_table.buckets_number; i++)
654 {
655 struct gehash_bucket * current_bucket = &(occurrence_table.buckets[i]);
656
657 if(current_bucket -> current_items>=1)
658 {
659 for (j=0; j<current_bucket -> current_items; j++)
660 {
661 if(current_bucket -> item_values [j] > threshold)
662 {
663 HashTablePut(huge_table,NULL+1+ current_bucket -> item_keys[j], NULL+current_bucket -> item_values [j]);
664 }
665 }
666 }
667 }
668
669
670 for(i=0;i<128;i++)
671 if(huge_index[i]) free(huge_index[i]);
672
673 free(ginp);
674 gehash_destory(&occurrence_table);
675
676 if(huge_table -> numOfElements)
677 {
678 print_in_box(80,0,0,"%llu uninformative subreads were found.", huge_table -> numOfElements);
679 print_in_box(80,0,0,"These subreads were excluded from index building.");
680 }
681
682 return 0;
683 }
684
685
rtrim(char * s)686 char *rtrim(char *s)
687 {
688 char* back = s + strlen(s);
689 while(isspace(*--back));
690 *(back+1) = '\0';
691 return s;
692 }
693
694 typedef struct{
695 char * filename;
696 unsigned int line_no;
697 int byte_in_line;
698 } format_check_context_t;
699
700 int ERROR_FOUND_IN_FASTA = 0;
check_and_convert_warn(format_check_context_t * fcc,char * msg,FILE * log_fp)701 void check_and_convert_warn(format_check_context_t * fcc, char * msg, FILE * log_fp){
702 ERROR_FOUND_IN_FASTA += 1;
703 fprintf(log_fp, "A format issue is found in the %u-th line in file '%s': %s.\n", fcc->line_no, fcc->filename, msg);
704 }
check_and_convert_warnOLD(char * FN,long long int fpos_line_head,unsigned line_no,int line_pos,char * msg,FILE * log_fp)705 void check_and_convert_warnOLD(char * FN, long long int fpos_line_head, unsigned line_no, int line_pos, char * msg, FILE * log_fp)
706 {
707 #define CHAR_ESC 27
708 int x1,brs=0;
709 long long int back_search_ptr;
710 char * line_buf = malloc(MAX_READ_LENGTH+1);
711
712 ERROR_FOUND_IN_FASTA += 1;
713
714 fprintf(log_fp,"\n");
715
716 //fprintf(log_fp,"%c[33m", CHAR_ESC);
717 for(x1=0;x1<81;x1++)
718 fprintf(log_fp,"=");
719 fprintf(log_fp,"\n");
720 //fprintf(log_fp,"%c[32m", CHAR_ESC);
721 fprintf(log_fp,"Input file '%s':\n", FN);
722 //fprintf(log_fp,"%c[31m", CHAR_ESC);
723 fprintf(log_fp,"%s\n", msg);
724 //fprintf(log_fp,"%c[33m", CHAR_ESC);
725 for(x1=0;x1<81;x1++)
726 fprintf(log_fp,".");
727 fprintf(log_fp,"\n");
728 //fprintf(log_fp,"%c[37m", CHAR_ESC);
729
730
731 FILE * warn_fp = f_subr_open(FN, "r");
732
733 for(back_search_ptr = fpos_line_head - 1; back_search_ptr>=0; back_search_ptr--)
734 {
735 int nch;
736 fseeko(warn_fp, back_search_ptr, SEEK_SET);
737 nch = fgetc(warn_fp);
738 if(nch == '\n') brs++;
739 if(brs >2) break;
740 fseeko(warn_fp, back_search_ptr, SEEK_SET);
741 }
742
743 if(back_search_ptr<=0)brs++;
744
745 int print_line_no = line_no - brs + 1;
746 while(1)
747 {
748 char * ret = fgets(line_buf, MAX_READ_LENGTH, warn_fp);
749 if(!ret)break;
750
751 /*
752 if(ftello(warn_fp) > fpos_line_head)
753 fprintf(log_fp,"%c[9m%c[31m", CHAR_ESC, CHAR_ESC);
754 else
755 fprintf(log_fp,"%c[29m%c[37m", CHAR_ESC, CHAR_ESC);
756 */
757 fprintf(log_fp," % 9d ", print_line_no++);
758
759 rtrim(line_buf);
760
761 fprintf(log_fp,"%s%s\n",line_buf,strlen(line_buf)<16?" ":"");
762 if(ftello(warn_fp) > fpos_line_head)
763 break;
764 }
765 for(x1=0;x1<line_pos+11;x1++)
766 fprintf(log_fp," ");
767 fprintf(log_fp,"^\n");
768
769 //fprintf(log_fp,"%c[29m%c[37m", CHAR_ESC, CHAR_ESC);
770 for(x1=0;x1<2;x1++)
771 {
772 char * ret = fgets(line_buf, MAX_READ_LENGTH, warn_fp);
773 if(!ret)break;
774 fprintf(log_fp," % 9d ", print_line_no++);
775 fprintf(log_fp,"%s",line_buf);
776 }
777 fclose(warn_fp);
778 //fprintf(log_fp,"%c[33m", CHAR_ESC);
779 for(x1=0;x1<81;x1++)
780 fprintf(log_fp,"=");
781 fprintf(log_fp,"\n");
782 fprintf(log_fp,"\n");
783
784 //fprintf(log_fp,"%c[0m", CHAR_ESC);
785 free(line_buf);
786 }
787
check_and_convert_FastA(char ** input_fas,int fa_number,char * out_fa,unsigned int ** chrom_lens,FILE * log_fp,char * log_fn)788 int check_and_convert_FastA(char ** input_fas, int fa_number, char * out_fa, unsigned int ** chrom_lens, FILE * log_fp, char * log_fn)
789 {
790 int is_R_warnned = 0, is_repeated_chro= 0;
791 char * line_buf = malloc(MAX_READ_LENGTH);
792 char * read_head_buf = malloc(MAX_READ_LENGTH * 3);
793 unsigned int inp_file_no, line_no;
794 int written_chrs = 0, is_disk_full = 0;
795 int chrom_lens_max_len = 100;
796 int chrom_lens_len = 0;
797 ERROR_FOUND_IN_FASTA = 0;
798 FILE * out_fp = f_subr_open(out_fa,"wb");
799 format_check_context_t fcc;
800 memset(&fcc,0,sizeof(fcc));
801
802 if(!out_fp)
803 {
804 SUBREADprintf("ERROR: the output directory is not writable, but the index builder needs to create temporary files in the current directory. Please change the working directory and rerun the index builder.\n");
805 return -1;
806 }
807
808 (*chrom_lens) = malloc(chrom_lens_max_len*sizeof(unsigned int));
809 memset((*chrom_lens), 0, chrom_lens_max_len*sizeof(unsigned int));
810
811
812 HashTable * rep_name_table = HashTableCreate(9999);
813 HashTableSetDeallocationFunctions(rep_name_table, free, NULL);
814 HashTableSetKeyComparisonFunction(rep_name_table, (int (*) (const void *, const void *)) strcmp);
815 HashTableSetHashFunction(rep_name_table , fc_chro_hash);
816
817 print_in_box( 80,0,0,"Check the integrity of provided reference sequences ...");
818 for(inp_file_no = 0; inp_file_no < fa_number; inp_file_no++)
819 {
820 autozip_fp fafp;
821 memset(&fafp, 0, sizeof(fafp));
822 int faret = autozip_open(input_fas[inp_file_no], &fafp);
823 fcc.filename = input_fas[inp_file_no];
824
825 if(faret <0) {
826 SUBREADprintf("ERROR: Input file '%s' is not found or is not accessible. No index was built.\n", input_fas[inp_file_no]);
827 HashTableDestroy(rep_name_table);
828 return -1;
829 }
830
831 line_no = 0;
832 int is_head_written=0;
833 read_head_buf[0]=0;
834 while(1) {
835 unsigned int read_len = 0;
836 int ret = autozip_gets(&fafp, line_buf, MAX_READ_LENGTH);
837 if(ret<1) break;
838 if(ret >= MAX_READ_LENGTH-2){
839 SUBREADprintf("ERROR: A fasta file cannot have a line longer than 1000 bytes. You need to split a very long line into many lines.\n");
840 return -1;
841 }
842
843 line_no ++;
844 fcc.line_no = line_no;
845 int line_buf_len = strlen(line_buf);
846
847 for(; line_buf[line_buf_len-1] == '\r' || line_buf[line_buf_len-1]=='\n' ;line_buf_len--)
848 {
849 if(line_buf[line_buf_len-1]=='\r')
850 {
851 if(!is_R_warnned)
852 {
853 is_R_warnned=1;
854 fcc.byte_in_line = line_buf_len-1;
855 check_and_convert_warn(&fcc ,"This line ends with '\\r\\n'. It is not a problem for building the index but we suggest to use Unix-styled line breaks.", log_fp);
856 }
857 }
858 line_buf[line_buf_len-1] =0;
859 }
860 //if(strchr(line_buf,'>') || strchr(line_buf,'\r') || strchr(line_buf,'\n'))SUBREADprintf("TESTING: string contains '''%s''' at %I64u\n", line_buf, ftello(fafp.plain_fp));
861
862 if(line_buf_len<1)
863 {
864 fcc.byte_in_line = -1;
865 check_and_convert_warn(&fcc ,"This line is empty. This is not allowed in the FASTA file.", log_fp);
866 continue;
867 }
868
869 if(line_buf[0]=='>')
870 {
871 if(line_no>1 &&!is_head_written)
872 {
873 fcc.byte_in_line = 0;
874 check_and_convert_warn(&fcc,"This sequence has less than 16 bases. It is ignored in the index because no subreads can be extracted.", log_fp);
875 }
876 is_head_written = 0;
877 read_len = 0;
878 read_head_buf[0]=0;
879
880 strcat(read_head_buf, line_buf);
881 strcat(read_head_buf, "\n");
882
883 int chro_name_end=1;
884 while(line_buf[chro_name_end])
885 {
886 if(line_buf[chro_name_end]==' '||line_buf[chro_name_end]=='|' || line_buf[chro_name_end]=='\t')break;
887 chro_name_end++;
888 }
889
890 line_buf[chro_name_end] = 0;
891 int is_exist = HashTableGet(rep_name_table , line_buf+1) - NULL;
892 if(is_exist)
893 {
894 SUBREADprintf("ERROR: repeated chromosome name '%s' is observed in the FASTA file(s).\nThe index was NOT built.\n", line_buf+1);
895 is_repeated_chro=1;
896 break;
897 }
898
899 char * keymem = malloc(chro_name_end);
900 strcpy(keymem, line_buf+1);
901 HashTablePut(rep_name_table, keymem, NULL+1);
902
903 }
904 else if(line_no<1)
905 {
906 fcc.byte_in_line=0;
907 check_and_convert_warn(&fcc ,"This file is not started with a header line. It seems not to be a FASTA file.", log_fp);
908 }
909 else
910 {
911 int xk2;
912 for(xk2=0; xk2 < line_buf_len; xk2++)
913 {
914 int nextch = line_buf[xk2];
915 int lowerch = tolower(nextch);
916 if(!( lowerch == 'a' || lowerch == 't' || lowerch == 'g' || lowerch == 'c' || nextch == '.' || nextch=='-' || lowerch=='n'))
917 {
918 fcc.byte_in_line=xk2;
919 check_and_convert_warn(&fcc, "The non-ACGT base was converted to an 'A'.", log_fp);
920 line_buf[xk2] = 'A';
921 }
922 else if(nextch == '.' || nextch=='-' || lowerch=='n')
923 line_buf[xk2] = 'A';
924 else
925 line_buf[xk2] = toupper(nextch);
926 }
927
928 read_len += line_buf_len;
929
930 if(read_len > 16 && !is_head_written)
931 {
932 fputs(read_head_buf, out_fp);
933 written_chrs++;
934 is_head_written = 1;
935
936 chrom_lens_len++;
937 if((chrom_lens_max_len-1) <= chrom_lens_len)
938 {
939 (*chrom_lens) = realloc((*chrom_lens), 2*chrom_lens_max_len*sizeof(unsigned int));
940 chrom_lens_max_len*=2;
941 }
942 }
943
944 if(is_head_written)
945 {
946 int line_buf_len = strlen(line_buf);
947 int writen_len = fprintf(out_fp,"%s\n", line_buf);
948 if(writen_len < line_buf_len){
949 SUBREADprintf("ERROR: unable to write into the temporary file. Please check the free space of the output directory.\n");
950 is_disk_full = 1;
951 break;
952 }
953 (*chrom_lens)[chrom_lens_len-1] = read_len;
954 (*chrom_lens)[chrom_lens_len] = 0;
955 }
956 else
957 {
958 strcat(read_head_buf, line_buf);
959 strcat(read_head_buf, "\n");
960 }
961
962 }
963 }
964
965 autozip_close(&fafp);
966 if(is_disk_full) break;
967 }
968
969
970 HashTableDestroy(rep_name_table);
971 free(line_buf);
972 free(read_head_buf);
973 fclose(out_fp);
974
975 if(!written_chrs){
976 SUBREADprintf("ERROR: No index was built because there were no subreads extracted. A chromosome needs at least 16 bases to be indexed.");
977 return 1;
978 }
979
980 if(is_repeated_chro|| is_disk_full)
981 return 1;
982
983 if(ERROR_FOUND_IN_FASTA)
984 {
985 print_in_box( 80,0,0,"There were %d notes for reference sequences.", ERROR_FOUND_IN_FASTA);
986 print_in_box( 89,0,0,"The notes can be found in the log file, %c[36m'%s'%c[0m.", CHAR_ESC, log_fn, CHAR_ESC);
987 }
988 else print_in_box( 80,0,0,"No format issues were found");
989
990 return 0;
991 }
992
993 char * tmp_file_for_signal;
994
SIGINT_hook(int param)995 void SIGINT_hook(int param)
996 {
997 #ifdef MAKE_STANDALONE
998 if(tmp_file_for_signal[0])
999 {
1000 unlink(tmp_file_for_signal);
1001 SUBREADprintf("\n\nReceived a terminal signal. The temporary file was removed. The index was NOT built successfully. Please DO NOT use the new index until they are rebuilt.\n\n");
1002 }
1003
1004 exit(param);
1005 #endif
1006 }
1007
1008 static struct option ib_long_options[]={
1009 {0, 0, 0, 0}
1010 };
1011
1012 #ifdef MAKE_STANDALONE
main(int argc,char ** argv)1013 int main(int argc,char ** argv)
1014 #else
1015 int main_buildindex(int argc,char ** argv)
1016 #endif
1017 {
1018 int threshold = 100, optindex=0;
1019 int memory_limit; // 8000 MBytes
1020 char output_file[MAX_FILE_NAME_LENGTH], c, tmp_fa_file[MAX_FILE_NAME_LENGTH], log_file_name[MAX_FILE_NAME_LENGTH+20];
1021 char *ptr_tmp_fa_file[1];
1022 unsigned int * chromosome_lengths;
1023 begin00_ftime = miltime();
1024
1025 if(sizeof(char *)>4) memory_limit=8000;
1026 else memory_limit=3000;
1027
1028 ptr_tmp_fa_file[0]=tmp_fa_file;
1029 output_file[0] = 0;
1030 tmp_fa_file[0] = 0;
1031 tmp_file_for_signal = tmp_fa_file;
1032
1033 IS_FORCED_ONE_BLOCK = 0;
1034 GENE_SLIDING_STEP = 3;
1035 IS_COLOR_SPACE = 0;
1036
1037 SUBREADprintf("\n");
1038
1039 optind = 0;
1040
1041 while ((c = getopt_long (argc, argv, "kvcBFM:o:f:Db?", ib_long_options, &optindex)) != -1)
1042 switch(c)
1043 {
1044 case 'b':
1045 ignore_bar_in_seqnames= 1;
1046 break;
1047 case 'B':
1048 IS_FORCED_ONE_BLOCK =1;
1049 break;
1050 case 'F':
1051 GENE_SLIDING_STEP =1;
1052 break;
1053 case 'v':
1054 core_version_number("Subread-buildindex");
1055 return 0;
1056 case 'c':
1057 IS_COLOR_SPACE = 1;
1058 break;
1059 case 'M':
1060 memory_limit = atoi(optarg);
1061 break;
1062 case 'f':
1063 threshold = atoi(optarg);
1064 break;
1065 case 'o':
1066 strncpy(output_file, optarg, MAX_FILE_NAME_LENGTH-1);
1067 break;
1068 case 'k':
1069 if(memcmp(SUBREAD_VERSION , "1.3.",4)==0)
1070 {
1071 SUBREADprintf("The \"-k\" option is not supported in version 1.3.x. ");
1072 return -1;
1073 }
1074 MARK_NONINFORMATIVE_SUBREADS = 1;
1075 break;
1076 case '?':
1077 return -1 ;
1078 }
1079
1080 if (argc == optind || !output_file[0])
1081 {
1082 SUBREADprintf("Version %s\n\n", SUBREAD_VERSION);
1083
1084 /*
1085 SUBREADputs("Usage:");
1086 SUBREADputs("");
1087 SUBREADputs(" ./subread-buildindex [options] -o <basename> {FASTA file1} [FASTA file2] ...");
1088 SUBREADputs("");
1089 SUBREADputs("Required arguments:");
1090 SUBREADputs("");
1091 SUBREADputs(" -o <basename> base name of the index to be created");
1092 SUBREADputs("");
1093 SUBREADputs("Optional arguments:");
1094 SUBREADputs("");
1095 SUBREADputs(" -G build a gapped index for the reference genome. 16bp subreads");
1096 SUBREADputs("");
1097 SUBREADputs(" -M <int> size of requested memory(RAM) in megabytes. Index is split into blocks if necessary.");
1098 SUBREADputs("");
1099 SUBREADputs(" -f <int> specify the threshold for removing uninformative subreads");
1100 SUBREADputs(" (highly repetitive 16mers in the reference). 24 by default.");
1101 SUBREADputs("");
1102 SUBREADputs(" -c build a color-space index.");
1103 SUBREADputs("");
1104 SUBREADputs(" -v output version of the program.");
1105 SUBREADputs("");
1106 SUBREADputs("For more information about these arguments, please refer to the User Manual.\n");
1107 */
1108 SUBREADputs("Usage:");
1109 SUBREADputs("");
1110 SUBREADputs(" ./subread-buildindex [options] -o <basename> {FASTA[.gz] file1}\\");
1111 SUBREADputs(" [FASTA[.gz] file2] ...");
1112 SUBREADputs("");
1113 SUBREADputs("Required arguments:");
1114 SUBREADputs("");
1115 SUBREADputs(" -o <basename> base name of the index to be created");
1116 SUBREADputs("");
1117 SUBREADputs("Optional arguments:");
1118 SUBREADputs("");
1119 SUBREADputs(" -F build a full index for the reference genome. 16bp subreads");
1120 SUBREADputs(" will be extracted from every position of the reference");
1121 SUBREADputs(" genome. Size of the index is typically 3 times the size of");
1122 SUBREADputs(" index built from using the default setting.");
1123 SUBREADputs("");
1124 SUBREADputs(" -B create one block of index. The built index will not be split");
1125 SUBREADputs(" into multiple pieces. This makes the largest amount of");
1126 SUBREADputs(" memory be requested when running alignments, but it enables");
1127 SUBREADputs(" the maximum mapping speed to be achieved. This option");
1128 SUBREADputs(" overrides -M when it is provided as well.");
1129 SUBREADputs("");
1130 SUBREADputs(" -M <int> size of requested memory(RAM) in megabytes, 8000 by default.");
1131 SUBREADputs("");
1132 SUBREADputs(" -f <int> specify the threshold for removing uninformative subreads");
1133 SUBREADputs(" (highly repetitive 16mers in the reference). 100 by default.");
1134 SUBREADputs("");
1135 SUBREADputs(" -c build a color-space index.");
1136 SUBREADputs("");
1137 SUBREADputs(" -v output version of the program.");
1138 SUBREADputs("");
1139 SUBREADputs("For more information about these arguments, please refer to the User Manual.\n");
1140
1141 return -1 ;
1142 }
1143
1144 if(threshold < 5) {
1145 SUBREADprintf("ERROR: The threshold of non-informative reads cannot be less than 5.\n");
1146 return -1;
1147 }
1148
1149 // **********************************************
1150 // print config summary
1151 // **********************************************
1152
1153 print_subread_logo();
1154 size_t free_mem=-1, total_mem=-1;
1155 if(get_free_total_mem( &total_mem, &free_mem )){
1156 free_mem=-1;
1157 total_mem=-1;
1158 }
1159
1160 SUBREADputs("");
1161 print_in_box(80, 1, 1, "setting");
1162 print_in_box(80, 0, 1, "");
1163 print_in_box(80, 0, 0, " Index name : %s", get_short_fname(output_file));
1164 print_in_box(80, 0, 0, " Index space : %s", IS_COLOR_SPACE?"color space":"base space");
1165
1166 if(IS_FORCED_ONE_BLOCK)
1167 {
1168 print_in_box(80, 0, 0, " Index split : no-split");
1169 memory_limit = GENE_SLIDING_STEP==1?22000:11500;
1170 }
1171 else
1172 {
1173 if(memory_limit > 12000 && GENE_SLIDING_STEP>2)
1174 {
1175 print_in_box(80, 0, 0, " Memory : %u -> %u Mbytes", memory_limit, 12000);
1176 memory_limit = 12000;
1177 }
1178 else
1179 print_in_box(80, 0, 0, " Memory : %u Mbytes", memory_limit);
1180 }
1181 print_in_box(80, 0, 0, " Repeat threshold : %d repeats", threshold);
1182 print_in_box(80, 0, 0, " Gapped index : %s", GENE_SLIDING_STEP>1?"yes":"no");
1183 print_in_box(80, 0, 0, "");
1184 if(free_mem>0)print_in_box(80, 0, 0, " Free / total memory : %.1fGB / %.1fGB", free_mem*1./1024/1024/1024, total_mem*1./1024/1024/1024);
1185 print_in_box(80, 0, 0, "");
1186 print_in_box(80, 0, 0, " Input files : %d file%s in total", argc - optind, (argc - optind>1)?"s":"");
1187
1188 int x1;
1189 for(x1=0;x1< argc - optind; x1++)
1190 {
1191 char * fasta_fn = *(argv+optind+x1);
1192 int f_type = probe_file_type_fast(fasta_fn);
1193 char o_char = '?';
1194 if(f_type == FILE_TYPE_FASTA){
1195 o_char = 'o';
1196 }
1197 if(f_type == FILE_TYPE_GZIP_FASTA){
1198 o_char = 'o';
1199 }
1200 print_in_box(94, 0, 0, " %c[32m%c%c[36m %s%c[0m", CHAR_ESC, o_char, CHAR_ESC, get_short_fname(fasta_fn) , CHAR_ESC);
1201 }
1202 print_in_box(80, 0, 0, "");
1203
1204 #ifndef __MINGW32__
1205 if(free_mem>0 && free_mem < 3*1024ll*1024*1024){
1206 print_in_box(80, 0, 0, "");
1207 print_in_box( 80, 0, 0, " WARNING: the free memory is lower than 3.0GB." );
1208 print_in_box( 80, 0, 0, " the program may run very slow or crash." );
1209 print_in_box(80, 0, 0, "");
1210 }
1211 #endif
1212
1213 print_in_box(80, 2, 1, "");
1214 SUBREADputs("");
1215
1216 print_in_box(80, 1, 1, "Running");
1217 print_in_box(80, 0, 0, "");
1218
1219 for(x1=0;x1< argc - optind; x1++)
1220 {
1221 char * fasta_fn = *(argv+optind+x1);
1222 int f_type = probe_file_type_fast(fasta_fn);
1223 if(f_type != FILE_TYPE_GZIP_FASTA && f_type != FILE_TYPE_FASTA && f_type != FILE_TYPE_NONEXIST){
1224 SUBREADprintf("ERROR: '%s' is not a Fasta file.\n", fasta_fn);
1225 STANDALONE_exit(-1);
1226 }
1227 }
1228
1229 for(x1 = strlen(output_file); x1 >=0; x1--){
1230 if(output_file[x1]=='/'){
1231 memcpy(tmp_fa_file, output_file, x1);
1232 tmp_fa_file[x1]=0;
1233 break;
1234 }
1235 }
1236 if(tmp_fa_file[0]==0)strcpy(tmp_fa_file, "./");
1237
1238 #ifdef __MINGW32__
1239 sprintf(tmp_fa_file+strlen(tmp_fa_file), "/subread-index-sam-%06u-%06d", getpid(),(int)(time(NULL) % 1000000));
1240 #else
1241 sprintf(tmp_fa_file+strlen(tmp_fa_file), "/subread-index-sam-%06u-XXXXXX", getpid());
1242 int tmpfdd = mkstemp(tmp_fa_file);
1243 if(tmpfdd == -1){
1244 SUBREADprintf("ERROR: cannot create temp file\n");
1245 return -1;
1246 }
1247 #endif
1248
1249 sprintf(log_file_name, "%s.log", output_file);
1250 FILE * log_fp = f_subr_open(log_file_name,"wb");
1251
1252 signal (SIGTERM, SIGINT_hook);
1253 signal (SIGINT, SIGINT_hook);
1254
1255
1256 int ret = check_and_convert_FastA(argv+optind , argc - optind, tmp_fa_file, &chromosome_lengths, log_fp, log_file_name);
1257 if(log_fp)
1258 fclose(log_fp);
1259
1260 if(!ret)
1261 {
1262 long long actual_bases=0;
1263 HashTable * huge_table;
1264 huge_table = HashTableCreate(200000);
1265 unsigned int expected_hash_items = (unsigned int)(memory_limit * 1024.0 / 8.) * 1024 ;
1266 unsigned int * bucket_sizes = NULL, total_tables=0;
1267 unsigned int bucket_no = calculate_buckets_by_size(expected_hash_items, SUBINDEX_VER2, 0, GENE_SLIDING_STEP);
1268 ret = ret || scan_gene_index(output_file, ptr_tmp_fa_file , 1, threshold, huge_table, &actual_bases);
1269 ret = ret || build_gene_index(output_file, ptr_tmp_fa_file , 1, threshold, huge_table, chromosome_lengths, actual_bases, 1, &bucket_sizes, expected_hash_items, bucket_no, &total_tables);
1270
1271 long long estm_need_memory = estimate_memory_peak( bucket_sizes, bucket_no, total_tables );
1272 long long needed_mem = estm_need_memory + huge_table->numOfElements * sizeof(KeyValuePair) + 560ll*1024*1024 + actual_bases / ((GENE_SLIDING_STEP > 1 ) ? 7:5);
1273
1274 if(free_mem>0 && free_mem < needed_mem){
1275 print_in_box(80, 0, 1, "");
1276 print_in_box(80, 0, 1, "WARNING: available memory is lower than %.1f GB." ,needed_mem*1./1024l/1024/1024);
1277 print_in_box(80, 0, 1, " The program may run very slow.");
1278 print_in_box(80, 0, 1, "Build a gapped index and/or split index into blocks to reduce memory use.");
1279 print_in_box(80, 0, 1, "");
1280 }else
1281 print_in_box(80, 0, 0, "%.1f GB of memory is needed for index building." ,needed_mem*1./1024l/1024/1024);
1282
1283
1284 ret = ret || build_gene_index(output_file, ptr_tmp_fa_file , 1, threshold, huge_table, chromosome_lengths, actual_bases, 0, &bucket_sizes, expected_hash_items, bucket_no, &total_tables);
1285
1286 if(!ret){
1287 print_in_box(80, 0, 1, "Total running time: %.1f minutes.", (miltime()-begin00_ftime)/60);
1288 print_in_box(89, 0, 1, "Index %c[36m%s%c[0m was successfully built.", CHAR_ESC, output_file, CHAR_ESC);
1289 }
1290 HashTableDestroy(huge_table);
1291 free(chromosome_lengths);
1292 free(bucket_sizes);
1293 }
1294
1295 unlink(tmp_fa_file);
1296 tmp_fa_file[0]=0;
1297
1298 print_in_box(80, 0, 0, "");
1299 print_in_box(80, 2, 1, "");
1300 SUBREADputs("");
1301 return ret;
1302 }
1303
1304