1 /***************************************************************
2 
3    The Subread and Rsubread software packages are free
4    software packages:
5 
6    you can redistribute it and/or modify it under the terms
7    of the GNU General Public License as published by the
8    Free Software Foundation, either version 3 of the License,
9    or (at your option) any later version.
10 
11    Subread is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty
13    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14 
15    See the GNU General Public License for more details.
16 
17    Authors: Drs Yang Liao and Wei Shi
18 
19   ***************************************************************/
20 
21 
22 #include <stdio.h>
23 #include <ctype.h>
24 #include <string.h>
25 #include <getopt.h>
26 #include <signal.h>
27 #include <unistd.h>
28 #include <time.h>
29 #include <sys/types.h>
30 #include "hashtable.h"
31 #include "gene-value-index.h"
32 #include "HelperFunctions.h"
33 #include "gene-algorithms.h"
34 #include "sorted-hashtable.h"
35 #include "seek-zlib.h"
36 #include "hashtable.h"
37 #include "input-files.h"
38 
39 #define NO_GENE_DEBUG_
40 #define _GENE_DEBUG_SIZE_ 40000
41 #define MIN_READ_SPLICING 2000000
42 
43 
44 #define MAX_KEY_MATCH GENE_VOTE_SPACE
45 int GENE_SLIDING_STEP = 3;
46 int IS_COLOR_SPACE = 0;
47 int MARK_NONINFORMATIVE_SUBREADS = 0;
48 int IS_FORCED_ONE_BLOCK = 0;
49 int ignore_bar_in_seqnames = 0;
50 double begin00_ftime = 0.0;
51 
52 #define NEXT_READ 1
53 #define NEXT_FILE 2
54 #define FULL_PARTITION 4
55 
estimate_memory_peak(unsigned int * bucket_sizes,unsigned int bucket_no,unsigned int tables)56 long long estimate_memory_peak(unsigned int * bucket_sizes, unsigned int bucket_no, unsigned int tables ){
57 	long long peakuse = 0;
58 	int tn;
59 	for(tn=0; tn<tables; tn++){
60 		long long this_peak = 0;
61 		unsigned int bn;
62 		for(bn=0; bn<bucket_no; bn++)
63 			this_peak += bucket_sizes[bn + tn*bucket_no];
64 		peakuse = max(peakuse, this_peak);
65 	}
66 	//SUBREADprintf("EST MEMORY PEAK=%lld, bucks=%u, tabs=%u\n", peakuse, bucket_no, tables);
67 	return peakuse *(sizeof(short)+sizeof(gehash_data_t)) + sizeof(struct gehash_bucket)*bucket_no + sizeof(int) * tables*bucket_no;
68 }
69 
print_build_log(double finished_rate,double read_per_second,double expected_seconds,unsigned long long int total_reads)70 void print_build_log(double finished_rate, double read_per_second, double expected_seconds, unsigned long long int total_reads)
71 {
72 	print_in_box( 81,0,0,"%4d%%%%, %3d mins elapsed, rate=%.1fk bps/s", (int)(finished_rate*100), (int)(miltime()-begin_ftime)/60, read_per_second/1000 );
73 }
74 
75 
76 #define MAX_BASES_IN_INDEX 4294900000.0
77 
build_gene_index(const char index_prefix[],char ** chro_files,int chro_file_number,int threshold,HashTable * huge_table,unsigned int * chro_lens,long long actual_bases,int for_measure_buckets,unsigned int ** bucket_sizes,unsigned int expected_hash_items,unsigned int bucket_no,unsigned int * total_tables)78 int build_gene_index(const char index_prefix [], char ** chro_files, int chro_file_number, int threshold, HashTable * huge_table, unsigned int * chro_lens, long long actual_bases, int for_measure_buckets, unsigned int ** bucket_sizes, unsigned int expected_hash_items, unsigned int bucket_no, unsigned int * total_tables){
79 	int file_number, table_no, status = NEXT_FILE;
80 	unsigned int offset, read_no, current_tab_items;
81 	long long int all_bases = guess_gene_bases(chro_files,chro_file_number);
82 
83 	int chro_table_maxsize=100, dump_res = 0;
84 	unsigned int * read_offsets = malloc(sizeof(unsigned int) * chro_table_maxsize);
85 	char * read_names = malloc(MAX_READ_NAME_LEN * chro_table_maxsize);
86 	gehash_t table;
87 //	gehash_t huge_table;
88 	gene_value_index_t value_array_index;
89 
90 	gene_input_t * ginp = malloc(sizeof(gene_input_t));
91 	begin_ftime = miltime();
92 
93 //	SUBREADprintf ("Index items per partition = %u\n\n", expected_hash_items);
94 
95 	if (chro_file_number > 199)
96 	{
97 		SUBREADprintf("ERROR: There are too many FASTA files. You may merge them into one FASTA file.\n");
98 		return -1;
99 	}
100 
101 	if (strlen (index_prefix) > 290)
102 	{
103 		SUBREADprintf("ERROR: The path is too long. It should not be longer than 290 chars.\n");
104 		return -1;
105 	}
106 
107 	if(all_bases<1)
108 	{
109 		SUBREADprintf("ERROR: File '%s' is inaccessible.\n", chro_files[-all_bases-1]);
110 		return -1;
111 	}
112 
113 	current_tab_items = 0;
114 	table_no = 0;
115 	int padding_around_contigs = MAX_READ_LENGTH;
116 	if(0 == for_measure_buckets){
117 		if(gehash_create_ex(& table, expected_hash_items, 0, SUBINDEX_VER2, GENE_SLIDING_STEP, padding_around_contigs)) return 1;
118 		if(gvindex_init(&value_array_index, 0, (unsigned int) actual_bases))return 1;
119 	}
120 	if(1==for_measure_buckets){
121 		*bucket_sizes = realloc(*bucket_sizes, sizeof(int) * (table_no+1) * bucket_no);
122 		memset((*bucket_sizes) + table_no * bucket_no, 0, sizeof(int) *  bucket_no);
123 		memset(&table,0 , sizeof(gehash_t));
124 		memset(&value_array_index, 0, sizeof(gene_value_index_t));
125 		table.padding = padding_around_contigs;
126 		table.index_gap = GENE_SLIDING_STEP;
127 	}
128 
129 	file_number = 0;
130 	offset = table.padding;
131 	read_no = 0;
132 
133 	char * fn = malloc(3100);
134 	memset(read_offsets,0, chro_table_maxsize*sizeof(int));
135 	sprintf(fn, "%s.files", index_prefix);
136 	unlink(fn);
137 
138 	status = NEXT_FILE;
139 
140 	if(1==for_measure_buckets)
141 		print_in_box(80,0,0,"Estimate the index size...");
142 	else
143 		print_in_box(80,0,0,"Build the index...");
144 
145 	{
146 		char window [16], last_color_base=-1, last_last_color_base=-1;
147 		int i, read_len = 0;
148 		unsigned int int_key = 0, array_int_key = 0;
149 		int skips=0, all_skips = 0;
150 
151 		//Pre-fill
152 		while(1)
153 		{
154 			//Subread Cycle
155 			char next_char;
156 			if (status == NEXT_FILE)
157 			{
158 				if(file_number == chro_file_number)
159 				{
160 					FILE * fp;
161 
162 					geinput_close(ginp);
163 
164 					//SUBREADprintf ("Processing chromosome files ...\n");
165 
166 					if(0==for_measure_buckets){
167 						sprintf (fn, "%s.%02d.%c.tab", index_prefix, table_no, IS_COLOR_SPACE?'c':'b');
168 						SUBREADfflush(stdout);
169 
170 						if(!dump_res)dump_res |= gehash_dump(&table, fn);
171 
172 						sprintf (fn, "%s.%02d.%c.array", index_prefix, table_no, IS_COLOR_SPACE?'c':'b');
173 						if(!dump_res)dump_res |= gvindex_dump(&value_array_index, fn);
174 						gvindex_destory(&value_array_index) ;
175 
176 						gehash_destory(&table);
177 					}
178 
179 					read_offsets[read_no-1] = offset + table.padding;
180 
181 					for(i=table_no+1; i<100; i++)
182 					{
183 						sprintf(fn, "%s.%02d.%c.tab", index_prefix, i, IS_COLOR_SPACE?'c':'b');
184 						unlink(fn);
185 						sprintf(fn, "%s.%02d.%c.array", index_prefix, i, IS_COLOR_SPACE?'c':'b');
186 						unlink(fn);
187 					}
188 
189 					sprintf (fn, "%s.reads", index_prefix);
190 					fp = f_subr_open(fn, "wb");
191 					for (i=0; i<read_no; i++)
192 						fprintf(fp, "%u\t%s\n", read_offsets[i], read_names+i*MAX_READ_NAME_LEN);
193 
194 					fclose(fp);
195 
196 					break;
197 				}
198 				else
199 				{
200 					if (file_number)
201 						geinput_close(ginp);
202 					geinput_open(chro_files[file_number++], ginp);
203 					status = NEXT_READ;
204 				}
205 			}
206 			if (status == NEXT_READ)
207 			{
208 
209 				geinput_readline(ginp, fn, 0);
210 
211 				if(read_no>0){
212 					read_offsets[read_no-1] = offset + table.padding;
213 					offset += 2*table.padding;
214 				}
215 
216 				//printf("TTTXT FN=%s\n",fn);
217 
218 				for(i=0;(fn[i+1] != ' ' && fn[i+1] != '\0' && fn[i+1] != '\t' && i<MAX_CHROMOSOME_NAME_LEN - 1 && ( fn[i+1] != '|'  || !ignore_bar_in_seqnames )); i++)
219 					*(read_names + MAX_READ_NAME_LEN*read_no + i) = fn[i+1];
220 
221 				*(read_names + MAX_READ_NAME_LEN*read_no + i) = 0;
222 
223 				sprintf(fn, "%s.files", index_prefix);
224 				FILE * fname_fp = f_subr_open(fn, "a");
225 				fprintf(fname_fp, "%s\t%s\t%ld\n", read_names+read_no*MAX_READ_NAME_LEN, ginp -> filename, ftell(ginp -> input_fp));
226 				fclose(fname_fp);
227 
228 				for (i=0; i<16; i++)
229 				{
230 					char nch = geinput_next_char(ginp);
231 					if (nch == 'N') skips = 16;
232 					else if (skips>0) skips--;
233 					window[i] = nch;
234 				}
235 
236 				read_len = 16;
237 				read_no ++;
238 
239 				if(read_no >= chro_table_maxsize)
240 				{
241 					read_offsets = realloc(read_offsets, 2* chro_table_maxsize * sizeof(unsigned int));
242 					read_names = realloc(read_names, 2* chro_table_maxsize * MAX_READ_NAME_LEN);
243 					chro_table_maxsize *= 2;
244 				}
245 
246 				if(IS_COLOR_SPACE)
247 				{
248 					int_key = genekey2color('A',window);
249 					array_int_key = genekey2int(window,GENE_SPACE_BASE);
250 					last_last_color_base = -1;
251 					last_color_base = window[15];
252 				}
253 				else
254 					array_int_key = int_key = genekey2int(window,GENE_SPACE_BASE);
255 
256 			}
257 			if (status == FULL_PARTITION)
258 			{
259 				int seek_back_reads ;
260 
261 				if (read_len < MIN_READ_SPLICING)
262 				{
263 					seek_back_reads = read_len;
264 					offset -= seek_back_reads;
265 					offset += 16;
266 				}
267 				else
268 				{
269 					seek_back_reads = MIN_READ_SPLICING - 10;
270 					seek_back_reads -= seek_back_reads%3;
271 					seek_back_reads += 1;
272 					offset -= seek_back_reads;
273 					offset += 16;
274 				}
275 
276 				if(0==for_measure_buckets){
277 					sprintf(fn, "%s.%02d.%c.tab", index_prefix, table_no, IS_COLOR_SPACE?'c':'b');
278 					SUBREADfflush(stdout);
279 
280 					if(!dump_res)dump_res |= gehash_dump(&table, fn);
281 					sprintf(fn, "%s.%02d.%c.array", index_prefix, table_no, IS_COLOR_SPACE?'c':'b');
282 					if(!dump_res)dump_res |= gvindex_dump(&value_array_index, fn);
283 				}
284 
285 				table_no ++;
286 
287 				if(0==for_measure_buckets){
288 					gehash_destory(&table);
289 					gvindex_destory(&value_array_index);
290 				}
291 
292 				read_len -= seek_back_reads;
293 				read_len += 16;
294 
295 				while(seek_back_reads)
296 				{
297 					fseek(ginp -> input_fp, -1, SEEK_CUR);
298 					char bnch = fgetc(ginp -> input_fp);
299 					if ((bnch >='A' && bnch <= 'Z' ) || (bnch >='a' && bnch <= 'z' ) || bnch == '-' || bnch == 'N' || bnch=='.')seek_back_reads--;
300 					fseek(ginp -> input_fp, -1, SEEK_CUR);
301 				}
302 
303 				for (i=0; i<16; i++)
304 				{
305 					char nch = geinput_next_char(ginp);
306 					if (nch == 'N' ) skips = 16;
307 					else if (skips>0) skips--;
308 					window[i] = nch;
309 				}
310 
311 				if(IS_COLOR_SPACE)
312 				{
313 					int_key = genekey2color('A',window);
314 					array_int_key = genekey2int(window,GENE_SPACE_BASE);
315 					last_last_color_base = -1;
316 					last_color_base = window[15];
317 				}
318 				else
319 					array_int_key = int_key = genekey2int(window, GENE_SPACE_BASE);
320 
321 				current_tab_items = 0;
322 				if(0==for_measure_buckets){
323 					if(gehash_create_ex(&table, expected_hash_items, 0, SUBINDEX_VER2, GENE_SLIDING_STEP, padding_around_contigs)) return 1;
324 					if(gvindex_init(&value_array_index, offset, (unsigned int) actual_bases)) return 1;
325 				}
326 
327 				if(1==for_measure_buckets){
328 					//unsigned int * tmpp = *bucket_sizes;
329 					*bucket_sizes = realloc(*bucket_sizes, sizeof(int) * (table_no+1) * bucket_no);
330 					assert(NULL!= *bucket_sizes);
331 					memset((*bucket_sizes) + table_no * bucket_no, 0,   sizeof(int) * bucket_no);
332 				}
333 			}
334 
335 			status = 0;
336 
337 			if(skips || (IS_COLOR_SPACE && last_last_color_base<0))
338 				all_skips ++;
339 			else
340 			{
341 				int is_no_info = HashTableGet(huge_table, NULL+1+int_key)-NULL;
342 				if(is_no_info){
343 					//	SUBREADprintf("NOINFO KEY=%u AT %u\n", int_key, offset);
344 				}else {
345 					if(0 == for_measure_buckets)if(gehash_insert(&table, int_key, offset - (IS_COLOR_SPACE?1:0), (*bucket_sizes)+ table_no * bucket_no))return 1;
346 					if(1 == for_measure_buckets) gehash_try_insert_measure((*bucket_sizes)+ table_no * bucket_no , bucket_no, int_key);
347 					current_tab_items++;
348 				}
349 				if( 0 == for_measure_buckets )
350 					gvindex_set(&value_array_index, offset - (IS_COLOR_SPACE?0:0), array_int_key, padding_around_contigs);
351 			}
352 
353 			if((!IS_FORCED_ONE_BLOCK) && current_tab_items >= expected_hash_items && (read_len > MIN_READ_SPLICING || read_len < 32))
354 			{
355 				status = FULL_PARTITION;
356 				continue;
357 			}
358 
359 			for (i=0; i<GENE_SLIDING_STEP; i++)
360 			{
361 				next_char = geinput_next_char(ginp);
362 				if(next_char < 0) {
363 					if( 0 == for_measure_buckets ) gvindex_set(&value_array_index, offset - (IS_COLOR_SPACE?0:0), array_int_key, padding_around_contigs);
364 
365 					if (next_char == -1) status = NEXT_READ;
366 					if (next_char == -2) status = NEXT_FILE;
367 					if (next_char == -3) return 0;
368 					break;
369 				}
370 				//SUBREADprintf("NEXT_CH=%c\n", next_char);
371 
372 				if (next_char == 'N' )skips = 16;
373 				else if (skips>0){
374 					skips--;
375 					last_color_base = -1;
376 				}
377 
378 				int_key = int_key << 2;
379 
380 
381 				if (IS_COLOR_SPACE)
382 				{
383 					if(last_color_base>0)
384 						int_key += chars2color(last_color_base, next_char);
385 					array_int_key = array_int_key << 2;
386 					array_int_key += base2int (next_char);
387 
388 					last_last_color_base = last_color_base;
389 					last_color_base = next_char;
390 				}
391 				else
392 				{
393 					int_key += base2int (next_char);
394 					array_int_key = int_key;
395 				}
396 
397 				if(actual_bases > 12){
398 					if (offset>0 && offset % (actual_bases / 12) == 0)
399 					{
400 						double finished_rate = offset*1.0/actual_bases;
401 						double base_per_second = offset / (miltime()-begin_ftime);
402 						double ETA = (actual_bases-offset) / base_per_second;
403 						print_build_log(finished_rate,base_per_second,ETA, actual_bases);
404 						SUBREADfflush(stdout) ;
405 					}
406 				}
407 
408 				offset ++;
409 				read_len ++;
410 
411 				if(offset > 0xFFFFFFFDU)
412 				{
413 					SUBREADprintf("ERROR: the provided reference sequences include more than 4 billion bases.\n") ;
414 					return -1;
415 				}
416 
417 			}
418 
419 		}
420 	}
421 	free(read_names);
422 	free(read_offsets);
423 	if(dump_res){
424 		SUBREADprintf("No index was built.\n");
425 		sprintf(fn, "%s.files", index_prefix);
426 		unlink(fn);
427 		sprintf(fn, "%s.reads", index_prefix);
428 		unlink(fn);
429 		int index_i;
430 		for(index_i = 0; index_i <= 99; index_i++){
431 			sprintf(fn, "%s.%02d.b.tab", index_prefix, index_i);
432 			unlink(fn);
433 			sprintf(fn, "%s.%02d.c.tab", index_prefix, index_i);
434 			unlink(fn);
435 			sprintf(fn, "%s.%02d.b.array", index_prefix, index_i);
436 			unlink(fn);
437 			sprintf(fn, "%s.%02d.c.array", index_prefix, index_i);
438 			unlink(fn);
439 		}
440 	}
441 	free(fn);
442 	free(ginp);
443 	*total_tables = table_no+1;
444 	return 0;
445 }
446 
add_repeated_subread(gehash_t * tab,unsigned int subr,unsigned char ** huge_index)447 int add_repeated_subread(gehash_t * tab , unsigned int subr, unsigned char ** huge_index)
448 {
449 	int huge_byte = (subr>>1);
450 	int huge_offset = (subr % 2) * 4;
451 	int ind_id = (huge_byte >> 24) & 127;
452 
453 	unsigned int byte_value = huge_index[ ind_id ][huge_byte&0xffffff] ;
454 
455 	int huge_value = (byte_value>> huge_offset) & 0xf;
456 	if(huge_value <15){
457 		huge_value ++;
458 		byte_value = (byte_value & (~(0xf << huge_offset))) | (huge_value << huge_offset);
459 		huge_index[ ind_id ][huge_byte&0xffffff] = (unsigned char)(byte_value&0xff);
460 		return 0;
461 	}
462 	unsigned int times = 0;
463 	int matched = gehash_get(tab, subr, &times, 1);
464 	if(matched)
465 		gehash_update(tab, subr, times+1);
466 	else
467 		if(gehash_insert(tab, subr, 16, NULL)) return 1;
468 
469 	return 0;
470 }
471 
scan_gene_index(const char index_prefix[],char ** chro_files,int chro_file_number,int threshold,HashTable * huge_table,long long * actual_total_bases_inc_marging)472 int scan_gene_index(const char index_prefix [], char ** chro_files, int chro_file_number, int threshold, HashTable *huge_table, long long * actual_total_bases_inc_marging)
473 {
474 	int file_number, i ,j;
475 	int status = NEXT_FILE;
476 	unsigned int offset, read_no;
477 	long long int guess_all_bases = guess_gene_bases(chro_files,chro_file_number);
478 	int padding_around_contigs = MAX_READ_LENGTH;
479 	*actual_total_bases_inc_marging = padding_around_contigs;
480 
481 	gehash_t occurrence_table;
482 	unsigned char * huge_index[128];
483 
484 	for(i=0;i<128;i++) {
485 		huge_index[i] = calloc(1024*1024*16,1);
486 		if(NULL == huge_index[i]){
487 			SUBREADprintf("ERROR: No memory can be allocated.\n");
488 			return -1;
489 		}
490 	}
491 
492 	if(gehash_create_ex(&occurrence_table, 500000, 0, SUBINDEX_VER0, 1, 0)) return 1;
493 
494 	gene_input_t * ginp = malloc(sizeof(gene_input_t));
495 
496 	print_in_box(80,0,0,"Scan uninformative subreads in reference sequences ...");
497 
498 	if (chro_file_number > 199)
499 	{
500 		SUBREADprintf("ERROR: There are too many FASTA files. You may merge them into one FASTA file.\n");
501 		return -1;
502 	}
503 
504 	if (strlen (index_prefix) > 290)
505 	{
506 		SUBREADprintf("ERROR: The path is too long. It should not be longer than 290 chars.\n");
507 		return -1;
508 	}
509 	if(guess_all_bases<1)
510 	{
511 		SUBREADprintf("ERROR: File '%s' is inaccessible.\n", chro_files[-guess_all_bases-1]);
512 		return -1;
513 	}
514 
515 
516 	file_number = 0;
517 	offset = 0;
518 	read_no = 0;
519 
520 	char * fn = malloc(3100);
521 	sprintf(fn, "%s.files", index_prefix);
522 	unlink(fn);
523 
524 	status = NEXT_FILE;
525 
526 
527 	{
528 		char window [16], last_color_base=-1, last_last_color_base=-1;
529 		int i, read_len = 0;
530 		unsigned int int_key = 0, array_int_key = 0;
531 		int skips=0, all_skips = 0;
532 
533 		//Pre-fill
534 		while(1)
535 		{
536 			//Subread Cycle
537 			char next_char;
538 			//if(status > 0)SUBREADprintf("BDSSTATUS = %d\n", status);
539 
540 			if (status == NEXT_FILE)
541 			{
542 				(*actual_total_bases_inc_marging)+=padding_around_contigs;
543 				if(file_number == chro_file_number)
544 				{
545 					geinput_close(ginp);
546 
547 					break;
548 				}
549 				else
550 				{
551 					if (file_number)
552 						geinput_close(ginp);
553 					geinput_open(chro_files[file_number++], ginp);
554 					status = NEXT_READ;
555 				}
556 			}
557 			if (status == NEXT_READ)
558 			{
559 				(*actual_total_bases_inc_marging)+=2*padding_around_contigs;
560 
561 				geinput_readline(ginp, fn, 0);
562 				//SUBREADprintf("HEADER_SCAN '''%s'''\n", fn);
563 
564 				for (i=0; i<16; i++)
565 				{
566 					char nch = geinput_next_char(ginp);
567 					if (nch == 'N') skips = 16;
568 					else if (skips>0) skips--;
569 					window[i] = nch;
570 				}
571 				read_len = 16;
572 				read_no ++;
573 
574 				if(IS_COLOR_SPACE)
575 				{
576 					int_key = genekey2color('A',window);
577 					array_int_key = genekey2int(window,GENE_SPACE_BASE);
578 					last_last_color_base = -1;
579 					last_color_base = window[15];
580 				}
581 				else
582 					array_int_key = int_key = genekey2int(window,GENE_SPACE_BASE);
583 
584 			}
585 
586 			status = 0;
587 
588 			if(skips || (IS_COLOR_SPACE && last_last_color_base<0))
589 				all_skips ++;
590 			else
591 			{
592 				int rv = add_repeated_subread(&occurrence_table, int_key, huge_index);
593 				if(rv<0) {
594 					return -1;
595 				}
596 			}
597 
598 
599 			for (i=0; i<GENE_SLIDING_STEP; i++)
600 			{
601 				next_char = geinput_next_char(ginp);
602 				if(next_char < 0)
603 				{
604 					if (next_char == -1) status = NEXT_READ;
605 					if (next_char == -2) status = NEXT_FILE;
606 					if (next_char == -3) return 0;
607 					break;
608 				}
609 
610 				if (next_char == 'N' )skips = 16;
611 				else if (skips>0){
612 					skips--;
613 					last_color_base = -1;
614 				}
615 
616 				int_key = int_key << 2;
617 
618 				if (IS_COLOR_SPACE)
619 				{
620 					if(last_color_base>0)
621 						int_key += chars2color(last_color_base, next_char);
622 					array_int_key = array_int_key << 2;
623 					array_int_key += base2int (next_char);
624 
625 					last_last_color_base = last_color_base;
626 					last_color_base = next_char;
627 				}
628 				else
629 				{
630 					int_key += base2int (next_char);
631 					array_int_key = int_key;
632 				}
633 
634 				offset ++;
635 				(*actual_total_bases_inc_marging)++;
636 				read_len ++;
637 
638 				if(offset > 0xFFFFFFFDU)
639 				{
640 					SUBREADprintf("ERROR: the provided reference sequences include more than 4 billion bases.\n") ;
641 					return -1;
642 				}
643 
644 			}
645 
646 		}
647 	}
648 
649 
650 
651 	free(fn);
652 
653 	for (i=0; i<occurrence_table.buckets_number; i++)
654 	{
655 		struct gehash_bucket * current_bucket = &(occurrence_table.buckets[i]);
656 
657 		if(current_bucket -> current_items>=1)
658 		{
659 			for (j=0; j<current_bucket -> current_items; j++)
660 			{
661 				if(current_bucket -> item_values [j] > threshold)
662 				{
663 					HashTablePut(huge_table,NULL+1+ current_bucket -> item_keys[j], NULL+current_bucket -> item_values [j]);
664 				}
665 			}
666 		}
667 	}
668 
669 
670 	for(i=0;i<128;i++)
671 		if(huge_index[i])  free(huge_index[i]);
672 
673 	free(ginp);
674 	gehash_destory(&occurrence_table);
675 
676 	if(huge_table -> numOfElements)
677 	{
678 		print_in_box(80,0,0,"%llu uninformative subreads were found.", huge_table -> numOfElements);
679 		print_in_box(80,0,0,"These subreads were excluded from index building.");
680 	}
681 
682 	return 0;
683 }
684 
685 
rtrim(char * s)686 char *rtrim(char *s)
687 {
688 	char* back = s + strlen(s);
689 	while(isspace(*--back));
690 	*(back+1) = '\0';
691 	return s;
692 }
693 
694 typedef struct{
695 	char * filename;
696 	unsigned int line_no;
697 	int byte_in_line;
698 } format_check_context_t;
699 
700 int ERROR_FOUND_IN_FASTA = 0;
check_and_convert_warn(format_check_context_t * fcc,char * msg,FILE * log_fp)701 void check_and_convert_warn(format_check_context_t * fcc, char * msg, FILE * log_fp){
702 	ERROR_FOUND_IN_FASTA += 1;
703 	fprintf(log_fp, "A format issue is found in the %u-th line in file '%s': %s.\n", fcc->line_no, fcc->filename, msg);
704 }
check_and_convert_warnOLD(char * FN,long long int fpos_line_head,unsigned line_no,int line_pos,char * msg,FILE * log_fp)705 void check_and_convert_warnOLD(char * FN, long long int fpos_line_head, unsigned line_no, int line_pos, char * msg, FILE * log_fp)
706 {
707 #define CHAR_ESC 27
708 	int x1,brs=0;
709 	long long int back_search_ptr;
710 	char * line_buf = malloc(MAX_READ_LENGTH+1);
711 
712 	ERROR_FOUND_IN_FASTA += 1;
713 
714 	fprintf(log_fp,"\n");
715 
716 	//fprintf(log_fp,"%c[33m", CHAR_ESC);
717 	for(x1=0;x1<81;x1++)
718 		fprintf(log_fp,"=");
719 	fprintf(log_fp,"\n");
720 	//fprintf(log_fp,"%c[32m", CHAR_ESC);
721 	fprintf(log_fp,"Input file '%s':\n", FN);
722 	//fprintf(log_fp,"%c[31m", CHAR_ESC);
723 	fprintf(log_fp,"%s\n", msg);
724 	//fprintf(log_fp,"%c[33m", CHAR_ESC);
725 	for(x1=0;x1<81;x1++)
726 		fprintf(log_fp,".");
727 	fprintf(log_fp,"\n");
728 	//fprintf(log_fp,"%c[37m", CHAR_ESC);
729 
730 
731 	FILE * warn_fp = f_subr_open(FN, "r");
732 
733 	for(back_search_ptr = fpos_line_head - 1; back_search_ptr>=0; back_search_ptr--)
734 	{
735 		int nch;
736 		fseeko(warn_fp, back_search_ptr, SEEK_SET);
737 		nch = fgetc(warn_fp);
738 		if(nch == '\n') brs++;
739 		if(brs >2) break;
740 		fseeko(warn_fp, back_search_ptr, SEEK_SET);
741 	}
742 
743 	if(back_search_ptr<=0)brs++;
744 
745 	int print_line_no = line_no - brs + 1;
746 	while(1)
747 	{
748 		char * ret = fgets(line_buf, MAX_READ_LENGTH, warn_fp);
749 		if(!ret)break;
750 
751 		/*
752 		if(ftello(warn_fp) > fpos_line_head)
753 			fprintf(log_fp,"%c[9m%c[31m", CHAR_ESC, CHAR_ESC);
754 		else
755 			fprintf(log_fp,"%c[29m%c[37m", CHAR_ESC, CHAR_ESC);
756 		*/
757 		fprintf(log_fp," % 9d ", print_line_no++);
758 
759 		rtrim(line_buf);
760 
761 		fprintf(log_fp,"%s%s\n",line_buf,strlen(line_buf)<16?"              ":"");
762 		if(ftello(warn_fp) > fpos_line_head)
763 			break;
764 	}
765 	for(x1=0;x1<line_pos+11;x1++)
766 		fprintf(log_fp," ");
767 	fprintf(log_fp,"^\n");
768 
769 	//fprintf(log_fp,"%c[29m%c[37m", CHAR_ESC, CHAR_ESC);
770 	for(x1=0;x1<2;x1++)
771 	{
772 		char * ret = fgets(line_buf, MAX_READ_LENGTH, warn_fp);
773 		if(!ret)break;
774 		fprintf(log_fp," % 9d ", print_line_no++);
775 		fprintf(log_fp,"%s",line_buf);
776 	}
777 	fclose(warn_fp);
778 	//fprintf(log_fp,"%c[33m", CHAR_ESC);
779 	for(x1=0;x1<81;x1++)
780 		fprintf(log_fp,"=");
781 	fprintf(log_fp,"\n");
782 	fprintf(log_fp,"\n");
783 
784 	//fprintf(log_fp,"%c[0m", CHAR_ESC);
785 	free(line_buf);
786 }
787 
check_and_convert_FastA(char ** input_fas,int fa_number,char * out_fa,unsigned int ** chrom_lens,FILE * log_fp,char * log_fn)788 int check_and_convert_FastA(char ** input_fas, int fa_number, char * out_fa, unsigned int ** chrom_lens, FILE * log_fp, char * log_fn)
789 {
790 	int is_R_warnned = 0, is_repeated_chro= 0;
791 	char * line_buf = malloc(MAX_READ_LENGTH);
792 	char * read_head_buf = malloc(MAX_READ_LENGTH * 3);
793 	unsigned int inp_file_no, line_no;
794 	int written_chrs = 0, is_disk_full = 0;
795 	int chrom_lens_max_len = 100;
796 	int chrom_lens_len = 0;
797 	ERROR_FOUND_IN_FASTA = 0;
798 	FILE * out_fp = f_subr_open(out_fa,"wb");
799 	format_check_context_t fcc;
800 	memset(&fcc,0,sizeof(fcc));
801 
802 	if(!out_fp)
803 	{
804 		SUBREADprintf("ERROR: the output directory is not writable, but the index builder needs to create temporary files in the current directory. Please change the working directory and rerun the index builder.\n");
805 		return -1;
806 	}
807 
808 	(*chrom_lens) = malloc(chrom_lens_max_len*sizeof(unsigned int));
809 	memset((*chrom_lens), 0, chrom_lens_max_len*sizeof(unsigned int));
810 
811 
812 	HashTable * rep_name_table = HashTableCreate(9999);
813 	HashTableSetDeallocationFunctions(rep_name_table, free, NULL);
814 	HashTableSetKeyComparisonFunction(rep_name_table, (int (*) (const void *, const void *)) strcmp);
815 	HashTableSetHashFunction(rep_name_table , fc_chro_hash);
816 
817 	print_in_box( 80,0,0,"Check the integrity of provided reference sequences ...");
818 	for(inp_file_no = 0; inp_file_no < fa_number; inp_file_no++)
819 	{
820 		autozip_fp fafp;
821 		memset(&fafp, 0, sizeof(fafp));
822 		int faret = autozip_open(input_fas[inp_file_no], &fafp);
823 		fcc.filename = input_fas[inp_file_no];
824 
825 		if(faret <0) {
826 			SUBREADprintf("ERROR: Input file '%s' is not found or is not accessible. No index was built.\n", input_fas[inp_file_no]);
827 			HashTableDestroy(rep_name_table);
828 			return -1;
829 		}
830 
831 		line_no = 0;
832 		int is_head_written=0;
833 		read_head_buf[0]=0;
834 		while(1) {
835 			unsigned int read_len = 0;
836 			int ret = autozip_gets(&fafp, line_buf, MAX_READ_LENGTH);
837 			if(ret<1) break;
838 			if(ret >= MAX_READ_LENGTH-2){
839 				SUBREADprintf("ERROR: A fasta file cannot have a line longer than 1000 bytes. You need to split a very long line into many lines.\n");
840 				return -1;
841 			}
842 
843 			line_no ++;
844 			fcc.line_no = line_no;
845 			int line_buf_len = strlen(line_buf);
846 
847 			for(; line_buf[line_buf_len-1] == '\r' || line_buf[line_buf_len-1]=='\n' ;line_buf_len--)
848 			{
849 				if(line_buf[line_buf_len-1]=='\r')
850 				{
851 					if(!is_R_warnned)
852 					{
853 						is_R_warnned=1;
854 						fcc.byte_in_line = line_buf_len-1;
855 						check_and_convert_warn(&fcc ,"This line ends with '\\r\\n'. It is not a problem for building the index but we suggest to use Unix-styled line breaks.", log_fp);
856 					}
857 				}
858 				line_buf[line_buf_len-1] =0;
859 			}
860 			//if(strchr(line_buf,'>') || strchr(line_buf,'\r') || strchr(line_buf,'\n'))SUBREADprintf("TESTING: string contains '''%s'''  at %I64u\n", line_buf, ftello(fafp.plain_fp));
861 
862 			if(line_buf_len<1)
863 			{
864 				fcc.byte_in_line = -1;
865 				check_and_convert_warn(&fcc ,"This line is empty. This is not allowed in the FASTA file.", log_fp);
866 				continue;
867 			}
868 
869 			if(line_buf[0]=='>')
870 			{
871 				if(line_no>1 &&!is_head_written)
872 				{
873 					fcc.byte_in_line = 0;
874 					check_and_convert_warn(&fcc,"This sequence has less than 16 bases. It is ignored in the index because no subreads can be extracted.", log_fp);
875 				}
876 				is_head_written = 0;
877 				read_len = 0;
878 				read_head_buf[0]=0;
879 
880 				strcat(read_head_buf, line_buf);
881 				strcat(read_head_buf, "\n");
882 
883 				int chro_name_end=1;
884 				while(line_buf[chro_name_end])
885 				{
886 					if(line_buf[chro_name_end]==' '||line_buf[chro_name_end]=='|' || line_buf[chro_name_end]=='\t')break;
887 					chro_name_end++;
888 				}
889 
890 				line_buf[chro_name_end] = 0;
891 				int is_exist = HashTableGet(rep_name_table , line_buf+1) - NULL;
892 				if(is_exist)
893 				{
894 					SUBREADprintf("ERROR: repeated chromosome name '%s' is observed in the FASTA file(s).\nThe index was NOT built.\n", line_buf+1);
895 					is_repeated_chro=1;
896 					break;
897 				}
898 
899 				char * keymem = malloc(chro_name_end);
900 				strcpy(keymem, line_buf+1);
901 				HashTablePut(rep_name_table, keymem, NULL+1);
902 
903 			}
904 			else if(line_no<1)
905 			{
906 				fcc.byte_in_line=0;
907 				check_and_convert_warn(&fcc ,"This file is not started with a header line. It seems not to be a FASTA file.", log_fp);
908 			}
909 			else
910 			{
911 				int xk2;
912 				for(xk2=0; xk2 < line_buf_len; xk2++)
913 				{
914 					int nextch = line_buf[xk2];
915 					int lowerch = tolower(nextch);
916 					if(!( lowerch == 'a' || lowerch == 't' || lowerch == 'g' || lowerch == 'c' || nextch == '.' || nextch=='-' || lowerch=='n'))
917 					{
918 						fcc.byte_in_line=xk2;
919 						check_and_convert_warn(&fcc, "The non-ACGT base was converted to an 'A'.", log_fp);
920 						line_buf[xk2] = 'A';
921 					}
922 					else if(nextch == '.' || nextch=='-' ||  lowerch=='n')
923 						line_buf[xk2] = 'A';
924 					else
925 						line_buf[xk2] = toupper(nextch);
926 				}
927 
928 				read_len += line_buf_len;
929 
930 				if(read_len > 16 && !is_head_written)
931 				{
932 					fputs(read_head_buf, out_fp);
933 					written_chrs++;
934 					is_head_written = 1;
935 
936 					chrom_lens_len++;
937 					if((chrom_lens_max_len-1) <= chrom_lens_len)
938 					{
939 						(*chrom_lens) = realloc((*chrom_lens), 2*chrom_lens_max_len*sizeof(unsigned int));
940 						chrom_lens_max_len*=2;
941 					}
942 				}
943 
944 				if(is_head_written)
945 				{
946 					int line_buf_len = strlen(line_buf);
947 					int writen_len = fprintf(out_fp,"%s\n", line_buf);
948 					if(writen_len < line_buf_len){
949 						SUBREADprintf("ERROR: unable to write into the temporary file. Please check the free space of the output directory.\n");
950 						is_disk_full = 1;
951 						break;
952 					}
953 					(*chrom_lens)[chrom_lens_len-1] = read_len;
954 					(*chrom_lens)[chrom_lens_len] = 0;
955 				}
956 				else
957 				{
958 					strcat(read_head_buf, line_buf);
959 					strcat(read_head_buf, "\n");
960 				}
961 
962 			}
963 		}
964 
965 		autozip_close(&fafp);
966 		if(is_disk_full) break;
967 	}
968 
969 
970 	HashTableDestroy(rep_name_table);
971 	free(line_buf);
972 	free(read_head_buf);
973 	fclose(out_fp);
974 
975 	if(!written_chrs){
976 		SUBREADprintf("ERROR: No index was built because there were no subreads extracted. A chromosome needs at least 16 bases to be indexed.");
977 		return 1;
978 	}
979 
980 	if(is_repeated_chro|| is_disk_full)
981 		return 1;
982 
983 	if(ERROR_FOUND_IN_FASTA)
984 	{
985 		print_in_box( 80,0,0,"There were %d notes for reference sequences.", ERROR_FOUND_IN_FASTA);
986 		print_in_box( 89,0,0,"The notes can be found in the log file, %c[36m'%s'%c[0m.", CHAR_ESC, log_fn, CHAR_ESC);
987 	}
988 	else	print_in_box( 80,0,0,"No format issues were found");
989 
990 	return 0;
991 }
992 
993 char * tmp_file_for_signal;
994 
SIGINT_hook(int param)995 void SIGINT_hook(int param)
996 {
997 	#ifdef MAKE_STANDALONE
998 	if(tmp_file_for_signal[0])
999 	{
1000 		unlink(tmp_file_for_signal);
1001 		SUBREADprintf("\n\nReceived a terminal signal. The temporary file was removed. The index was NOT built successfully. Please DO NOT use the new index until they are rebuilt.\n\n");
1002 	}
1003 
1004 	exit(param);
1005 	#endif
1006 }
1007 
1008 static struct option ib_long_options[]={
1009 	{0, 0, 0, 0}
1010 };
1011 
1012 #ifdef MAKE_STANDALONE
main(int argc,char ** argv)1013 int main(int argc,char ** argv)
1014 #else
1015 int main_buildindex(int argc,char ** argv)
1016 #endif
1017 {
1018 	int threshold = 100, optindex=0;
1019 	int memory_limit;	// 8000 MBytes
1020 	char output_file[MAX_FILE_NAME_LENGTH], c, tmp_fa_file[MAX_FILE_NAME_LENGTH], log_file_name[MAX_FILE_NAME_LENGTH+20];
1021 	char *ptr_tmp_fa_file[1];
1022 	unsigned int * chromosome_lengths;
1023 	begin00_ftime = miltime();
1024 
1025 	if(sizeof(char *)>4) memory_limit=8000;
1026 	else memory_limit=3000;
1027 
1028 	ptr_tmp_fa_file[0]=tmp_fa_file;
1029 	output_file[0] = 0;
1030 	tmp_fa_file[0] = 0;
1031 	tmp_file_for_signal = tmp_fa_file;
1032 
1033 	IS_FORCED_ONE_BLOCK = 0;
1034 	GENE_SLIDING_STEP = 3;
1035 	IS_COLOR_SPACE = 0;
1036 
1037 	SUBREADprintf("\n");
1038 
1039 	optind = 0;
1040 
1041 	while ((c = getopt_long (argc, argv, "kvcBFM:o:f:Db?", ib_long_options, &optindex)) != -1)
1042 		switch(c)
1043 		{
1044 			case 'b':
1045 				ignore_bar_in_seqnames= 1;
1046 				break;
1047 			case 'B':
1048 				IS_FORCED_ONE_BLOCK =1;
1049 				break;
1050 			case 'F':
1051 				GENE_SLIDING_STEP =1;
1052 				break;
1053 			case 'v':
1054 				core_version_number("Subread-buildindex");
1055 				return 0;
1056 			case 'c':
1057 				IS_COLOR_SPACE = 1;
1058 				break;
1059 			case 'M':
1060 				memory_limit = atoi(optarg);
1061 				break;
1062 			case 'f':
1063 				threshold = atoi(optarg);
1064 				break;
1065 			case 'o':
1066 				strncpy(output_file, optarg, MAX_FILE_NAME_LENGTH-1);
1067 				break;
1068 			case 'k':
1069 				if(memcmp(SUBREAD_VERSION , "1.3.",4)==0)
1070 				{
1071 					SUBREADprintf("The \"-k\" option is not supported in version 1.3.x. ");
1072 					return -1;
1073 				}
1074 				MARK_NONINFORMATIVE_SUBREADS = 1;
1075 				break;
1076 			case '?':
1077 				return -1 ;
1078 		}
1079 
1080 	if (argc == optind || !output_file[0])
1081 	{
1082 		SUBREADprintf("Version %s\n\n", SUBREAD_VERSION);
1083 
1084 	/*
1085 		SUBREADputs("Usage:");
1086 		SUBREADputs("");
1087 		SUBREADputs(" ./subread-buildindex [options] -o <basename> {FASTA file1} [FASTA file2] ...");
1088 		SUBREADputs("");
1089 		SUBREADputs("Required arguments:");
1090 		SUBREADputs("");
1091 		SUBREADputs("    -o <basename>   base name of the index to be created");
1092 		SUBREADputs("");
1093 		SUBREADputs("Optional arguments:");
1094 		SUBREADputs("");
1095 		SUBREADputs("    -G              build a gapped index for the reference genome. 16bp subreads");
1096 		SUBREADputs("");
1097 		SUBREADputs("    -M <int>        size of requested memory(RAM) in megabytes. Index is split into blocks if necessary.");
1098 		SUBREADputs("");
1099 		SUBREADputs("    -f <int>        specify the threshold for removing uninformative subreads");
1100 		SUBREADputs("                    (highly repetitive 16mers in the reference). 24 by default.");
1101 		SUBREADputs("");
1102 		SUBREADputs("    -c              build a color-space index.");
1103 		SUBREADputs("");
1104 		SUBREADputs("    -v              output version of the program.");
1105 		SUBREADputs("");
1106 		SUBREADputs("For more information about these arguments, please refer to the User Manual.\n");
1107 	*/
1108 		 SUBREADputs("Usage:");
1109 		 SUBREADputs("");
1110 		 SUBREADputs(" ./subread-buildindex [options] -o <basename> {FASTA[.gz] file1}\\");
1111 		 SUBREADputs("      [FASTA[.gz] file2] ...");
1112 		 SUBREADputs("");
1113 		 SUBREADputs("Required arguments:");
1114 		 SUBREADputs("");
1115 		 SUBREADputs("    -o <basename>   base name of the index to be created");
1116 		 SUBREADputs("");
1117 		 SUBREADputs("Optional arguments:");
1118 		 SUBREADputs("");
1119 		 SUBREADputs("    -F              build a full index for the reference genome. 16bp subreads");
1120 		 SUBREADputs("                    will be extracted from every position of the reference");
1121 		 SUBREADputs("                    genome. Size of the index is typically 3 times the size of");
1122 		 SUBREADputs("                    index built from using the default setting.");
1123 		 SUBREADputs("");
1124 		 SUBREADputs("    -B              create one block of index. The built index will not be split");
1125 		 SUBREADputs("                    into multiple pieces. This makes the largest amount of");
1126 		 SUBREADputs("                    memory be requested when running alignments, but it enables");
1127 		 SUBREADputs("                    the maximum mapping speed to be achieved. This option");
1128 		 SUBREADputs("                    overrides -M when it is provided as well.");
1129 		 SUBREADputs("");
1130 		 SUBREADputs("    -M <int>        size of requested memory(RAM) in megabytes, 8000 by default.");
1131 		 SUBREADputs("");
1132 		 SUBREADputs("    -f <int>        specify the threshold for removing uninformative subreads");
1133 		 SUBREADputs("                    (highly repetitive 16mers in the reference). 100 by default.");
1134 		 SUBREADputs("");
1135 		 SUBREADputs("    -c              build a color-space index.");
1136 		 SUBREADputs("");
1137 		 SUBREADputs("    -v              output version of the program.");
1138 		 SUBREADputs("");
1139 		 SUBREADputs("For more information about these arguments, please refer to the User Manual.\n");
1140 
1141 		return -1 ;
1142 	}
1143 
1144 	if(threshold < 5) {
1145 		SUBREADprintf("ERROR: The threshold of non-informative reads cannot be less than 5.\n");
1146 		return -1;
1147 	}
1148 
1149 	// **********************************************
1150 	//	print config summary
1151 	// **********************************************
1152 
1153 	print_subread_logo();
1154 	size_t free_mem=-1, total_mem=-1;
1155 	if(get_free_total_mem( &total_mem, &free_mem )){
1156 		free_mem=-1;
1157 		total_mem=-1;
1158 	}
1159 
1160 	SUBREADputs("");
1161 	print_in_box(80, 1, 1, "setting");
1162 	print_in_box(80, 0, 1, "");
1163 	print_in_box(80, 0, 0, "               Index name : %s", get_short_fname(output_file));
1164 	print_in_box(80, 0, 0, "              Index space : %s", IS_COLOR_SPACE?"color space":"base space");
1165 
1166 	if(IS_FORCED_ONE_BLOCK)
1167 	{
1168 		print_in_box(80, 0, 0, "              Index split : no-split");
1169 		memory_limit = GENE_SLIDING_STEP==1?22000:11500;
1170 	}
1171 	else
1172 	{
1173 		if(memory_limit > 12000 && GENE_SLIDING_STEP>2)
1174 		{
1175 			print_in_box(80, 0, 0, "                   Memory : %u -> %u Mbytes", memory_limit, 12000);
1176 			memory_limit = 12000;
1177 		}
1178 		else
1179 			print_in_box(80, 0, 0, "                   Memory : %u Mbytes", memory_limit);
1180 	}
1181 	print_in_box(80, 0, 0, "         Repeat threshold : %d repeats", threshold);
1182 	print_in_box(80, 0, 0, "             Gapped index : %s", GENE_SLIDING_STEP>1?"yes":"no");
1183 	print_in_box(80, 0, 0, "");
1184 	if(free_mem>0)print_in_box(80, 0, 0, "      Free / total memory : %.1fGB / %.1fGB", free_mem*1./1024/1024/1024, total_mem*1./1024/1024/1024);
1185 	print_in_box(80, 0, 0, "");
1186 	print_in_box(80, 0, 0, "              Input files : %d file%s in total",  argc - optind, (argc - optind>1)?"s":"");
1187 
1188 	int x1;
1189 	for(x1=0;x1< argc - optind; x1++)
1190 	{
1191 		char * fasta_fn = *(argv+optind+x1);
1192 		int f_type = probe_file_type_fast(fasta_fn);
1193 		char o_char = '?';
1194 		if(f_type == FILE_TYPE_FASTA){
1195 			o_char = 'o';
1196 		}
1197 		if(f_type == FILE_TYPE_GZIP_FASTA){
1198 			o_char = 'o';
1199 		}
1200 		print_in_box(94, 0, 0, "                            %c[32m%c%c[36m %s%c[0m", CHAR_ESC, o_char, CHAR_ESC,  get_short_fname(fasta_fn) , CHAR_ESC);
1201 	}
1202 	print_in_box(80, 0, 0, "");
1203 
1204 	#ifndef __MINGW32__
1205 	if(free_mem>0 && free_mem < 3*1024ll*1024*1024){
1206 		print_in_box(80, 0, 0, "");
1207 		print_in_box( 80, 0, 0, "  WARNING: the free memory is lower than 3.0GB." );
1208 		print_in_box( 80, 0, 0, "           the program may run very slow or crash." );
1209 		print_in_box(80, 0, 0, "");
1210 	}
1211 	#endif
1212 
1213 	print_in_box(80, 2, 1, "");
1214 	SUBREADputs("");
1215 
1216 	print_in_box(80, 1, 1, "Running");
1217 	print_in_box(80, 0, 0, "");
1218 
1219 	for(x1=0;x1< argc - optind; x1++)
1220 	{
1221 		char * fasta_fn = *(argv+optind+x1);
1222 		int f_type = probe_file_type_fast(fasta_fn);
1223 		if(f_type != FILE_TYPE_GZIP_FASTA && f_type != FILE_TYPE_FASTA && f_type != FILE_TYPE_NONEXIST){
1224 			SUBREADprintf("ERROR: '%s' is not a Fasta file.\n", fasta_fn);
1225 			STANDALONE_exit(-1);
1226 		}
1227 	}
1228 
1229 	for(x1 = strlen(output_file); x1 >=0; x1--){
1230 		if(output_file[x1]=='/'){
1231 			memcpy(tmp_fa_file, output_file, x1);
1232 			tmp_fa_file[x1]=0;
1233 			break;
1234 		}
1235 	}
1236 	if(tmp_fa_file[0]==0)strcpy(tmp_fa_file, "./");
1237 
1238 	#ifdef __MINGW32__
1239 	sprintf(tmp_fa_file+strlen(tmp_fa_file), "/subread-index-sam-%06u-%06d", getpid(),(int)(time(NULL) % 1000000));
1240 	#else
1241 	sprintf(tmp_fa_file+strlen(tmp_fa_file), "/subread-index-sam-%06u-XXXXXX", getpid());
1242 	int tmpfdd = mkstemp(tmp_fa_file);
1243 	if(tmpfdd == -1){
1244 		SUBREADprintf("ERROR: cannot create temp file\n");
1245 		return -1;
1246 	}
1247 	#endif
1248 
1249 	sprintf(log_file_name, "%s.log", output_file);
1250 	FILE * log_fp = f_subr_open(log_file_name,"wb");
1251 
1252 	signal (SIGTERM, SIGINT_hook);
1253 	signal (SIGINT, SIGINT_hook);
1254 
1255 
1256 	int ret = check_and_convert_FastA(argv+optind , argc - optind, tmp_fa_file, &chromosome_lengths, log_fp, log_file_name);
1257 	if(log_fp)
1258 		fclose(log_fp);
1259 
1260 	if(!ret)
1261 	{
1262 		long long actual_bases=0;
1263 		HashTable * huge_table;
1264 		huge_table = HashTableCreate(200000);
1265 		unsigned int expected_hash_items = (unsigned int)(memory_limit * 1024.0 / 8.) * 1024 ;
1266 		unsigned int * bucket_sizes = NULL, total_tables=0;
1267 		unsigned int bucket_no = calculate_buckets_by_size(expected_hash_items, SUBINDEX_VER2, 0, GENE_SLIDING_STEP);
1268 		ret = ret || scan_gene_index(output_file, ptr_tmp_fa_file , 1, threshold, huge_table, &actual_bases);
1269 		ret = ret || build_gene_index(output_file, ptr_tmp_fa_file , 1, threshold, huge_table, chromosome_lengths, actual_bases, 1, &bucket_sizes, expected_hash_items, bucket_no, &total_tables);
1270 
1271 		long long estm_need_memory = estimate_memory_peak( bucket_sizes, bucket_no, total_tables );
1272 		long long needed_mem =  estm_need_memory + huge_table->numOfElements * sizeof(KeyValuePair) + 560ll*1024*1024 + actual_bases / ((GENE_SLIDING_STEP > 1 ) ?  7:5);
1273 
1274 		if(free_mem>0 && free_mem < needed_mem){
1275 			print_in_box(80, 0, 1, "");
1276 			print_in_box(80, 0, 1, "WARNING: available memory is lower than %.1f GB." ,needed_mem*1./1024l/1024/1024);
1277 			print_in_box(80, 0, 1, "         The program may run very slow.");
1278 			print_in_box(80, 0, 1, "Build a gapped index and/or split index into blocks to reduce memory use.");
1279 			print_in_box(80, 0, 1, "");
1280 		}else
1281 			print_in_box(80, 0, 0, "%.1f GB of memory is needed for index building." ,needed_mem*1./1024l/1024/1024);
1282 
1283 
1284 		ret = ret || build_gene_index(output_file, ptr_tmp_fa_file , 1, threshold, huge_table, chromosome_lengths, actual_bases, 0, &bucket_sizes, expected_hash_items, bucket_no, &total_tables);
1285 
1286 		if(!ret){
1287 			print_in_box(80, 0, 1, "Total running time: %.1f minutes.", (miltime()-begin00_ftime)/60);
1288 			print_in_box(89, 0, 1, "Index %c[36m%s%c[0m was successfully built.", CHAR_ESC, output_file, CHAR_ESC);
1289 		}
1290 		HashTableDestroy(huge_table);
1291 		free(chromosome_lengths);
1292 		free(bucket_sizes);
1293 	}
1294 
1295 	unlink(tmp_fa_file);
1296 	tmp_fa_file[0]=0;
1297 
1298 	print_in_box(80, 0, 0, "");
1299 	print_in_box(80, 2, 1, "");
1300 	SUBREADputs("");
1301 	return ret;
1302 }
1303 
1304