1 #include <stdio.h>
2 #include <signal.h>
3 #include <dirent.h>
4 #include <string.h>
5 #include <stdlib.h>
6 #include <ctype.h>
7 #include <sys/types.h>
8 
9 #ifndef __MINGW32__
10 #include <sys/resource.h>
11 #endif
12 
13 #include <sys/stat.h>
14 #include <unistd.h>
15 #include <zlib.h>
16 #include <stdio.h>
17 #include <assert.h>
18 
19 #include "LRMfile-io.h"
20 #include "LRMsorted-hashtable.h"
21 
LRMgenekey2int(char key[])22 int LRMgenekey2int(char key []){
23 	int i;
24 	int ret;
25 
26 	ret = 0;
27 	for (i=0; i<16; i++)
28 		ret |= (LRMbase2int(key[i]))<<(2*(15-i));
29 
30 	return ret;
31 }
32 
LRMgeinput_open(const char * filename,LRMgene_input_t * input)33 int LRMgeinput_open(const char * filename, LRMgene_input_t * input){
34 	int ret = 0;
35 	if(strlen(filename)>LRMMAX_FILENAME_LENGTH-2)
36 		return 1;
37 
38 	strcpy(input->filename, filename);
39 	FILE * TMP_FP = fopen(filename, "rb");
40 
41 	if(TMP_FP == NULL)
42 		return 1;
43 
44 	int id1, id2;
45 	id1 = fgetc(TMP_FP);
46 	id2 = fgetc(TMP_FP);
47 
48 	if(id1 == 31 && id2 == 139) {
49 		fclose(TMP_FP);
50 		input->input_fp = malloc(sizeof(seekable_zfile_t));
51 		input->file_type = LRMGENE_INPUT_GZIP_FASTQ;
52 		ret = LRMseekgz_open(filename, input->input_fp);
53 	}else{
54 		input->file_type = LRMGENE_INPUT_FASTQ;
55 		input->input_fp = TMP_FP;
56 		fseek(input->input_fp, 0, SEEK_SET);
57 	}
58 
59 	return ret;
60 }
61 
LRMgeinput_close(LRMgene_input_t * input)62 void LRMgeinput_close(LRMgene_input_t * input){
63 		if(input -> file_type == LRMGENE_INPUT_GZIP_FASTQ)
64 		LRMseekgz_close((seekable_zfile_t * ) input->input_fp);
65 	else
66 		fclose((FILE*)input->input_fp);
67 }
68 
69 char * LRM__converting_char_table = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNGNNNCNNNNNNNNNNNNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNGNNNCNNNNNNNNNNNNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN  ";
70 
LRMreverse_read(char * InBuff,int read_len)71 void LRMreverse_read(char * InBuff, int read_len){
72 	int i;
73 
74 	for (i=0; i<read_len/2; i++)
75 	{
76 		int rll1 = read_len - 1 - i;
77 		unsigned char tmp = InBuff[rll1];
78 
79 		InBuff[rll1] = *(LRM__converting_char_table+InBuff[i]);
80 		InBuff[i] = *(LRM__converting_char_table+tmp);
81 
82 	}
83 	if(i*2 == read_len-1)
84 	{
85 		InBuff[i] = *(LRM__converting_char_table+InBuff[i]);
86 	}
87 }
88 
89 
90 
LRMgeinput_getc(LRMgene_input_t * input)91 int LRMgeinput_getc(LRMgene_input_t * input){
92 	if(input -> file_type == LRMGENE_INPUT_GZIP_FASTQ){
93 		return LRMseekgz_next_char((seekable_zfile_t*)input -> input_fp);
94 	}else{
95 		return fgetc((FILE*)input -> input_fp);
96 	}
97 }
98 
LRMgeinput_readline(LRMgene_input_t * input,int buf_len,char * linebuffer)99 int LRMgeinput_readline(LRMgene_input_t * input, int buf_len, char * linebuffer){
100 	int ret =0;
101 	while(1){
102 		char ch = LRMgeinput_getc(input);
103 		if (ch == '\n' || ch == EOF) break;
104 		if(ret < buf_len-1)
105 			linebuffer[ret++] = ch;
106 	}
107 	linebuffer[ret]=0;
108 	return ret;
109 }
110 
111 
112 #define SKIP_LINE { nch=' '; while(nch != EOF && nch != '\n') nch = LRMgeinput_getc(input); }
113 
LRMgeinput_next_read(LRMgene_input_t * input,char * read_name,char * read_string,char * quality_string)114 int LRMgeinput_next_read(LRMgene_input_t * input, char * read_name, char * read_string, char * quality_string){
115 	char nch;
116 	int ret;
117 
118 	//READ NAME
119 	if (read_name == NULL){
120 		SKIP_LINE;
121 		if(nch == EOF) return -1;
122 	} else {
123 		int ATnch = LRMgeinput_getc(input);
124 		assert(ATnch == '@' || ATnch <0);
125 		if(ATnch < 0) return -1;
126 
127 		int retv = LRMgeinput_readline(input, LRMMAX_READ_NAME_LEN, read_name);
128 		if(retv<=0) return -1;
129 
130 		int cursor = 1;
131 		while(read_name[cursor])
132 		{
133 			if(read_name[cursor] == ' ' || read_name[cursor] == '\t')
134 			{
135 				read_name [cursor] = 0;
136 				break;
137 			}
138 			cursor++;
139 		}
140 	}
141 	// READ LINE
142 	ret = LRMgeinput_readline(input, LRMMAX_READ_LENGTH, read_string);
143 
144 	// SKIP "+"
145 	do{
146 		nch = LRMgeinput_getc(input);
147 	} while( nch == '\n' );
148 	SKIP_LINE;
149 
150 	// QUAL LINE
151 	if (quality_string)
152 		LRMgeinput_readline(input, LRMMAX_READ_LENGTH, quality_string);
153 	else
154 		SKIP_LINE;
155 
156 	return ret;
157 }
158 
LRMfetch_next_read(LRMcontext_t * context,LRMthread_context_t * thread_context,unsigned int * read_len,char * read_name,char * read_text,char * qual_text,unsigned int * read_no_in_chunk)159 int LRMfetch_next_read(LRMcontext_t * context, LRMthread_context_t * thread_context, unsigned int *read_len, char * read_name, char * read_text, char * qual_text, unsigned int * read_no_in_chunk){
160 	int this_number = -1;
161 	int this_rlen = 0;
162 
163 	LRMthread_lock(&context -> input_lock);
164 	this_rlen = LRMgeinput_next_read(&context -> input_file, read_name, read_text, qual_text);
165 	if(this_rlen > 0 ){
166 		this_number = context -> processed_reads_in_chunk;
167 		context -> processed_reads_in_chunk ++;
168 	}else context -> input_exhausted = 1;
169 
170 	LRMthread_lockrelease(&context -> input_lock);
171 
172 	if(this_rlen && this_number>=0)
173 	{
174 		*read_no_in_chunk = this_number;
175 		*read_len = this_rlen;
176 		return 0;
177 	}else{
178 		*read_no_in_chunk = -1;
179 		return 1;
180 	}
181 }
182 
LRMreverse_quality(char * InBuff,int read_len)183 void LRMreverse_quality(char * InBuff, int read_len){
184 	int i;
185 	if(!InBuff) return;
186 	if(!InBuff[0]) return;
187 	for (i=0; i<read_len/2; i++)
188 	{
189 		char tmp;
190 		tmp = InBuff[i];
191 		InBuff[i] = InBuff[read_len -1-i];
192 		InBuff[read_len -1-i] = tmp;
193 	}
194 }
195 
LRMquality_64_to_33(char * qs)196 void LRMquality_64_to_33(char *qs){
197 	int i;
198 	for(i=0; qs[i]; i++){
199 		qs[i] -= (64-33);
200 	}
201 }
202 
LRMgenerate_bam_record_encode_cigar(LRMcontext_t * context,int * cigar_int,char * cigar,int * mapped_length,int read_text_len)203 int LRMgenerate_bam_record_encode_cigar(LRMcontext_t * context, int * cigar_int, char * cigar, int * mapped_length, int read_text_len ){
204 	int tmp_int=0;
205 	int reported_read_len = 0;
206 	int cigar_cursor = 0, num_opt = 0;
207 	(*mapped_length) = 0;
208 
209 	if(cigar[0]=='*') return 0;
210 
211 	while(1)
212 	{
213 		char nch = cigar[cigar_cursor++];
214 		if(!nch)break;
215 		if(isdigit(nch))
216 		{
217 			tmp_int = tmp_int*10+(nch-'0');
218 		}
219 		else
220 		{
221 			int int_opt=0;
222 			if(nch == 'M' ||nch == 'N'||nch == 'D') (*mapped_length) += tmp_int;
223 			if(nch == 'M' ||nch == 'I'||nch == 'S') reported_read_len += tmp_int;
224 			for(; int_opt<8; int_opt++) if("MIDNSHP=X"[int_opt] == nch)break;
225 			cigar_int[num_opt ++] = (tmp_int << 4) | int_opt;
226 			tmp_int = 0;
227 			if(num_opt >= context -> max_cigars_in_read){
228 				int_opt = 4; //'S'
229 				tmp_int = read_text_len - reported_read_len;
230 				cigar_int[num_opt ++] = (tmp_int << 4) | int_opt;
231 				LRMprintf("CIGAR_TRIMMED : %d bases\n", tmp_int);
232 				break;
233 			}
234 		}
235 	}
236 
237 	return num_opt;
238 }
239 
LRMreg2bin(int beg,int end)240 int LRMreg2bin(int beg, int end){
241 	--end;
242 	if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
243 	if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
244 	if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20);
245 	if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23);
246 	if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26);
247 	return 0;
248 }
249 
LRMgenerate_bam_record_encode_read_qual(char * bin,char * read,char * qual,int rlen)250 int LRMgenerate_bam_record_encode_read_qual(char * bin, char * read, char * qual, int rlen){
251 	int w_ptr = 0, xk1;
252 
253 	for(xk1 = 0; xk1 < rlen; xk1++){
254 		int fourbit;
255 		for(fourbit=0;fourbit<15;fourbit++) if("=ACMGRSVTWYHKDBN"[fourbit] == read[xk1])break;
256 		if( xk1 % 2 == 0){
257 			fourbit = fourbit << 4;
258 			bin[w_ptr]=0;
259 		}
260 
261 		bin[w_ptr] |= fourbit;
262 
263 		if(xk1 % 2 == 1)w_ptr++;
264 	}
265 	if(rlen %2) w_ptr++;
266 
267 	for(xk1=0; xk1<rlen; xk1++)
268 		bin[w_ptr+xk1] = qual[xk1]-33;
269 	return w_ptr+rlen;
270 }
271 
LRMgenerate_bam_record(LRMcontext_t * context,LRMthread_context_t * thread_context,LRMread_iteration_context_t * iteration_context,char * target_ptr,int flags,unsigned int chro_pos,char * chro_name,int map_quality,char * cigar,int mis_matched)272 int LRMgenerate_bam_record(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, char * target_ptr, int flags, unsigned int chro_pos,char *  chro_name, int map_quality, char * cigar, int mis_matched){
273 	int mapped_length = 0;
274 
275 	int chro_number = LRMHashTableGet(context->sam_bam_chromosome_table, chro_name) - NULL;
276 	chro_number -=1;
277 	memcpy(target_ptr+4, &chro_number, 4);
278 	memcpy(target_ptr+8, &chro_pos, 4);
279 
280 	int name_len = strlen(iteration_context -> read_name)+1;
281 
282 	memcpy(target_ptr+20, &iteration_context->read_length, 4);
283 	int next_non = -1;
284 	memcpy(target_ptr+24, &next_non, 4);
285 	memcpy(target_ptr+28, &next_non, 4);
286 	memset(target_ptr+32, 0, 4);	//TLEN
287 	memcpy(target_ptr+36, iteration_context->read_name, name_len);
288 	int bin_ptr = 36+name_len;
289 
290 	int cigar_opts = LRMgenerate_bam_record_encode_cigar(context, (int *)(target_ptr + bin_ptr), cigar, &mapped_length, iteration_context->read_length);
291 	int flag_nc = flags << 16 | cigar_opts;
292 	memcpy(target_ptr+16, &flag_nc, 4);
293 	bin_ptr += cigar_opts*4;
294 
295 
296 	int bin_mq_nl = (LRMreg2bin(chro_pos, chro_pos+mapped_length)<<16) | map_quality << 8 | name_len;
297 	memcpy(target_ptr+12, &bin_mq_nl, 4);
298 
299 	//LRMprintf("READ TEXT %s : len=%d \n", iteration_context->read_name, iteration_context->read_length);
300 	bin_ptr += LRMgenerate_bam_record_encode_read_qual(target_ptr + bin_ptr, iteration_context->read_text, iteration_context -> qual_text, iteration_context->read_length);
301 
302 	memcpy(target_ptr+bin_ptr, "NM",2);
303 	target_ptr[bin_ptr+2]='C';
304 	target_ptr[bin_ptr+3]=mis_matched;
305 
306 	memcpy(target_ptr, &bin_ptr, 4); // block len not include itself.
307 	bin_ptr += 4;
308 	return bin_ptr;
309 }
310 
LRM_CRC32(char * dat,int len)311 unsigned int LRM_CRC32(char * dat, int len){
312 	unsigned int crc0 = crc32(0, NULL, 0);
313 	unsigned int ret = crc32(crc0, (unsigned char *)dat, len);
314 	return ret;
315 }
316 
LRMwrite_chunk_compress_bam_block(LRMcontext_t * context,LRMthread_context_t * thread_context,char * bin_buf,char * bam_buf,int bin_len)317 int LRMwrite_chunk_compress_bam_block(LRMcontext_t * context,  LRMthread_context_t * thread_context, char * bin_buf, char * bam_buf, int bin_len){
318 	char * compressed_buff = bam_buf + 18;
319 
320 	int compressed_size ;
321 	unsigned int CRC32;
322 	thread_context -> bam_file_output_stream.avail_out = 66600;
323 	thread_context -> bam_file_output_stream.avail_in = bin_len;
324 	CRC32 = LRM_CRC32(bin_buf , bin_len);
325 
326  	int Z_DEFAULT_MEM_LEVEL = 8;
327 	thread_context -> bam_file_output_stream.zalloc = Z_NULL;
328 	thread_context -> bam_file_output_stream.zfree = Z_NULL;
329 	thread_context -> bam_file_output_stream.opaque = Z_NULL;
330 
331 	deflateInit2(&thread_context -> bam_file_output_stream, LRMBAM_COMPRESS_LEVEL, Z_DEFLATED,
332 		LRMGZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
333 
334 	thread_context -> bam_file_output_stream.next_in = (unsigned char*) bin_buf;
335 	thread_context -> bam_file_output_stream.next_out = (unsigned char*) compressed_buff;
336 
337 	deflate(&thread_context -> bam_file_output_stream, Z_FINISH);
338 	deflateEnd(&thread_context -> bam_file_output_stream);
339 
340 	compressed_size = 66600 -thread_context -> bam_file_output_stream.avail_out;
341 
342 	//LRMprintf("COMPRESS: %d -> %d\n", bin_len, compressed_size);
343 
344 	bam_buf[0]=31;
345 	bam_buf[1]=(char)139;
346 	bam_buf[2]=8;
347 	bam_buf[3]=4;
348 	memset(bam_buf+4, 0, 5);
349 	bam_buf[9] = 0xff;	// OS
350 
351 	int tmpi = 6;
352 	memcpy(bam_buf+10, &tmpi, 2); //XLSN
353 	bam_buf[12]=66; // SI1
354 	bam_buf[13]=67; // SI2
355 	tmpi = 2;
356 	memcpy(bam_buf+14, &tmpi, 2); //BSIZE
357 	tmpi = compressed_size + 19 + 6;
358 	memcpy(bam_buf+16, &tmpi, 2); //BSIZE
359 
360 	memcpy(bam_buf+18+compressed_size, &CRC32, 4);
361 	memcpy(bam_buf+18+compressed_size+4, &bin_len, 4);
362 	return compressed_size + 26;
363 }
364 
LRMhash_strcmp(const void * s1,const void * s2)365 int LRMhash_strcmp(const void * s1, const void * s2){
366 	return strcmp(s1, s2);
367 }
368 
LRMhash_strhash(const void * sv)369 srUInt_64 LRMhash_strhash(const void * sv){
370 	unsigned char *s = (unsigned char *)sv;
371 	unsigned long ret = 0;
372 	while(*s){
373 		ret ^= (ret << 6);
374 		ret ^= *s;
375 		s++;
376 	}
377 	return ret;
378 }
379 
LRMreadline(FILE * fp,char * buf,int buflen)380 int LRMreadline(FILE * fp, char * buf, int buflen){
381 	int cr = 0;
382 	while(!feof(fp)){
383 		char c = fgetc(fp);
384 		if(c == '\r'){
385 			continue;
386 		}else if(c == '\n'){
387 			break;
388 		}else if(cr < buflen-1)
389 			buf[cr++]=c;
390 	}
391 	buf[cr]=0;
392 	return cr;
393 }
394 
LRMload_offsets(LRMcontext_t * context)395 int LRMload_offsets(LRMcontext_t * context){
396 	char fn[LRMMAX_FILENAME_LENGTH+20];
397 	FILE * fp;
398 	int padding = 0;
399 	unsigned int last_end = 0;
400 
401 	LRMgehash_load_option(context -> index_prefix, LRMSUBREAD_INDEX_OPTION_INDEX_PADDING , &padding);
402 	assert(padding>16);
403 
404 	sprintf(fn, "%s.reads", context->index_prefix);
405 	fp = fopen(fn, "r");
406 
407 	if(!fp){
408 		LRMprintf("file not found :%s\n", fn);
409 		return 1;
410 	}
411 
412 	context -> current_index_padding = padding;
413 	int chromosome_no = 0;
414 	while (!feof(fp))
415 	{
416 		int i=0, step = 0, j=0;
417 		unsigned int this_offset =0;
418 
419 		LRMreadline(fp, fn, LRMMAX_FILENAME_LENGTH-1);
420 		if (strlen(fn)<2)continue;
421 		char * chro_name = malloc(LRMMAX_CHROMOSOME_NAME_LEN);
422 		while (fn[i])
423 		{
424 			if (fn[i] == '\t') {
425 				fn[i]=0;
426 				this_offset = (unsigned int)atoll(fn);
427 				step = 1;
428 			}else if (step)	{
429 				if(j<LRMMAX_CHROMOSOME_NAME_LEN-1){
430 					chro_name[j++]=fn[i];
431 					chro_name[j]=0;
432 				}
433 			}
434 			i++;
435 		}
436 
437 		LRMHashTablePut(context->sam_bam_chromosome_table, chro_name, NULL + chromosome_no + 1);
438 		LRMArrayListPush(context->sam_bam_chromosome_list, chro_name);
439 
440 		LRMHashTablePut(context->chromosome_size_table, chro_name, NULL + (this_offset - last_end + 16 - context->current_index_padding * 2));
441 		LRMArrayListPush(context->chromosome_size_list, NULL + this_offset);
442 
443 		chromosome_no++;
444 		last_end = this_offset;
445 	}
446 
447 	fclose(fp);
448 	return 0;
449 }
450 
451 #define LRMcheck_resize_buff 			if(new_need >= thread_context -> out_buff_capacity){\
452 				thread_context -> out_buff_capacity = max(thread_context -> out_buff_capacity*2, new_need );\
453 				thread_context -> out_SAMBAM_buffer=realloc(thread_context -> out_SAMBAM_buffer, thread_context -> out_buff_capacity);\
454 			}\
455 
456 #define LRMBAM_COMPRESS_BLOCK 63000
457 #define LRMBAM_COMPRESS_TRIGGER 53000
458 
LRMsambam_write_header(LRMcontext_t * context,LRMthread_context_t * thread_context)459 void LRMsambam_write_header(LRMcontext_t * context, LRMthread_context_t * thread_context){
460 	if(context -> sam_bam_file_header_written) return;
461 
462 	thread_context -> out_SAMBAM_buffer = malloc(3000000);
463 	thread_context -> out_buff_capacity = 3000000;
464 	thread_context -> out_buff_used = 0;
465 
466 	int chro_no;
467 	if(!context -> is_SAM_output){
468 		memcpy(thread_context -> out_SAMBAM_buffer,"BAM\1",4);
469 		thread_context -> out_buff_used = 8;
470 	}
471 
472 	for(chro_no = -1; chro_no < context -> sam_bam_chromosome_list->numOfElements + 2; chro_no++){
473 		char * header_line = malloc(10100);
474 		int wrlen = 0;
475 		if(chro_no >=0 && chro_no < context -> sam_bam_chromosome_list->numOfElements){
476 			char * chro_name = LRMArrayListGet(context -> sam_bam_chromosome_list, chro_no);
477 			int chro_length = LRMHashTableGet(context -> chromosome_size_table, chro_name) - NULL;
478 
479 			wrlen = sprintf(header_line, "@SQ\tSN:%s\tLN:%d\n",chro_name,chro_length);
480 		}else if(chro_no == -1){
481 			wrlen = sprintf(header_line, "@HD\tVN:1.0\tSO:unsorted\n");
482 		}else if(chro_no == context -> sam_bam_chromosome_list->numOfElements ){
483 			wrlen = sprintf(header_line, "@PG\tID:subread-long-read-mapping\tPN:subread-long-read-mapping\tCL:%s\n", context -> user_command_line);
484 		}
485 		if(context -> is_SAM_output){
486 			fwrite( header_line, 1, wrlen, context -> sam_bam_file);
487 		}else{
488 			int new_need = thread_context -> out_buff_used + wrlen +1;
489 			LRMcheck_resize_buff;
490 			memcpy(thread_context -> out_SAMBAM_buffer + thread_context -> out_buff_used,header_line,wrlen);
491 			thread_context -> out_buff_used += wrlen;
492 		}
493 		free(header_line);
494 	}
495 
496 	int new_need = thread_context -> out_buff_used + 10;
497 	LRMcheck_resize_buff;
498 
499 	int BAM_text_len = thread_context -> out_buff_used-8;
500 	memcpy(thread_context -> out_SAMBAM_buffer+4, &BAM_text_len, 4);
501 	memcpy(thread_context -> out_SAMBAM_buffer + thread_context -> out_buff_used, &context ->sam_bam_chromosome_list->numOfElements , 4);
502 	thread_context -> out_buff_used +=4;
503 	for(chro_no = 0; chro_no < context -> sam_bam_chromosome_list->numOfElements ; chro_no++){
504 		char * chro_name = LRMArrayListGet(context -> sam_bam_chromosome_list, chro_no);
505 		int chro_namelen = strlen(chro_name)+1;
506 		int new_need = thread_context -> out_buff_used + chro_namelen + 9;
507 		LRMcheck_resize_buff;
508 
509 		memcpy(thread_context -> out_SAMBAM_buffer+thread_context -> out_buff_used, &chro_namelen, 4);
510 		thread_context -> out_buff_used +=4;
511 		memcpy(thread_context -> out_SAMBAM_buffer+thread_context -> out_buff_used, chro_name, chro_namelen);
512 		thread_context -> out_buff_used += chro_namelen;
513 		int chro_length = LRMHashTableGet(context -> chromosome_size_table, chro_name) - NULL;
514 		memcpy(thread_context -> out_SAMBAM_buffer+thread_context -> out_buff_used, &chro_length, 4);
515 		thread_context -> out_buff_used +=4;
516 	}
517 
518 	if(!context -> is_SAM_output){
519 		LRMwrite_chunk_check_buffer_write(context, thread_context, 1);
520 
521 	}
522 	context -> sam_bam_file_header_written = 1;
523 	free(thread_context -> out_SAMBAM_buffer);
524 }
525 
LRMbam_generate_tail_binary(LRMcontext_t * context,LRMthread_context_t * thread_context)526 void LRMbam_generate_tail_binary(LRMcontext_t * context, LRMthread_context_t * thread_context){
527 	if(context -> bam_file_tail_length>0)return;
528 	context -> bam_file_tail_length = LRMwrite_chunk_compress_bam_block(context, thread_context, "", context -> bam_file_tail_binary , 0);
529 }
530 
LRMwrite_chunk_add_buffered_output(LRMcontext_t * context,LRMthread_context_t * thread_context,LRMread_iteration_context_t * iteration_context,int flags,char * chro_name,unsigned int chro_pos,int map_quality,char * cigar,int mis_matched)531 int LRMwrite_chunk_add_buffered_output(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int flags,  char * chro_name, unsigned int chro_pos, int map_quality, char * cigar, int mis_matched){
532 	char * target_ptr;
533 	int cigar_len = strlen(cigar);
534 	int rname_len = strlen(iteration_context->read_name);
535 	int read_binlen, actural_target_len;
536 	read_binlen = cigar_len+rname_len+2.5*iteration_context->read_length+500;
537 
538 	if(read_binlen + thread_context->out_buff_used >= thread_context -> out_buff_capacity){
539 		thread_context -> out_buff_capacity = max( thread_context -> out_buff_capacity*1.3, read_binlen);
540 		thread_context -> out_SAMBAM_buffer=realloc(thread_context -> out_SAMBAM_buffer, thread_context -> out_buff_capacity);
541 	}
542 
543 	target_ptr = thread_context -> out_SAMBAM_buffer +  thread_context->out_buff_used;
544 	if(context -> is_Phred_64) LRMquality_64_to_33(iteration_context->qual_text) ;
545 
546 	if(context->is_SAM_output){
547 		actural_target_len = sprintf(target_ptr,"%s\t%d\t%s\t%u\t%d\t%s\t*\t0\t0\t%s\t%s\tNM:%d\n", iteration_context -> read_name, flags, chro_name, chro_pos + 1, map_quality, cigar, iteration_context->read_text, iteration_context->qual_text, mis_matched);
548 	}else{
549 		actural_target_len = LRMgenerate_bam_record(context, thread_context, iteration_context, target_ptr, flags, chro_pos, chro_name, map_quality,  cigar, mis_matched);
550 	}
551 	thread_context -> out_buff_used+=actural_target_len;
552 	LRMwrite_chunk_check_buffer_write(context, thread_context, 0);
553 	return 0;
554 }
LRMwrite_chunk_check_buffer_write(LRMcontext_t * context,LRMthread_context_t * thread_context,int force_write)555 int LRMwrite_chunk_check_buffer_write(LRMcontext_t * context, LRMthread_context_t * thread_context, int force_write){
556 	if(force_write || thread_context -> out_buff_used > LRMBAM_COMPRESS_TRIGGER){
557 		if(!context->is_SAM_output){
558 			int write_cursor = 0;
559 			int compressed_cursor = 0;
560 			for(write_cursor = 0; write_cursor < thread_context -> out_buff_used; write_cursor += LRMBAM_COMPRESS_BLOCK){
561 				char compressed_data [66666];
562 				int bin_len = min(LRMBAM_COMPRESS_BLOCK,thread_context -> out_buff_used - write_cursor);
563 				int compressed_len = LRMwrite_chunk_compress_bam_block(context, thread_context, thread_context -> out_SAMBAM_buffer + write_cursor, compressed_data, bin_len);
564 				if(compressed_len > bin_len){	// very unlikely
565 					int new_needed_size = thread_context -> out_buff_used  + (compressed_len - bin_len);
566 					if(new_needed_size > thread_context -> out_buff_capacity){
567 						thread_context -> out_buff_capacity = new_needed_size;
568 						thread_context -> out_SAMBAM_buffer=realloc(thread_context -> out_SAMBAM_buffer, thread_context -> out_buff_capacity);
569 					}
570 					int x1;
571 					for(x1 = thread_context -> out_buff_used - 1; x1 >= write_cursor + bin_len; x1 --)
572 						thread_context -> out_SAMBAM_buffer[ x1 + (compressed_len - bin_len) ] = thread_context -> out_SAMBAM_buffer[ x1 ];
573 				}
574 				memcpy(thread_context -> out_SAMBAM_buffer + compressed_cursor, compressed_data, compressed_len);
575 				compressed_cursor += compressed_len;
576 			}
577 			thread_context -> out_buff_used = compressed_cursor;
578 		}
579 		LRMthread_lock(&context->sam_bam_file_lock);
580 		fwrite(thread_context -> out_SAMBAM_buffer,1, thread_context -> out_buff_used, context -> sam_bam_file);
581 		LRMthread_lockrelease(&context->sam_bam_file_lock);
582 		thread_context -> out_buff_used = 0;
583 	}
584 
585 	return 0;
586 }
587 
588 
589 
LRMpos2txt(LRMcontext_t * context,unsigned int linear,char * txt)590 void LRMpos2txt(LRMcontext_t * context, unsigned int linear, char * txt){
591 	char * chro_name = NULL;
592 	int pos;
593 
594 	int retv = LRMlocate_gene_position(context, linear, &chro_name, &pos);
595 	if(chro_name && !retv)
596 		sprintf(txt, "%s:%d", chro_name, pos+1);
597 	else	strcpy(txt, "*");
598 }
599 
LRMlocate_gene_position(LRMcontext_t * context,unsigned int linear,char ** chro_name,int * pos)600 int LRMlocate_gene_position(LRMcontext_t * context, unsigned int linear, char ** chro_name, int * pos) {
601 	int n = 0;
602 	int total_offsets = context -> chromosome_size_list -> numOfElements;
603 	int jump_ns = total_offsets/4;
604 
605 	//LRMprintf("LINEAR = %u\n", linear);
606 
607 	(*chro_name)=NULL;
608 	(*pos) = -1;
609 	while (jump_ns > 5)
610 	{
611 		while(n+jump_ns < total_offsets && LRMArrayListGet(context->chromosome_size_list, n + jump_ns) - NULL <= linear)
612 			n+=jump_ns;
613 		jump_ns /=4;
614 	}
615 
616 	for (; n < total_offsets; n++)
617 	{
618 		//LRMprintf("TESTING %u\n",(LRMArrayListGet(context->chromosome_size_list, n) - NULL) );
619 		if ( (LRMArrayListGet(context->chromosome_size_list, n) - NULL)  > linear)
620 		{
621 			*pos = linear;
622 			if(n>0)(*pos) -= (LRMArrayListGet(context->chromosome_size_list, n-1)-NULL);
623 			if( (*pos) < context -> current_index_padding )	return 1;
624 			else (*pos) -= context -> current_index_padding;
625 			(*chro_name) = LRMArrayListGet(context -> sam_bam_chromosome_list, n);
626 
627 			return 0;
628 		}
629 	}
630 	return -1;
631 }
632 
LRMlocate_chro_length(LRMcontext_t * context,unsigned int linear,char ** chro_name,long long * end_pos)633 int LRMlocate_chro_length(LRMcontext_t * context, unsigned int linear, char ** chro_name, long long * end_pos){
634 	int n = 0;
635 	int total_offsets = context -> chromosome_size_list -> numOfElements;
636 	int jump_ns = total_offsets/4;
637 
638 	while (jump_ns > 5)
639 	{
640 		while(n+jump_ns < total_offsets && LRMArrayListGet(context->chromosome_size_list, n + jump_ns) - NULL <= linear)
641 			n+=jump_ns;
642 		jump_ns /=4;
643 	}
644 
645 	for (; n < total_offsets; n++)
646 	{
647 		//LRMprintf("TESTING %u\n",(LRMArrayListGet(context->chromosome_size_list, n) - NULL) );
648 		if ( (LRMArrayListGet(context->chromosome_size_list, n) - NULL)  > linear) {
649 			(* chro_name) = LRMArrayListGet(context -> sam_bam_chromosome_list, n);
650 			(* end_pos) = LRMArrayListGet(context->chromosome_size_list, n)-NULL;
651 			if((* end_pos) > context -> current_index_padding) (* end_pos) -=  context -> current_index_padding ;
652 			return 0;
653 		}
654 	}
655 	return -1;
656 }
657