1 #include <stdio.h>
2 #include <signal.h>
3 #include <dirent.h>
4 #include <string.h>
5 #include <stdlib.h>
6 #include <ctype.h>
7 #include <sys/types.h>
8
9 #ifndef __MINGW32__
10 #include <sys/resource.h>
11 #endif
12
13 #include <sys/stat.h>
14 #include <unistd.h>
15 #include <zlib.h>
16 #include <stdio.h>
17 #include <assert.h>
18
19 #include "LRMfile-io.h"
20 #include "LRMsorted-hashtable.h"
21
LRMgenekey2int(char key[])22 int LRMgenekey2int(char key []){
23 int i;
24 int ret;
25
26 ret = 0;
27 for (i=0; i<16; i++)
28 ret |= (LRMbase2int(key[i]))<<(2*(15-i));
29
30 return ret;
31 }
32
LRMgeinput_open(const char * filename,LRMgene_input_t * input)33 int LRMgeinput_open(const char * filename, LRMgene_input_t * input){
34 int ret = 0;
35 if(strlen(filename)>LRMMAX_FILENAME_LENGTH-2)
36 return 1;
37
38 strcpy(input->filename, filename);
39 FILE * TMP_FP = fopen(filename, "rb");
40
41 if(TMP_FP == NULL)
42 return 1;
43
44 int id1, id2;
45 id1 = fgetc(TMP_FP);
46 id2 = fgetc(TMP_FP);
47
48 if(id1 == 31 && id2 == 139) {
49 fclose(TMP_FP);
50 input->input_fp = malloc(sizeof(seekable_zfile_t));
51 input->file_type = LRMGENE_INPUT_GZIP_FASTQ;
52 ret = LRMseekgz_open(filename, input->input_fp);
53 }else{
54 input->file_type = LRMGENE_INPUT_FASTQ;
55 input->input_fp = TMP_FP;
56 fseek(input->input_fp, 0, SEEK_SET);
57 }
58
59 return ret;
60 }
61
LRMgeinput_close(LRMgene_input_t * input)62 void LRMgeinput_close(LRMgene_input_t * input){
63 if(input -> file_type == LRMGENE_INPUT_GZIP_FASTQ)
64 LRMseekgz_close((seekable_zfile_t * ) input->input_fp);
65 else
66 fclose((FILE*)input->input_fp);
67 }
68
69 char * LRM__converting_char_table = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNGNNNCNNNNNNNNNNNNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNGNNNCNNNNNNNNNNNNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ";
70
LRMreverse_read(char * InBuff,int read_len)71 void LRMreverse_read(char * InBuff, int read_len){
72 int i;
73
74 for (i=0; i<read_len/2; i++)
75 {
76 int rll1 = read_len - 1 - i;
77 unsigned char tmp = InBuff[rll1];
78
79 InBuff[rll1] = *(LRM__converting_char_table+InBuff[i]);
80 InBuff[i] = *(LRM__converting_char_table+tmp);
81
82 }
83 if(i*2 == read_len-1)
84 {
85 InBuff[i] = *(LRM__converting_char_table+InBuff[i]);
86 }
87 }
88
89
90
LRMgeinput_getc(LRMgene_input_t * input)91 int LRMgeinput_getc(LRMgene_input_t * input){
92 if(input -> file_type == LRMGENE_INPUT_GZIP_FASTQ){
93 return LRMseekgz_next_char((seekable_zfile_t*)input -> input_fp);
94 }else{
95 return fgetc((FILE*)input -> input_fp);
96 }
97 }
98
LRMgeinput_readline(LRMgene_input_t * input,int buf_len,char * linebuffer)99 int LRMgeinput_readline(LRMgene_input_t * input, int buf_len, char * linebuffer){
100 int ret =0;
101 while(1){
102 char ch = LRMgeinput_getc(input);
103 if (ch == '\n' || ch == EOF) break;
104 if(ret < buf_len-1)
105 linebuffer[ret++] = ch;
106 }
107 linebuffer[ret]=0;
108 return ret;
109 }
110
111
112 #define SKIP_LINE { nch=' '; while(nch != EOF && nch != '\n') nch = LRMgeinput_getc(input); }
113
LRMgeinput_next_read(LRMgene_input_t * input,char * read_name,char * read_string,char * quality_string)114 int LRMgeinput_next_read(LRMgene_input_t * input, char * read_name, char * read_string, char * quality_string){
115 char nch;
116 int ret;
117
118 //READ NAME
119 if (read_name == NULL){
120 SKIP_LINE;
121 if(nch == EOF) return -1;
122 } else {
123 int ATnch = LRMgeinput_getc(input);
124 assert(ATnch == '@' || ATnch <0);
125 if(ATnch < 0) return -1;
126
127 int retv = LRMgeinput_readline(input, LRMMAX_READ_NAME_LEN, read_name);
128 if(retv<=0) return -1;
129
130 int cursor = 1;
131 while(read_name[cursor])
132 {
133 if(read_name[cursor] == ' ' || read_name[cursor] == '\t')
134 {
135 read_name [cursor] = 0;
136 break;
137 }
138 cursor++;
139 }
140 }
141 // READ LINE
142 ret = LRMgeinput_readline(input, LRMMAX_READ_LENGTH, read_string);
143
144 // SKIP "+"
145 do{
146 nch = LRMgeinput_getc(input);
147 } while( nch == '\n' );
148 SKIP_LINE;
149
150 // QUAL LINE
151 if (quality_string)
152 LRMgeinput_readline(input, LRMMAX_READ_LENGTH, quality_string);
153 else
154 SKIP_LINE;
155
156 return ret;
157 }
158
LRMfetch_next_read(LRMcontext_t * context,LRMthread_context_t * thread_context,unsigned int * read_len,char * read_name,char * read_text,char * qual_text,unsigned int * read_no_in_chunk)159 int LRMfetch_next_read(LRMcontext_t * context, LRMthread_context_t * thread_context, unsigned int *read_len, char * read_name, char * read_text, char * qual_text, unsigned int * read_no_in_chunk){
160 int this_number = -1;
161 int this_rlen = 0;
162
163 LRMthread_lock(&context -> input_lock);
164 this_rlen = LRMgeinput_next_read(&context -> input_file, read_name, read_text, qual_text);
165 if(this_rlen > 0 ){
166 this_number = context -> processed_reads_in_chunk;
167 context -> processed_reads_in_chunk ++;
168 }else context -> input_exhausted = 1;
169
170 LRMthread_lockrelease(&context -> input_lock);
171
172 if(this_rlen && this_number>=0)
173 {
174 *read_no_in_chunk = this_number;
175 *read_len = this_rlen;
176 return 0;
177 }else{
178 *read_no_in_chunk = -1;
179 return 1;
180 }
181 }
182
LRMreverse_quality(char * InBuff,int read_len)183 void LRMreverse_quality(char * InBuff, int read_len){
184 int i;
185 if(!InBuff) return;
186 if(!InBuff[0]) return;
187 for (i=0; i<read_len/2; i++)
188 {
189 char tmp;
190 tmp = InBuff[i];
191 InBuff[i] = InBuff[read_len -1-i];
192 InBuff[read_len -1-i] = tmp;
193 }
194 }
195
LRMquality_64_to_33(char * qs)196 void LRMquality_64_to_33(char *qs){
197 int i;
198 for(i=0; qs[i]; i++){
199 qs[i] -= (64-33);
200 }
201 }
202
LRMgenerate_bam_record_encode_cigar(LRMcontext_t * context,int * cigar_int,char * cigar,int * mapped_length,int read_text_len)203 int LRMgenerate_bam_record_encode_cigar(LRMcontext_t * context, int * cigar_int, char * cigar, int * mapped_length, int read_text_len ){
204 int tmp_int=0;
205 int reported_read_len = 0;
206 int cigar_cursor = 0, num_opt = 0;
207 (*mapped_length) = 0;
208
209 if(cigar[0]=='*') return 0;
210
211 while(1)
212 {
213 char nch = cigar[cigar_cursor++];
214 if(!nch)break;
215 if(isdigit(nch))
216 {
217 tmp_int = tmp_int*10+(nch-'0');
218 }
219 else
220 {
221 int int_opt=0;
222 if(nch == 'M' ||nch == 'N'||nch == 'D') (*mapped_length) += tmp_int;
223 if(nch == 'M' ||nch == 'I'||nch == 'S') reported_read_len += tmp_int;
224 for(; int_opt<8; int_opt++) if("MIDNSHP=X"[int_opt] == nch)break;
225 cigar_int[num_opt ++] = (tmp_int << 4) | int_opt;
226 tmp_int = 0;
227 if(num_opt >= context -> max_cigars_in_read){
228 int_opt = 4; //'S'
229 tmp_int = read_text_len - reported_read_len;
230 cigar_int[num_opt ++] = (tmp_int << 4) | int_opt;
231 LRMprintf("CIGAR_TRIMMED : %d bases\n", tmp_int);
232 break;
233 }
234 }
235 }
236
237 return num_opt;
238 }
239
LRMreg2bin(int beg,int end)240 int LRMreg2bin(int beg, int end){
241 --end;
242 if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
243 if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
244 if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20);
245 if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23);
246 if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26);
247 return 0;
248 }
249
LRMgenerate_bam_record_encode_read_qual(char * bin,char * read,char * qual,int rlen)250 int LRMgenerate_bam_record_encode_read_qual(char * bin, char * read, char * qual, int rlen){
251 int w_ptr = 0, xk1;
252
253 for(xk1 = 0; xk1 < rlen; xk1++){
254 int fourbit;
255 for(fourbit=0;fourbit<15;fourbit++) if("=ACMGRSVTWYHKDBN"[fourbit] == read[xk1])break;
256 if( xk1 % 2 == 0){
257 fourbit = fourbit << 4;
258 bin[w_ptr]=0;
259 }
260
261 bin[w_ptr] |= fourbit;
262
263 if(xk1 % 2 == 1)w_ptr++;
264 }
265 if(rlen %2) w_ptr++;
266
267 for(xk1=0; xk1<rlen; xk1++)
268 bin[w_ptr+xk1] = qual[xk1]-33;
269 return w_ptr+rlen;
270 }
271
LRMgenerate_bam_record(LRMcontext_t * context,LRMthread_context_t * thread_context,LRMread_iteration_context_t * iteration_context,char * target_ptr,int flags,unsigned int chro_pos,char * chro_name,int map_quality,char * cigar,int mis_matched)272 int LRMgenerate_bam_record(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, char * target_ptr, int flags, unsigned int chro_pos,char * chro_name, int map_quality, char * cigar, int mis_matched){
273 int mapped_length = 0;
274
275 int chro_number = LRMHashTableGet(context->sam_bam_chromosome_table, chro_name) - NULL;
276 chro_number -=1;
277 memcpy(target_ptr+4, &chro_number, 4);
278 memcpy(target_ptr+8, &chro_pos, 4);
279
280 int name_len = strlen(iteration_context -> read_name)+1;
281
282 memcpy(target_ptr+20, &iteration_context->read_length, 4);
283 int next_non = -1;
284 memcpy(target_ptr+24, &next_non, 4);
285 memcpy(target_ptr+28, &next_non, 4);
286 memset(target_ptr+32, 0, 4); //TLEN
287 memcpy(target_ptr+36, iteration_context->read_name, name_len);
288 int bin_ptr = 36+name_len;
289
290 int cigar_opts = LRMgenerate_bam_record_encode_cigar(context, (int *)(target_ptr + bin_ptr), cigar, &mapped_length, iteration_context->read_length);
291 int flag_nc = flags << 16 | cigar_opts;
292 memcpy(target_ptr+16, &flag_nc, 4);
293 bin_ptr += cigar_opts*4;
294
295
296 int bin_mq_nl = (LRMreg2bin(chro_pos, chro_pos+mapped_length)<<16) | map_quality << 8 | name_len;
297 memcpy(target_ptr+12, &bin_mq_nl, 4);
298
299 //LRMprintf("READ TEXT %s : len=%d \n", iteration_context->read_name, iteration_context->read_length);
300 bin_ptr += LRMgenerate_bam_record_encode_read_qual(target_ptr + bin_ptr, iteration_context->read_text, iteration_context -> qual_text, iteration_context->read_length);
301
302 memcpy(target_ptr+bin_ptr, "NM",2);
303 target_ptr[bin_ptr+2]='C';
304 target_ptr[bin_ptr+3]=mis_matched;
305
306 memcpy(target_ptr, &bin_ptr, 4); // block len not include itself.
307 bin_ptr += 4;
308 return bin_ptr;
309 }
310
LRM_CRC32(char * dat,int len)311 unsigned int LRM_CRC32(char * dat, int len){
312 unsigned int crc0 = crc32(0, NULL, 0);
313 unsigned int ret = crc32(crc0, (unsigned char *)dat, len);
314 return ret;
315 }
316
LRMwrite_chunk_compress_bam_block(LRMcontext_t * context,LRMthread_context_t * thread_context,char * bin_buf,char * bam_buf,int bin_len)317 int LRMwrite_chunk_compress_bam_block(LRMcontext_t * context, LRMthread_context_t * thread_context, char * bin_buf, char * bam_buf, int bin_len){
318 char * compressed_buff = bam_buf + 18;
319
320 int compressed_size ;
321 unsigned int CRC32;
322 thread_context -> bam_file_output_stream.avail_out = 66600;
323 thread_context -> bam_file_output_stream.avail_in = bin_len;
324 CRC32 = LRM_CRC32(bin_buf , bin_len);
325
326 int Z_DEFAULT_MEM_LEVEL = 8;
327 thread_context -> bam_file_output_stream.zalloc = Z_NULL;
328 thread_context -> bam_file_output_stream.zfree = Z_NULL;
329 thread_context -> bam_file_output_stream.opaque = Z_NULL;
330
331 deflateInit2(&thread_context -> bam_file_output_stream, LRMBAM_COMPRESS_LEVEL, Z_DEFLATED,
332 LRMGZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
333
334 thread_context -> bam_file_output_stream.next_in = (unsigned char*) bin_buf;
335 thread_context -> bam_file_output_stream.next_out = (unsigned char*) compressed_buff;
336
337 deflate(&thread_context -> bam_file_output_stream, Z_FINISH);
338 deflateEnd(&thread_context -> bam_file_output_stream);
339
340 compressed_size = 66600 -thread_context -> bam_file_output_stream.avail_out;
341
342 //LRMprintf("COMPRESS: %d -> %d\n", bin_len, compressed_size);
343
344 bam_buf[0]=31;
345 bam_buf[1]=(char)139;
346 bam_buf[2]=8;
347 bam_buf[3]=4;
348 memset(bam_buf+4, 0, 5);
349 bam_buf[9] = 0xff; // OS
350
351 int tmpi = 6;
352 memcpy(bam_buf+10, &tmpi, 2); //XLSN
353 bam_buf[12]=66; // SI1
354 bam_buf[13]=67; // SI2
355 tmpi = 2;
356 memcpy(bam_buf+14, &tmpi, 2); //BSIZE
357 tmpi = compressed_size + 19 + 6;
358 memcpy(bam_buf+16, &tmpi, 2); //BSIZE
359
360 memcpy(bam_buf+18+compressed_size, &CRC32, 4);
361 memcpy(bam_buf+18+compressed_size+4, &bin_len, 4);
362 return compressed_size + 26;
363 }
364
LRMhash_strcmp(const void * s1,const void * s2)365 int LRMhash_strcmp(const void * s1, const void * s2){
366 return strcmp(s1, s2);
367 }
368
LRMhash_strhash(const void * sv)369 srUInt_64 LRMhash_strhash(const void * sv){
370 unsigned char *s = (unsigned char *)sv;
371 unsigned long ret = 0;
372 while(*s){
373 ret ^= (ret << 6);
374 ret ^= *s;
375 s++;
376 }
377 return ret;
378 }
379
LRMreadline(FILE * fp,char * buf,int buflen)380 int LRMreadline(FILE * fp, char * buf, int buflen){
381 int cr = 0;
382 while(!feof(fp)){
383 char c = fgetc(fp);
384 if(c == '\r'){
385 continue;
386 }else if(c == '\n'){
387 break;
388 }else if(cr < buflen-1)
389 buf[cr++]=c;
390 }
391 buf[cr]=0;
392 return cr;
393 }
394
LRMload_offsets(LRMcontext_t * context)395 int LRMload_offsets(LRMcontext_t * context){
396 char fn[LRMMAX_FILENAME_LENGTH+20];
397 FILE * fp;
398 int padding = 0;
399 unsigned int last_end = 0;
400
401 LRMgehash_load_option(context -> index_prefix, LRMSUBREAD_INDEX_OPTION_INDEX_PADDING , &padding);
402 assert(padding>16);
403
404 sprintf(fn, "%s.reads", context->index_prefix);
405 fp = fopen(fn, "r");
406
407 if(!fp){
408 LRMprintf("file not found :%s\n", fn);
409 return 1;
410 }
411
412 context -> current_index_padding = padding;
413 int chromosome_no = 0;
414 while (!feof(fp))
415 {
416 int i=0, step = 0, j=0;
417 unsigned int this_offset =0;
418
419 LRMreadline(fp, fn, LRMMAX_FILENAME_LENGTH-1);
420 if (strlen(fn)<2)continue;
421 char * chro_name = malloc(LRMMAX_CHROMOSOME_NAME_LEN);
422 while (fn[i])
423 {
424 if (fn[i] == '\t') {
425 fn[i]=0;
426 this_offset = (unsigned int)atoll(fn);
427 step = 1;
428 }else if (step) {
429 if(j<LRMMAX_CHROMOSOME_NAME_LEN-1){
430 chro_name[j++]=fn[i];
431 chro_name[j]=0;
432 }
433 }
434 i++;
435 }
436
437 LRMHashTablePut(context->sam_bam_chromosome_table, chro_name, NULL + chromosome_no + 1);
438 LRMArrayListPush(context->sam_bam_chromosome_list, chro_name);
439
440 LRMHashTablePut(context->chromosome_size_table, chro_name, NULL + (this_offset - last_end + 16 - context->current_index_padding * 2));
441 LRMArrayListPush(context->chromosome_size_list, NULL + this_offset);
442
443 chromosome_no++;
444 last_end = this_offset;
445 }
446
447 fclose(fp);
448 return 0;
449 }
450
451 #define LRMcheck_resize_buff if(new_need >= thread_context -> out_buff_capacity){\
452 thread_context -> out_buff_capacity = max(thread_context -> out_buff_capacity*2, new_need );\
453 thread_context -> out_SAMBAM_buffer=realloc(thread_context -> out_SAMBAM_buffer, thread_context -> out_buff_capacity);\
454 }\
455
456 #define LRMBAM_COMPRESS_BLOCK 63000
457 #define LRMBAM_COMPRESS_TRIGGER 53000
458
LRMsambam_write_header(LRMcontext_t * context,LRMthread_context_t * thread_context)459 void LRMsambam_write_header(LRMcontext_t * context, LRMthread_context_t * thread_context){
460 if(context -> sam_bam_file_header_written) return;
461
462 thread_context -> out_SAMBAM_buffer = malloc(3000000);
463 thread_context -> out_buff_capacity = 3000000;
464 thread_context -> out_buff_used = 0;
465
466 int chro_no;
467 if(!context -> is_SAM_output){
468 memcpy(thread_context -> out_SAMBAM_buffer,"BAM\1",4);
469 thread_context -> out_buff_used = 8;
470 }
471
472 for(chro_no = -1; chro_no < context -> sam_bam_chromosome_list->numOfElements + 2; chro_no++){
473 char * header_line = malloc(10100);
474 int wrlen = 0;
475 if(chro_no >=0 && chro_no < context -> sam_bam_chromosome_list->numOfElements){
476 char * chro_name = LRMArrayListGet(context -> sam_bam_chromosome_list, chro_no);
477 int chro_length = LRMHashTableGet(context -> chromosome_size_table, chro_name) - NULL;
478
479 wrlen = sprintf(header_line, "@SQ\tSN:%s\tLN:%d\n",chro_name,chro_length);
480 }else if(chro_no == -1){
481 wrlen = sprintf(header_line, "@HD\tVN:1.0\tSO:unsorted\n");
482 }else if(chro_no == context -> sam_bam_chromosome_list->numOfElements ){
483 wrlen = sprintf(header_line, "@PG\tID:subread-long-read-mapping\tPN:subread-long-read-mapping\tCL:%s\n", context -> user_command_line);
484 }
485 if(context -> is_SAM_output){
486 fwrite( header_line, 1, wrlen, context -> sam_bam_file);
487 }else{
488 int new_need = thread_context -> out_buff_used + wrlen +1;
489 LRMcheck_resize_buff;
490 memcpy(thread_context -> out_SAMBAM_buffer + thread_context -> out_buff_used,header_line,wrlen);
491 thread_context -> out_buff_used += wrlen;
492 }
493 free(header_line);
494 }
495
496 int new_need = thread_context -> out_buff_used + 10;
497 LRMcheck_resize_buff;
498
499 int BAM_text_len = thread_context -> out_buff_used-8;
500 memcpy(thread_context -> out_SAMBAM_buffer+4, &BAM_text_len, 4);
501 memcpy(thread_context -> out_SAMBAM_buffer + thread_context -> out_buff_used, &context ->sam_bam_chromosome_list->numOfElements , 4);
502 thread_context -> out_buff_used +=4;
503 for(chro_no = 0; chro_no < context -> sam_bam_chromosome_list->numOfElements ; chro_no++){
504 char * chro_name = LRMArrayListGet(context -> sam_bam_chromosome_list, chro_no);
505 int chro_namelen = strlen(chro_name)+1;
506 int new_need = thread_context -> out_buff_used + chro_namelen + 9;
507 LRMcheck_resize_buff;
508
509 memcpy(thread_context -> out_SAMBAM_buffer+thread_context -> out_buff_used, &chro_namelen, 4);
510 thread_context -> out_buff_used +=4;
511 memcpy(thread_context -> out_SAMBAM_buffer+thread_context -> out_buff_used, chro_name, chro_namelen);
512 thread_context -> out_buff_used += chro_namelen;
513 int chro_length = LRMHashTableGet(context -> chromosome_size_table, chro_name) - NULL;
514 memcpy(thread_context -> out_SAMBAM_buffer+thread_context -> out_buff_used, &chro_length, 4);
515 thread_context -> out_buff_used +=4;
516 }
517
518 if(!context -> is_SAM_output){
519 LRMwrite_chunk_check_buffer_write(context, thread_context, 1);
520
521 }
522 context -> sam_bam_file_header_written = 1;
523 free(thread_context -> out_SAMBAM_buffer);
524 }
525
LRMbam_generate_tail_binary(LRMcontext_t * context,LRMthread_context_t * thread_context)526 void LRMbam_generate_tail_binary(LRMcontext_t * context, LRMthread_context_t * thread_context){
527 if(context -> bam_file_tail_length>0)return;
528 context -> bam_file_tail_length = LRMwrite_chunk_compress_bam_block(context, thread_context, "", context -> bam_file_tail_binary , 0);
529 }
530
LRMwrite_chunk_add_buffered_output(LRMcontext_t * context,LRMthread_context_t * thread_context,LRMread_iteration_context_t * iteration_context,int flags,char * chro_name,unsigned int chro_pos,int map_quality,char * cigar,int mis_matched)531 int LRMwrite_chunk_add_buffered_output(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, int flags, char * chro_name, unsigned int chro_pos, int map_quality, char * cigar, int mis_matched){
532 char * target_ptr;
533 int cigar_len = strlen(cigar);
534 int rname_len = strlen(iteration_context->read_name);
535 int read_binlen, actural_target_len;
536 read_binlen = cigar_len+rname_len+2.5*iteration_context->read_length+500;
537
538 if(read_binlen + thread_context->out_buff_used >= thread_context -> out_buff_capacity){
539 thread_context -> out_buff_capacity = max( thread_context -> out_buff_capacity*1.3, read_binlen);
540 thread_context -> out_SAMBAM_buffer=realloc(thread_context -> out_SAMBAM_buffer, thread_context -> out_buff_capacity);
541 }
542
543 target_ptr = thread_context -> out_SAMBAM_buffer + thread_context->out_buff_used;
544 if(context -> is_Phred_64) LRMquality_64_to_33(iteration_context->qual_text) ;
545
546 if(context->is_SAM_output){
547 actural_target_len = sprintf(target_ptr,"%s\t%d\t%s\t%u\t%d\t%s\t*\t0\t0\t%s\t%s\tNM:%d\n", iteration_context -> read_name, flags, chro_name, chro_pos + 1, map_quality, cigar, iteration_context->read_text, iteration_context->qual_text, mis_matched);
548 }else{
549 actural_target_len = LRMgenerate_bam_record(context, thread_context, iteration_context, target_ptr, flags, chro_pos, chro_name, map_quality, cigar, mis_matched);
550 }
551 thread_context -> out_buff_used+=actural_target_len;
552 LRMwrite_chunk_check_buffer_write(context, thread_context, 0);
553 return 0;
554 }
LRMwrite_chunk_check_buffer_write(LRMcontext_t * context,LRMthread_context_t * thread_context,int force_write)555 int LRMwrite_chunk_check_buffer_write(LRMcontext_t * context, LRMthread_context_t * thread_context, int force_write){
556 if(force_write || thread_context -> out_buff_used > LRMBAM_COMPRESS_TRIGGER){
557 if(!context->is_SAM_output){
558 int write_cursor = 0;
559 int compressed_cursor = 0;
560 for(write_cursor = 0; write_cursor < thread_context -> out_buff_used; write_cursor += LRMBAM_COMPRESS_BLOCK){
561 char compressed_data [66666];
562 int bin_len = min(LRMBAM_COMPRESS_BLOCK,thread_context -> out_buff_used - write_cursor);
563 int compressed_len = LRMwrite_chunk_compress_bam_block(context, thread_context, thread_context -> out_SAMBAM_buffer + write_cursor, compressed_data, bin_len);
564 if(compressed_len > bin_len){ // very unlikely
565 int new_needed_size = thread_context -> out_buff_used + (compressed_len - bin_len);
566 if(new_needed_size > thread_context -> out_buff_capacity){
567 thread_context -> out_buff_capacity = new_needed_size;
568 thread_context -> out_SAMBAM_buffer=realloc(thread_context -> out_SAMBAM_buffer, thread_context -> out_buff_capacity);
569 }
570 int x1;
571 for(x1 = thread_context -> out_buff_used - 1; x1 >= write_cursor + bin_len; x1 --)
572 thread_context -> out_SAMBAM_buffer[ x1 + (compressed_len - bin_len) ] = thread_context -> out_SAMBAM_buffer[ x1 ];
573 }
574 memcpy(thread_context -> out_SAMBAM_buffer + compressed_cursor, compressed_data, compressed_len);
575 compressed_cursor += compressed_len;
576 }
577 thread_context -> out_buff_used = compressed_cursor;
578 }
579 LRMthread_lock(&context->sam_bam_file_lock);
580 fwrite(thread_context -> out_SAMBAM_buffer,1, thread_context -> out_buff_used, context -> sam_bam_file);
581 LRMthread_lockrelease(&context->sam_bam_file_lock);
582 thread_context -> out_buff_used = 0;
583 }
584
585 return 0;
586 }
587
588
589
LRMpos2txt(LRMcontext_t * context,unsigned int linear,char * txt)590 void LRMpos2txt(LRMcontext_t * context, unsigned int linear, char * txt){
591 char * chro_name = NULL;
592 int pos;
593
594 int retv = LRMlocate_gene_position(context, linear, &chro_name, &pos);
595 if(chro_name && !retv)
596 sprintf(txt, "%s:%d", chro_name, pos+1);
597 else strcpy(txt, "*");
598 }
599
LRMlocate_gene_position(LRMcontext_t * context,unsigned int linear,char ** chro_name,int * pos)600 int LRMlocate_gene_position(LRMcontext_t * context, unsigned int linear, char ** chro_name, int * pos) {
601 int n = 0;
602 int total_offsets = context -> chromosome_size_list -> numOfElements;
603 int jump_ns = total_offsets/4;
604
605 //LRMprintf("LINEAR = %u\n", linear);
606
607 (*chro_name)=NULL;
608 (*pos) = -1;
609 while (jump_ns > 5)
610 {
611 while(n+jump_ns < total_offsets && LRMArrayListGet(context->chromosome_size_list, n + jump_ns) - NULL <= linear)
612 n+=jump_ns;
613 jump_ns /=4;
614 }
615
616 for (; n < total_offsets; n++)
617 {
618 //LRMprintf("TESTING %u\n",(LRMArrayListGet(context->chromosome_size_list, n) - NULL) );
619 if ( (LRMArrayListGet(context->chromosome_size_list, n) - NULL) > linear)
620 {
621 *pos = linear;
622 if(n>0)(*pos) -= (LRMArrayListGet(context->chromosome_size_list, n-1)-NULL);
623 if( (*pos) < context -> current_index_padding ) return 1;
624 else (*pos) -= context -> current_index_padding;
625 (*chro_name) = LRMArrayListGet(context -> sam_bam_chromosome_list, n);
626
627 return 0;
628 }
629 }
630 return -1;
631 }
632
LRMlocate_chro_length(LRMcontext_t * context,unsigned int linear,char ** chro_name,long long * end_pos)633 int LRMlocate_chro_length(LRMcontext_t * context, unsigned int linear, char ** chro_name, long long * end_pos){
634 int n = 0;
635 int total_offsets = context -> chromosome_size_list -> numOfElements;
636 int jump_ns = total_offsets/4;
637
638 while (jump_ns > 5)
639 {
640 while(n+jump_ns < total_offsets && LRMArrayListGet(context->chromosome_size_list, n + jump_ns) - NULL <= linear)
641 n+=jump_ns;
642 jump_ns /=4;
643 }
644
645 for (; n < total_offsets; n++)
646 {
647 //LRMprintf("TESTING %u\n",(LRMArrayListGet(context->chromosome_size_list, n) - NULL) );
648 if ( (LRMArrayListGet(context->chromosome_size_list, n) - NULL) > linear) {
649 (* chro_name) = LRMArrayListGet(context -> sam_bam_chromosome_list, n);
650 (* end_pos) = LRMArrayListGet(context->chromosome_size_list, n)-NULL;
651 if((* end_pos) > context -> current_index_padding) (* end_pos) -= context -> current_index_padding ;
652 return 0;
653 }
654 }
655 return -1;
656 }
657