1 /***************************************************************
2
3 The Subread software package is free software package:
4 you can redistribute it and/or modify it under the terms
5 of the GNU General Public License as published by the
6 Free Software Foundation, either version 3 of the License,
7 or (at your option) any later version.
8
9 Subread is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty
11 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12
13 See the GNU General Public License for more details.
14
15 Authors: Drs Yang Liao and Wei Shi
16
17 ***************************************************************/
18
19
20 #include <stdio.h>
21 #include <signal.h>
22 #include <dirent.h>
23 #include <string.h>
24 #include <stdlib.h>
25 #include <ctype.h>
26 #include <sys/types.h>
27 #ifndef __MINGW32__
28 #include <sys/resource.h>
29 #endif
30 #include <sys/stat.h>
31 #include <unistd.h>
32 #include <zlib.h>
33 #include <stdio.h>
34 #include <assert.h>
35 #include "input-files.h"
36 #include "input-blc.h"
37 #include "sambam-file.h"
38 #include "HelperFunctions.h"
39 #include "hashtable.h"
40 #include "seek-zlib.h"
41 #include "gene-algorithms.h"
42 #include "sublog.h"
43
44 unsigned int BASE_BLOCK_LENGTH = 15000000;
45
f_subr_open(const char * fname,const char * mode)46 FILE * f_subr_open(const char * fname, const char * mode)
47 {
48 #ifdef __MINGW32__
49 return fopen64(fname, mode);
50 #else
51 #if defined(__LP64__) || defined(_LP64) || defined(MACOS) || defined(__FreeBSD__) || defined(__DragonFly__)
52 return fopen(fname, mode);
53 #else
54 return fopen64(fname, mode);
55 #endif
56 #endif
57
58 }
fastq_64_to_33(char * qs)59 void fastq_64_to_33(char * qs)
60 {
61 int i=0;
62 while(qs[i])
63 qs[i++] -= 31;
64 }
65
delay_run(void * ptr)66 void * delay_run(void * ptr){
67 usleep(100000);
68 free(ptr);
69 return NULL;
70 }
71
delay_realloc(void * old_pntr,size_t old_size,size_t new_size)72 void * delay_realloc(void * old_pntr, size_t old_size, size_t new_size){
73 pthread_t thread;
74 void * new_ret = malloc(new_size);
75 memcpy(new_ret, old_pntr, old_size);
76 pthread_create(&thread, NULL, delay_run, old_pntr);
77 return new_ret;
78 }
79
80 // the caller is in charge of deallocation
memstrcpy(char * in)81 char * memstrcpy(char * in){
82 int ilen = strlen(in);
83 char * ret = malloc(ilen+1);
84 memcpy(ret, in, ilen);
85 ret[ilen]=0;
86 return ret;
87 }
88
guess_reads_density(char * fname,int is_sam)89 double guess_reads_density(char * fname, int is_sam)
90 {
91 return guess_reads_density_format(fname, is_sam, NULL, NULL, NULL);
92 }
93
geinput_file_offset(gene_input_t * input)94 srInt_64 geinput_file_offset( gene_input_t * input){
95 if(input -> file_type == GENE_INPUT_GZIP_FASTQ || input -> file_type == GENE_INPUT_GZIP_FASTA){
96 if(((seekable_zfile_t*)input -> input_fp) -> blocks_in_chain<1)return 0;
97 seekable_decompressed_block_t * ct = ((seekable_zfile_t*)input -> input_fp) -> block_rolling_chain+((seekable_zfile_t*)input -> input_fp) -> block_chain_current_no;
98 return ct -> block_start_in_file_offset + ((seekable_zfile_t*)input -> input_fp) -> current_block_txt_read_ptr * 5/16; // compressed text ~= plain text * 28%
99 }else{
100 return ftello((FILE*)input ->input_fp);
101 }
102 }
103
guess_reads_density_format(char * fname,int is_sam,int * min_phred_score,int * max_phred_score,int * tested_reads)104 double guess_reads_density_format(char * fname, int is_sam, int * min_phred_score, int * max_phred_score, int * tested_reads)
105 {
106 gene_input_t *ginp = malloc(sizeof(gene_input_t));
107 srInt_64 fpos =0, fpos2 = 0;
108 int i;
109 int max_qual_chr = -1, min_qual_chr = 127;
110 char buff[MAX_READ_LENGTH] , qbuf[MAX_READ_LENGTH];
111
112 float retv = 0;
113
114 if(is_sam == 0)
115 {
116 if(geinput_open(fname, ginp))retv= -1.0;
117 }else if(is_sam == 1)
118 {
119 if(geinput_open_sam(fname, ginp,0))retv= -1.0;
120 }else if(is_sam == 2)
121 {
122 if(geinput_open_sam(fname, ginp,1))retv= -1.0;
123 }
124
125 if(retv > -0.1){
126 geinput_next_read(ginp, NULL, buff, NULL);
127
128 fpos = geinput_file_offset(ginp);
129 for(i=0; i<3000; i++)
130 {
131 if(geinput_next_read(ginp, NULL, buff, qbuf)<0) break;
132 if(qbuf[0])
133 {
134 int xk=0;
135 while(qbuf[xk])
136 {
137 min_qual_chr = min(min_qual_chr,qbuf[xk]);
138 max_qual_chr = max(max_qual_chr,qbuf[xk++]);
139 }
140 }
141 if(tested_reads)
142 (*tested_reads) ++;
143
144 }
145
146 if(min_phred_score)
147 {
148 (*min_phred_score) = min_qual_chr;
149 (*max_phred_score) = max_qual_chr;
150
151 }
152 fpos2 = geinput_file_offset(ginp) - fpos;
153 geinput_close(ginp);
154
155 retv= fpos2*1.0/i;
156 }
157
158 free(ginp);
159 return retv;
160 }
161
is_gene_char(char c)162 int is_gene_char(char c)
163 {
164 //if(c== 'M' || c == 'm' || c == 'U' || c == 'u' || c == 'A' || c=='a' || c=='G' || c=='g' || c=='C' || c=='c' || c=='T' || c=='t' || c=='N' || c=='n')
165 if(c=='-' || c == '.' || c == 'N')
166 return GENE_SPACE_BASE;
167 if((c>='A' && c<'Z') || (c>='a' && c<='z'))
168 return GENE_SPACE_BASE;
169 if(c>='0' && c<'9')
170 return GENE_SPACE_COLOR;
171 return 0;
172 }
173
guess_gene_bases(char ** files,int file_number)174 srInt_64 guess_gene_bases(char ** files, int file_number)
175 {
176 int i;
177 srInt_64 ret = 0;
178
179 for(i=0; i<file_number; i++)
180 {
181 char * fname = files[i];
182 struct stat statbuf;
183
184 if (stat(fname , &statbuf))
185 {
186 //SUBREADprintf("guess_gene_bases NOT FOUND!!%s\n", fname);
187 return -i-1;
188 }
189
190 ret += statbuf.st_size;
191 ret -= 150;
192 if(ret<2)ret=2;
193 }
194 return ret * 70 / 71;
195 }
196
197 #define geinput_getc(input) ( input -> file_type == GENE_INPUT_GZIP_FASTQ? (seekgz_next_char((seekable_zfile_t*)input -> input_fp)):(fgetc((FILE*)input -> input_fp)) )
198
geinput_preload_buffer(gene_input_t * input,subread_lock_t * read_lock)199 int geinput_preload_buffer(gene_input_t * input, subread_lock_t * read_lock){
200 if(input -> file_type == GENE_INPUT_GZIP_FASTQ)
201 return seekgz_preload_buffer((seekable_zfile_t*)input -> input_fp, read_lock);
202 return 0;
203 }
204
205
206
207 // read the line EXCLUDE last \n
208 // returns bytes WITHOUT \n
read_line_noempty(int max_read_len,gene_input_t * input,char * buff,int must_upper)209 int read_line_noempty(int max_read_len, gene_input_t * input, char * buff, int must_upper)
210 {
211 int ret =0;
212
213 if(input -> file_type == GENE_INPUT_GZIP_FASTQ || input -> file_type == GENE_INPUT_GZIP_FASTA){
214 seekgz_preload_buffer((seekable_zfile_t*)input -> input_fp, NULL);
215 ret = seekgz_gets((seekable_zfile_t*)input->input_fp, buff, MAX_READ_LENGTH-1);
216 if(ret > 0){
217 if(must_upper){
218 int ii;
219 for(ii=0; ii<ret-1;ii++) buff[ii] = toupper(buff[ii]);
220 }
221 buff[ret-1] =0;
222 return ret - 1;
223 } else return 0;
224 }
225 if(must_upper)
226 {
227 while(1)
228 {
229 char ch = geinput_getc(input);
230 #ifdef __MINGW32__
231 if(ch == '\r') continue;
232 #endif
233 if(ch == EOF) break;
234 if(ch == '\n'){
235 if(ret)
236 break;
237 }
238 else if(ret < max_read_len-1)
239 buff[ret++] = toupper(ch);
240 }
241 }
242 else
243 {
244 while(1)
245 {
246 char ch = geinput_getc(input);
247 #ifdef __MINGW32__
248 if(ch == '\r') continue;
249 #endif
250 if (ch == EOF) break;
251 if(ch == '\n'){
252 if(ret)
253 break;
254 }
255 else if(ret < max_read_len-1) buff[ret++] = ch;
256 }
257
258 }
259 buff[ret]=0;
260 return ret;
261 }
262
263
264
read_line(int max_read_len,FILE * fp,char * buff,int must_upper)265 int read_line(int max_read_len, FILE * fp, char * buff, int must_upper)
266 {
267 int ret =0;
268 if(must_upper)
269 {
270 while(1)
271 {
272 char ch = fgetc(fp);
273 #ifdef __MINGW32__
274 if(ch == '\r') continue;
275 #endif
276 if(ch == '\n' || ch == EOF) break;
277 if(ret < max_read_len-1)
278 buff[ret++] = toupper(ch);
279 }
280 }
281 else
282 {
283 while(1)
284 {
285 char ch = fgetc(fp);
286 #ifdef __MINGW32__
287 if(ch == '\r') continue;
288 #endif
289 if (ch == '\n' || ch == EOF) break;
290 if(ret < max_read_len-1)
291 buff[ret++] = ch;
292 }
293
294 }
295 buff[ret]=0;
296 return ret;
297 }
298
299
300
read_line_back(int max_read_len,FILE * fp,char * buff,int must_upper)301 int read_line_back(int max_read_len, FILE * fp, char * buff, int must_upper)
302 {
303 int ret =0;
304 int started = 0;
305 if(must_upper)
306 {
307 while(1)
308 {
309 char ch = fgetc(fp);
310 if (ch == '\n')
311 {
312 if (started)break;
313 else continue;
314 }
315 else if(ch == EOF) break;
316 else
317 started = 1;
318 if(ret <max_read_len && ch != '\r')
319 if ((ch!=' ' && ch != '\t'))
320 buff[ret++] = toupper(ch);
321 }
322 }
323 else
324 {
325 while(1)
326 {
327 char ch = fgetc(fp);
328 if (ch == '\n')
329 {
330 if (started)break;
331 else continue;
332 }
333 else if(ch == EOF) break;
334 else
335 started = 1;
336
337 if(ret <max_read_len && ch != '\r')
338 buff[ret++] = ch;
339 }
340
341 }
342 buff[ret]=0;
343 return ret;
344 }
345
geinput_readline(gene_input_t * input,char * buff,int conv_to_upper)346 int geinput_readline(gene_input_t * input, char * buff, int conv_to_upper)
347 {
348 return read_line(MAX_READ_LENGTH, input -> input_fp, buff, conv_to_upper);
349 }
350
is_read(char * in_buff)351 int is_read(char * in_buff)
352 {
353 int p=0;
354 char c;
355 int space_type = GENE_SPACE_BASE;
356 while((c=in_buff[p++])!='\0')
357 {
358 if(c!='\r' && c!='\n'){
359 int x = is_gene_char(c);
360 if (x == GENE_SPACE_COLOR)
361 space_type = GENE_SPACE_COLOR;
362 else if(!x)
363 return 0;
364 }
365 }
366 return space_type;
367 }
368
strtokmm(char * str,const char * delim,char ** next)369 char *strtokmm(char *str, const char *delim, char ** next) {
370 char *tok;
371 char *m;
372
373 if (delim == NULL) return NULL;
374
375 tok = (str) ? str : (*next);
376 if (tok == NULL) return NULL;
377
378 m = strstr(tok, delim);
379
380 if (m) {
381 (*next) = m + strlen(delim);
382 *m = '\0';
383 } else {
384 (*next) = NULL;
385 }
386
387 return tok;
388 }
389
geinput_open_scRNA_BAM(char * rfnames,gene_input_t * input,int reads_per_chunk,int threads)390 int geinput_open_scRNA_BAM(char * rfnames, gene_input_t * input, int reads_per_chunk, int threads ){
391 strcpy(input->filename,rfnames);
392 int rv = input_scBAM_init(&input -> scBAM_input, rfnames);
393 input -> file_type = GENE_INPUT_SCRNA_BAM;
394 input -> space_type = GENE_SPACE_BASE;
395 return rv;
396 }
397
geinput_open_scRNA_fqs(char * rfnames,gene_input_t * input,int reads_per_chunk,int threads)398 int geinput_open_scRNA_fqs(char * rfnames, gene_input_t * input, int reads_per_chunk, int threads ){
399 strcpy(input->filename,rfnames);
400 int rv = input_mFQ_init_by_one_string(&input -> scRNA_fq_input, rfnames);
401 input -> file_type = GENE_INPUT_SCRNA_FASTQ;
402 input -> space_type = GENE_SPACE_BASE;
403 return rv;
404 }
405
geinput_open_bcl(const char * dir_name,gene_input_t * input,int reads_per_chunk,int threads)406 int geinput_open_bcl( const char * dir_name, gene_input_t * input, int reads_per_chunk, int threads){
407 int rv = cacheBCL_init(&input -> bcl_input , (char*) dir_name, reads_per_chunk, threads );
408 strcpy(input->filename, dir_name);
409 if(rv) return -1;
410 input -> file_type = GENE_INPUT_BCL;
411 input -> space_type = GENE_SPACE_BASE;
412 return 0;
413 }
414
geinput_open_sam(const char * filename,gene_input_t * input,int half_number)415 int geinput_open_sam(const char * filename, gene_input_t * input, int half_number)
416 {
417 input->input_fp = f_subr_open(filename, "rb");
418
419 strcpy(input->filename, filename);
420
421 if(input->input_fp == NULL)
422 return 1;
423 input -> file_type = half_number + GENE_INPUT_SAM_SINGLE;
424 while(1){
425 char in_buff[3001];
426 srInt_64 current_pos = ftello(input -> input_fp);
427 int rlen = read_line(3000, input->input_fp, in_buff, 0);
428 if(rlen < 1) return 1;
429
430 if(in_buff[0] != '@')
431 {
432 int x, tab_no = 0;
433 char *read_buf=NULL;
434 for(x=0; x<rlen; x++)
435 {
436 if(in_buff[x]=='\t')
437 {
438 tab_no ++;
439 if(tab_no ==9) read_buf = in_buff+x+1;
440 if(tab_no ==10) in_buff[x]=0;
441 continue;
442 }
443 }
444 if (tab_no<10)return 1;
445 input->space_type = is_read(read_buf);
446 if (GENE_INPUT_SAM_PAIR_2 != input -> file_type) fseeko(input -> input_fp , current_pos, SEEK_SET);
447 input -> read_chunk_start = ftell(input -> input_fp);
448 break;
449 }
450 }
451
452 return 0;
453 }
454
geinput_open(const char * filename,gene_input_t * input)455 int geinput_open(const char * filename, gene_input_t * input)
456 {
457 char in_buff[MAX_READ_LENGTH];
458 int line_no = 0, ret = 0;
459 if(strlen(filename)>298)
460 return 1;
461
462 input -> gzfa_last_name[0]=0;
463 strcpy(input->filename, filename);
464 FILE * TMP_FP = f_subr_open(filename, "rb");
465
466 if(TMP_FP == NULL)
467 return 1;
468
469 int id1, id2;
470 id1 = fgetc(TMP_FP);
471 id2 = fgetc(TMP_FP);
472
473 if(id1 == 31 && id2 == 139) {
474 fclose(TMP_FP);
475 input->input_fp = malloc(sizeof(seekable_zfile_t));
476 ret = seekgz_open(filename, input->input_fp, NULL );
477 if(ret == 0){
478 int fq_stat = 0;
479 for(line_no = 0; line_no < 1000; line_no++){
480 int fl = seekgz_gets(input->input_fp, in_buff, 1000);
481 if(fl < 1)break; // EOF
482 else if(fl == 1)continue; // empty line
483 else{ // text line
484
485 if(line_no==0)input->file_type = in_buff[0]=='@'? GENE_INPUT_GZIP_FASTQ: GENE_INPUT_GZIP_FASTA;
486 if(fq_stat%4 == 1) // read text
487 {
488 input->space_type = is_read(in_buff);
489 break;
490 }
491 fq_stat ++;
492 }
493 }
494 seekgz_close(input->input_fp);
495 seekgz_open(filename, input->input_fp, NULL);
496 }
497 //SUBREADprintf("ZFAtest: type=%d\n", input->file_type);
498 }else{
499 input->file_type = GENE_INPUT_FASTQ;
500 input->input_fp = TMP_FP;
501 fseeko(input->input_fp, 0, SEEK_SET);
502 while (1){
503 srInt_64 last_pos = ftello(input->input_fp);
504 int rlen = read_line_noempty(MAX_READ_LENGTH, input, in_buff, 0);
505 if (rlen<=0){
506 ret = 1;
507 break;
508 }else{
509 if(line_no==0 && is_read(in_buff))
510 {
511 input->file_type = GENE_INPUT_PLAIN;
512 input->space_type = is_read(in_buff);
513 fseeko(input->input_fp,last_pos,SEEK_SET);
514 break;
515 }
516 if(in_buff[0]=='>')
517 {
518 input->file_type = GENE_INPUT_FASTA;
519 // printf("FILE %s OPENED AS FATSA.\n", filename);
520 rlen += read_line(MAX_READ_LENGTH, input->input_fp, in_buff, 0);
521 input->space_type = is_read(in_buff);
522
523 fseeko(input->input_fp,last_pos,SEEK_SET);
524 break;
525 }
526 if(in_buff[0]=='@')
527 {
528 input->file_type = GENE_INPUT_FASTQ;
529 rlen += read_line_noempty(MAX_READ_LENGTH, input, in_buff, 0);
530 input->space_type = is_read(in_buff);
531 fseeko(input->input_fp, last_pos,SEEK_SET);
532 break;
533 }
534 line_no++;
535 }
536 }
537 }
538 input -> read_chunk_start = geinput_file_offset(input);
539
540 if(0 == input->space_type)input->space_type = GENE_SPACE_BASE;
541 return ret;
542 }
543
geinput_next_char(gene_input_t * input)544 int geinput_next_char(gene_input_t * input)
545 {
546 if(input->file_type == GENE_INPUT_FASTA)
547 {
548 int last_br = 0;
549 while (1)
550 {
551 char nch = fgetc((FILE *)input->input_fp);
552 if (nch <0 && feof((FILE *)input->input_fp))
553 return -2;
554 else if (nch < 0 || nch > 126)SUBREADprintf("\nUnrecognised char = #%d\n", nch);
555
556 if (nch == '\r')
557 {
558 #ifndef __MINGW32__
559 SUBREADprintf("The input FASTA file contains \\r characters. This should not result in any problem but we suggest to use UNIX-style line breaks.\n");
560 #endif
561 last_br ++;
562 continue;
563 }
564 if (nch == '\n')
565 {
566 last_br ++;
567 continue;
568 }
569 if (nch == ' ' || nch == '\t')
570 continue;
571
572 if (nch == '>' && last_br)
573 {
574 // if this is a new segment
575
576 fseeko(input->input_fp, -last_br , SEEK_CUR);
577 return -1;
578 }
579
580 if (is_gene_char(nch))
581 return toupper(nch);
582 else {
583 srInt_64 fpos = ftello(input->input_fp);
584 int back_search_len =2;
585 int is_empty_seq = 0;
586 char *out_buf = malloc(2000);
587
588 while( fpos >= back_search_len )
589 {
590 fseeko(input->input_fp, fpos - back_search_len, SEEK_SET);
591 int bc_nch = fgetc(input->input_fp);
592 //SUBREADprintf("SEEKINGBACK : %d : ch=%d '%c' ; bch=%d '%c'\n", back_search_len, nch, nch, bc_nch, bc_nch);
593 if(bc_nch=='\n')
594 {
595 if(nch == '>' && back_search_len==2) is_empty_seq=1;
596 break;
597 }
598 back_search_len++;
599 }
600
601 char * fgin = fgets(out_buf, 1999,input->input_fp);
602 if(NULL == fgin) out_buf[0]=0;
603
604 if(is_empty_seq)
605 {
606 if(strlen(out_buf)>0)
607 out_buf[strlen(out_buf)-1]=0;
608 SUBREADprintf ("\nEmpty chromosome sequence before '%s'. The file offset is %llu\n",out_buf, fpos);
609 free(out_buf);
610 return -1;
611 }
612 else
613 {
614 #ifdef __MINGW32__
615 SUBREADprintf ("\nUnknown character in the chromosome data: '%c' (ASCII:%02X), ignored. The file offset is %lu\n", nch, nch, fpos);
616 #else
617 SUBREADprintf ("\nUnknown character in the chromosome data: '%c' (ASCII:%02X), ignored. The file offset is %llu\n", nch, nch, fpos);
618 #endif
619 SUBREADprintf("%s", out_buf);
620 for(; back_search_len>2; back_search_len--)
621 SUBREADprintf(" ");
622 SUBREADprintf("^\n");
623
624 fseeko(input->input_fp, fpos, SEEK_SET);
625 free(out_buf);
626 return 'N';
627 }
628 }
629 if(nch !='\r' && nch != '\n')last_br = 0;
630 }
631 }
632 else
633 {
634 SUBREADprintf("Only the FASTA format is accepted for input chromosome data.\n");
635 return -3;
636 }
637
638 }
639
640
geinput_readline_back(gene_input_t * input,char * linebuffer_3000)641 int geinput_readline_back(gene_input_t * input, char * linebuffer_3000)
642 {
643 srInt_64 last_pos = ftello(input -> input_fp);
644 int ret = read_line(3000, input->input_fp, linebuffer_3000, 0);
645 if(ret<1) return -1;
646 fseeko(input -> input_fp, last_pos, SEEK_SET);
647 return ret;
648 }
649
650 #define SKIP_LINE { nch=' '; while(nch != EOF && nch != '\n') nch = geinput_getc(input); }
651 #define SKIP_LINE_NOEMPTY {int content_line_l = 0; nch=' '; while(nch != EOF && (nch != '\n' ||! content_line_l)){nch = geinput_getc(input); content_line_l += (nch != '\n');} }
652
653 //#define SKIP_LINE { nch=' '; while(nch != EOF && nch != '\n') nch = geinput_getc(input); }
654
read_numbers(gene_input_t * input)655 unsigned int read_numbers(gene_input_t * input)
656 {
657 unsigned int ret = 0;
658 char nch;
659 srInt_64 fpos = ftello(input->input_fp);
660 if(input->file_type >= GENE_INPUT_SAM_SINGLE)
661 {
662 while(1)
663 {
664 nch = fgetc(input->input_fp);
665 if(nch=='@')
666 SKIP_LINE
667 else break;
668 }
669 }
670
671 while(1)
672 {
673 SKIP_LINE
674 if(nch==EOF) break;
675 ret ++;
676 }
677 fseeko(input->input_fp, fpos, SEEK_SET);
678 if (input->file_type == GENE_INPUT_FASTQ) return ret/4;
679 if (input->file_type == GENE_INPUT_FASTA) return ret/2;
680 return ret;
681 }
682
geinput_tell(gene_input_t * input,gene_inputfile_position_t * pos)683 void geinput_tell(gene_input_t * input, gene_inputfile_position_t * pos){
684 if(input -> file_type == GENE_INPUT_SCRNA_BAM){
685 scBAM_tell(&input -> scBAM_input, &pos -> scBAM_position);
686 }else if(input -> file_type == GENE_INPUT_SCRNA_FASTQ){
687 input_mFQ_tell(&input -> scRNA_fq_input, &pos -> mFQ_position);
688 }else if(input -> file_type == GENE_INPUT_BCL){
689 assert(input -> file_type != GENE_INPUT_BCL);
690 }else if(input -> file_type == GENE_INPUT_GZIP_FASTQ || input -> file_type == GENE_INPUT_GZIP_FASTA){
691 seekgz_tell(( seekable_zfile_t *)input -> input_fp, &pos -> seekable_gzip_position);
692 if(input -> gzfa_last_name[0]) strcpy(pos -> gzfa_last_name, input -> gzfa_last_name);
693 else pos -> gzfa_last_name[0]=0;
694 }else{
695 pos -> simple_file_position = ftello((FILE *)input -> input_fp);
696 }
697 }
698
geinput_seek(gene_input_t * input,gene_inputfile_position_t * pos)699 void geinput_seek(gene_input_t * input, gene_inputfile_position_t * pos){
700 if(input -> file_type == GENE_INPUT_SCRNA_BAM){
701 scBAM_seek(&input -> scBAM_input, &pos -> scBAM_position);
702 }else if(input -> file_type == GENE_INPUT_SCRNA_FASTQ){
703 input_mFQ_seek(&input -> scRNA_fq_input, &pos -> mFQ_position);
704 }else if(input -> file_type == GENE_INPUT_BCL){
705 assert(input -> file_type != GENE_INPUT_BCL);
706 }else if(input -> file_type == GENE_INPUT_GZIP_FASTQ || input -> file_type == GENE_INPUT_GZIP_FASTA){
707 seekgz_seek(( seekable_zfile_t *)input -> input_fp, &pos -> seekable_gzip_position);
708 if(pos -> gzfa_last_name[0]) strcpy(input -> gzfa_last_name, pos -> gzfa_last_name);
709 else input -> gzfa_last_name[0]=0;
710 }else{
711 fseeko((FILE *)input -> input_fp, pos -> simple_file_position, SEEK_SET);
712 }
713 }
714
trim_read_inner(char * read_text,char * qual_text,int rlen,short t_5,short t_3)715 int trim_read_inner(char * read_text, char * qual_text, int rlen, short t_5, short t_3)
716 {
717
718 if(rlen > t_5)
719 {
720 int xk1;
721 for(xk1 = 0; xk1 < rlen - t_5 ; xk1++)
722 read_text[xk1] = read_text[xk1+t_5];
723
724 if(qual_text)
725 for(xk1 = 0; xk1 < rlen - t_5 ; xk1++)
726 qual_text[xk1] = qual_text[xk1+t_5];
727 }
728 else{
729 read_text[0]=0;
730 if(qual_text)qual_text[0]=0;
731 return 0;
732 }
733
734 if(rlen - t_5 > t_3)
735 {
736 read_text[rlen - t_5 - t_3]=0;
737 if(qual_text)qual_text[rlen - t_5 - t_3]=0;
738 }
739 else{
740 read_text[0]=0;
741 if(qual_text)qual_text[0]=0;
742 return 0;
743 }
744
745
746
747 return max(0, rlen - t_5 - t_3);
748 }
749
tell_current_line_no(gene_input_t * input)750 srInt_64 tell_current_line_no(gene_input_t * input){
751 srInt_64 fpos = ftello(input->input_fp);
752 fseeko(input->input_fp,0,SEEK_SET);
753 srInt_64 ret = 0, fscanpos = 0;
754 while(1)
755 {
756 char nch = fgetc(input->input_fp);
757 if(nch == EOF) return -1;
758 if(nch == '\n') ret ++;
759 fscanpos ++;
760 if(fscanpos >= fpos){
761 fseeko(input->input_fp, fpos, SEEK_SET);
762 return ret;
763 }
764 }
765 }
766
geinput_next_read(gene_input_t * input,char * read_name,char * read_string,char * quality_string)767 int geinput_next_read(gene_input_t * input, char * read_name, char * read_string, char * quality_string)
768 {
769 return geinput_next_read_trim( input, read_name, read_string, quality_string, 0, 0, NULL);
770 }
771
772 // returns read length if OK
geinput_next_read_trim(gene_input_t * input,char * read_name,char * read_string,char * quality_string,short trim_5,short trim_3,int * is_secondary)773 int geinput_next_read_trim(gene_input_t * input, char * read_name, char * read_string, char * quality_string, short trim_5, short trim_3, int * is_secondary)
774 {
775 if(input -> file_type == GENE_INPUT_BCL) {
776 int rv = cacheBCL_next_read(&input -> bcl_input, read_name, read_string, quality_string, NULL);
777 if(rv<=0) return -1;
778 if(trim_5 || trim_3) rv = trim_read_inner(read_string, quality_string, rv, trim_5, trim_3);
779 return rv;
780 } else if(input -> file_type == GENE_INPUT_SCRNA_FASTQ) {
781 int rv = input_mFQ_next_read(&input -> scRNA_fq_input, read_name, read_string, quality_string);
782 if(rv<=0) return -1;
783 if(trim_5 || trim_3) rv = trim_read_inner(read_string, quality_string, rv, trim_5, trim_3);
784 return rv;
785 } else if(input -> file_type == GENE_INPUT_SCRNA_BAM) {
786 int rv = scBAM_next_read(&input -> scBAM_input, read_name, read_string, quality_string);
787 if(rv<=0) return -1;
788 if(trim_5 || trim_3) rv = trim_read_inner(read_string, quality_string, rv, trim_5, trim_3);
789 return rv;
790 } else if(input -> file_type == GENE_INPUT_PLAIN) {
791 int ret = read_line(MAX_READ_LENGTH, input->input_fp, read_string, 0);
792 if(quality_string) *quality_string=0;
793
794 if(ret <3)return -1;
795
796 if(trim_5 || trim_3) ret = trim_read_inner(read_string, NULL, ret, trim_5, trim_3);
797 return ret;
798 } else if(input->file_type >= GENE_INPUT_SAM_SINGLE) {
799 char in_buff [3001];
800 int tabs;
801 int current_str_pos;
802 int i;
803 int ret = -1;
804 int need_reverse;
805 char mask_buf[5];
806
807
808
809 while(1)
810 {
811 // int is_second_map = 0;
812 int linelen = read_line(3000, input->input_fp, in_buff, 0);
813 if(linelen <1)return -1;
814 if(read_name)
815 *read_name = 0;
816 if(quality_string)
817 *quality_string = 0;
818 *read_string = 0;
819 need_reverse = 0;
820 current_str_pos = 0;
821 ret = -1;
822 tabs=0;
823
824 for(i=0; i<linelen+1; i++)
825 {
826 if(in_buff[i]=='\t'|| i ==linelen)
827 {
828 if(tabs == 0 && read_name)read_name[current_str_pos] = 0;
829 if(tabs == 1)
830 {
831 mask_buf[current_str_pos] = 0;
832 int flags = atoi(mask_buf) ;
833 if(is_secondary && (flags & SAM_FLAG_SECONDARY_MAPPING))
834 {
835 (*is_secondary) = 1;
836 }
837 need_reverse = ( flags & SAM_FLAG_REVERSE_STRAND_MATCHED )?1:0;
838
839 }
840 if(tabs == 9){
841 read_string[current_str_pos] = 0;
842 ret = current_str_pos;
843 }
844 if(tabs == 10 && quality_string){
845 quality_string[current_str_pos] = 0;
846 break;
847 }
848
849 current_str_pos = 0 ;
850 tabs +=1;
851 }
852 else
853 {
854 if(tabs == 9)// read
855 read_string[current_str_pos++] = in_buff[i];
856 else if(tabs == 10 && quality_string)// quality string
857 quality_string[current_str_pos++] = in_buff[i];
858 else if(tabs == 0 && read_name)// name
859 read_name[current_str_pos++] = in_buff[i];
860 else if(tabs == 1)
861 mask_buf[current_str_pos++] = in_buff[i];
862 }
863 }
864 if(input->file_type > GENE_INPUT_SAM_SINGLE)
865 // skip a line if not single-end
866 read_line(1, input->input_fp, in_buff, 0);
867
868 break;
869 //printf("Repeated read skipped : %s\n", read_name);
870 }
871
872 if(need_reverse)
873 {
874 if(quality_string)
875 reverse_quality(quality_string, ret);
876 reverse_read(read_string, ret, input->space_type);
877 }
878 if(trim_5 || trim_3) ret = trim_read_inner(read_string, quality_string, ret, trim_5, trim_3);
879 return ret;
880 } else if(input->file_type == GENE_INPUT_GZIP_FASTA) {
881 // it is currently at ">"
882 int tr = 0, ret = 0;
883 char rbuf [MAX_READ_LENGTH+2];
884 if(input -> gzfa_last_name [0] == 0){
885 ret = read_line_noempty(MAX_READ_NAME_LEN, input, rbuf, 0);
886 if(ret <1) return -1;
887 if(read_name)strcpy(read_name, rbuf+1);
888 }
889 else if(read_name)strcpy(read_name, input -> gzfa_last_name);
890 ret=0;
891
892 while(1){
893 tr = read_line_noempty(MAX_READ_LENGTH, input, rbuf, 0);
894 if(tr<1) {
895 if(ret<1) return -1;
896 break;
897 }else{
898 if(rbuf[0]=='>'){
899 strcpy(input -> gzfa_last_name, rbuf+1);
900 break;
901 }else{
902 strcpy(read_string+ret, rbuf);
903 ret += tr; // read_line_noempty have no \n
904 }
905 read_string[ret]=0;
906 }
907 }
908 if(trim_5 || trim_3) ret = trim_read_inner(read_string, quality_string, ret, trim_5, trim_3);
909 return ret;
910 } else if(input->file_type == GENE_INPUT_FASTA) {
911 int ret;
912 if(quality_string) (*quality_string)=0;
913 #ifdef __MINGW32__
914 assert(0);
915 #endif
916 while(1) // fetch read name
917 {
918 ret = read_line(MAX_READ_LENGTH, input->input_fp, read_string, 0);
919 if(ret <1)
920 {
921 sublog_printf(SUBLOG_STAGE_RELEASED,SUBLOG_LEVEL_DEBUG, "The input file normally exhausted.");
922 return -1;
923 }
924
925 int cursor = 0;
926 while(read_string[cursor])
927 {
928 if(cursor >=2 &&(read_string[cursor] == ' ' || read_string[cursor] == '\t'))
929 {
930 read_string [cursor] = 0;
931 break;
932 }
933 cursor++;
934 }
935
936 if(read_string[0]=='>'){
937 if (read_name != NULL)
938 strncpy(read_name, read_string+1, MAX_READ_NAME_LEN);
939 break;
940 }
941 else
942 sublog_printf(SUBLOG_STAGE_RELEASED,SUBLOG_LEVEL_FATAL,"The input file may be broken.");
943 }
944 ret = 0;
945 while(1) // fetch read text
946 {
947 char nch = 0;
948 ret += read_line(MAX_READ_LENGTH-ret, input->input_fp, read_string+ret, 1);
949
950 nch = fgetc(input->input_fp);
951
952 if(nch!=EOF)
953 fseeko(input->input_fp, -1, SEEK_CUR);
954
955 if(nch == '>'||nch<1 || nch == EOF)
956 break;
957 }
958 // printf("LOAD R=|%s|\nRETV=%d\n", read_string, ret);
959 if(ret <1)return -1;
960 if(trim_5 || trim_3) ret = trim_read_inner(read_string, quality_string, ret, trim_5, trim_3);
961 return ret;
962
963 } else if(input->file_type == GENE_INPUT_FASTQ || input->file_type == GENE_INPUT_GZIP_FASTQ) {
964 char nch = 0;
965 int ret;
966
967 //if(input->file_type == GENE_INPUT_GZIP_FASTQ)seekgz_preload_buffer(input, NULL);
968 //READ NAME
969 if (read_name == NULL)
970 {
971 SKIP_LINE_NOEMPTY;
972 if(nch == EOF) return -1;
973 }
974 else
975 {
976 do{
977 nch = geinput_getc(input);
978 //SUBREADprintf("B4_READ_NAME: %d '%c'\n", nch,nch);
979 } while (nch == '\n');
980 if(nch==EOF) return -1;
981
982 if(nch != '@') {
983 if(input->file_type == GENE_INPUT_FASTQ){
984 srInt_64 lineno = tell_current_line_no(input);
985 SUBREADprintf("ERROR: a format issue %d is found on the %lld-th line in input file '%s'.\nProgram aborted.\n", nch, lineno, input -> filename);
986 } else {
987 SUBREADprintf("ERROR: a format issue %d is found on the input file '%s'.\nProgram aborted.\n", nch, input -> filename);
988 SUBREADprintf("The lines after the error point:\n");
989 read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
990 SUBREADprintf("%s\n", read_string);
991 read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
992 SUBREADprintf("%s\n", read_string);
993 read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
994 SUBREADprintf("%s\n", read_string);
995 read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
996 SUBREADprintf("%s\n", read_string);
997 }
998 return -1;
999 }
1000
1001 read_line_noempty(MAX_READ_NAME_LEN, input, read_name, 0);
1002
1003 int cursor = 1;
1004 while(read_name[cursor])
1005 {
1006 if(read_name[cursor] == ' ' || read_name[cursor] == '\t')
1007 {
1008 read_name [cursor] = 0;
1009 break;
1010 }
1011 cursor++;
1012 }
1013 }
1014 //if(input->file_type == GENE_INPUT_GZIP_FASTQ)seekgz_preload_buffer(input, NULL);
1015 // READ LINE
1016 ret = read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
1017 //SUBREADprintf("READ_SHOULD_ATGC [len=%d] : '''%s'''\n", ret, read_string);
1018
1019 // SKIP "+"
1020 do{
1021 nch = geinput_getc(input);
1022 } while( nch == '\n' );
1023 if(nch != '+'){
1024 if(input->file_type == GENE_INPUT_FASTQ){
1025 srInt_64 lineno = tell_current_line_no(input);
1026 SUBREADprintf("ERROR: a format issue %c is found on the %lld-th line in input file '%s'.\nProgram aborted.\n", nch, lineno, input -> filename);
1027 }else{
1028 SUBREADprintf("ERROR: a format issue %d (should be +) is found on the input file '%s'.\nProgram aborted.\n", nch, input -> filename);
1029 read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
1030 SUBREADprintf("%s\n", read_string);
1031 read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
1032 SUBREADprintf("%s\n", read_string);
1033 read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
1034 SUBREADprintf("%s\n", read_string);
1035 read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
1036 SUBREADprintf("%s\n", read_string);
1037 }
1038 return -1;
1039 }
1040 SKIP_LINE;
1041
1042 // QUAL LINE
1043 if (quality_string)
1044 read_line_noempty(MAX_READ_LENGTH, input, quality_string, 0);
1045 else
1046 SKIP_LINE_NOEMPTY;
1047
1048
1049
1050 #ifdef MODIFIED_READ_LEN
1051 {
1052 int modified_start = 0;
1053 if(modified_start)
1054 {
1055 int i;
1056 for(i=0;i<MODIFIED_READ_LEN; i++)
1057 {
1058 read_string[i] = read_string[i+modified_start];
1059 if(quality_string)
1060 quality_string[i] = quality_string[i+modified_start];
1061 }
1062 }
1063 read_string[MODIFIED_READ_LEN]=0;
1064 if(quality_string)
1065 quality_string[MODIFIED_READ_LEN]=0;
1066 ret = MODIFIED_READ_LEN;
1067 }
1068 #endif
1069
1070 // printf("LOAD R=|%s|\nRETV=%d\n", read_string, ret);
1071
1072 if(trim_5 || trim_3) ret = trim_read_inner(read_string, quality_string, ret, trim_5, trim_3);
1073 return ret;
1074
1075 }else return -1;
1076 }
1077
geinput_close(gene_input_t * input)1078 void geinput_close(gene_input_t * input)
1079 {
1080 if(input -> file_type == GENE_INPUT_SCRNA_BAM)
1081 input_scBAM_close(&input -> scBAM_input);
1082 else if(input -> file_type == GENE_INPUT_SCRNA_FASTQ)
1083 input_mFQ_close(&input -> scRNA_fq_input);
1084 else if(input -> file_type == GENE_INPUT_BCL)
1085 cacheBCL_close(&input -> bcl_input);
1086 else if(input -> file_type == GENE_INPUT_GZIP_FASTQ || input -> file_type == GENE_INPUT_GZIP_FASTA)
1087 seekgz_close((seekable_zfile_t * ) input->input_fp);
1088 else
1089 fclose((FILE*)input->input_fp);
1090 }
1091
1092 char * __converting_char_table = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNGNNNCNNNNNNNNNNNNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNGNNNCNNNNNNNNNNNNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ";
1093
reverse_read(char * InBuff,int read_len,int space_type)1094 void reverse_read(char * InBuff, int read_len, int space_type)
1095 {
1096 int i;
1097
1098 if(space_type == GENE_SPACE_COLOR)
1099 {
1100 int start_pos = 0;
1101 char last_base = InBuff[0];
1102
1103 //printf("CLRLEN0=%d\nS0=%s\n", read_len, InBuff);
1104 if(isalpha(last_base))
1105 {
1106 read_len ++;
1107
1108 for (i=1; i<read_len; i++)
1109 {
1110 int new_int = InBuff[i];
1111 int new_base = 0;
1112 if(new_int == '0')
1113 new_base=last_base;
1114 else if(new_int == '1')
1115 {
1116 if(last_base == 'A')new_base = 'C';
1117 else if(last_base == 'G')new_base = 'T';
1118 else if(last_base == 'T')new_base = 'G';
1119 else new_base = 'A';
1120 }
1121 else if(new_int == '2')
1122 {
1123 if(last_base == 'A')new_base = 'G';
1124 else if(last_base == 'G')new_base = 'A';
1125 else if(last_base == 'T')new_base = 'C';
1126 else new_base = 'T';
1127 }
1128 else
1129 {
1130 if(last_base == 'A')new_base = 'T';
1131 else if(last_base == 'G')new_base = 'C';
1132 else if(last_base == 'T')new_base = 'A';
1133 else new_base = 'G';
1134 }
1135 last_base = new_base;
1136 // putchar(last_base);
1137 }
1138 //puts("");
1139 InBuff[0] = *(__converting_char_table+last_base);
1140 start_pos = 1;
1141 }
1142 else read_len--;
1143
1144 for (i=0; i<(read_len - start_pos)/2; i++)
1145 {
1146 int rll1 = read_len - 1 - i;
1147 char tmp = InBuff[rll1];
1148 InBuff[rll1] = InBuff[i + start_pos];
1149 InBuff[i + start_pos] = tmp;
1150 }
1151 }
1152 else
1153 {
1154 for (i=0; i<read_len/2; i++)
1155 {
1156 int rll1 = read_len - 1 - i;
1157 unsigned char tmp = InBuff[rll1];
1158
1159 InBuff[rll1] = *(__converting_char_table+InBuff[i]);
1160 InBuff[i] = *(__converting_char_table+tmp);
1161
1162 }
1163 if(i*2 == read_len-1)
1164 {
1165 InBuff[i] = *(__converting_char_table+InBuff[i]);
1166 }
1167 }
1168
1169 }
1170
1171
1172
reverse_quality(char * InBuff,int read_len)1173 void reverse_quality(char * InBuff, int read_len)
1174 {
1175 int i;
1176 if(!InBuff) return;
1177 if(!InBuff[0]) return;
1178 for (i=0; i<read_len/2; i++)
1179 {
1180 char tmp;
1181 tmp = InBuff[i];
1182 InBuff[i] = InBuff[read_len -1-i];
1183 InBuff[read_len -1-i] = tmp;
1184 }
1185 }
1186
1187
genekey2intX(char * key,int space_type)1188 int genekey2intX(char * key,int space_type)
1189 {
1190 int i;
1191 int ret;
1192
1193 ret = 0;
1194 if(space_type == GENE_SPACE_BASE)
1195 for (i=30; i>=0; i-=2)
1196 {
1197 char kv = *(key++);
1198 ret |= base2int(kv)<<i;
1199 }
1200 else
1201 for (i=0; i<16; i++)
1202 {
1203 ret = ret << 2;
1204 ret |= color2int (key[i]);
1205 }
1206
1207 // printf("RET=%u\n",ret);
1208
1209 return ret;
1210 }
1211
1212
genekey2int(char * key,int space_type)1213 int genekey2int(char *key,int space_type)
1214 {
1215 int i;
1216 int ret;
1217
1218 ret = 0;
1219 if(space_type == GENE_SPACE_BASE)
1220 for (i=30; i>=0; i-=2)
1221 {
1222 char c1 = *(key++);
1223 ret |= base2int(c1)<<i;
1224 }
1225 else
1226 for (i=0; i<16; i++)
1227 {
1228 ret = ret << 2;
1229 ret |= color2int (key[i]);
1230 }
1231 return ret;
1232 }
1233
genekey2color(char last_base,char key[])1234 int genekey2color(char last_base, char key [])
1235 {
1236 int i, ret = 0;
1237 char last_char = last_base;
1238
1239 for (i=0; i<16; i++)
1240 {
1241 char next_char = key[i];
1242
1243 ret = ret << 2;
1244 ret += chars2color(last_char, next_char);
1245
1246 last_char = next_char;
1247 }
1248
1249 return ret;
1250 }
1251
colorread2base(char * read_buffer,int read_len)1252 void colorread2base(char * read_buffer, int read_len)
1253 {
1254 int i;
1255 char last_base = read_buffer[0];
1256 //printf("C2B:%s\n",read_buffer);
1257 for (i=1; i<read_len; i++)
1258 {
1259 int new_int = read_buffer[i];
1260 int new_base = 0;
1261 if(new_int == '0')
1262 new_base=last_base;
1263 else if(new_int == '1')
1264 {
1265 if(last_base == 'A')new_base = 'C';
1266 else if(last_base == 'G')new_base = 'T';
1267 else if(last_base == 'T')new_base = 'G';
1268 else new_base = 'A';
1269 }
1270 else if(new_int == '2')
1271 {
1272 if(last_base == 'A')new_base = 'G';
1273 else if(last_base == 'G')new_base = 'A';
1274 else if(last_base == 'T')new_base = 'C';
1275 else new_base = 'T';
1276 }
1277 else
1278 {
1279 if(last_base == 'A')new_base = 'T';
1280 else if(last_base == 'G')new_base = 'C';
1281 else if(last_base == 'T')new_base = 'A';
1282 else new_base = 'G';
1283 }
1284 read_buffer[i] = new_base;
1285 last_base = new_base;
1286 }
1287 //printf("CBX:%s\n",read_buffer);
1288 }
1289
color2char(char clr,char c1)1290 char color2char(char clr, char c1)
1291 {
1292 if(clr == '0')return c1;
1293 else if(clr == '1')
1294 {
1295 if(c1 == 'A') return 'C';
1296 else if(c1 == 'T') return 'G';
1297 else if(c1 == 'G') return 'T';
1298 else return 'A';
1299 }
1300 else if(clr == '2')
1301 {
1302 if(c1 == 'A') return 'G';
1303 else if(c1 == 'T') return 'C';
1304 else if(c1 == 'G') return 'A';
1305 else return 'T';
1306 }
1307 else if(clr == '3')
1308 {
1309 if(c1 == 'A') return 'T';
1310 else if(c1 == 'T') return 'A';
1311 else if(c1 == 'G') return 'C';
1312 else return 'G';
1313 }
1314
1315 return 'N';
1316 }
1317
chars2color(char c1,char c2)1318 int chars2color(char c1, char c2)
1319 {
1320 if(c1 == 'A')
1321 {
1322 if (c2=='A') return 0;
1323 if (c2=='C') return 1;
1324 if (c2=='G') return 2;
1325 else return 3;
1326 }
1327 if (c1 == 'C')
1328 {
1329 if (c2=='A') return 1;
1330 if (c2=='C') return 0;
1331 if (c2=='G') return 3;
1332 else return 2;
1333 }
1334 if (c1 == 'G')
1335 {
1336 if (c2=='A') return 2;
1337 if (c2=='C') return 3;
1338 if (c2=='G') return 0;
1339 else return 1;
1340 }
1341
1342 // if c1 == 'T', 'U'
1343 if (c2=='A') return 3;
1344 if (c2=='C') return 2;
1345 if (c2=='G') return 1;
1346 else return 0;
1347
1348
1349
1350 }
1351
find_subread_end(int len,int TOTAL_SUBREADS,int subread)1352 int find_subread_end(int len, int TOTAL_SUBREADS, int subread)
1353 {
1354 if(len<= EXON_LONG_READ_LENGTH)
1355 {
1356 int subread_step = ((len<<16) - (19<<16))/(TOTAL_SUBREADS -1);
1357 return ((subread_step*(subread))>>16)+15;
1358 }
1359 else
1360 {
1361 int subread_step;
1362
1363 subread_step = 6<<16;
1364 if(((len - 18)<<16) / subread_step > 62)
1365 subread_step = ((len - 18)<<16)/62;
1366 return ((subread_step*(subread))>>16)+15;
1367 }
1368 }
1369
fix_cigar_SAM14(char * cig)1370 void fix_cigar_SAM14(char * cig){
1371 int tmpi = 0, ci = 0, tmpM = 0, wi = 0;
1372 char ncig[EXON_MAX_CIGAR_LEN];
1373
1374 if(cig[0]=='*'){
1375 return;
1376 }
1377 while(1){
1378 int nch = cig[ci];
1379 if(isdigit(nch)) tmpi = tmpi * 10 + nch - '0';
1380 else{
1381 if(nch == '=' || nch == 'X' || nch == 'M'){
1382 tmpM += tmpi;
1383 }else{
1384 if(tmpM > 0){
1385 wi += sprintf(ncig + wi, "%dM", tmpM);
1386 tmpM = 0;
1387 }
1388 if(0 == nch) break;
1389 else wi += sprintf(ncig + wi, "%d%c", tmpi, nch);
1390 }
1391 tmpi = 0;
1392 }
1393 ci++;
1394 }
1395 memcpy(cig, ncig, wi+1);
1396 }
1397
1398 //This function returns 0 if the line is a mapped read; -1 if the line is in a wrong format and 1 if the read is unmapped.
parse_SAM_line(char * sam_line,char * read_name,int * flags,char * chro,unsigned int * pos,char * cigar,int * mapping_quality,unsigned int * pair_dist,char * sequence,char * quality_string,int * rl,int * repeated)1399 int parse_SAM_line(char * sam_line, char * read_name, int * flags, char * chro, unsigned int * pos, char * cigar, int * mapping_quality, unsigned int * pair_dist, char * sequence, char * quality_string, int * rl, int * repeated)
1400 {
1401 char cc;
1402 int ci = 0, k=0, field=0, ret_quality = 0, ret_flag = 0, ret_pairdist=0;
1403 unsigned int ret_pos = 0;
1404 int is_rep = 0;
1405
1406 while( (cc = sam_line[k]) )
1407 {
1408 if(cc=='\t')
1409 {
1410 field++;
1411 k++;
1412 if(field == 1)read_name[ci]=0;
1413 else if(field == 3)chro[ci]=0;
1414 else if(field == 6)cigar[ci]=0;
1415 else if(field == 10)
1416 {
1417 sequence[ci]=0;
1418 (*rl) = ci;
1419 }
1420 else if(field == 11)quality_string[ci]=0;
1421 ci=0;
1422 is_rep = 0;
1423 continue;
1424 }
1425 if(field == 9)
1426 sequence[ci++] = cc;
1427 else if(field == 10)
1428 quality_string[ci++] = cc;
1429 else if(field == 0)
1430 read_name[ci++] = cc;
1431 else if(field == 1)
1432 ret_flag = ret_flag*10 + (cc-'0');
1433 else if(field == 8)
1434 {
1435 if(cc!='-')
1436 ret_pairdist = ret_pairdist*10 + (cc-'0');
1437 }
1438 else if(field == 2)
1439 {
1440 //if(ci == 0 && cc == '*') return 1;
1441 chro[ci++] = cc;
1442 }
1443 else if(field == 3)
1444 ret_pos = ret_pos * 10 + (cc-'0');
1445 else if(field == 4)
1446 ret_quality = ret_quality * 10 + (cc-'0');
1447 else if(field == 5)
1448 cigar[ci++] = cc;
1449 else if(field > 10)
1450 {
1451 if(cc == 'I' && ci==0) is_rep = 1;
1452 if(cc != 'H' && ci==1 ) is_rep = 0;
1453 if(is_rep && ci == 4) *repeated = 0;
1454 if(is_rep && ci>4)
1455 (*repeated)=(*repeated)*10+(cc-'0');
1456 ci++;
1457
1458 }
1459 k++;
1460
1461 }
1462
1463 //printf("REP=%d\n", *repeated);
1464
1465 if(field == 10 && ci>0)quality_string[ci]=0;
1466 else if(field < 10) return -1;
1467
1468 if(ret_flag & 4)
1469 (*mapping_quality) = 0;
1470 else
1471 (*mapping_quality) = ret_quality;
1472 (*pos) = ret_pos;
1473 (*flags) = ret_flag;
1474 (*pair_dist) = ret_pairdist;
1475 //printf("FLAG=%d\n", (*flags));
1476 if(((*flags) & 4) == 4) return 1;
1477
1478 fix_cigar_SAM14(cigar);
1479 return 0;
1480
1481 }
1482
1483
1484 // This function returns 0 if the block is determined.
1485 // The block is undeterminable if the chromosome name is not in known_chromosomes, or the position is larger than the known length.
1486 // Pos is in terms of [1, ... , max_length]
get_read_block(char * chro,unsigned int pos,char * temp_file_suffix,chromosome_t * known_chromosomes,unsigned int * max_base_position)1487 int get_read_block(char *chro, unsigned int pos, char *temp_file_suffix, chromosome_t *known_chromosomes, unsigned int * max_base_position)
1488 {
1489 int chro_no;
1490 unsigned int max_known_chromosome=0;
1491
1492 for(chro_no=0;known_chromosomes[chro_no].chromosome_name[0]; chro_no++)
1493 {
1494 if(strcmp(chro , known_chromosomes[chro_no].chromosome_name) == 0)
1495 {
1496 max_known_chromosome = known_chromosomes[chro_no].known_length;
1497 break;
1498 }
1499 //if(chro_no > 1)
1500 // printf("TOO MANY CHROS:%d\n", chro_no);
1501 }
1502 if(!known_chromosomes[chro_no].chromosome_name[0]) return 1;
1503 if(pos >= known_chromosomes[chro_no].known_length) return 1;
1504
1505 int block_no = (pos-1) / BASE_BLOCK_LENGTH;
1506 sprintf(temp_file_suffix , "%s-%04u.bin", chro, block_no);
1507 if(max_base_position)*max_base_position=min((block_no+1)*BASE_BLOCK_LENGTH, max_known_chromosome);
1508
1509 return 0;
1510 }
1511
get_temp_file_pointer(char * temp_file_name,HashTable * fp_table,int * close_immediately)1512 FILE * get_temp_file_pointer(char *temp_file_name, HashTable* fp_table, int * close_immediately)
1513 {
1514 FILE * temp_file_pointer = (FILE *) HashTableGet(fp_table, temp_file_name);
1515 *close_immediately = 0;
1516
1517 if(temp_file_pointer == NULL || temp_file_pointer == NULL + 1) {
1518 int need_put = (temp_file_pointer == NULL );
1519 char *key_name;
1520 key_name = (char *)SUBREAD_malloc(300);
1521 if(!key_name)
1522 return NULL;
1523 strcpy(key_name, temp_file_name);
1524 temp_file_pointer = f_subr_open(key_name,"ab");
1525
1526 if(!temp_file_pointer){
1527 SUBREADprintf("File cannot be opened: '%s'.\nPlease increase the maximum open files by command 'ulimit -n'.\nThis number should be set to at least 500 for human genome, and more chromosomes require more opened files.\n\n", key_name);
1528 return NULL;
1529 }
1530
1531 int maximum_open_file = fp_table -> appendix1 - NULL;
1532 if( fp_table -> numOfElements < maximum_open_file && need_put)
1533 HashTablePut(fp_table, key_name ,temp_file_pointer);
1534 else{
1535 if(need_put)
1536 HashTablePut(fp_table, key_name , NULL + 1);
1537 *close_immediately = 1;
1538 }
1539 }
1540
1541 return temp_file_pointer;
1542 }
1543
my_fclose(void * fp)1544 void my_fclose(void * fp)
1545 {
1546 if(fp && fp != NULL+1)
1547 fclose((FILE *)fp);
1548 }
1549
my_strcmp(const void * s1,const void * s2)1550 int my_strcmp(const void * s1, const void * s2)
1551 {
1552 int ret = strcmp((char*)s1, (char*)s2);
1553 //SUBREADprintf("SCM:%s %s = %d\n", s1, s2, ret);
1554 return ret;
1555 }
1556
write_read_block_file(FILE * temp_fp,unsigned int read_number,char * read_name,int flags,char * chro,unsigned int pos,char * cigar,int mapping_quality,char * sequence,char * quality_string,int rl,int is_sequence_needed,char strand,unsigned short read_pos,unsigned short read_len,unsigned short mapped_seg)1557 int write_read_block_file(FILE *temp_fp , unsigned int read_number, char *read_name, int flags, char * chro, unsigned int pos, char *cigar, int mapping_quality, char *sequence , char *quality_string, int rl , int is_sequence_needed, char strand, unsigned short read_pos, unsigned short read_len, unsigned short mapped_seg)
1558 {
1559 base_block_temp_read_t datum;
1560 memset(&datum,0,sizeof(datum));
1561 datum.record_type = 100;
1562 datum.read_number = read_number;
1563 datum.pos = pos;
1564 datum.flags = flags;
1565 datum.strand = strand;
1566 datum.read_pos = read_pos;
1567 datum.read_len = read_len;
1568 datum.mapping_quality = mapping_quality;
1569 datum.mapped_segment_in_read = mapped_seg;
1570
1571 if(rl < 1|| rl > MAX_READ_LENGTH)
1572 {
1573
1574 SUBREADprintf("READ IS TOO LONG:%d\n", rl);
1575 return -1;
1576 }
1577
1578 fwrite(&datum, sizeof(datum), 1, temp_fp);
1579 if(is_sequence_needed)
1580 {
1581 unsigned short srl = rl&0xffff;
1582 int wlen = fwrite(&srl, sizeof(short),1, temp_fp);
1583 if(wlen != 1) return -1;
1584 wlen = fwrite(sequence , 1, rl,temp_fp );
1585 if(wlen != rl) return -1;
1586 wlen = fwrite(quality_string , 1, rl,temp_fp );
1587 if(wlen != rl) return -1;
1588 }
1589 return 0;
1590 }
1591
1592
get_known_chromosomes(char * in_SAM_file,chromosome_t * known_chromosomes)1593 int get_known_chromosomes(char * in_SAM_file, chromosome_t * known_chromosomes)
1594 {
1595 int i, is_first_read_PE;
1596 int is_BAM = is_certainly_bam_file(in_SAM_file, &is_first_read_PE, NULL);
1597 SamBam_FILE * fp = SamBam_fopen(in_SAM_file,is_BAM?SAMBAM_FILE_BAM:SAMBAM_FILE_SAM);
1598
1599 while(1)
1600 {
1601 char line_buffer [3000];
1602 char * is_ret = SamBam_fgets(fp, line_buffer, 2999, 0);
1603 if(!is_ret) break;
1604 int linelen = strlen(line_buffer);
1605
1606 if(line_buffer[0]=='@')
1607 {
1608 int chro_numb=0, field = 0, ci=0, ciw = 0;
1609 if(line_buffer[1]!='S' || line_buffer[2]!='Q' || line_buffer[3]!='\t' ) continue;
1610
1611 while(known_chromosomes[chro_numb].chromosome_name[0]!=0) chro_numb++;
1612 if(chro_numb > XOFFSET_TABLE_SIZE-1)
1613 {
1614 SUBREADprintf("FATAL ERROR: the number of chromosomes excessed %d\n", XOFFSET_TABLE_SIZE);
1615 return -1;
1616 }
1617 known_chromosomes[chro_numb].known_length = 0;
1618 for(i=0; i< linelen; i++)
1619 {
1620 char cc = line_buffer[i];
1621
1622 if(cc == '\r' || cc=='\n') continue;
1623
1624 if(cc == '\t')
1625 {
1626 if(field == 1)
1627 known_chromosomes[chro_numb].chromosome_name[ciw]=0;
1628 ci = 0;
1629 ciw = 0;
1630 field ++;
1631 }
1632 else if(field == 1)
1633 {
1634 if(ci >2)
1635 known_chromosomes[chro_numb].chromosome_name[ciw++]=cc;
1636 ci++;
1637 }
1638 else if(field == 2)
1639 {
1640 if(ci >2)
1641 known_chromosomes[chro_numb].known_length = known_chromosomes[chro_numb].known_length * 10 + (cc - '0');
1642 ci++;
1643 }
1644 }
1645 }
1646 else
1647 break;
1648 }
1649 SamBam_fclose(fp);
1650 return 0;
1651 }
1652
add_cigar_indel_event(HashTable * event_table_ptr,char * chro,unsigned int chro_pos,int indels,char * ins_seq)1653 void add_cigar_indel_event(HashTable * event_table_ptr, char * chro, unsigned int chro_pos, int indels , char * ins_seq)
1654 {
1655 if(abs(indels)>100) return;
1656
1657 char event_token[100];
1658 snprintf(event_token, 99,"%s\t%u", chro, chro_pos);
1659 int x1;
1660 unsigned int indel_event_id = 0xffffffff, token_len;
1661
1662 int exist_indel_count = HashTableGet(event_table_ptr, event_token) - NULL;
1663 unsigned short * app2_ptr = event_table_ptr->appendix2;
1664
1665 if(exist_indel_count)
1666 for(x1 = 0; x1< exist_indel_count; x1++)
1667 {
1668 snprintf(event_token, 99,"%s\t%u\t%d", chro, chro_pos, x1);
1669 srInt_64 t64v = (HashTableGet(event_table_ptr, event_token)-NULL);
1670 srInt_64 indel_len = (t64v&0xff) - 0x80;
1671 if(indel_len == indels){
1672 indel_event_id = 0xffffff&(t64v >> 8) ;
1673 if(app2_ptr[indel_event_id]<65000)
1674 app2_ptr[indel_event_id] +=1;
1675 return;
1676 }
1677 }
1678
1679
1680 if(event_table_ptr->counter2<0xffff00)
1681 {
1682 unsigned int event_space_max_size = event_table_ptr-> counter1;
1683 indel_event_id = event_table_ptr->counter2 ++;
1684
1685 if(indel_event_id >= event_space_max_size)
1686 {
1687 event_table_ptr->appendix1 = realloc(event_table_ptr->appendix1 , sizeof(char *) * event_space_max_size*2);
1688 event_table_ptr->appendix2 = realloc(event_table_ptr->appendix2 , sizeof(short) * event_space_max_size*2);
1689 memset(event_table_ptr->appendix2 + event_space_max_size * sizeof(short), 0, sizeof(short) * event_space_max_size);
1690 event_table_ptr-> counter1 = event_space_max_size*2;
1691 app2_ptr = event_table_ptr->appendix2;
1692 }
1693
1694 token_len=snprintf(event_token, 99,"%s\t%u", chro, chro_pos);
1695 if(exist_indel_count<1)
1696 {
1697 char * token_1 = malloc(token_len+1);
1698 strcpy(token_1, event_token);
1699 HashTablePut(event_table_ptr, token_1, NULL+1);
1700 }
1701 else
1702 {
1703 HashTablePutReplace(event_table_ptr, event_token, NULL+exist_indel_count+1, 0);
1704 }
1705
1706 token_len=snprintf(event_token, 99,"%s\t%u\t%d", chro, chro_pos, exist_indel_count);
1707 char * token_2 = malloc(token_len+1);
1708 strcpy(token_2, event_token);
1709 srInt_64 indel_event_id_long = indel_event_id;
1710 app2_ptr[indel_event_id] +=1;
1711
1712 HashTablePut(event_table_ptr, token_2, NULL + ((0xff & (0x80 + indels)) | ((indel_event_id_long&0xffffff) << 8)));
1713 if(indels<0)
1714 {
1715 char * ins_seq_2 = malloc(-indels), ** app1_ptrptr = event_table_ptr->appendix1;
1716 memcpy(ins_seq_2, ins_seq, -indels);
1717 app1_ptrptr[indel_event_id] = ins_seq_2;
1718 }
1719 }
1720 }
1721
destroy_cigar_event_table(HashTable * event_table)1722 void destroy_cigar_event_table(HashTable * event_table)
1723 {
1724 int bucket;
1725 KeyValuePair * cursor;
1726 char ** seq_tab = event_table->appendix1;
1727 for(bucket=0; bucket<event_table -> numOfBuckets; bucket++)
1728 {
1729 cursor = event_table -> bucketArray[bucket];
1730 while (1)
1731 {
1732 int xk1, tabs;
1733 if (!cursor) break;
1734
1735 char * token = (char *)cursor -> key;
1736 tabs = 0;
1737 for(xk1=0; token[xk1]; xk1++)
1738 if(token[xk1]=='\t') tabs++;
1739 srInt_64 tmpv = cursor -> value - NULL;
1740 //printf("%s\t%lld\n", token, tmpv);
1741
1742 if(tabs==3)
1743 {
1744 unsigned int event_id = (tmpv>>8)&0xffffff;
1745 free(seq_tab[event_id]);
1746 }
1747 free(token);
1748 cursor = cursor->next;
1749 }
1750 }
1751
1752 free(event_table->appendix1);
1753 free(event_table->appendix2);
1754 HashTableDestroy(event_table);
1755 }
1756
break_VCF_file(char * vcf_file,HashTable * fp_table,char * temp_file_prefix,chromosome_t * known_chromosomes)1757 void break_VCF_file(char * vcf_file, HashTable * fp_table, char * temp_file_prefix, chromosome_t* known_chromosomes)
1758 {
1759 autozip_fp vzfp;
1760 int vret = autozip_open(vcf_file, &vzfp);
1761 char temp_file_suffix[MAX_CHROMOSOME_NAME_LEN+20];
1762 int close_now = 0;
1763
1764 if(vret < 0)
1765 {
1766 SUBREADprintf("The specified VCF does not exist.\n");
1767 return;
1768 }
1769
1770 char * linebuf = malloc(3000);
1771 char * tmpfname = malloc(MAX_FILE_NAME_LENGTH);
1772
1773 while(1)
1774 {
1775 char * tok_tmp;
1776 int aretc = autozip_gets(&vzfp, linebuf, 2999);
1777 if(aretc < 1) break;
1778 if(linebuf[0]=='#') continue;
1779 if(strstr(linebuf, "INDEL")) continue;
1780 //SUBREADprintf("VLINE:%s\n", linebuf);
1781
1782 char * chro = strtok_r(linebuf, "\t", &tok_tmp);
1783 if(!tok_tmp) continue;
1784 char * pos_str = strtok_r(NULL, "\t", &tok_tmp);
1785 if(!tok_tmp) continue;
1786
1787 strtok_r(NULL, "\t", &tok_tmp);// name
1788 if(!tok_tmp) continue;
1789
1790 char * ref_seq = strtok_r(NULL, "\t", &tok_tmp);
1791 if(!tok_tmp) continue;
1792 char * alt_seq = strtok_r(NULL, "\t", &tok_tmp);
1793 if(!tok_tmp) continue;
1794
1795 int is_snp = 0;
1796 if(strstr(alt_seq,","))
1797 {
1798 char * com_tmp = NULL;
1799 char * com_sec = strtok_r(alt_seq, ",", &com_tmp);
1800 while(com_sec)
1801 {
1802 if(strlen(com_sec)==strlen(ref_seq))
1803 {
1804 is_snp=1;
1805 break;
1806 }
1807
1808 com_sec = strtok_r(NULL, ",", &com_tmp);
1809 }
1810
1811 }else if(strlen(ref_seq) == strlen(alt_seq)) is_snp=1;
1812
1813 if(!is_snp)continue;
1814 unsigned int max_section_pos;
1815
1816 if(get_read_block(chro, atoi(pos_str) , temp_file_suffix, known_chromosomes, &max_section_pos))continue;
1817 sprintf(tmpfname, "%s%s", temp_file_prefix , temp_file_suffix);
1818 FILE * temp_fp = get_temp_file_pointer(tmpfname, fp_table, &close_now);
1819 if(temp_fp)
1820 {
1821 VCF_temp_read_t datum;
1822 datum.record_type = 200;
1823 datum.pos = atoi(pos_str);
1824 datum.type = CHRO_EVENT_TYPE_SNP;
1825 fwrite(&datum, sizeof(VCF_temp_read_t), 1, temp_fp);
1826 if(close_now) fclose(temp_fp);
1827 }
1828 }
1829
1830 free(linebuf);
1831 free(tmpfname);
1832 autozip_close(&vzfp);
1833 }
1834
break_SAM_file(char * in_SAM_file,int is_BAM_file,char * temp_file_prefix,unsigned int * real_read_count,int * block_count,chromosome_t * known_chromosomes,int is_sequence_needed,int base_ignored_head_tail,gene_value_index_t * array_index,gene_offset_t * offsets,srInt_64 * all_mapped_bases,HashTable * event_table,char * VCF_file,srInt_64 * all_mapped_reads,int do_fragment_filtering,int push_to_read_head,int use_softclipped_bases)1835 int break_SAM_file(char * in_SAM_file, int is_BAM_file, char * temp_file_prefix, unsigned int * real_read_count, int * block_count, chromosome_t * known_chromosomes, int is_sequence_needed, int base_ignored_head_tail, gene_value_index_t *array_index, gene_offset_t * offsets, srInt_64 * all_mapped_bases, HashTable * event_table, char * VCF_file, srInt_64 * all_mapped_reads, int do_fragment_filtering, int push_to_read_head, int use_softclipped_bases )
1836 {
1837 int i, is_first_read=1, is_error = 0;
1838 HashTable * fp_table;
1839 unsigned int read_number = 0;
1840 char line_buffer [3000];
1841 SamBam_FILE * sambam_reader;
1842
1843 sambam_reader = SamBam_fopen(in_SAM_file, is_BAM_file?SAMBAM_FILE_BAM:SAMBAM_FILE_SAM);
1844
1845 if(!sambam_reader){
1846 SUBREADprintf("SAM file does not exist or is not accessible: '%s'\n", in_SAM_file);
1847 return 1;
1848 }
1849 if(push_to_read_head)assert(is_sequence_needed==0);
1850
1851 fp_table = HashTableCreate( 11011 );
1852 HashTableSetDeallocationFunctions(fp_table, free, my_fclose);
1853 HashTableSetKeyComparisonFunction(fp_table, my_strcmp);
1854 HashTableSetHashFunction(fp_table,HashTableStringHashFunction);
1855
1856 char * fns = malloc(200);
1857 fns[0]=0;
1858 exec_cmd("ulimit -n", fns, 200);
1859 int max_open_file = atoi(fns);
1860 //SUBREADprintf("SYS FILE LIMIT=%d\n", max_open_file);
1861 free(fns);
1862
1863 max_open_file = max(100, max_open_file);
1864 max_open_file = min(3000, max_open_file);
1865
1866 fp_table -> appendix1 = NULL + max_open_file * 2/ 3;
1867
1868 if(event_table!=NULL && event_table->appendix1==NULL)
1869 {
1870 event_table->appendix1 = malloc(sizeof(char *) * 100);
1871 event_table->appendix2 = malloc(sizeof(unsigned short) * 100);
1872 memset(event_table->appendix2, 0, sizeof(unsigned short) * 100);
1873 event_table->counter1 = 100;
1874 event_table->counter2 = 0;
1875 }
1876
1877 while(1)
1878 {
1879 //srInt_64 file_position = ftello(fp);
1880 //int linelen = read_line(2999, fp, line_buffer, 0);
1881 char * is_ret = SamBam_fgets(sambam_reader, line_buffer, 2999, 1);
1882
1883 if(!is_ret) break;
1884
1885 if(line_buffer[0]=='@')
1886 {
1887 int chro_numb=0, field = 0, ci=0, ciw = 0;
1888 if(line_buffer[1]!='S' || line_buffer[2]!='Q' || line_buffer[3]!='\t' ) continue;
1889
1890 while(known_chromosomes[chro_numb].chromosome_name[0]!=0) chro_numb++;
1891
1892 if(chro_numb > XOFFSET_TABLE_SIZE-1)
1893 {
1894 SUBREADprintf("FATAL ERROR: the number of chromosomes excessed %d\n", XOFFSET_TABLE_SIZE);
1895 return -1;
1896 }
1897
1898 known_chromosomes[chro_numb].known_length = 0;
1899 for(i=0; ; i++)
1900 {
1901 char cc = line_buffer[i];
1902 if(!cc) break;
1903
1904 if(cc == '\r' || cc=='\n') continue;
1905
1906 if(cc == '\t')
1907 {
1908 if(field == 1)
1909 known_chromosomes[chro_numb].chromosome_name[ciw]=0;
1910 ci = 0;
1911 ciw = 0;
1912 field ++;
1913 }
1914 else if(field == 1)
1915 {
1916 if(ci >2)
1917 known_chromosomes[chro_numb].chromosome_name[ciw++]=cc;
1918 ci++;
1919 }
1920 else if(field == 2)
1921 {
1922 if(ci >2)
1923 known_chromosomes[chro_numb].known_length = known_chromosomes[chro_numb].known_length * 10 + (cc - '0');
1924 ci++;
1925 }
1926 }
1927 if(chro_numb < XOFFSET_TABLE_SIZE-1) known_chromosomes[chro_numb+1].chromosome_name[0]=0;
1928 }
1929 else
1930 {
1931 char read_name[MAX_READ_NAME_LEN], chro[MAX_CHROMOSOME_NAME_LEN], cigar[EXON_MAX_CIGAR_LEN], sequence[MAX_READ_LENGTH+1], quality_string[MAX_READ_LENGTH+1];
1932 int flags = 0, mapping_quality = 0, rl=0;
1933 char is_negative_strand = 0;
1934 unsigned int pos = 0, pairdist = 0;
1935 char temp_file_suffix[MAX_FILE_NAME_LENGTH];
1936 char temp_file_name[MAX_FILE_NAME_LENGTH];
1937 FILE * temp_fp;
1938 int repeated = -1, close_now = 0;
1939
1940 if(is_first_read)
1941 {
1942 is_first_read=0;
1943
1944 if(VCF_file && VCF_file[0])
1945 break_VCF_file(VCF_file, fp_table, temp_file_prefix, known_chromosomes);
1946 }
1947
1948
1949 //SUBREADprintf("ARRI_0=%p ; OFFS=%p ; EVT=%p\n%s\n",array_index, offsets, event_table, line_buffer);
1950 int line_parse_result = parse_SAM_line(line_buffer, read_name, &flags, chro, &pos, cigar, & mapping_quality, &pairdist, sequence, quality_string, &rl, &repeated);
1951 if(line_parse_result<0)SUBREADprintf("WRONG LINE FORMAT: %s\n", line_buffer);
1952
1953 if(strlen(quality_string)<2)
1954 {
1955 int xk1;
1956 for(xk1=0; xk1<rl; xk1++)
1957 {
1958 quality_string[xk1]='I';
1959 }
1960 quality_string[xk1]=0;
1961 }
1962
1963 if(line_parse_result || (flags & SAM_FLAG_UNMAPPED)){
1964 read_number ++;
1965 continue;
1966 }
1967
1968 if(do_fragment_filtering && (flags & SAM_FLAG_PAIRED_TASK) && (pairdist ==0 || pairdist > 500000)){
1969 read_number ++;
1970 continue;
1971 }
1972
1973 if(do_fragment_filtering && array_index)
1974 {
1975 int mismatch = 0;
1976
1977 unsigned int linear_pos = linear_gene_position(offsets , chro, pos)-1;
1978 float match_rate = final_mapping_quality(array_index, linear_pos, sequence, quality_string, cigar, FASTQ_PHRED33, & mismatch, rl, NULL, NULL);
1979 if(mismatch>8 || match_rate < 160)
1980 {
1981 read_number ++;
1982 continue;
1983 }
1984 }
1985
1986 is_negative_strand = (flags & SAM_FLAG_REVERSE_STRAND_MATCHED)?1:0;
1987 if((flags & 4) ==0 && all_mapped_reads)(*all_mapped_reads)++;
1988
1989
1990 if(is_sequence_needed == 2)
1991 {
1992
1993 }
1994 else if(is_sequence_needed == 1)
1995 {
1996 int read_cursor = 0;
1997 int is_first_S = 1;
1998 unsigned int chromosome_cursor = pos;
1999 int j, tmpv=0;
2000 char cc;
2001 unsigned short M_parts=0;
2002
2003 for(j=0; cigar[j]; j++)
2004 {
2005 cc = cigar[j];
2006 if(cc>='0' && cc<='9') tmpv= tmpv*10+(cc-'0');
2007 else if(cc == 'S'||cc == 'M')
2008 {
2009 if(cc == 'M') is_first_S = 0;
2010
2011 if(cc == 'M' || use_softclipped_bases)
2012 {
2013 unsigned int insertion_cursor = chromosome_cursor - ((cc=='S' && is_first_S)?tmpv:0);
2014 unsigned int insertion_end = chromosome_cursor + ((cc=='S' && is_first_S)?0:tmpv);
2015 // DO INSERTION
2016 while(insertion_cursor < insertion_end && read_cursor < (rl - base_ignored_head_tail))
2017 {
2018 unsigned int max_section_pos, insert_length;
2019 int need_write = 1;
2020
2021 if(get_read_block(chro, insertion_cursor , temp_file_suffix, known_chromosomes, &max_section_pos))break;
2022 insert_length = min(max_section_pos + 1, insertion_end) - insertion_cursor;
2023 if(insert_length<1) break;
2024
2025 if(base_ignored_head_tail)
2026 {
2027 if(read_cursor+insert_length < base_ignored_head_tail)
2028 need_write = 0;
2029 else if(read_cursor < base_ignored_head_tail)
2030 {
2031 int ignored_length = base_ignored_head_tail - read_cursor;
2032 insert_length = read_cursor + insert_length - base_ignored_head_tail;
2033
2034 read_cursor = base_ignored_head_tail;
2035 insertion_cursor += ignored_length;
2036 }
2037
2038 if(read_cursor >= (rl - base_ignored_head_tail))
2039 need_write = 0;
2040 else if(read_cursor +insert_length >= (rl - base_ignored_head_tail))
2041 insert_length = (rl - base_ignored_head_tail) - read_cursor;
2042 }
2043 // printf("INST: RL=%d; INSL=%d; READ_CUR=%d; IGNORE=%d\n", rl, insert_length, read_cursor , base_ignored_head_tail);
2044
2045 //#warning " ======= DEBUG OUT ========="
2046 if(0 && FIXLENstrcmp("SRR768163.14829906", read_name) == 0)
2047 SUBREADprintf("INST: RL=%d; NEED=%d; INSL=%d; READ_CUR=%d; IGNORE=%d; RN=%s\nWRT AT %u (one-based): %s\n\n", rl, need_write, insert_length, read_cursor , base_ignored_head_tail, read_name, insertion_cursor, sequence + read_cursor);
2048
2049 if(0 && strcmp(chro, "chr12") == 0 && insertion_cursor <= 114788620 && insertion_cursor + insert_length > 114788620){
2050 int read_pos0 = 114788620 - insertion_cursor + read_cursor;
2051 SUBREADprintf("INST_114788620 : %s : val=%c ; NEED=%d\n", read_name, sequence[read_pos0], need_write);
2052 }
2053
2054 if(need_write && insert_length > 0 && sequence[0]!='*') {
2055 sprintf(temp_file_name, "%s%s", temp_file_prefix , temp_file_suffix);
2056 temp_fp = get_temp_file_pointer(temp_file_name, fp_table, &close_now);
2057 if(!temp_fp) return -1;
2058 if(all_mapped_bases)
2059 (*all_mapped_bases) += insert_length;
2060
2061 is_error |= write_read_block_file(temp_fp , read_number, read_name, flags, chro, insertion_cursor, cigar, mapping_quality, sequence + read_cursor , quality_string + read_cursor, insert_length , 1, is_negative_strand, read_cursor, rl, M_parts);
2062 if(close_now) fclose(temp_fp);
2063 }
2064 insertion_cursor += insert_length;
2065 read_cursor += insert_length;
2066 }
2067 if(M_parts < 65535)M_parts ++;
2068 }
2069 else
2070 read_cursor += tmpv;
2071
2072 if(!is_first_S)
2073 chromosome_cursor += tmpv;
2074
2075 tmpv=0;
2076 }
2077 else if(cc == 'D' || cc == 'N')
2078 {
2079 // the left edge ( last WANTED base ) is chromosome_cursor-1
2080 // the indel length is tmpv;
2081 // now we add this into the event table.
2082 if(event_table && cc=='D')
2083 add_cigar_indel_event(event_table, chro, chromosome_cursor-1, tmpv, NULL);
2084 chromosome_cursor += tmpv;
2085 tmpv = 0;
2086 }
2087 else if(cc == 'I' )
2088 {
2089 // the left edge ( last WANTED base ) is chromosome_cursor-1
2090 // the indel length is -tmpv;
2091 // now we add this into the event table.
2092 if(event_table && sequence[0]!='*')
2093 add_cigar_indel_event(event_table, chro, chromosome_cursor-1, -tmpv, sequence + read_cursor);
2094 read_cursor += tmpv;
2095 tmpv = 0;
2096 }
2097 else tmpv = 0;
2098
2099 }
2100
2101 }else{ // NO sequence is needed : no CIGAR is parsed.
2102 int cgi, cc;
2103 int pushback = 0;
2104
2105 for(cgi=0; cigar[cgi]; cgi++){
2106 cc = cigar[cgi];
2107 if(cc >='0' && cc<='9') pushback = pushback*10 + cc-'0';
2108 else{
2109 if(cc!='S') pushback=0;
2110 break;
2111 }
2112 }
2113
2114 assert(pos>=pushback);
2115 pos -= pushback;
2116
2117 if(get_read_block(chro, pos, temp_file_suffix, known_chromosomes, NULL)) {
2118 read_number ++;
2119 continue;
2120 }
2121 sprintf(temp_file_name, "%s%s", temp_file_prefix , temp_file_suffix);
2122
2123 temp_fp = get_temp_file_pointer(temp_file_name, fp_table, &close_now);
2124 is_error |= write_read_block_file(temp_fp , read_number, read_name, flags, chro, pos, cigar, mapping_quality, sequence , quality_string, rl , is_sequence_needed, is_negative_strand, 0,rl, 0);
2125 if(close_now)fclose(temp_fp);
2126 }
2127 read_number ++;
2128 }
2129 }
2130
2131 if(block_count)
2132 (*block_count) = fp_table->numOfElements;
2133 HashTableDestroy(fp_table);
2134 SamBam_fclose(sambam_reader);
2135 if(real_read_count)
2136 (*real_read_count) = read_number;
2137 if(is_error){
2138 SUBREADprintf("ERROR: cannot write into the temporary files. Please check the disk space in the temp directory.\n");
2139 }
2140 return is_error;
2141 }
2142
is_in_exon_annotations(gene_t * output_genes,unsigned int offset,int is_start)2143 int is_in_exon_annotations(gene_t *output_genes, unsigned int offset, int is_start)
2144 {
2145 int i,j;
2146
2147 for(i=0; i< MAX_ANNOTATION_EXONS; i++)
2148 {
2149 if(!output_genes[i].end_offset) break;
2150 if(output_genes[i].end_offset >= offset && output_genes[i].start_offset <= offset)
2151 {
2152 for(j=0; j< MAX_EXONS_PER_GENE; j++)
2153 {
2154 if(output_genes[i].exon_ends[j] >= offset && output_genes[i].exon_starts[j] <= offset)
2155 {
2156 if(output_genes[i].exon_starts[j] == offset && is_start) return 2; // 2==exactly matched
2157 if(output_genes[i].exon_ends[j] == offset && !is_start) return 2;
2158 return 1; // 1==enclosed
2159 }
2160 }
2161 }
2162 }
2163 return 0; //0==exon not found
2164 }
2165
load_exon_annotation(char * annotation_file_name,gene_t ** output_genes,gene_offset_t * offsets)2166 int load_exon_annotation(char * annotation_file_name, gene_t ** output_genes, gene_offset_t* offsets)
2167 {
2168 int line_len, gene_number = 0, exons = 0;
2169 char old_gene_name[MAX_GENE_NAME_LEN];
2170 FILE * fp = f_subr_open(annotation_file_name, "rb");
2171
2172 if(!fp)
2173 {
2174 SUBREADprintf("Cannot open the exon annotation file: %s\n", annotation_file_name);
2175 return -1;
2176 }
2177 (*output_genes) = malloc(sizeof(gene_t)*MAX_ANNOTATION_EXONS);
2178 if(!*output_genes)
2179 {
2180 SUBREADprintf("Cannot allocate memory for the exon table. \n");
2181 return -1;
2182 }
2183
2184
2185 old_gene_name[0]=0;
2186 (*output_genes)[0].end_offset = 0;
2187 (*output_genes)[0].start_offset = 0xffffffff;
2188 while(gene_number < MAX_ANNOTATION_EXONS)
2189 {
2190 char buff[1200], this_gene_name[MAX_GENE_NAME_LEN], chromosome_name[MAX_CHROMOSOME_NAME_LEN];
2191 int i = 0, j=0;
2192 unsigned int exon_location;
2193
2194 line_len = read_line(1200, fp, buff, 0);
2195
2196 if(line_len>0) //Not EOF
2197 {
2198 if(!isdigit(buff[0])) // it is a title line or something else
2199 continue;
2200
2201 for(i=0; buff[i] != '\t' && buff[i] != '\n' && i < 1200; i++)
2202 this_gene_name[i] = buff[i];
2203 this_gene_name[i] = 0;
2204 }
2205
2206 if(line_len<=0 || (exons && old_gene_name[0] && strcmp(this_gene_name , old_gene_name))) // it is a new gene
2207 {
2208 strncpy((*output_genes)[gene_number].gene_name , old_gene_name, MAX_GENE_NAME_LEN);
2209 (*output_genes)[gene_number].exon_ends[exons] = 0;
2210 gene_number++;
2211 exons = 0;
2212 (*output_genes)[gene_number].end_offset = 0;
2213 (*output_genes)[gene_number].start_offset = 0xffffffff;
2214 }
2215
2216 if(line_len<=0) break;
2217
2218
2219 // copy chromosome name
2220 for(i++; buff[i] != '\t' && buff[i] != '\n' && i < 1200; i++)
2221 chromosome_name[j++] = buff[i];
2222 chromosome_name[j] = 0;
2223
2224 // start location
2225 exon_location = 0;
2226 for(i++; buff[i] != '\t' && buff[i] != '\n' && i < 1200; i++)
2227 if(isdigit(buff[i]))
2228 exon_location = exon_location*10 + buff[i] - '0';
2229
2230 (*output_genes)[gene_number].exon_starts[exons] = linear_gene_position(offsets, chromosome_name , exon_location-1);
2231 if( (*output_genes)[gene_number].exon_starts[exons] == 0xffffffff)
2232 continue;
2233
2234 if((*output_genes)[gene_number].start_offset > (*output_genes)[gene_number].exon_starts[exons])
2235 (*output_genes)[gene_number].start_offset = (*output_genes)[gene_number].exon_starts[exons];
2236
2237 // end location
2238 exon_location = 0;
2239 for(i++; buff[i] != '\t' && buff[i] != '\n' && buff[i] && i < 1200; i++)
2240 if(isdigit(buff[i]))
2241 exon_location = exon_location*10 + buff[i] - '0';
2242
2243 (*output_genes)[gene_number].exon_ends[exons] = linear_gene_position(offsets, chromosome_name , exon_location);
2244
2245 if((*output_genes)[gene_number].end_offset < (*output_genes)[gene_number].exon_ends[exons])
2246 (*output_genes)[gene_number].end_offset = (*output_genes)[gene_number].exon_ends[exons];
2247
2248 exons ++;
2249 if(exons >= MAX_EXONS_PER_GENE)
2250 {
2251 SUBREADprintf("The number of exons excesses the limit. Please increase the value of MAX_EXONS_PER_GENE in subread.h.\n");
2252 return -1;
2253 }
2254
2255 strncpy(old_gene_name, this_gene_name , MAX_GENE_NAME_LEN);
2256 }
2257 fclose(fp);
2258 return 0;
2259 }
2260
does_file_exist(char * path)2261 int does_file_exist(char * path)
2262 {
2263 int ret ;
2264 FILE * fp = f_subr_open(path, "rb");
2265 ret = fp!=NULL;
2266 if(fp)fclose(fp);
2267
2268 return ret;
2269 }
2270
sort_SAM_hash(char * str)2271 srUInt_64 sort_SAM_hash(char * str)
2272 {
2273 srUInt_64 hash = 5381;
2274 int c, xk1=0;
2275
2276 while (1)
2277 {
2278 c = str[xk1++];
2279 if(!c)break;
2280 hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
2281 }
2282 return hash;
2283 }
2284
2285
2286 void do_SIGINT_remove(char * prefix, int param);
2287 char * _SAMSORT_SNP_delete_temp_prefix = NULL;
2288 char * _REPAIRER_delete_temp_prefix = NULL;
SAM_SORT_SIGINT_hook(int param)2289 void SAM_SORT_SIGINT_hook(int param) {
2290 do_SIGINT_remove(_SAMSORT_SNP_delete_temp_prefix, param);
2291 }
REPAIR_SIGINT_hook(int param)2292 void REPAIR_SIGINT_hook(int param) {
2293 do_SIGINT_remove(_REPAIRER_delete_temp_prefix, param);
2294 }
2295
delete_with_prefix(char * prefix)2296 void delete_with_prefix(char * prefix){
2297 if(prefix != NULL)
2298 {
2299 int xk1, last_slash = -1;
2300 char del2[MAX_FILE_NAME_LENGTH], del_suffix[MAX_FILE_NAME_LENGTH], del_name[MAX_FILE_NAME_LENGTH];
2301 for(xk1=0; prefix[xk1]; xk1++)
2302 {
2303 if(prefix[xk1]=='/') last_slash = xk1;
2304 else if(prefix[xk1]=='\\')
2305 {
2306 SUBREADprintf("The file name is unknown.\n");
2307 return;
2308 }
2309 }
2310 if(last_slash>=0)
2311 {
2312 memcpy(del2, prefix, last_slash);
2313 del2[last_slash] = 0;
2314 strcpy(del_suffix , prefix + last_slash + 1);
2315 }
2316 else
2317 {
2318 strcpy(del2,".");
2319 strcpy(del_suffix , prefix);
2320 }
2321
2322 //#warning ">>>>>>>> COMMENT THIS OUT <<<<<<<<<<<<<<<<<<<<<"
2323 //SUBREADprintf("SCANDEL: %s, PREFIX %s, SUFFIX %s\n", del2, prefix, del_suffix);
2324 if(strlen(del_suffix)>8)
2325 {
2326 DIR *d;
2327 struct dirent *dir;
2328
2329 d = opendir(del2);
2330 if (d)
2331 {
2332 while ((dir = readdir(d)) != NULL)
2333 {
2334 if(strstr(dir->d_name, del_suffix))
2335 {
2336 strcpy(del_name, del2);
2337 strcat(del_name, "/");
2338 strcat(del_name, dir->d_name);
2339 unlink(del_name);
2340
2341 // #warning ">>>>>>>> COMMENT THIS OUT <<<<<<<<<<<<<<<<<<<<<"
2342 // SUBREADprintf("DEL: %s\n", del_name);
2343 //test fix
2344 }
2345 }
2346 closedir(d);
2347 }
2348 }
2349
2350 }
2351
2352 }
2353
do_SIGINT_remove(char * prefix,int param)2354 void do_SIGINT_remove(char * prefix, int param) {
2355 #ifdef MAKE_STANDALONE
2356 delete_with_prefix(prefix);
2357 SUBREADprintf("\n\nReceived a terminal signal. The temporary files were removed.\n");
2358 exit(param);
2359 #endif
2360 }
2361
2362
2363 void * old_sig_TERM = NULL, * old_sig_INT = NULL;
2364
SAM_pairer_writer_create(SAM_pairer_writer_main_t * bam_main,int all_threads,int has_dummy,int BAM_input,int c_level,char * out_file)2365 int SAM_pairer_writer_create( SAM_pairer_writer_main_t * bam_main , int all_threads , int has_dummy, int BAM_input, int c_level, char * out_file){
2366 int x1;
2367
2368 memset(bam_main, 0, sizeof(SAM_pairer_writer_main_t));
2369 bam_main -> bam_fp = f_subr_open(out_file, "wb");
2370 if(NULL == bam_main -> bam_fp) return 1;
2371 strcpy(bam_main -> bam_name, out_file);
2372 bam_main -> threads = malloc(all_threads * sizeof(SAM_pairer_writer_thread_t));
2373 bam_main -> all_threads = all_threads;
2374 bam_main -> has_dummy = has_dummy;
2375 bam_main -> compression_level = c_level;
2376 subread_init_lock(&bam_main -> output_fp_lock);
2377
2378 for(x1 = 0; x1 < all_threads ; x1 ++){
2379 bam_main -> threads[x1].BIN_buffer_ptr = 0;
2380 bam_main -> threads[x1].strm.zalloc = Z_NULL;
2381 bam_main -> threads[x1].strm.zfree = Z_NULL;
2382 bam_main -> threads[x1].strm.opaque = Z_NULL;
2383 bam_main -> threads[x1].strm.avail_in = 0;
2384 bam_main -> threads[x1].strm.next_in = Z_NULL;
2385
2386 deflateInit2(&bam_main -> threads[x1].strm, bam_main -> compression_level, Z_DEFLATED,
2387 PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
2388 }
2389 return 0;
2390 }
2391
SAM_pairer_write_BAM_header(FILE * writer,int compressed_size)2392 void SAM_pairer_write_BAM_header(FILE * writer, int compressed_size)
2393 {
2394
2395 // the four magic characters
2396 fputc(31, writer);
2397 fputc(139, writer);
2398 fputc(8, writer);
2399 fputc(4, writer);
2400
2401 time_t time_now = 0;
2402 fwrite(&time_now,4,1, writer);
2403
2404 int tmp_i;
2405 // Extra flags and OS
2406 fputc(0, writer);
2407 fputc(0xff, writer);
2408
2409 // Extra length
2410 tmp_i = 6;
2411 fwrite(&tmp_i,2,1, writer);
2412
2413
2414 // SI1 and SI2 magic numbers, and SLEN
2415 fputc(66, writer);
2416 fputc(67, writer);
2417 tmp_i = 2;
2418 fwrite(&tmp_i,2,1, writer);
2419 tmp_i = compressed_size + 19 + 6;
2420 fwrite(&tmp_i,2,1, writer);
2421 }
2422
2423
2424
SAM_pairer_multi_thread_compress(SAM_pairer_writer_main_t * bam_main,SAM_pairer_writer_thread_t * bam_thread)2425 int SAM_pairer_multi_thread_compress(SAM_pairer_writer_main_t * bam_main , SAM_pairer_writer_thread_t * bam_thread)
2426 {
2427 #define BAM_compressed_space 65536
2428 char * BAM_compressed = malloc(BAM_compressed_space);
2429 int ret, have;
2430 if(bam_thread -> BIN_buffer_ptr>0){
2431 deflateReset(&bam_thread -> strm);
2432 bam_thread -> strm.avail_in = bam_thread -> BIN_buffer_ptr;
2433 bam_thread -> strm.next_in = bam_thread -> BIN_buffer;
2434 bam_thread -> strm.avail_out = BAM_compressed_space;
2435 bam_thread -> strm.next_out = (unsigned char *)BAM_compressed;
2436 ret = deflate( &bam_thread -> strm , Z_FINISH);
2437
2438 have = BAM_compressed_space - bam_thread -> strm.avail_out;
2439 assert(bam_thread -> strm.avail_in == 0);
2440 }else{
2441 z_stream nstrm;
2442 nstrm.zalloc = Z_NULL;
2443 nstrm.zfree = Z_NULL;
2444 nstrm.opaque = Z_NULL;
2445 nstrm.avail_in = 0;
2446 nstrm.next_in = Z_NULL;
2447
2448 deflateInit2(&nstrm, SAMBAM_COMPRESS_LEVEL_NORMAL, Z_DEFLATED,
2449 PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
2450
2451 nstrm.avail_in = 0;
2452 nstrm.next_in = bam_thread -> BIN_buffer;
2453 nstrm.avail_out = BAM_compressed_space;
2454 nstrm.next_out = (unsigned char *)BAM_compressed;
2455 ret = deflate(&nstrm, Z_FINISH);
2456 deflateEnd(&nstrm);
2457 have = BAM_compressed_space - nstrm.avail_out;
2458 }
2459 if(ret == Z_OK || 1){
2460
2461 //SUBREADprintf("Compress: %d -> %d %p\n", bam_thread -> BIN_buffer_ptr, have, bam_main -> bam_fp);
2462 //if(bam_thread -> BIN_buffer_ptr == 0) have = 0;
2463 unsigned int crc0 = crc32(0, NULL, 0);
2464 unsigned int CRC32 = crc32(crc0, (unsigned char *) bam_thread -> BIN_buffer ,bam_thread -> BIN_buffer_ptr);
2465
2466
2467 subread_lock_occupy( &bam_main -> output_fp_lock );
2468 SAM_pairer_write_BAM_header( bam_main -> bam_fp , have);
2469 fwrite(BAM_compressed,1, have, bam_main -> bam_fp );
2470 fwrite(&CRC32 , 4, 1, bam_main -> bam_fp);
2471 fwrite( &bam_thread -> BIN_buffer_ptr , 4, 1, bam_main -> bam_fp);
2472
2473 subread_lock_release( &bam_main -> output_fp_lock );
2474
2475 bam_thread -> BIN_buffer_ptr = 0;
2476 } else {
2477 SUBREADprintf("ERROR: Cannot compress a BAM block : %d\n", ret);
2478 return 1;
2479 }
2480 free(BAM_compressed);
2481 return 0;
2482 }
2483
2484
2485
SAM_pairer_writer_destroy(SAM_pairer_writer_main_t * bam_main)2486 void SAM_pairer_writer_destroy( SAM_pairer_writer_main_t * bam_main ) {
2487 int x1;
2488 for(x1 = 0; x1 < bam_main -> all_threads ; x1 ++){
2489 if(bam_main -> threads[x1].BIN_buffer_ptr>0){
2490 SAM_pairer_multi_thread_compress(bam_main, bam_main->threads+x1);
2491 }
2492
2493 if(x1 == bam_main -> all_threads - 1){
2494 assert(0 == bam_main -> threads[x1].BIN_buffer_ptr);
2495 SAM_pairer_multi_thread_compress(bam_main, bam_main->threads+x1);
2496 }
2497 deflateEnd(&bam_main -> threads[x1].strm);
2498 }
2499 subread_destroy_lock(&bam_main -> output_fp_lock);
2500 fclose(bam_main -> bam_fp);
2501 free(bam_main -> threads);
2502 }
2503
SAM_pairer_set_unsorted_notification(SAM_pairer_context_t * pairer,void (* unsorted_notification)(void * pairer,char * bin1,char * bin2))2504 void SAM_pairer_set_unsorted_notification(SAM_pairer_context_t * pairer, void (* unsorted_notification) (void * pairer, char * bin1, char * bin2)){
2505 pairer -> unsorted_notification = unsorted_notification;
2506 }
2507
2508
SAM_pairer_warning_file_open_limit()2509 int SAM_pairer_warning_file_open_limit(){
2510 #ifndef __MINGW32__
2511 struct rlimit limit_st;
2512 getrlimit(RLIMIT_NOFILE, & limit_st);
2513
2514 if(min(limit_st.rlim_cur, limit_st.rlim_max ) < MIN_FILE_POINTERS_ALLOWED){
2515 SUBREADprintf(" ERROR: the maximum file open number (%d) is too low. Please increase this number to a number larger than 50 by using the 'ulimit -n' command.\n\n",(int)(min(limit_st.rlim_cur, limit_st.rlim_max)));
2516 return 1;
2517 }
2518 #endif
2519 return 0;
2520 }
2521
2522 // Tiny_Mode only write the following information:
2523 // Name Flag Chro Pos Mapq Cigar MateChro MatePos Tlen N I NH:i:xx HI:i:xx
2524 // Tiny_Mode does not work when output and input are both in BAM format
2525 // in_format can be either
2526 // bin_buff_size_per_thread is in Mega-Bytes.
2527 // It returns 0 if no error
SAM_pairer_create(SAM_pairer_context_t * pairer,int all_threads,int bin_buff_size_per_thread,int BAM_input,int is_Tiny_Mode,int is_single_end_mode,int force_do_not_sort,int need_read_group_tag,int display_progress,char * in_file,void (* reset_output_function)(void * pairer),int (* output_header_function)(void * pairer,int thread_no,int is_text,unsigned int items,char * bin,unsigned int bin_len),int (* output_function)(void * pairer,int thread_no,char * bin1,char * bin2),char * tmp_path,void * appendix1,int long_read_minimum_length)2528 int SAM_pairer_create(SAM_pairer_context_t * pairer, int all_threads, int bin_buff_size_per_thread, int BAM_input, int is_Tiny_Mode, int is_single_end_mode, int force_do_not_sort, int need_read_group_tag, int display_progress, char * in_file, void (* reset_output_function) (void * pairer), int (* output_header_function) (void * pairer, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len), int (* output_function) (void * pairer, int thread_no, char * bin1, char * bin2), char * tmp_path, void * appendix1, int long_read_minimum_length) {
2529
2530 memset(pairer, 0, sizeof(SAM_pairer_context_t));
2531
2532 if(in_file[0]=='<'){
2533 in_file++;
2534 strncpy(pairer -> in_file_name, "<STDIN>", MAX_FILE_NAME_LENGTH);
2535 }else
2536 strncpy(pairer -> in_file_name, in_file, MAX_FILE_NAME_LENGTH);
2537
2538 pairer -> input_fp = f_subr_open(in_file, "rb");
2539 if(NULL == pairer -> input_fp) return 1;
2540
2541 SAM_pairer_warning_file_open_limit();
2542
2543 pairer -> input_is_BAM = BAM_input;
2544 pairer -> tiny_mode = is_Tiny_Mode;
2545 pairer -> reset_output_function = reset_output_function;
2546 pairer -> output_function = output_function;
2547 pairer -> output_header = output_header_function;
2548 pairer -> display_progress = display_progress;
2549 pairer -> is_single_end_mode = is_single_end_mode;
2550 pairer -> force_do_not_sort = force_do_not_sort;
2551 pairer -> need_read_group_tag = need_read_group_tag;
2552 pairer -> long_read_minimum_length = long_read_minimum_length;
2553
2554 subread_init_lock(&pairer -> unsorted_notification_lock);
2555 subread_init_lock(&pairer -> input_fp_lock);
2556 subread_init_lock(&pairer -> SAM_BAM_table_lock);
2557
2558 pairer -> total_threads = all_threads;
2559 if(pairer ->input_is_BAM){
2560 pairer -> input_buff_SBAM_size = bin_buff_size_per_thread * 1024 * 1024;
2561 }else{
2562 pairer -> input_buff_SBAM_size = max(bin_buff_size_per_thread * 1024 * 1024 + FC_LONG_READ_RECORD_HARDLIMIT , 3*FC_LONG_READ_RECORD_HARDLIMIT/2);
2563 }
2564
2565 pairer -> input_buff_BIN_size = max(1024*1024, pairer -> input_buff_SBAM_size );
2566
2567 pairer -> appendix1 = appendix1;
2568
2569 old_sig_TERM = signal (SIGTERM, REPAIR_SIGINT_hook);
2570 old_sig_INT = signal (SIGINT, REPAIR_SIGINT_hook);
2571
2572 strcpy(pairer -> tmp_file_prefix, tmp_path);
2573 _REPAIRER_delete_temp_prefix = pairer -> tmp_file_prefix;
2574 pairer -> threads = malloc(all_threads * sizeof(SAM_pairer_thread_t));
2575 memset(pairer -> threads, 0, all_threads * sizeof(SAM_pairer_thread_t));
2576
2577 if(pairer ->input_is_BAM){
2578 pairer ->bam_margin_table = HashTableCreate(2191);
2579 HashTableSetHashFunction(pairer -> bam_margin_table, fc_chro_hash);
2580 HashTableSetKeyComparisonFunction(pairer -> bam_margin_table, fc_strcmp_chro);
2581 HashTableSetDeallocationFunctions(pairer -> bam_margin_table, free, free);
2582 }else{
2583 pairer -> sam_contig_number_table = HashTableCreate(21907);
2584 HashTableSetHashFunction(pairer -> sam_contig_number_table, fc_chro_hash);
2585 HashTableSetKeyComparisonFunction(pairer -> sam_contig_number_table, fc_strcmp_chro);
2586 HashTableSetDeallocationFunctions(pairer -> sam_contig_number_table, free, NULL);
2587 }
2588
2589 pairer -> unsorted_notification_table = HashTableCreate(2191);
2590 HashTableSetHashFunction(pairer -> unsorted_notification_table, fc_chro_hash);
2591 HashTableSetKeyComparisonFunction(pairer -> unsorted_notification_table, fc_strcmp_chro);
2592 HashTableSetDeallocationFunctions(pairer -> unsorted_notification_table, free, free);
2593
2594 int x1;
2595
2596 for(x1 = 0; x1 < all_threads ; x1++){
2597 pairer -> threads[x1].thread_id = x1;
2598 pairer -> threads[x1].reads_in_SBAM = 0;
2599 pairer -> threads[x1].input_buff_SBAM = malloc(pairer -> input_buff_SBAM_size);
2600 pairer -> threads[x1].input_buff_BIN_capacity = pairer -> input_buff_BIN_size;
2601 pairer -> threads[x1].input_buff_BIN = malloc(pairer -> threads[x1].input_buff_BIN_capacity );
2602
2603 pairer -> threads[x1].input_buff_BIN_used = 0;
2604 pairer -> threads[x1].orphant_table = HashTableCreate(pairer -> input_buff_SBAM_size / 100);
2605 HashTableSetHashFunction(pairer -> threads[x1].orphant_table, fc_chro_hash);
2606 HashTableSetKeyComparisonFunction(pairer -> threads[x1].orphant_table, fc_strcmp_chro);
2607 HashTableSetDeallocationFunctions(pairer -> threads[x1].orphant_table, free, free);
2608 pairer -> threads[x1].strm.zalloc = Z_NULL;
2609 pairer -> threads[x1].strm.zfree = Z_NULL;
2610 pairer -> threads[x1].strm.opaque = Z_NULL;
2611 pairer -> threads[x1].strm.avail_in = 0;
2612 pairer -> threads[x1].strm.next_in = Z_NULL;
2613
2614 inflateInit2(&pairer -> threads[x1].strm, PAIRER_GZIP_WINDOW_BITS);
2615
2616 if(force_do_not_sort)
2617 subread_init_lock(&pairer -> threads[x1].SBAM_lock);
2618 }
2619 return 0;
2620 }
2621
SAM_pairer_print_keys(void * key,void * hashed_obj,HashTable * tab)2622 void SAM_pairer_print_keys(void * key, void * hashed_obj, HashTable * tab){
2623 int dlen =0;
2624 memcpy(&dlen, hashed_obj,4);
2625 SUBREADprintf("ESKY = %s LEN = %d\n",(char*)key,dlen);
2626 }
2627
SAM_pairer_destroy(SAM_pairer_context_t * pairer)2628 void SAM_pairer_destroy(SAM_pairer_context_t * pairer){
2629
2630 int x1;
2631 srInt_64 all_orphants = 0;
2632 for(x1 = 0; x1 < pairer -> total_threads ; x1++){
2633 inflateEnd(&pairer -> threads[x1].strm);
2634 free(pairer -> threads[x1].input_buff_BIN);
2635 free(pairer -> threads[x1].input_buff_SBAM);
2636
2637 if(pairer -> force_do_not_sort)
2638 subread_destroy_lock(&pairer -> threads[x1].SBAM_lock);
2639
2640 all_orphants += pairer -> threads[x1].orphant_table->numOfElements;
2641 HashTableDestroy(pairer -> threads[x1].orphant_table);
2642 }
2643
2644 if(pairer->input_is_BAM){
2645 //HashTableIteration(pairer -> bam_margin_table, SAM_pairer_print_keys);
2646 HashTableDestroy(pairer -> bam_margin_table);
2647 }
2648 else HashTableDestroy(pairer -> sam_contig_number_table);
2649 HashTableDestroy(pairer -> unsorted_notification_table);
2650
2651 subread_destroy_lock(&pairer -> unsorted_notification_lock);
2652 subread_destroy_lock(&pairer -> input_fp_lock);
2653 subread_destroy_lock(&pairer -> SAM_BAM_table_lock);
2654
2655 delete_with_prefix(pairer -> tmp_file_prefix);
2656 fclose(pairer -> input_fp);
2657 free(pairer -> threads);
2658 signal (SIGTERM, old_sig_TERM);
2659 signal (SIGINT, old_sig_INT);
2660 //SUBREADprintf("All orphans=%llu frags\n", all_orphants);
2661 }
2662
2663 // always assume that fp is at the start of a BAM GZ block.
SAM_pairer_read_BAM_block(FILE * fp,int max_read_len,char * inbuff)2664 int SAM_pairer_read_BAM_block(FILE * fp, int max_read_len, char * inbuff) {
2665 unsigned char gz_header_12 [12];
2666 //SUBREADprintf("STAT GZ POS=%llu\n", ftello(fp));
2667 int read_len = fread(gz_header_12, 1, 12, fp );
2668 if(read_len < 12){
2669 return -1;
2670 }
2671 if(gz_header_12[0]!=31 || gz_header_12[1]!=139){
2672 //SUBREADprintf("Unrecognized Gzip headers: %u, %u\nPlease make sure if the input file is in the BAM format.\n", gz_header_12[0], gz_header_12[1]);
2673 return -1;
2674 }
2675 unsigned short xlen = 0, bsize = 0;
2676 memcpy(&xlen, gz_header_12 + 10, 2);
2677 int xlen_read = 0;
2678
2679 while( xlen_read < xlen ){
2680 unsigned char x_header_4[4];
2681 unsigned short slen = 0;
2682 read_len = fread(x_header_4, 1, 4, fp);
2683 if(read_len < 4){
2684 SUBREADprintf("BAD GZ BAM 6LEN\n");
2685 return -1;
2686 }
2687 memcpy(&slen, x_header_4+2 , 2);
2688 xlen_read += 4;
2689 if(x_header_4[0]==66 && x_header_4[1]==67 && slen == 2){
2690 read_len = fread(&bsize, 2, 1, fp);
2691 if(read_len < 1){
2692 SUBREADprintf("BAD GZ BAM XLEN\n");
2693 return -1;
2694 }
2695 }else{
2696 fseeko(fp, slen, SEEK_CUR);
2697 }
2698 xlen_read += slen;
2699 }
2700 if(bsize < 1 || bsize < xlen + 19){
2701 SUBREADprintf("BAD GZ BAM BSIZE\n");
2702 return -1;
2703 }
2704 read_len = fread(inbuff, 1, bsize - xlen - 19, fp);
2705 //SUBREADprintf("ABBO : GOOD GZ , LEN=%d , POS=%llu\n", read_len, ftello(fp));
2706
2707 // seek over CRC and ISIZE
2708 fseeko(fp, 8, SEEK_CUR);
2709 if(read_len < bsize - xlen - 19) return -1;
2710 return read_len;
2711 }
2712
2713 #define MIN_BAM_BLOCK_SIZE 66000
2714
SAM_pairer_read_SAM_MB(FILE * fp,int max_read_len,char * inbuff)2715 int SAM_pairer_read_SAM_MB( FILE * fp, int max_read_len, char * inbuff ){
2716 int ret = 0;
2717
2718 if(feof(fp)) return 0;
2719
2720 while(1){
2721 if(ret >= max_read_len - FC_LONG_READ_RECORD_HARDLIMIT || feof(fp))break;
2722 int rlen = fread(inbuff +ret , 1, max_read_len - FC_LONG_READ_RECORD_HARDLIMIT - ret , fp);
2723 if(rlen > 0){
2724 int x1;
2725 for(x1 = 0; x1 < min(200, rlen); x1++)
2726 if(*(inbuff+ret+x1)<8 || *(inbuff+ret+x1)> 127){
2727 SUBREADprintf("NOT_SAM_ACTUALLY\n");
2728 return -1;
2729 }
2730 ret += rlen;
2731 }
2732 }
2733 if(!feof(fp)){
2734 int nch;
2735 while(1){
2736 nch = fgetc(fp);
2737 if(nch < 0 || nch == '\n'){
2738 break;
2739 }else{
2740 inbuff[ret++]=nch;
2741 }
2742 }
2743 }
2744 if(inbuff[ret-1] != '\n') inbuff[ret++]='\n';
2745 inbuff[ret] = 0;
2746
2747 return ret;
2748 }
2749
SAM_pairer_fill_BIN_buff(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,int * is_finished)2750 void SAM_pairer_fill_BIN_buff(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context , int * is_finished){
2751 // load continuous 64MB of data into the SBAM buffer of the current thread
2752 // For BAM files: must be the entire blocks.
2753 // For SAM files: must be the full lines.
2754 int current_buffer_used = 0;
2755 int current_blocks = 0;
2756 int last_read_len = -1, this_size;
2757 if(pairer -> input_is_BAM){
2758 thread_context -> input_buff_SBAM_file_start = ftello(pairer -> input_fp);
2759 while(1){
2760 if( feof(pairer -> input_fp)){
2761 *is_finished = 1;
2762 break;
2763 }
2764 if(pairer -> input_buff_SBAM_size - current_buffer_used < MIN_BAM_BLOCK_SIZE) {
2765 break;
2766 }
2767 this_size = SAM_pairer_read_BAM_block( pairer -> input_fp , pairer -> input_buff_SBAM_size - current_buffer_used , thread_context -> input_buff_SBAM + current_buffer_used);
2768
2769 current_blocks ++;
2770 if(this_size >= 0) {
2771 current_buffer_used += this_size;
2772 } else {
2773 if(feof(pairer -> input_fp) && last_read_len != -1 ){
2774 pairer -> is_bad_format |= (last_read_len > 2);
2775 pairer -> is_incomplete_BAM |= (last_read_len > 2);
2776 if(pairer -> is_incomplete_BAM)SUBREADprintf("ERROR: the BAM file seems incomplete : this %d, last %d.\n", this_size , last_read_len );
2777 }
2778 *is_finished = 1;
2779 break;
2780 }
2781 last_read_len = this_size;
2782 }
2783 thread_context -> input_buff_SBAM_file_end = ftello(pairer -> input_fp);
2784 }else{ // is_SAM
2785 current_buffer_used = SAM_pairer_read_SAM_MB(pairer -> input_fp , pairer -> input_buff_SBAM_size , thread_context -> input_buff_SBAM);
2786 if(current_buffer_used < 1) *is_finished = 1;
2787 }
2788
2789 //SUBREADprintf("PAPA:READ=%d by %d blocks %p, PTRS=%p %p\n", current_buffer_used, current_blocks, thread_context, thread_context -> input_buff_SBAM, thread_context -> input_buff_BIN);
2790 thread_context -> input_buff_SBAM_used = current_buffer_used;
2791 thread_context -> input_buff_SBAM_ptr = 0;
2792 thread_context -> input_buff_BIN_used = 0;
2793 thread_context -> input_buff_BIN_ptr = 0;
2794 thread_context -> readno_in_chunk = 0;
2795 }
2796
2797 int SAM_pairer_find_start(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context );
2798 #define BAM_next_nch { \
2799 int retXX = 0; while(thread_context -> input_buff_BIN_ptr >= thread_context -> input_buff_BIN_used){retXX = SAM_pairer_fetch_BAM_block(pairer, thread_context); if(retXX) break;}\
2800 if(retXX) nch=-1; else nch = thread_context -> input_buff_BIN[thread_context -> input_buff_BIN_ptr++];}
2801
2802 #define SAM_next_line {\
2803 if( thread_context -> input_buff_SBAM_used <= thread_context -> input_buff_SBAM_ptr ){ line_ptr = NULL;}else{\
2804 line_ptr = thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr;line_len = 0;\
2805 while(line_len + thread_context -> input_buff_SBAM_ptr < thread_context -> input_buff_SBAM_used){ int ccch = thread_context -> input_buff_SBAM[ thread_context -> input_buff_SBAM_ptr + line_len ]; if(ccch == '\n')break; line_len ++;}\
2806 thread_context -> input_buff_SBAM_ptr += line_len+1;}}
2807
SAM_pairer_fetch_BAM_block(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context)2808 int SAM_pairer_fetch_BAM_block(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context){
2809 if(thread_context -> input_buff_SBAM_used <= thread_context -> input_buff_SBAM_ptr){
2810 return 1;
2811 }
2812
2813 int remained_BIN = thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_ptr;
2814 if( remained_BIN > 0) {
2815 int x1;
2816 for(x1 = 0 ; x1 < thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_ptr; x1++)
2817 thread_context -> input_buff_BIN[x1] = thread_context -> input_buff_BIN[x1+thread_context -> input_buff_BIN_ptr];
2818 thread_context -> input_buff_BIN_used -= thread_context -> input_buff_BIN_ptr;
2819 } else thread_context -> input_buff_BIN_used = 0;
2820
2821 thread_context -> input_buff_BIN_ptr = 0;
2822
2823 inflateReset(&thread_context -> strm);
2824
2825 int lin, lout;
2826
2827 lin=thread_context -> strm.avail_in = (unsigned int)(thread_context -> input_buff_SBAM_used - thread_context -> input_buff_SBAM_ptr);
2828 thread_context -> strm.next_in = (unsigned char *)thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr;
2829
2830 if( thread_context -> input_buff_BIN_capacity < thread_context -> input_buff_BIN_used + 128*1024){
2831 thread_context -> input_buff_BIN_capacity = max(thread_context -> input_buff_BIN_used, thread_context -> input_buff_BIN_capacity )*1.5;
2832 if(thread_context -> input_buff_BIN_capacity > 1024*1024*1024){
2833 SUBREADprintf("ERROR: buffer size larger than 1GB\n");
2834 return 1;
2835 }else{
2836 //SUBREADprintf("Resize Buffer of Th_%d to %d (used %d); In_ava=%d - %d\n", thread_context -> thread_id, thread_context -> input_buff_BIN_capacity, thread_context -> input_buff_BIN_used, thread_context -> input_buff_SBAM_used , thread_context -> input_buff_SBAM_ptr);
2837 }
2838 thread_context -> input_buff_BIN = realloc( thread_context -> input_buff_BIN , thread_context -> input_buff_BIN_capacity);
2839 //SUBREADprintf(" PTR=%p\n",thread_context -> input_buff_BIN);
2840 assert( thread_context -> input_buff_BIN );
2841 }
2842 lout=thread_context -> strm.avail_out = thread_context -> input_buff_BIN_capacity - thread_context -> input_buff_BIN_used;
2843 thread_context -> strm.next_out = (unsigned char *)thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_used;
2844
2845 int ret = inflate(&thread_context ->strm, Z_FINISH);
2846 if(ret == Z_OK || ret == Z_STREAM_END)
2847 {
2848 int have = lout - thread_context ->strm.avail_out;
2849 int used_BAM = lin - thread_context -> strm.avail_in;
2850
2851 //SUBREADprintf("ABBO TH %d : INFLATED BAM_CONSUMED: %d BIN_USED: %d => %d NEED_FIND_START=%d\n", thread_context -> thread_id, used_BAM, thread_context -> input_buff_BIN_used , thread_context -> input_buff_BIN_used+have, thread_context -> need_find_start);
2852 thread_context -> input_buff_BIN_used += have;
2853 thread_context -> input_buff_SBAM_ptr += used_BAM;
2854
2855 if(thread_context -> need_find_start){
2856 int test_read_bin = SAM_pairer_find_start(pairer, thread_context);
2857 if(test_read_bin<1 && thread_context -> input_buff_BIN_used >= 32 ){
2858 pairer -> is_bad_format = 1;
2859 SUBREADprintf("ERROR: cannot find the start of the next BAM block.\n");
2860 }
2861 }
2862 //SUBREADprintf("FETCHED BLOCK DECOMP=%d FROM COMP=%d\n", have, used_BAM);
2863 } else {
2864 if(ret == -5){
2865 SUBREADprintf("Cannot parse the input BAM file. If the BAM file contains long reads, please run featureCounts on the long-read mode.\n");
2866 }else{
2867 SUBREADprintf("GZIP ERROR:%d\n", ret);
2868 }
2869 pairer -> is_bad_format = 1;
2870 pairer -> is_internal_error = 1;
2871 return 1;
2872 }
2873
2874 return 0;
2875 }
2876
2877 #define BAM_next_u32(v) {\
2878 (v) = 0; unsigned int poww = 1 ; \
2879 BAM_next_nch; (v) += nch*poww; poww *= 256;\
2880 BAM_next_nch; (v) += nch*poww; poww *= 256;\
2881 BAM_next_nch; (v) += nch*poww; poww *= 256;\
2882 BAM_next_nch; (v) += nch*poww;\
2883 }
2884
SAM_pairer_reduce_BAM_bin(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,unsigned char * bin_where,int * bin_len)2885 void SAM_pairer_reduce_BAM_bin(SAM_pairer_context_t * pairer, SAM_pairer_thread_t * thread_context, unsigned char * bin_where, int * bin_len){
2886 unsigned int seq_len, name_len, cigar_ops;
2887 memcpy(&seq_len, bin_where + 20, 4);
2888 if(seq_len<=1) return;
2889 memcpy(&name_len, bin_where + 12, 4);
2890 name_len = name_len & 0xff;
2891 memcpy(&cigar_ops, bin_where + 16, 4);
2892 cigar_ops = cigar_ops & 0xffff;
2893
2894 int targ_pos = 36+name_len+4*cigar_ops + 2;
2895 int src_pos = 36+name_len+4*cigar_ops + (1+seq_len) / 2 + seq_len;
2896
2897 bin_where[targ_pos-2]=0xff;
2898 bin_where[targ_pos-1]=0xff;
2899
2900 //SUBREADprintf("REDUCE by Thread %d : %d -> %d in %d ; seq_len = %u\n", thread_context -> thread_id, src_pos, targ_pos, *bin_len, seq_len);
2901 seq_len = 1;
2902 memcpy(bin_where + 20, &seq_len, 4);
2903
2904 while(src_pos < (*bin_len)){
2905 bin_where[targ_pos++]=bin_where[src_pos++];
2906 }
2907 (* bin_len) = targ_pos - 4;
2908 memcpy(bin_where, bin_len, 4);
2909 (* bin_len) += 4;
2910
2911 }
2912
2913 #define MAX_BIN_RECORD_LENGTH ( 20*1024*1024)
2914 int reduce_SAM_to_BAM(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context, int include_sequence);
2915 int is_read_bin(char * bin, int bin_len, int max_refID);
2916
SAM_pairer_get_next_read_BIN(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,unsigned char ** bin_where,int * bin_len)2917 int SAM_pairer_get_next_read_BIN( SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context , unsigned char ** bin_where, int * bin_len ) {
2918 if( pairer -> input_is_BAM ){
2919 int nch = 0;
2920 while(1){
2921 if(!pairer -> BAM_header_parsed){
2922 int x1;
2923 unsigned int bam_signature;
2924 BAM_next_u32(bam_signature);
2925 BAM_next_u32(pairer -> BAM_l_text);
2926 char * header_txt = NULL;
2927 int header_txt_dynamic_length = -1;
2928
2929 if(pairer->BAM_l_text>0) header_txt = malloc(max(1000000,pairer->BAM_l_text));
2930
2931 for(x1 = 0 ; x1 < pairer -> BAM_l_text; x1++){
2932 BAM_next_nch;
2933 header_txt [x1] = nch;
2934 }
2935 int is_OK = 0;
2936 if(pairer -> output_header)pairer -> output_header(pairer, thread_context -> thread_id, 1, pairer -> BAM_l_text , header_txt , pairer -> BAM_l_text );
2937
2938 BAM_next_u32(pairer -> BAM_n_ref);
2939 unsigned int ref_bin_len = 0;
2940 for(x1 = 0; x1 < pairer -> BAM_n_ref; x1++) {
2941 unsigned int l_name, l_ref, x2;
2942 BAM_next_u32(l_name);
2943 assert(l_name < 256);
2944
2945 if(header_txt == NULL){
2946 header_txt = malloc(3000000);
2947 header_txt_dynamic_length = 3000000;
2948 }
2949
2950 if( header_txt_dynamic_length>0 && ref_bin_len > header_txt_dynamic_length - 1000000 ){
2951 header_txt_dynamic_length *= 2;
2952 header_txt = realloc( header_txt, header_txt_dynamic_length);
2953 }
2954
2955 memcpy(header_txt + ref_bin_len, &l_name, 4);
2956 ref_bin_len += 4;
2957 for(x2 = 0; x2 < l_name; x2++){
2958 BAM_next_nch;
2959 header_txt[ref_bin_len++] = nch;
2960 }
2961 BAM_next_u32(l_ref);
2962 memcpy(header_txt + ref_bin_len, &l_ref, 4);
2963 ref_bin_len += 4;
2964 }
2965
2966 is_OK = is_OK || (pairer -> output_header?pairer -> output_header(pairer, thread_context -> thread_id, 0, pairer -> BAM_n_ref , header_txt , ref_bin_len ):0);
2967 //SUBREADprintf("TFMT:HEADER REFS=%d TXTS=%d SIGN=%u\n", pairer -> BAM_n_ref, pairer->BAM_l_text, bam_signature);
2968
2969 if(header_txt) free(header_txt);
2970 if(is_OK){
2971 pairer -> is_incomplete_BAM = 1;
2972 return 0;
2973 }
2974
2975 pairer -> BAM_header_parsed = 1;
2976 SAM_pairer_fetch_BAM_block(pairer, thread_context);
2977 }
2978
2979 if(pairer -> is_bad_format) return 0;
2980
2981 unsigned int record_len = 0xffffffff;
2982 while(1){
2983 if(thread_context -> input_buff_BIN_ptr <= thread_context -> input_buff_BIN_used - 4)
2984 memcpy(&record_len, thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr, 4);
2985
2986 if(record_len < 0xfffffff0 && thread_context -> input_buff_BIN_ptr +4 + record_len <= thread_context -> input_buff_BIN_used){
2987 break;
2988 }
2989
2990 int ret_fetch = SAM_pairer_fetch_BAM_block(pairer, thread_context); // if ret != 0 then load next big chunk of BAM.
2991 if(ret_fetch){
2992 if(thread_context -> input_buff_BIN_used > thread_context -> input_buff_BIN_ptr){
2993 char * margin_key = malloc(40);
2994 char * margin_data = malloc(thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_ptr+4);
2995 int margin_size = thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_ptr;
2996 memcpy(margin_data, &margin_size, 4);
2997 memcpy(margin_data+4, thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr, thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_ptr);
2998 #ifdef __MINGW32__
2999 sprintf(margin_key,"E%lu", (unsigned long)thread_context -> input_buff_SBAM_file_end);
3000 #else
3001 sprintf(margin_key,"E%llu", thread_context -> input_buff_SBAM_file_end);
3002 #endif
3003 subread_lock_occupy(&pairer -> SAM_BAM_table_lock);
3004
3005 HashTablePut(pairer -> bam_margin_table, margin_key, margin_data);
3006 subread_lock_release(&pairer -> SAM_BAM_table_lock);
3007 }
3008 return 0;
3009 }
3010 }
3011
3012 //SUBREADprintf("TFMT:RLEN=%d\n", record_len);
3013
3014 if(!pairer -> is_bad_format){
3015 unsigned int seq_len = 0;
3016 thread_context -> input_buff_BIN_ptr += 4;
3017 memcpy(&seq_len, thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr + 16, 4);
3018
3019 //SUBREADprintf("REDUCE_2: record %u, %u\n", record_len, seq_len);
3020 // #warning "=========== CHECK IF '0 && ' IS CORRECT ==========="
3021 if(record_len < 32 || (0 && record_len > min(MAX_BIN_RECORD_LENGTH,60000))|| seq_len >= pairer -> long_read_minimum_length){
3022 if(seq_len >= pairer -> long_read_minimum_length) pairer -> is_single_end_mode = 1;
3023 SUBREADprintf("ERROR: sequence length in the BAM record is out of the expected region: %d, %d\n", record_len , seq_len );
3024 pairer -> is_bad_format = 1;
3025 return 0;
3026 }
3027
3028 (* bin_where) = thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr - 4;
3029 (* bin_len) = record_len + 4;
3030
3031 thread_context -> input_buff_BIN_ptr += record_len;
3032 }
3033 return 1;
3034 }
3035 } else { // if input is SAM
3036 char *line_ptr;
3037 int line_len=0, passed_read_SBAM_ptr = -1;
3038 if(!pairer -> BAM_header_parsed){
3039 char * header_start = NULL;
3040 int header_len = 0;
3041 while(1){
3042 SAM_next_line;
3043 //SUBREADprintf("LINE_PTR[%d][used bytes=%d]='''%s'''\n", thread_context -> thread_id, thread_context -> input_buff_SBAM_used, line_ptr);
3044 if(NULL == header_start && line_ptr[0] == '@') header_start = line_ptr;
3045
3046 if(NULL == line_ptr){
3047 passed_read_SBAM_ptr = line_ptr - thread_context -> input_buff_SBAM;
3048 //SUBREADprintf("FATAL: the header is too large to the buffer.\n");
3049 break;
3050 }else{
3051 //SUBREADprintf("LINELEN=%d, PTR=%d, FIRST=%c\n", line_len, thread_context -> input_buff_SBAM_ptr , line_ptr[0]);
3052 }
3053 if(line_ptr[0]=='@'){
3054 header_len += 1+line_len;
3055 }else{
3056 passed_read_SBAM_ptr = line_ptr - thread_context -> input_buff_SBAM;
3057 break;
3058 }
3059 }
3060
3061 int is_OK = pairer -> output_header(pairer, thread_context -> thread_id, 1, header_len , header_start , header_len);
3062 thread_context -> input_buff_SBAM_ptr = 0;
3063 int header_bin_ptr = 0, header_contigs = 0;
3064 while(1){
3065 SAM_next_line;
3066 if(line_ptr == NULL || line_ptr[0]!='@') break;
3067 if(memcmp(line_ptr, "@SQ\t",4)==0){
3068 unsigned int ct_len = 0, ctptr = 4, status = 0, sqname_len = 0;
3069 char * sqname = NULL;
3070 while(1){
3071 char ctnch = line_ptr[ctptr++];
3072 if( status == 0){
3073 if(ctnch=='S' && line_ptr[ctptr] == 'N' && line_ptr[ctptr+1] == ':'){
3074 ctptr += 2;
3075 status = 10;
3076 sqname = line_ptr + ctptr;
3077 }else if(ctnch=='L' && line_ptr[ctptr] == 'N' && line_ptr[ctptr+1] == ':'){
3078 ctptr += 2;
3079 status = 20;
3080 }else status = 30;
3081 }else if(status == 10 || status == 20 || status == 30){
3082 if(ctnch == '\t' || ctnch == '\n'){
3083 status = 0;
3084 if(ctnch == '\n') break;
3085 //break;
3086 }
3087 if(status == 10) sqname_len ++;
3088 else if(status == 20) ct_len = ct_len * 10 + ctnch - '0';
3089 }
3090 }
3091
3092
3093 sqname_len += 1;
3094 memcpy(header_start + header_bin_ptr, &sqname_len, 4);
3095 header_bin_ptr += 4;
3096 memcpy(header_start + header_bin_ptr, sqname, sqname_len-1);
3097 *(header_start + header_bin_ptr + sqname_len - 1) = 0;
3098 char * mem_contig_name = malloc(sqname_len);
3099 strcpy(mem_contig_name , header_start + header_bin_ptr);
3100 //SUBREADprintf("CONTIG %d : %s (len=%d = %d)\n", header_contigs, header_start + header_bin_ptr , sqname_len, strlen(mem_contig_name));
3101 HashTablePut(pairer -> sam_contig_number_table , mem_contig_name, NULL + 1 + header_contigs);
3102 header_bin_ptr += sqname_len;
3103
3104 memcpy(header_start + header_bin_ptr, &ct_len, 4);
3105 header_bin_ptr += 4;
3106 header_contigs++;
3107 }
3108 }
3109
3110 is_OK = is_OK || pairer -> output_header(pairer, thread_context -> thread_id, 0, header_contigs , header_start , header_bin_ptr);
3111 pairer -> BAM_header_parsed = 1;
3112 if(is_OK){
3113 pairer -> is_incomplete_BAM = 1;
3114 return 0;
3115 }
3116 }
3117
3118 if(passed_read_SBAM_ptr >=0)
3119 thread_context -> input_buff_SBAM_ptr = passed_read_SBAM_ptr;
3120
3121 if( thread_context -> input_buff_SBAM_ptr < thread_context -> input_buff_SBAM_used ){
3122 thread_context -> input_buff_BIN_ptr = 0;
3123 //SUBREADprintf("reduce_SAM_to_BAM_0 \n");
3124 *bin_len = reduce_SAM_to_BAM(pairer, thread_context,!pairer -> tiny_mode);
3125 *bin_where = (unsigned char *)thread_context -> input_buff_BIN;
3126
3127 return ((*bin_len) > 0 && !pairer->is_bad_format)?1:0;
3128 }
3129 return 0;
3130 }
3131 return 0;
3132 }
3133
online_register_contig(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,char * ref)3134 int online_register_contig(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context, char * ref){
3135 SUBREADprintf("ERROR: Unable to find chromosome '%s' in the SAM header.\n", ref);
3136 assert(0);
3137 int reflen = strlen(ref);
3138 char * header_sec = malloc(reflen + 20);
3139 reflen++;
3140 memcpy(header_sec, &reflen, 4);
3141 memcpy(header_sec + 4, ref, reflen);
3142 memset(header_sec + 4+reflen, 0, 4);
3143 subread_lock_occupy(&pairer -> SAM_BAM_table_lock);
3144
3145 int refId = HashTableGet(pairer->sam_contig_number_table, ref) - NULL - 1;
3146 if(refId < 0){
3147 refId = pairer->sam_contig_number_table->numOfElements;
3148 pairer -> output_header(pairer, thread_context -> thread_id, 0, 1 , header_sec , 8+reflen);
3149 char * mem_ref = malloc(reflen+1);
3150 memcpy(mem_ref, ref, reflen);
3151 mem_ref[reflen]=0;
3152 HashTablePut(pairer->sam_contig_number_table, mem_ref, NULL + refId + 1);
3153 }
3154 subread_lock_release(&pairer -> SAM_BAM_table_lock);
3155 free(header_sec);
3156 return refId;
3157 }
3158
3159 #define set_memory_int(ptr, iii) { *(ptr) = (iii)&0xff; *(ptr+1) = (iii>>8)&0xff; *(ptr+2) = (iii>>16)&0xff;*(ptr+3) = (iii>>24); }
3160
reduce_SAM_to_BAM(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,int include_sequence)3161 int reduce_SAM_to_BAM(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context, int include_sequence){
3162 int column_no = 0, in_ptr = 0;
3163 char * in_str = thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr;
3164 char * read_name = NULL, * ref = NULL, * mate_ref = NULL, * cigar = NULL, * seq = NULL, * qual = NULL;
3165 int flag = 0, pos = 0, mapq = 0, old_read_pos = 0, tlen = 0, l_read_name = 0, tlen_sign = 1, l_seq = 0;
3166
3167 read_name = in_str;
3168 while(1){
3169 int nch = in_str[in_ptr];
3170 if(nch == '\n' || nch == '\0') {
3171 break;
3172 }else if(nch == '\t'){
3173 if(column_no == 0 || column_no == 2 || column_no == 5 || column_no == 6 || column_no == 9)
3174 in_str[in_ptr] = 0;
3175 column_no ++;
3176 if(column_no == 2) ref = in_str + in_ptr + 1;
3177 else if(column_no == 5) cigar = in_str + in_ptr + 1;
3178 else if(column_no == 6) mate_ref = in_str + in_ptr + 1;
3179 else if(column_no == 9) seq = in_str + in_ptr + 1;
3180 else if(column_no == 10) qual = in_str + in_ptr + 1;
3181 else if(column_no == 11) break;
3182 }else{
3183 if(column_no == 0) l_read_name ++;
3184 else if(column_no == 1) flag = flag *10 + nch - '0';
3185 else if(column_no == 3) pos = pos *10 + nch - '0';
3186 else if(column_no == 4) mapq = mapq *10 + nch - '0';
3187 else if(column_no == 7) old_read_pos = old_read_pos *10 + nch - '0';
3188 else if(column_no == 9) l_seq ++;
3189 else if(column_no == 8){
3190 if(nch == '-') tlen_sign = -1;
3191 else tlen = tlen *10 + nch - '0';
3192 }
3193 }
3194
3195 in_ptr++;
3196 }
3197 if(column_no < 10){
3198 //SUBREADprintf("RETURN_LESS:%d\n", column_no);
3199 return -1;
3200 }
3201 l_read_name++;
3202
3203 char * bin_tmp = (char *)thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr;
3204
3205 int refID = HashTableGet(pairer->sam_contig_number_table, ref) - NULL - 1;
3206 if(refID < 0 && ref[0]!='*')
3207 refID = online_register_contig(pairer, thread_context, ref);
3208 set_memory_int(bin_tmp + 4, refID);
3209
3210 pos -= 1;
3211 set_memory_int(bin_tmp + 8, pos);
3212
3213 int mapq_nl = mapq << 8 | l_read_name;
3214 set_memory_int(bin_tmp + 12, mapq_nl);
3215
3216 int coverage;
3217 int cigar_ops = SamBam_compress_cigar(cigar, (int *)(bin_tmp + 36 + l_read_name), &coverage, 65535);
3218 int flag_nc = flag << 16 | cigar_ops;
3219 set_memory_int(bin_tmp + 16, flag_nc);
3220
3221
3222
3223 int seq_len = qual - seq - 1;
3224
3225 if(seq_len >=pairer -> long_read_minimum_length ){
3226 pairer -> is_single_end_mode = 1;
3227 include_sequence = 0;
3228 pairer -> tiny_mode = 1;
3229 pairer -> long_cigar_mode = 1;
3230 }
3231
3232 if(include_sequence){
3233 set_memory_int(bin_tmp + 20, l_seq); // SEQ_LEN
3234 }else set_memory_int(bin_tmp + 20, 1);
3235
3236 int mate_refID = refID;
3237 if(mate_ref[0]!='=' || mate_ref[1]!=0)
3238 mate_refID = HashTableGet(pairer->sam_contig_number_table, mate_ref) - NULL - 1;
3239
3240 if(mate_refID < 0 && mate_ref[0]!='*')
3241 mate_refID = online_register_contig(pairer, thread_context, mate_ref);
3242
3243 set_memory_int(bin_tmp + 24, mate_refID);
3244
3245 old_read_pos -= 1;
3246 set_memory_int(bin_tmp + 28, old_read_pos);
3247
3248 tlen = tlen * tlen_sign;
3249 set_memory_int(bin_tmp + 32, tlen);
3250
3251 memcpy(bin_tmp + 36, read_name, l_read_name);
3252 int bin_ptr = 36 + l_read_name + 4 * cigar_ops;
3253
3254 if(include_sequence){
3255 int xk1, nch;
3256 //SUBREADprintf("SEQ (%d = %d) = %s\n", strlen(seq), l_seq, seq);
3257 //SUBREADprintf("QUA (%d = %d) = %s\n\n", strlen(qual), l_seq, qual);
3258 SamBam_read2bin(seq , bin_tmp + bin_ptr);
3259 bin_ptr += (l_seq + 1) / 2;
3260 for(xk1=0; xk1 < l_seq; xk1++){
3261 nch = qual[xk1];
3262 bin_tmp[bin_ptr++] = nch - 33;
3263 }
3264 }else{
3265 bin_tmp[bin_ptr ++] = 0xff;
3266 bin_tmp[bin_ptr ++] = 0xff;
3267 }
3268
3269 if(column_no == 11) // has extra tags
3270 {
3271 while(in_str[in_ptr] == '\t'){
3272 if((!isalpha(in_str[in_ptr+1])) || (!isalpha(in_str[in_ptr+4]))){
3273 while(in_str[in_ptr] !='\n')in_ptr++;
3274 break;
3275 }
3276 in_ptr ++;
3277
3278 //SUBREADprintf("EXTRA_TAGS : %c%c : %c\n",in_str[in_ptr+0], in_str[in_ptr+1], in_str[in_ptr+3]);
3279
3280 int is_important_tag = (in_str[in_ptr+0] == 'N' && in_str[in_ptr+1] == 'H') ||
3281 (in_str[in_ptr+0] == 'H' && in_str[in_ptr+1] == 'I') ||
3282 (in_str[in_ptr+0] == 'R' && in_str[in_ptr+1] == 'G') ||
3283 (in_str[in_ptr+0] == 'N' && in_str[in_ptr+1] == 'M') ;
3284 int xxnch;
3285 if(in_str[in_ptr + 3] == 'Z' || in_str[in_ptr + 3] == 'H'){
3286 if(is_important_tag||!pairer -> tiny_mode){
3287 bin_tmp[bin_ptr+0] = in_str[in_ptr+0];
3288 bin_tmp[bin_ptr+1] = in_str[in_ptr+1];
3289 bin_tmp[bin_ptr+2] = in_str[in_ptr + 3];
3290 bin_ptr += 3;
3291 }
3292 in_ptr += 5;
3293 while(1){
3294 xxnch = *(in_str + in_ptr);
3295 if(xxnch == '\n' || xxnch == '\t' || xxnch == 0) break;
3296 if(is_important_tag||!pairer -> tiny_mode)
3297 *(bin_tmp + (bin_ptr++)) = xxnch;
3298 in_ptr ++;
3299 }
3300 if(is_important_tag||!pairer -> tiny_mode){
3301 *(bin_tmp + (bin_ptr++)) = 0;
3302 }
3303 }else if(in_str[in_ptr + 3] == 'i'){
3304 int tmpi = 0, tmpi_sign = 1;
3305 if(is_important_tag || !pairer -> tiny_mode){
3306 bin_tmp[bin_ptr+0] = in_str[in_ptr+0];
3307 bin_tmp[bin_ptr+1] = in_str[in_ptr+1];
3308 bin_tmp[bin_ptr+2] = 'i';
3309 bin_ptr += 3;
3310 }
3311
3312 in_ptr += 5;
3313
3314 while(1){
3315 xxnch = *(in_str + in_ptr);
3316 if(xxnch == '\n' || xxnch == '\t' || xxnch == 0) break;
3317 else if(xxnch == '-') tmpi_sign = -1;
3318 else tmpi = tmpi * 10 + xxnch - '0';
3319 in_ptr ++;
3320 }
3321 tmpi *= tmpi_sign;
3322 if(is_important_tag || !pairer -> tiny_mode){
3323 set_memory_int(bin_tmp+bin_ptr, tmpi);
3324 bin_ptr += 4;
3325 }
3326 }else if(in_str[in_ptr + 3] == 'f'){
3327 char ftxt[30];
3328 int fi=0;
3329 while(1){
3330 xxnch = *(in_str + in_ptr + 5 + fi);
3331 if(xxnch== '\n' || xxnch == '\t'|| xxnch == 0) break;
3332 ftxt[fi++]=xxnch;
3333 ftxt[fi]=0;
3334 }
3335 if(!pairer -> tiny_mode){
3336 float fv = atof(ftxt);
3337 bin_tmp[bin_ptr+0] = in_str[in_ptr+0];
3338 bin_tmp[bin_ptr+1] = in_str[in_ptr+1];
3339 bin_tmp[bin_ptr+2] = 'f';
3340 memcpy( bin_tmp + bin_ptr + 3, &fv, 4);
3341 bin_ptr += 7;
3342 }
3343 in_ptr += 5 + fi;
3344 }else if(in_str[in_ptr + 3] == 'B'){
3345 char elemtype = in_str[in_ptr + 5];
3346 int txi=0, eles=0;
3347 char ttxt[30], *elen_ptr = NULL;;
3348 if(!pairer -> tiny_mode){
3349 bin_tmp[bin_ptr+0] = in_str[in_ptr+0];
3350 bin_tmp[bin_ptr+1] = in_str[in_ptr+1];
3351 bin_tmp[bin_ptr+2] = 'B';
3352 bin_tmp[bin_ptr+3] = elemtype;
3353 elen_ptr = bin_tmp+4 + bin_ptr;
3354 bin_ptr += 8;
3355 }
3356 in_ptr += 6;
3357 int elembytes_no = 4;
3358 if(elemtype == 'C' || elemtype == 'c') elembytes_no = 1;
3359 if(elemtype == 'S' || elemtype == 's') elembytes_no = 2;
3360 while(1){
3361 xxnch = *(in_str + in_ptr);
3362 if((!pairer -> tiny_mode)){
3363 if((xxnch ==',' || xxnch =='\n' || xxnch == '\t' || xxnch == 0) && txi > 0){
3364 //SUBREADprintf("ADD VAL : `%s`\n", ttxt);
3365 if(elemtype == 'f'){
3366 float fv = atof(ttxt);
3367 memcpy( bin_tmp + bin_ptr, &fv, 4);
3368 }else{
3369 int iv = atoi(ttxt);
3370 memcpy( bin_tmp + bin_ptr, &iv, elembytes_no);
3371 }
3372 bin_ptr+=elembytes_no;
3373 txi=0;
3374 eles++;
3375 }else{
3376 if(xxnch!=','){
3377 ttxt[txi++] = xxnch;
3378 ttxt[txi] = 0;
3379 }
3380 }
3381 }
3382 if(xxnch =='\n' || xxnch == '\t' || xxnch == 0)break;
3383 in_ptr ++;
3384 }
3385 if((!pairer -> tiny_mode)) memcpy(elen_ptr, & eles, 4);
3386
3387 }else if(in_str[in_ptr + 3] == 'A'){
3388 if(!pairer -> tiny_mode){
3389 bin_tmp[bin_ptr+0] = in_str[in_ptr+0];
3390 bin_tmp[bin_ptr+1] = in_str[in_ptr+1];
3391 bin_tmp[bin_ptr+2] = 'A';
3392 bin_tmp[bin_ptr+3] = in_str[in_ptr+5];
3393 bin_ptr += 4;
3394 }
3395 in_ptr += 6;
3396 }else{
3397 in_ptr += 5;
3398 while(1){
3399 xxnch = *(in_str + in_ptr);
3400 if(xxnch == '\n' || xxnch == '\t' || xxnch == 0) break;
3401 in_ptr++;
3402 }
3403 }
3404 // #warning "=============== COMMENT NEXT ====================="
3405 // SUBREADprintf("Z_len PTR = %d + %d\n", bin_ptr, thread_context -> input_buff_BIN_ptr);
3406 }
3407
3408 }
3409
3410 thread_context -> input_buff_SBAM_ptr += in_ptr + 1;
3411
3412 bin_ptr -= 4;
3413 set_memory_int(bin_tmp, bin_ptr);
3414 bin_ptr += 4;
3415 //memcpy(buf, bin_tmp, bin_ptr);
3416
3417 return bin_ptr;
3418 }
3419
SAP_pairer_skip_tag_body_len(char * bin)3420 int SAP_pairer_skip_tag_body_len(char *bin){
3421 int skip_content = 0;
3422 if(bin[2]=='i' || bin[2]=='I' || bin[2]=='f')
3423 skip_content = 4;
3424 else if(bin[2]=='s' || bin[2]=='S')
3425 skip_content = 2;
3426 else if(bin[2]=='c' || bin[2]=='C' || bin[2]=='A')
3427 skip_content = 1;
3428 else if(bin[2]=='Z' || bin[2]=='H'){
3429 while(bin[skip_content + 3]) skip_content++;
3430 skip_content ++;
3431 } else if(bin[2]=='B'){
3432 char cell_type = tolower(bin[3]);
3433 memcpy(&skip_content, bin + 4, 4);
3434 if(cell_type == 's')skip_content *=2;
3435 else if(cell_type == 'i' || cell_type == 'f')skip_content *= 4;
3436 skip_content += 4+1; // 32-bit count, 1 byte type
3437 }else{
3438 SUBREADprintf("UnknownTag=%c\n", bin[2]);
3439 assert(0);
3440 }
3441 return skip_content+3;
3442 }
3443
SAM_pairer_iterate_tags(unsigned char * bin,int bin_len,char * tag_name,char * data_type,char ** saved_value)3444 int SAM_pairer_iterate_tags(unsigned char * bin, int bin_len, char * tag_name, char * data_type, char ** saved_value){
3445 int found = 0;
3446 int bin_cursor = 0;
3447 while(bin_cursor < bin_len){
3448 if(0){
3449 char outc[3];
3450 outc[0] = bin[bin_cursor];
3451 outc[1] = bin[bin_cursor+1];
3452
3453 outc[2]=0;
3454 SUBREADprintf("TAG=%s, TYP=%c %d %c\n", outc, bin[bin_cursor+2], bin[bin_cursor+3], bin[bin_cursor+4]);
3455 }
3456
3457 if(bin[bin_cursor] == tag_name[0] && bin[bin_cursor+1] == tag_name[1]){
3458 (* data_type) = bin[bin_cursor+2];
3459 (* saved_value) = (char *)bin+bin_cursor+3;
3460 found = 1;
3461 break;
3462 }
3463
3464 int skip_content = SAP_pairer_skip_tag_body_len((char*)bin+bin_cursor);
3465 bin_cursor += skip_content ;
3466 }
3467 return found;
3468 }
3469
SAM_pairer_iterate_int_tags(unsigned char * bin,int bin_len,char * tag_name,int * saved_value)3470 int SAM_pairer_iterate_int_tags(unsigned char * bin, int bin_len, char * tag_name, int * saved_value){
3471 char * data_ptr = NULL;
3472 char data_type = 0;
3473
3474 (*saved_value) = 0;
3475 int ret = SAM_pairer_iterate_tags(bin, bin_len, tag_name, &data_type, &data_ptr);
3476 //SUBREADprintf(" NEED %s , FOUND %d, TYPE %c\n", tag_name, ret, data_type);
3477 if(ret){
3478 if(data_type == 'i' || data_type == 'I')
3479 memcpy(saved_value, data_ptr, 4);
3480 else if(data_type == 's' || data_type == 'S')
3481 memcpy(saved_value, data_ptr, 2);
3482 else if(data_type == 'c' || data_type == 'C')
3483 memcpy(saved_value, data_ptr, 1);
3484 else return 0;
3485 }
3486
3487 return ret;
3488 }
3489
3490
3491
SAM_pairer_get_read_full_name(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,unsigned char * bin,int bin_len,char * full_name,int * this_flag)3492 int SAM_pairer_get_read_full_name( SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context , unsigned char * bin, int bin_len , char * full_name, int * this_flag){
3493 full_name[0]=0;
3494 int rlen = 0;
3495 unsigned int l_read_name = 0;
3496 unsigned int refID = 0;
3497 unsigned int next_refID = 0;
3498 unsigned int pos = 0, l_seq = 0, cigar_opts;
3499 unsigned int next_pos = 0, tmpi = 0;
3500 int FLAG;
3501
3502 int HItag = -1;
3503
3504
3505 memcpy(&refID, bin + 4, 4);
3506 memcpy(&pos, bin + 8, 4);
3507 memcpy(&tmpi, bin + 12, 4);
3508 l_read_name = tmpi & 0xff;
3509 memcpy(&tmpi, bin + 16, 4);
3510 FLAG = (tmpi >> 16)&0xffff;
3511 (*this_flag) = FLAG;
3512 cigar_opts = tmpi & 0xffff;
3513 memcpy(&next_refID, bin + 24, 4);
3514 memcpy(&next_pos, bin + 28, 4);
3515 memcpy(full_name, bin+36, l_read_name);
3516 assert(l_read_name > 0);
3517 unsigned int r1_refID, old_read_pos, r2_refID, new_dummy_pos;
3518
3519 if(FLAG & 4){
3520 refID = -1;
3521 pos = 0;
3522 }
3523
3524 if(FLAG & 8){
3525 next_refID = -1;
3526 next_pos = 0;
3527 }
3528
3529 if((FLAG & 0x40) == 0x40){
3530 r1_refID = refID;
3531 old_read_pos = pos;
3532 r2_refID = next_refID;
3533 new_dummy_pos = next_pos;
3534 } else {
3535 r2_refID = refID;
3536 new_dummy_pos = pos;
3537 r1_refID = next_refID;
3538 old_read_pos = next_pos;
3539 }
3540
3541
3542 memcpy(&l_seq, bin + 20, 4);
3543 //SUBREADprintf("LQ=%d, RL=%d, CIGAR_OPT=%d\n", l_seq, (l_seq+1)/2, cigar_opts);
3544
3545 unsigned int tags_start = 36+l_read_name+4*cigar_opts+(l_seq+1)/2+l_seq;
3546 unsigned int tags_len = bin_len - tags_start;
3547
3548 if(tags_len > 2){
3549 int found = SAM_pairer_iterate_int_tags(bin + tags_start, tags_len, "HI", &HItag);
3550 if(!found) HItag = -1;
3551 }
3552
3553 int slash_pos = 0;
3554 for(; slash_pos < l_read_name - 1; slash_pos++){
3555 if(full_name[slash_pos] == '/') break;
3556 }
3557
3558 rlen = slash_pos + sprintf(full_name+slash_pos, "\027%d\027%u\027%d\027%u\027%d", r1_refID, old_read_pos, r2_refID, new_dummy_pos, HItag);
3559
3560 return rlen;
3561 }
3562
SAM_pairer_multi_thread_header(void * pairer_vp,int thread_no,int is_text,unsigned int items,char * bin,unsigned int bin_len)3563 int SAM_pairer_multi_thread_header (void * pairer_vp, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len){
3564
3565 SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
3566 SAM_pairer_writer_main_t * bam_main = (SAM_pairer_writer_main_t * )pairer -> appendix1;
3567 SAM_pairer_writer_thread_t * bam_thread = bam_main -> threads + thread_no;
3568 unsigned int BIN_block_cursor = 0, bin_cursor = 0;
3569 //SUBREADprintf("WRITE HEADER TYPE=%d; ITEMS=%d\n", is_text, items);
3570 if(is_text){
3571 memcpy( bam_thread -> BIN_buffer, "BAM\1", 4 );
3572 memcpy( bam_thread -> BIN_buffer + 4 , & items , 4 );
3573 BIN_block_cursor = 8;
3574 }else{
3575 memcpy( bam_thread -> BIN_buffer , & items , 4 );
3576 BIN_block_cursor = 4;
3577 }
3578 while( bin_cursor < bin_len ){
3579 int write_text_len = min(SAM_PAIRER_WRITE_BUFFER - BIN_block_cursor, bin_len - bin_cursor);
3580 // SUBREADprintf("WRITE TLEN=%d\n", write_text_len);
3581 memcpy(bam_thread -> BIN_buffer + BIN_block_cursor , bin + bin_cursor, write_text_len);
3582 bam_thread -> BIN_buffer_ptr = write_text_len + BIN_block_cursor;
3583
3584 SAM_pairer_multi_thread_compress(bam_main, bam_thread);
3585 bin_cursor += write_text_len;
3586 BIN_block_cursor = 0;
3587 }
3588
3589 bam_thread -> BIN_buffer_ptr = 0;
3590 return 0;
3591 }
3592
SAM_pairer_get_tag_bin_start(char * bin1)3593 int SAM_pairer_get_tag_bin_start(char * bin1){
3594 int seq_len = 0;
3595 int cigar_opts = 0;
3596 int len_name = (unsigned char)bin1[12];
3597 memcpy(&seq_len, bin1 + 20,4);
3598 memcpy(&cigar_opts, bin1 + 16, 2);
3599 return 36 + len_name + seq_len + (seq_len+1)/2 + 4 * cigar_opts;
3600 }
3601
SAM_pairer_make_dummy(char * rname,char * bin1,char * out_bin2,int need_RG_tag)3602 void SAM_pairer_make_dummy(char * rname, char * bin1, char * out_bin2, int need_RG_tag){
3603 char * realname = bin1 + 36;
3604 int block1len =-1;
3605 int len_name = (unsigned char)bin1[12] -1;
3606 int old_read_chro =-1;
3607 int old_read_pos =-1;
3608 int new_dummy_chro =-1;
3609 int new_dummy_pos =-1;
3610
3611 memcpy(&block1len, bin1, 4);
3612 memcpy(&old_read_chro, bin1 + 4, 4);
3613 memcpy(&old_read_pos, bin1 + 8, 4);
3614
3615 memcpy(&new_dummy_chro, bin1 + 24, 4);
3616 memcpy(&new_dummy_pos, bin1 + 28, 4);
3617
3618 int HItag =-1;
3619 int NHtag =-1;
3620
3621 int seq_len = -1;
3622 int cigar_opts = -1;
3623 memcpy(&seq_len, bin1 + 20,4);
3624 int old_read_FLAG = -1;
3625 memcpy(&old_read_FLAG, bin1 + 16, 4);
3626 cigar_opts = old_read_FLAG & 0xffff;
3627
3628 char * RG_tag_val = NULL;
3629 int bin1ptr = 36 + len_name +1 + seq_len + (seq_len+1)/2 + 4 * cigar_opts;
3630 //SUBREADprintf("MAKE_DUMMY: %s ; need_RG=%d, %d > %d\n", realname, need_RG_tag, block1len + 4 ,bin1ptr + 3);
3631 if( block1len + 4 > bin1ptr + 3 ){
3632 SAM_pairer_iterate_int_tags((unsigned char *)(bin1+bin1ptr),block1len + 4 - bin1ptr, "NH", &NHtag);
3633 SAM_pairer_iterate_int_tags((unsigned char *)(bin1+bin1ptr),block1len + 4 - bin1ptr, "HI", &HItag);
3634 if( need_RG_tag ){
3635 char RG_type=0;
3636 SAM_pairer_iterate_tags((unsigned char *)(bin1+bin1ptr),block1len + 4 - bin1ptr, "RG", &RG_type, &RG_tag_val);
3637 if(RG_type != 'Z') RG_tag_val = NULL;
3638 //SUBREADprintf("type=%c\tval=%s\n", RG_type, RG_tag_val);
3639 }
3640 }
3641
3642 old_read_FLAG = 0xffff&(old_read_FLAG >>16);
3643 int mate_tlen = 0;
3644 memcpy(&mate_tlen, bin1 + 32, 4);
3645
3646 if(old_read_chro<0) old_read_pos=-1;
3647 if(new_dummy_chro<0) new_dummy_pos=-1;
3648
3649
3650 int bin_mq_nl = (len_name+1);
3651 int new_dummy_FLAG = (old_read_FLAG&0x40)? 0x80:0x40;
3652 new_dummy_FLAG |= 1;
3653
3654 // Dummy reads should always be unmapped!
3655 //if(old_read_FLAG & 8)new_dummy_FLAG |=4;
3656
3657 if(old_read_FLAG & 4)new_dummy_FLAG |=8;
3658 if(old_read_FLAG & 8)new_dummy_FLAG |=4;
3659 if(old_read_FLAG & 0x10) new_dummy_FLAG |= 0x20;
3660 if(old_read_FLAG & 0x20) new_dummy_FLAG |= 0x10;
3661 new_dummy_FLAG = new_dummy_FLAG << 16;
3662
3663 memcpy(out_bin2+4, &new_dummy_chro,4);
3664 memcpy(out_bin2+8, &new_dummy_pos,4);
3665 memcpy(out_bin2+12, &bin_mq_nl, 4);
3666 memcpy(out_bin2+16, &new_dummy_FLAG, 4);
3667
3668 new_dummy_FLAG = 1;
3669 memcpy(out_bin2+20, &new_dummy_FLAG, 4);
3670 memcpy(out_bin2+24, &old_read_chro, 4);
3671 memcpy(out_bin2+28, &old_read_pos, 4);
3672
3673 mate_tlen = -mate_tlen;
3674 memcpy(out_bin2+32, &mate_tlen, 4);
3675 memcpy(out_bin2+36, realname, len_name+1);
3676 out_bin2[36 + len_name+1] = 0xff;
3677 out_bin2[36 + len_name+2] = 0x20;
3678
3679 int all_len = 36 + len_name + 3 - 4;
3680 int tag_ptr = 36 + len_name + 3;
3681 //SUBREADprintf("HI=%d\n", HItag);
3682 if(HItag>0){
3683 out_bin2[tag_ptr++]='H';
3684 out_bin2[tag_ptr++]='I';
3685 if(HItag<128){
3686 out_bin2[tag_ptr++]='C';
3687 memcpy(out_bin2 + (tag_ptr++), &HItag, 1);
3688 all_len += 4;
3689 }else if(HItag<32767){
3690 out_bin2[(tag_ptr+=2)]='S';
3691 memcpy(out_bin2 + 36 + len_name+6, &HItag, 2);
3692 all_len += 5;
3693 }else {
3694 out_bin2[(tag_ptr+=4)]='I';
3695 memcpy(out_bin2 + 36 + len_name+6, &HItag, 4);
3696 all_len += 7;
3697 }
3698 }
3699 if(NHtag>0){
3700 out_bin2[tag_ptr++]='N';
3701 out_bin2[tag_ptr++]='H';
3702 if(NHtag<128){
3703 out_bin2[tag_ptr++]='C';
3704 memcpy(out_bin2 + (tag_ptr++), &NHtag, 1);
3705 all_len += 4;
3706 }else if(NHtag<32767){
3707 out_bin2[(tag_ptr+=2)]='S';
3708 memcpy(out_bin2 + 36 + len_name+6, &NHtag, 2);
3709 all_len += 5;
3710 }else {
3711 out_bin2[(tag_ptr+=4)]='I';
3712 memcpy(out_bin2 + 36 + len_name+6, &NHtag, 4);
3713 all_len += 7;
3714 }
3715 }
3716 if(RG_tag_val){
3717 out_bin2[tag_ptr++]='R';
3718 out_bin2[tag_ptr++]='G';
3719 out_bin2[tag_ptr++]='Z';
3720 all_len +=3;
3721 while(*RG_tag_val){
3722 out_bin2[tag_ptr++]=*(RG_tag_val++);
3723 all_len ++;
3724 }
3725 out_bin2[tag_ptr++]=0;
3726 all_len ++;
3727 }
3728
3729 memcpy(out_bin2,&all_len,4);
3730 }
3731
SAM_pairer_reset(SAM_pairer_context_t * pairer)3732 void SAM_pairer_reset( SAM_pairer_context_t * pairer ) {
3733 int x1;
3734 pairer -> is_finished = 0;
3735 pairer -> BAM_header_parsed = 0;
3736 pairer -> total_input_reads = 0;
3737 pairer -> input_chunk_no = 0;
3738 pairer -> merge_level_finished = 0;
3739 for(x1 = 0; x1 < pairer -> total_threads ; x1 ++){
3740 pairer -> threads[x1].reads_in_SBAM = 0;
3741 pairer -> threads[x1].input_buff_BIN_used = 0;
3742 pairer -> threads[x1].input_buff_BIN_ptr = 0;
3743 pairer -> threads[x1].input_buff_SBAM_used = 0;
3744 pairer -> threads[x1].input_buff_SBAM_ptr = 0;
3745 pairer -> threads[x1].orphant_block_no = 0;
3746 pairer -> threads[x1].readno_in_chunk = 0;
3747 pairer -> threads[x1].immediate_last_read_full_name[0]=0;
3748 HashTableDestroy(pairer -> threads[x1].orphant_table);
3749 pairer -> threads[x1].orphant_table = HashTableCreate(pairer -> input_buff_SBAM_size / 100);
3750 HashTableSetHashFunction(pairer -> threads[x1].orphant_table, fc_chro_hash);
3751 HashTableSetKeyComparisonFunction(pairer -> threads[x1].orphant_table, fc_strcmp_chro);
3752 HashTableSetDeallocationFunctions(pairer -> threads[x1].orphant_table, free, free);
3753 inflateReset(&pairer -> threads[x1].strm);
3754 }
3755 HashTableDestroy(pairer -> unsorted_notification_table);
3756 pairer -> unsorted_notification_table = HashTableCreate(2191);
3757 HashTableSetHashFunction(pairer -> unsorted_notification_table, fc_chro_hash);
3758 HashTableSetKeyComparisonFunction(pairer -> unsorted_notification_table, fc_strcmp_chro);
3759 HashTableSetDeallocationFunctions(pairer -> unsorted_notification_table, free, free);
3760
3761
3762 HashTableDestroy(pairer ->bam_margin_table);
3763 pairer ->bam_margin_table = HashTableCreate(2191);
3764 HashTableSetHashFunction(pairer -> bam_margin_table, fc_chro_hash);
3765 HashTableSetKeyComparisonFunction(pairer -> bam_margin_table, fc_strcmp_chro);
3766 HashTableSetDeallocationFunctions(pairer -> bam_margin_table, free, free);
3767
3768 }
SAM_pairer_writer_reset(void * pairer_vp)3769 void SAM_pairer_writer_reset( void * pairer_vp ) {
3770 SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
3771 SAM_pairer_writer_main_t * bam_main = (SAM_pairer_writer_main_t * )pairer -> appendix1;
3772 int rlen = ftruncate(fileno(bam_main -> bam_fp), 0);
3773 if(rlen != 0)SUBREADprintf("ERROR: Cannot reset the output file.");
3774
3775 fclose(bam_main -> bam_fp);
3776 bam_main -> bam_fp = f_subr_open(bam_main -> bam_name, "wb");
3777 int x1;
3778 for(x1 = 0; x1 < pairer -> total_threads ; x1 ++){
3779 bam_main -> threads[x1].BIN_buffer_ptr = 0;
3780 deflateReset(&bam_main -> threads[x1].strm);
3781 }
3782
3783
3784 }
3785
SAM_pairer_multi_thread_output(void * pairer_vp,int thread_no,char * bin1,char * bin2)3786 int SAM_pairer_multi_thread_output(void * pairer_vp, int thread_no, char * bin1, char * bin2 ){
3787 SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
3788 SAM_pairer_writer_main_t * bam_main = (SAM_pairer_writer_main_t * )pairer -> appendix1;
3789 SAM_pairer_writer_thread_t * bam_thread = bam_main -> threads + thread_no;
3790
3791 char dummy_bin2 [MAX_READ_NAME_LEN*2 + 180 ];
3792 if(bin2==NULL && bam_main -> has_dummy){
3793 SAM_pairer_make_dummy( "DUMMY", bin1, dummy_bin2, pairer -> need_read_group_tag );
3794 bin2 = dummy_bin2;
3795 }
3796
3797 int bin_len1, bin_len2 = 0;
3798 memcpy(&bin_len1, bin1, 4);
3799 bin_len1 +=4;
3800
3801 if(bin2) {
3802 memcpy(&bin_len2, bin2, 4);
3803 bin_len2 +=4;
3804 }
3805
3806 if( bin_len1 + bin_len2 >= SAM_PAIRER_WRITE_BUFFER){
3807 SUBREADprintf("ERROR: BAM Record larger than a BAM block.\n");
3808 return 1;
3809 }
3810
3811 if(bin_len1 + bin_len2 + bam_thread -> BIN_buffer_ptr >= SAM_PAIRER_WRITE_BUFFER){
3812 int ret = SAM_pairer_multi_thread_compress(bam_main, bam_thread);
3813 if(ret)return 1;
3814 }
3815 memcpy( bam_thread -> BIN_buffer + bam_thread -> BIN_buffer_ptr, bin1, bin_len1 );
3816 if(bin2)
3817 memcpy( bam_thread -> BIN_buffer + bam_thread -> BIN_buffer_ptr + bin_len1, bin2, bin_len2 );
3818 bam_thread -> BIN_buffer_ptr += bin_len1 + bin_len2;
3819 return 0;
3820 }
3821
SAM_pairer_do_read_test(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,int read_name_len,char * read_full_name,int bin_len,char * bin,int flags)3822 void SAM_pairer_do_read_test( SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context , int read_name_len, char * read_full_name, int bin_len, char * bin , int flags){
3823
3824 unsigned char * mate_bin = HashTableGet(thread_context -> orphant_table, read_full_name);
3825
3826 if(mate_bin){
3827 if(pairer -> output_function)
3828 pairer -> output_function(pairer, thread_context -> thread_id, bin, (char*)mate_bin);
3829 HashTableRemove(thread_context -> orphant_table, read_full_name);
3830 if(thread_context -> orphant_space > bin_len)
3831 thread_context -> orphant_space -= bin_len;
3832 else thread_context -> orphant_space = 0;
3833 //SUBREADprintf("Mate_found: %s\n", read_full_name);
3834 } else {
3835 char * mem_name = malloc(read_name_len + 1);
3836 memcpy(mem_name, read_full_name, read_name_len);
3837 mem_name[read_name_len] = 0;
3838
3839 char * mem_bin = malloc(bin_len);
3840 memcpy(mem_bin, bin , bin_len);
3841
3842 HashTablePut(thread_context -> orphant_table, mem_name, mem_bin);
3843 thread_context -> orphant_space += bin_len;
3844 //#warning "============= COMMENT NEXT =================="
3845 //SUBREADprintf("Orphant_created [%d]: %s ; BINLEN=%d, OPSIZE=%d\n", thread_context -> thread_id, read_full_name, bin_len, thread_context -> orphant_space);
3846 }
3847 }
3848
3849
SAM_pairer_register_matcher(SAM_pairer_context_t * pairer,unsigned int chunk_number,unsigned int readno_in_chunk,char * read_full_name,char * bin,int bin_len,int this_flags)3850 void SAM_pairer_register_matcher(SAM_pairer_context_t * pairer , unsigned int chunk_number, unsigned int readno_in_chunk, char * read_full_name , char * bin, int bin_len , int this_flags){
3851
3852 char * mem_bin = malloc(bin_len);
3853 memcpy(mem_bin, bin , bin_len);
3854 subread_lock_occupy(&pairer -> unsorted_notification_lock);
3855 char * mem_name = malloc(24);
3856 sprintf(mem_name, "B:%u:%d", chunk_number , (readno_in_chunk>0)?1:0);
3857 HashTablePut(pairer -> unsorted_notification_table, mem_name, mem_bin);
3858
3859 mem_bin = malloc(bin_len);
3860 sprintf(mem_bin,"%010u %d", chunk_number, (readno_in_chunk>0)?1:0);
3861 mem_name = malloc(strlen(read_full_name) + 5);
3862 sprintf(mem_name, "C:%s:%d", read_full_name , (this_flags & 0x80)?1:0);
3863
3864 HashTablePut(pairer -> unsorted_notification_table, mem_name, mem_bin);
3865 subread_lock_release(&pairer -> unsorted_notification_lock);
3866 }
3867
SAM_pairer_do_one_BIN(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,char * bin,int bin_len)3868 void SAM_pairer_do_one_BIN(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context , char * bin, int bin_len){
3869 char read_full_name[ MAX_READ_NAME_LEN*2 +80 ]; // rname:chr_r1:pos_r1:chr_r2:pos_r2:HI_tag
3870 int this_flags=0;
3871 int name_len = SAM_pairer_get_read_full_name(pairer, thread_context, (unsigned char *)bin, bin_len, read_full_name, & this_flags);
3872
3873 if(pairer -> is_single_end_mode == 0 && ( this_flags & 1 ) == 1){ // if the reads are PE
3874 if(strcmp(read_full_name , thread_context -> immediate_last_read_full_name) == 0){
3875 if(pairer -> output_function)
3876 pairer -> output_function(pairer, thread_context -> thread_id, (char*) bin, (char*)thread_context -> immediate_last_read_bin);
3877 thread_context -> immediate_last_read_full_name[0] = 0;
3878 }else{
3879
3880 if(thread_context -> immediate_last_read_full_name[0]){
3881 if(thread_context -> readno_in_chunk>2){
3882 if(pairer -> is_unsorted_notified == 0){
3883 if(pairer -> unsorted_notification){
3884 //SUBREADprintf("READ_%d : UNSORT1 : %s != %s\n", thread_context -> readno_in_chunk, thread_context -> immediate_last_read_full_name , read_full_name);
3885 pairer -> unsorted_notification(pairer , thread_context -> immediate_last_read_bin, (char *) bin);
3886 }
3887 pairer -> is_unsorted_notified = 1;
3888 }
3889 }else if(thread_context -> readno_in_chunk == 1 && !pairer -> is_unsorted_notified ) {
3890 SAM_pairer_register_matcher(pairer, thread_context -> chunk_number, thread_context -> readno_in_chunk - 1, thread_context -> immediate_last_read_full_name, thread_context -> immediate_last_read_bin, thread_context -> immediate_last_read_bin_len , thread_context -> immediate_last_read_flags );
3891 }
3892
3893 SAM_pairer_do_read_test(pairer , thread_context , thread_context -> immediate_last_read_name_len , thread_context -> immediate_last_read_full_name , thread_context -> immediate_last_read_bin_len , thread_context -> immediate_last_read_bin, thread_context -> immediate_last_read_flags);
3894 }
3895
3896 thread_context -> immediate_last_read_bin_len = bin_len;
3897 thread_context -> immediate_last_read_name_len = name_len;
3898 thread_context -> immediate_last_read_flags = this_flags;
3899 strcpy(thread_context -> immediate_last_read_full_name, read_full_name);
3900 memcpy(thread_context -> immediate_last_read_bin, bin, bin_len);
3901 }
3902 }else{ // else just write.
3903 if(pairer -> output_function)
3904 pairer -> output_function(pairer, thread_context -> thread_id, (char*) bin, NULL);
3905 }
3906 thread_context -> readno_in_chunk ++;
3907 }
3908
SAM_pairer_do_next_read(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context)3909 int SAM_pairer_do_next_read( SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context ){
3910 unsigned char * bin = NULL;
3911 int bin_len = 0;
3912
3913 int has_next_read = SAM_pairer_get_next_read_BIN(pairer, thread_context, &bin, &bin_len);
3914 if(has_next_read && !pairer -> is_bad_format){
3915 SAM_pairer_do_one_BIN( pairer, thread_context,(char *)bin, bin_len );
3916 return 0;
3917 }
3918 else pairer -> BAM_header_parsed = 1;
3919 return 1;
3920 }
3921
3922
3923 // all orphants are written into files, each has a size of buffer size.
3924 // when the orphants are longer than buffer_size, then sort and save to disk.
3925
SAM_pairer_sort_exchange(void * arr,int l,int r)3926 void SAM_pairer_sort_exchange(void * arr, int l, int r){
3927 unsigned char *** sort_data = (unsigned char ***) arr;
3928 unsigned char * tmpc;
3929
3930 tmpc = sort_data[0][r];
3931 sort_data[0][r] = sort_data[0][l];
3932 sort_data[0][l] = tmpc;
3933
3934 tmpc = sort_data[1][r];
3935 sort_data[1][r] = sort_data[1][l];
3936 sort_data[1][l] = tmpc;
3937 }
3938
SAM_pairer_sort_compare(void * arr,int l,int r)3939 int SAM_pairer_sort_compare(void * arr, int l, int r){
3940 char *** sort_data = (char ***) arr;
3941 return strcmp(sort_data[0][l], sort_data[0][r]);
3942 }
3943
SAM_pairer_sort_merge(void * arr,int start,int items,int items2)3944 void SAM_pairer_sort_merge( void * arr, int start, int items, int items2 ){
3945 unsigned char *** sort_data = (unsigned char ***) arr;
3946
3947 unsigned char ** tmp_name_list = malloc(sizeof(char *) * (items+items2));
3948 unsigned char ** tmp_bin_list = malloc(sizeof(char *) * (items+items2));
3949
3950 int i1_cursor = start, i2_cursor = items + start;
3951 int tmp_cursor = 0;
3952
3953 while(1){
3954 if(i1_cursor == items + start && i2_cursor == items + items2 + start )break;
3955 int select_items_1 = (i2_cursor == start + items + items2) || (i1_cursor < items + start && SAM_pairer_sort_compare(arr, i1_cursor, i2_cursor) <= 0);
3956 if(select_items_1){
3957 tmp_name_list[tmp_cursor] = sort_data[0][i1_cursor];
3958 tmp_bin_list[tmp_cursor ++] = sort_data[1][i1_cursor++];
3959 }else{
3960 tmp_name_list[tmp_cursor] = sort_data[0][i2_cursor];
3961 tmp_bin_list[tmp_cursor ++] = sort_data[1][i2_cursor++];
3962 }
3963 }
3964 assert(tmp_cursor == items + items2);
3965
3966 memcpy( sort_data[0] + start, tmp_name_list, sizeof(char *) * (items+items2) );
3967 memcpy( sort_data[1] + start, tmp_bin_list, sizeof(char *) * (items+items2) );
3968 free(tmp_name_list);
3969 free(tmp_bin_list);
3970
3971 }
3972
SAM_pairer_osr_hash(char * st)3973 unsigned int SAM_pairer_osr_hash(char * st){
3974 int x1 = 0, nch;
3975 unsigned int ret = 0, ret2=0;
3976 while((nch = st[x1++])!=0){
3977 ret = (ret << 2) ^ nch;
3978 ret2 = (ret << 3) ^ nch;
3979 }
3980 return (ret^ret2) % 39846617;
3981 }
3982
SAM_pairer_osr_next_name(FILE * fp,char * name,int thread_no,int all_threads)3983 int SAM_pairer_osr_next_name(FILE * fp , char * name, int thread_no, int all_threads){
3984 while(1){
3985 if(feof(fp)) return 0;
3986 int rlen =0;
3987 int retv = fread(&rlen, 1, 2, fp);
3988 if(retv < 2) return 0;
3989 if(rlen < 1) return 0;
3990 assert(rlen < 1024);
3991
3992 int rlen2 = fread(name, 1, rlen, fp);
3993 if(rlen2 != rlen) return 0;
3994 name[rlen]=0;
3995 if(all_threads < 0 || SAM_pairer_osr_hash(name)% all_threads == thread_no )
3996 {
3997 fseeko(fp, -2-rlen, SEEK_CUR);
3998 return 1;
3999 }
4000 retv = fread(&rlen, 1, 4, fp);
4001 if(retv!=4) return -1;
4002 rlen +=4;
4003 fseeko(fp, rlen, SEEK_CUR);
4004 }
4005 return 0;
4006 }
4007
SAM_pairer_osr_next_bin(FILE * fp,char * bin)4008 void SAM_pairer_osr_next_bin(FILE * fp, char * bin){
4009 int rlen =0;
4010 int retv = fread(&rlen, 1, 2, fp);
4011 if(retv <2) *((int*)bin)=0;
4012
4013 assert(rlen < 1024);
4014 fseeko(fp, rlen, SEEK_CUR);
4015 rlen =0;
4016 retv = fread(&rlen, 1, 4, fp);
4017 if(retv <4) *((int*)bin)=0;
4018 rlen +=4;
4019 retv = fread(bin, 1, rlen, fp);
4020 if(retv <rlen) *((int*)bin)=0;
4021 }
4022
SAM_pairer_is_matched_chunks(char * c1,char * c2)4023 int SAM_pairer_is_matched_chunks(char * c1, char * c2){
4024 if(c1==NULL || c2==NULL)return 0;
4025
4026 unsigned int i1 = (unsigned int) atoi(c1);
4027 unsigned int i2 = (unsigned int) atoi(c2);
4028 int start_1 = c1[11]=='0';
4029 int start_2 = c2[11]=='0';
4030
4031 if(start_1+start_2!=1)return 0;
4032 if(start_1) i2++;else i1++;
4033 return i2==i1;
4034 }
4035
4036
4037
4038
4039
4040
merge_level_fps(SAM_pairer_context_t * pairer,char * fname,FILE ** fps,int fps_no)4041 int merge_level_fps(SAM_pairer_context_t * pairer, char * fname, FILE ** fps, int fps_no){
4042 char * bin_tmp1 , * bin_tmp2;
4043 int max_name_len = MAX_READ_NAME_LEN*2 +80, x1, is_disk_full = 0;
4044
4045 char tmp_fname[MAX_FILE_NAME_LENGTH+30];
4046 sprintf(tmp_fname, "%s-MERGE-TMP.tmp", pairer->tmp_file_prefix);
4047
4048 char * names = malloc( fps_no * max_name_len );
4049
4050 bin_tmp1 = malloc(FC_LONG_READ_RECORD_HARDLIMIT);
4051 bin_tmp2 = malloc(FC_LONG_READ_RECORD_HARDLIMIT);
4052 FILE * out_fp = fopen(tmp_fname, "wb");
4053
4054
4055 // initialize the "current_first_name" for each orphan file
4056
4057 for(x1 = 0 ; x1 < fps_no; x1++)
4058 {
4059 int has = SAM_pairer_osr_next_name( fps[x1] , names + max_name_len*x1 , -1 , -1);
4060 if(!has) *(names + max_name_len*x1)=0;
4061 }
4062
4063
4064 while(1){
4065 int min_name_fileno = -1;
4066 int min2_name_fileno = -1;
4067
4068 // find the min_name in all FPs
4069 // and find the same min_name if there is any
4070
4071 for(x1 = 0 ; x1 < fps_no; x1++){
4072 int has = *(names + max_name_len*x1);
4073 if(has){
4074 int strcv_12 = 1;
4075 if(min_name_fileno >=0) strcv_12 = strcmp(names+(min_name_fileno * max_name_len), names+(x1 * max_name_len));
4076 if(strcv_12 > 0){
4077 min_name_fileno = x1;
4078 min2_name_fileno = -1;
4079 }else if( strcv_12 == 0){
4080 min2_name_fileno = x1;
4081 }
4082 }
4083
4084 }
4085
4086
4087 if(min_name_fileno >= 0 && !is_disk_full){
4088 SAM_pairer_osr_next_bin( fps[ min_name_fileno ] , bin_tmp1);
4089
4090 if(min2_name_fileno>=0){
4091 SAM_pairer_osr_next_bin( fps[ min2_name_fileno ] , bin_tmp2);
4092 pairer -> output_function(pairer, 0, (char*) bin_tmp1, (char*)bin_tmp2);
4093
4094 if(0 && 0 == pairer -> is_unsorted_notified){
4095 char * name_tmp_1 = malloc(strlen(names+(min_name_fileno * max_name_len))+5), *name_tmp_2 = malloc(strlen(names+(min_name_fileno * max_name_len))+5);
4096 char * min1_chunk_info, * min2_chunk_info;
4097 sprintf(name_tmp_1, "C:%s:%d", names+(min_name_fileno * max_name_len), 0);
4098 sprintf(name_tmp_2, "C:%s:%d", names+(min2_name_fileno * max_name_len), 1);
4099 min1_chunk_info = HashTableGet( pairer -> unsorted_notification_table , name_tmp_1);
4100 min2_chunk_info = HashTableGet( pairer -> unsorted_notification_table , name_tmp_2);
4101 if(min1_chunk_info == NULL || min2_chunk_info == NULL || !SAM_pairer_is_matched_chunks(min1_chunk_info, min2_chunk_info)){
4102 sprintf(name_tmp_1, "B:%s:%d", names+(min_name_fileno * max_name_len), 0);
4103 if( pairer -> unsorted_notification ){
4104 //SUBREADprintf("FINAL STEP\n");
4105 //SUBREADprintf("UNSORT2\n");
4106 pairer -> unsorted_notification(pairer , HashTableGet( pairer -> unsorted_notification_table , name_tmp_1), NULL);
4107 }
4108 pairer -> is_unsorted_notified = 1;
4109 }
4110 free(name_tmp_1);
4111 }
4112
4113 int read_has = SAM_pairer_osr_next_name( fps[min2_name_fileno], names + max_name_len*min2_name_fileno, -1, -1);
4114 if(!read_has) *(names + max_name_len*min2_name_fileno)=0;
4115 }else{
4116 unsigned short wlen;
4117 unsigned int rbinlen = 0;
4118 wlen = strlen( names+(min_name_fileno * max_name_len) );
4119 fwrite( &wlen, 2, 1,out_fp );
4120 fwrite( names+(min_name_fileno * max_name_len), 1, wlen, out_fp );
4121 memcpy( &rbinlen, bin_tmp1 , 4);
4122 rbinlen += 4;
4123 fwrite( bin_tmp1, 4, 1, out_fp );
4124 int write_len = fwrite( bin_tmp1, 1, rbinlen, out_fp );
4125 if(write_len < rbinlen)is_disk_full = 1;
4126 }
4127 int read_has = SAM_pairer_osr_next_name( fps[min_name_fileno], names + max_name_len*min_name_fileno, -1, -1);
4128 if(!read_has) *(names + max_name_len*min_name_fileno)=0;
4129 } else break;
4130 }
4131
4132 fclose(out_fp);
4133 unlink(fname);
4134 rename(tmp_fname, fname);
4135 free(names);
4136 free(bin_tmp1);
4137 free(bin_tmp2);
4138 return is_disk_full;
4139 }
4140 #define PAIRER_WAIT_TICK_TIME 10000
4141
SAM_pairer_get_merge_max_fp(SAM_pairer_context_t * pairer)4142 int SAM_pairer_get_merge_max_fp(SAM_pairer_context_t * pairer){
4143 return pairer -> max_file_open_number;
4144
4145 }
4146
SAM_pairer_set_merge_max_fp(SAM_pairer_context_t * pairer,int fon)4147 void SAM_pairer_set_merge_max_fp(SAM_pairer_context_t * pairer, int fon){
4148 pairer -> max_file_open_number = fon;
4149 }
4150
4151
SAM_pairer_probe_maxfp(SAM_pairer_context_t * pairer)4152 int SAM_pairer_probe_maxfp( SAM_pairer_context_t * pairer){
4153 int orphant_fp_no=0, is_disk_full = 0;
4154 int thno, bkno, x1;
4155 int thread_fps [ pairer -> total_threads ];
4156 char tmp_fname[MAX_FILE_NAME_LENGTH+50];
4157
4158 memset(thread_fps, 0, sizeof(int) * pairer -> total_threads);
4159 for( thno = 0 ; thno < pairer -> total_threads ; thno ++ ){
4160 for( bkno = 0 ; ; bkno++){
4161 sprintf(tmp_fname, "%s-TH%02d-BK%06d.tmp", pairer->tmp_file_prefix, thno, bkno);
4162 FILE * in_fp = fopen(tmp_fname, "rb");
4163 if(NULL == in_fp) break;
4164 thread_fps[thno] = bkno;
4165 fclose(in_fp);
4166 orphant_fp_no ++;
4167 }
4168 }
4169
4170 int max_open_fps = 0, has_limit = 0;
4171 int orphant_fp_size = 50;
4172 FILE ** orphant_fps = malloc(sizeof(FILE *) * orphant_fp_size);
4173
4174 for( bkno = 0 ; bkno < 5; bkno++){
4175 sprintf(tmp_fname, "%s-FTEST-%d.tmp", pairer->tmp_file_prefix, bkno);
4176 FILE * tfp = fopen(tmp_fname, "w");
4177 if(NULL == tfp){
4178 has_limit = 1;
4179 break;
4180 }
4181 orphant_fps[max_open_fps++] = tfp;
4182 }
4183 //#warning ">>>>>>> COMMENT NEXT LINE <<<<<<<<"
4184 for( thno = 0 ; thno < pairer -> total_threads ; thno ++ ){
4185 if(has_limit) break;
4186 for( bkno = 0 ; ; bkno++){
4187 sprintf(tmp_fname, "%s-TH%02d-BK%06d.tmp", pairer->tmp_file_prefix, thno, bkno);
4188 FILE * in_fp = fopen(tmp_fname, "rb");
4189 if(NULL == in_fp){
4190 if( bkno <= thread_fps[thno] ) has_limit = 1;
4191 break;
4192 }
4193 orphant_fps[max_open_fps++] = in_fp;
4194 if(max_open_fps >= orphant_fp_size - 1){
4195 orphant_fp_size *= 2;
4196 orphant_fps = realloc(orphant_fps, orphant_fp_size * sizeof(FILE *));
4197 }
4198 }
4199 }
4200
4201 for( bkno = 0 ;bkno < max_open_fps; bkno ++) fclose(orphant_fps[bkno]);
4202
4203 SAM_pairer_set_merge_max_fp(pairer, max_open_fps - 5);
4204
4205 //#warning ">>>>>>> COMMENT NEXT LINE <<<<<<<<"
4206 //SUBREADprintf("Needed FPS = %d, Ulimit FPS = %d, Has_Limit = %d \n", orphant_fp_no, max_open_fps, has_limit);
4207
4208 if( SAM_pairer_get_merge_max_fp(pairer) < orphant_fp_no * pairer -> total_threads){
4209 int processed_orphant = 0;
4210 int current_opened_fp_no = 0 ;
4211 FILE * level_merge_fps [ SAM_pairer_get_merge_max_fp(pairer) ];
4212 for( thno = 0 ; thno < pairer -> total_threads ; thno ++ ){
4213 for( bkno = 0 ; ; bkno++){
4214 char tmp_fname[MAX_FILE_NAME_LENGTH+50];
4215 sprintf(tmp_fname, "%s-TH%02d-BK%06d.tmp", pairer->tmp_file_prefix, thno, bkno);
4216
4217 FILE * in_fp = fopen(tmp_fname, "rb");
4218 if(NULL == in_fp) break;
4219
4220 // #warning ">>>> COMMENT DEBUG OUTPUT <<<<"
4221 // SUBREADprintf("Adding temp file:%s\n", tmp_fname);
4222 level_merge_fps[current_opened_fp_no ++] = in_fp;
4223 processed_orphant ++;
4224 if(current_opened_fp_no >= SAM_pairer_get_merge_max_fp(pairer) || processed_orphant == orphant_fp_no){
4225 sprintf(tmp_fname, "%s-LEVELMERGE.tmp", pairer->tmp_file_prefix);
4226
4227 // #warning ">>>> COMMENT DEBUG OUTPUT <<<<"
4228 // SUBREADprintf("Merging temp files\n");
4229 is_disk_full |= merge_level_fps(pairer , tmp_fname, level_merge_fps, current_opened_fp_no);
4230 for(x1 = 0; x1 < current_opened_fp_no; x1++) fclose(level_merge_fps[x1]);
4231
4232 if(processed_orphant < orphant_fp_no){
4233 level_merge_fps[0] = fopen(tmp_fname, "rb");
4234 current_opened_fp_no = 1;
4235 }
4236 if(is_disk_full) break;
4237 }
4238 }
4239 }
4240 pairer -> merge_level_finished = 1;
4241 }
4242 free(orphant_fps);
4243 return is_disk_full;
4244 }
4245
SAM_pairer_rescure_orphants_max_FP(void * params)4246 void * SAM_pairer_rescure_orphants_max_FP(void * params){
4247 void ** param_ptr = (void **) params;
4248 SAM_pairer_context_t * pairer = param_ptr[0];
4249 int thread_no = (int)(param_ptr[1]-NULL);
4250 free(params);
4251
4252 srInt_64 died=0;
4253 int orphant_fp_no=0;
4254 int thno, bkno, x1;
4255 char tmp_fname[MAX_FILE_NAME_LENGTH+60];
4256
4257 int max_name_len = MAX_READ_NAME_LEN*2 +80, orphant_fp_size = 50;
4258 FILE ** orphant_fps = malloc(sizeof(FILE *) * orphant_fp_size);
4259
4260 if(0 == thread_no && pairer -> display_progress)
4261 SUBREADprintf("Finished scanning the input file. Processing unpaired reads.\n");
4262
4263 //SUBREADprintf("merged = %d\n", pairer -> merge_level_finished);
4264 if(pairer -> merge_level_finished){
4265 sprintf(tmp_fname, "%s-LEVELMERGE.tmp", pairer->tmp_file_prefix);
4266 FILE * in_fp = fopen(tmp_fname, "rb");
4267 orphant_fps[0] = in_fp;
4268 orphant_fp_no=1;
4269 }else{
4270 orphant_fp_no = 0;
4271 for( thno = 0 ; thno < pairer -> total_threads ; thno ++ ){
4272 for( bkno = 0 ; ; bkno++){
4273 sprintf(tmp_fname, "%s-TH%02d-BK%06d.tmp", pairer->tmp_file_prefix, thno, bkno);
4274
4275 FILE * in_fp = fopen(tmp_fname, "rb");
4276 if(NULL == in_fp) break;
4277 if(orphant_fp_no >= orphant_fp_size){
4278 orphant_fp_size *= 1.5;
4279 orphant_fps = realloc(orphant_fps, orphant_fp_size * sizeof(FILE *));
4280 }
4281 orphant_fps[orphant_fp_no++]=in_fp;
4282 }
4283 }
4284 }
4285
4286 char * names = malloc( orphant_fp_no * max_name_len );
4287 memset(names, 0, orphant_fp_no * max_name_len );
4288 char * bin_tmp1 , * bin_tmp2;
4289 bin_tmp1 = malloc(66000);
4290 bin_tmp2 = malloc(66000);
4291
4292
4293 for(x1 = 0 ; x1 < orphant_fp_no; x1++)
4294 {
4295 int has = SAM_pairer_osr_next_name( orphant_fps[x1] , names + max_name_len*x1 , thread_no , pairer-> total_threads);
4296 if(!has) *(names + max_name_len*x1)=0;
4297 }
4298
4299
4300 while(1){
4301 int min_name_fileno = -1;
4302 int min2_name_fileno = -1;
4303
4304 for(x1 = 0 ; x1 < orphant_fp_no; x1++){
4305 int has = *(names + max_name_len*x1);
4306 if(has){
4307 int strcv_12 = 1;
4308 if(min_name_fileno >=0) strcv_12 = strcmp(names+(min_name_fileno * max_name_len), names+(x1 * max_name_len));
4309 if(strcv_12 > 0){
4310 min_name_fileno = x1;
4311 min2_name_fileno = -1;
4312 }else if( strcv_12 == 0){
4313 min2_name_fileno = x1;
4314 }
4315 }
4316
4317 }
4318
4319 if(min_name_fileno >= 0){
4320 SAM_pairer_osr_next_bin( orphant_fps[ min_name_fileno ] , bin_tmp1);
4321
4322 if( min2_name_fileno >=0){
4323 SAM_pairer_osr_next_bin( orphant_fps[ min2_name_fileno ] , bin_tmp2);
4324 pairer -> output_function(pairer, thread_no, (char*) bin_tmp1, (char*)bin_tmp2);
4325
4326 if(0 && 0 == pairer -> is_unsorted_notified){
4327 char *name_tmp_1 = malloc(strlen(names+(min_name_fileno * max_name_len))+5), *name_tmp_2 = malloc(strlen(names+(min_name_fileno * max_name_len))+5);
4328 char * min1_chunk_info, * min2_chunk_info;
4329 sprintf(name_tmp_1, "C:%s:%d", names+(min_name_fileno * max_name_len), 0);
4330 sprintf(name_tmp_2, "C:%s:%d", names+(min2_name_fileno * max_name_len), 1);
4331 min1_chunk_info = HashTableGet( pairer -> unsorted_notification_table , name_tmp_1);
4332 min2_chunk_info = HashTableGet( pairer -> unsorted_notification_table , name_tmp_2);
4333 //#warning ">>>>>>> COMMENT NEXT LINE <<<<<<<<"
4334 //SUBREADprintf("RESCURE MATCHER: %s , %s == %s , %s, %s\n", name_tmp_1, name_tmp_2, min1_chunk_info, min2_chunk_info,
4335 // SAM_pairer_is_matched_chunks(min1_chunk_info, min2_chunk_info)?"MATCH":"XXXXX");
4336
4337 if(min1_chunk_info == NULL || min2_chunk_info == NULL || !SAM_pairer_is_matched_chunks(min1_chunk_info, min2_chunk_info)){
4338 sprintf(name_tmp_1, "B:%s:%d", names+(min_name_fileno * max_name_len), 0);
4339 if( pairer -> unsorted_notification ){
4340 SUBREADprintf("UNSORT3\n");
4341 //SUBREADprintf("FINAL STEP\n");
4342 pairer -> unsorted_notification(pairer , HashTableGet( pairer -> unsorted_notification_table , name_tmp_1), NULL);
4343 }
4344 pairer -> is_unsorted_notified = 1;
4345 }
4346 }
4347
4348 int read_has = SAM_pairer_osr_next_name( orphant_fps[min2_name_fileno], names + max_name_len*min2_name_fileno, thread_no, pairer-> total_threads);
4349 if(!read_has) *(names + max_name_len*min2_name_fileno)=0;
4350 }else{
4351 //#warning ">>>>>>> COMMENT NEXT LINE <<<<<<<<"
4352 //SUBREADprintf("FINAL_ORPHAN:%s\n" , names + max_name_len*min_name_fileno);
4353 pairer -> output_function(pairer, thread_no, (char*) bin_tmp1, NULL);
4354 died++;
4355 }
4356
4357 int read_has = SAM_pairer_osr_next_name( orphant_fps[min_name_fileno], names + max_name_len*min_name_fileno, thread_no, pairer-> total_threads);
4358 //#warning ">>>>>>> COMMENT NEXT BLOCK <<<<<<<<"
4359 if(0){
4360 if(!read_has) SUBREADprintf("FP %d FINISHED\n", min_name_fileno);
4361 }
4362 if(!read_has) *(names + max_name_len*min_name_fileno)=0;
4363 } else break;
4364 }
4365 free(names);
4366
4367 //#warning ">>>>>>> COMMENT NEXT LINE <<<<<<<<"
4368 //SUBREADprintf("finished_fps= %d\n", orphant_fp_no);
4369
4370 for(x1 = 0 ; x1 < orphant_fp_no; x1++)
4371 {
4372 fclose ( orphant_fps[x1] );
4373 }
4374 free( bin_tmp1 );
4375 free( bin_tmp2 );
4376 free(orphant_fps);
4377 pairer -> total_orphan_reads += died;
4378 return NULL;
4379 }
4380
4381
SAM_pairer_update_orphant_table(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context)4382 int SAM_pairer_update_orphant_table(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context){
4383 unsigned int x2 = 0;
4384 unsigned char ** name_list, ** bin_list;
4385 //SUBREADprintf("ELES=%lu\n", thread_context->orphant_table->numOfElements);
4386 name_list = malloc(sizeof(char*) * thread_context->orphant_table->numOfElements);
4387 bin_list = malloc(sizeof(char*) * thread_context->orphant_table->numOfElements);
4388
4389 int x1, is_error = 0;
4390 for(x1 = 0; x1 < thread_context->orphant_table->numOfBuckets; x1 ++){
4391 KeyValuePair *pair = thread_context->orphant_table->bucketArray[x1];
4392 while (pair != NULL) {
4393 KeyValuePair *nextPair = pair->next;
4394 name_list [x2] = (unsigned char *)pair -> key;
4395 bin_list [x2] = pair -> value;
4396 x2++;
4397 pair = nextPair;
4398 }
4399 }
4400
4401 assert(x2 == thread_context->orphant_table->numOfElements);
4402 unsigned char ** sort_data[2];
4403 sort_data[0]=name_list;
4404 sort_data[1]=bin_list;
4405 merge_sort(sort_data, thread_context->orphant_table->numOfElements, SAM_pairer_sort_compare, SAM_pairer_sort_exchange, SAM_pairer_sort_merge);
4406
4407 char tmp_fname[MAX_FILE_NAME_LENGTH+40];
4408 sprintf(tmp_fname, "%s-TH%02d-BK%06d.tmp", pairer->tmp_file_prefix, thread_context -> thread_id, thread_context -> orphant_block_no++);
4409 FILE * tmp_fp = fopen(tmp_fname, "wb");
4410 if(tmp_fp){
4411 for(x1 = 0; x1 < x2; x1 ++){
4412 unsigned int bin_len;
4413
4414 memcpy(&bin_len, bin_list[x1] , 4);
4415 int namelen = strlen((char *)name_list[x1]);
4416
4417 int write_len = fwrite(&namelen,2,1,tmp_fp);
4418 is_error |= (write_len <1);
4419 write_len = fwrite(name_list[x1], 1, namelen, tmp_fp);
4420 is_error |= (write_len <namelen);
4421 write_len = fwrite(&bin_len,4, 1,tmp_fp);
4422 is_error |= (write_len <1);
4423 write_len = fwrite(bin_list[x1], 1, bin_len + 4, tmp_fp);
4424 is_error |= (write_len < bin_len + 4);
4425
4426 HashTableRemove(thread_context->orphant_table , name_list[x1]);
4427 }
4428 fclose(tmp_fp);
4429 }else is_error =1;
4430 assert(thread_context -> orphant_table-> numOfElements == 0);
4431 free(name_list);
4432 free(bin_list);
4433 thread_context -> orphant_space = 0;
4434 if(is_error) SUBREADprintf("ERROR: unable to write into the temporary file. Please check the disk space in the output directory.\n");
4435 return is_error;
4436 }
4437
4438
is_read_bin_ONE(char * bin,int bin_len,int max_refID,int * block_len)4439 int is_read_bin_ONE(char * bin, int bin_len, int max_refID, int * block_len){
4440 memcpy(block_len, bin, 4);
4441 if((*block_len) > MAX_BIN_RECORD_LENGTH - 4 || (*block_len) < 32) return -1;
4442 if((*block_len) > bin_len - 4) return -2;
4443 int refID, mate_refID;
4444 memcpy(&refID, bin + 4, 4);
4445 memcpy(&mate_refID, bin + 24, 4);
4446 if(refID != -1 && (refID< 0 || refID >=max_refID)) return -3;
4447 if(mate_refID != -1 && (mate_refID< 0 || mate_refID >=max_refID)) return -4;
4448 int l_seq;
4449 memcpy(&l_seq, bin + 20, 4);
4450 if(l_seq > bin_len*2 || l_seq > MAX_BIN_RECORD_LENGTH || l_seq < 0) return -5;
4451
4452 int min_mq_nl;
4453 memcpy(&min_mq_nl, bin + 12, 4);
4454 int name_len = min_mq_nl & 0xff;
4455 if(name_len < 1) return -20;
4456 int flag_nc;
4457 memcpy(&flag_nc, bin + 16, 4);
4458 int cigar_opts = flag_nc & 0xffff;
4459 // int flag = flag_nc >> 16;
4460 if(cigar_opts > 100) return -6;
4461
4462 int rname_cursor = 36;
4463 if(bin[rname_cursor] == '@') return -7;
4464 for(; rname_cursor< 36 + name_len - 1; rname_cursor ++){
4465 int nch = bin[rname_cursor];
4466 if(nch < 0x20 || nch >=0x7f) return -9;
4467 if(nch == '\t') return -8;
4468 }
4469
4470 if(bin[rname_cursor]!=0)return -10;
4471
4472 if((*block_len) < 32 + name_len + 4*cigar_opts + l_seq + (l_seq+1)/2) return -11;
4473
4474 int cigar_i;
4475 for(cigar_i = 0; cigar_i < cigar_opts ; cigar_i++){
4476 int cigar_v;
4477 memcpy(&cigar_v , bin + 36 + name_len + 4*cigar_i, 4);
4478 int cigar_op = cigar_v & 0xf;
4479 int cigar_value = cigar_v & 0xfffffff;
4480 if(cigar_op > 8) return -12;
4481
4482 if((cigar_op == 0 || cigar_op == 1 || cigar_op > 6) && (cigar_value < 1 || cigar_value > MAX_BIN_RECORD_LENGTH)){
4483
4484 //#warning ">>>>>> COMMENT NEXT LINE IN RELEASE <<<<<<"
4485 if(0){
4486 char * rname = bin + 36;
4487 SUBREADprintf("OP=%d, VAL=%d [%s]\n", cigar_op, cigar_value, rname);
4488 }
4489
4490 return -13;
4491 }
4492 }
4493
4494 int ext_cursor = 36 + name_len + 4*cigar_opts + l_seq + (l_seq+1)/2;
4495 if(ext_cursor < (*block_len) + 4){
4496 if(ext_cursor > (*block_len) + 4 - 4) return -17;
4497 if((!isalpha(bin[ext_cursor])) || bin[ext_cursor+1]>122 || bin[ext_cursor+1]<48 ||!isalpha(bin[ext_cursor+2])){
4498 // SUBREADprintf("TAGERR: %c%c%c\n", bin[ext_cursor], bin[ext_cursor+1], bin[ext_cursor+2]);
4499 return -16;
4500 }
4501 }
4502 return 1;
4503 }
4504
4505 #define TESTING_READS_FOR_START 3
4506 int tchecks=0;
4507
4508 // A block MUST have at least three reads as evidence; otherwise the BAM file is converted into the conservative format.
is_read_bin(char * bin,int bin_len,int max_refID)4509 int is_read_bin(char * bin, int bin_len, int max_refID){
4510 int testing_i;
4511 int bin_cursor = 0;
4512 for(testing_i = 0; testing_i < TESTING_READS_FOR_START; testing_i++){
4513 int block_len = 0;
4514 int rr = is_read_bin_ONE(bin + bin_cursor, bin_len - bin_cursor, max_refID, &block_len);
4515
4516 if(0) SUBREADprintf("CHECK_START # %d: RET=%d\n", ++tchecks, rr);
4517
4518 if(rr!=1) return rr;
4519 bin_cursor += block_len +4;
4520 if(bin_cursor == bin_len) return 1;
4521 }
4522 return 1;
4523 }
4524
4525 int tfinds = 0;
4526
SAM_pairer_find_start(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context)4527 int SAM_pairer_find_start(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context ){
4528 thread_context -> need_find_start = 0;
4529 int start_pos = 0;
4530 for(start_pos = 0; start_pos < min(MAX_BIN_RECORD_LENGTH, thread_context -> input_buff_BIN_used); start_pos++){
4531 if(1==is_read_bin((char *)thread_context -> input_buff_BIN + start_pos, thread_context -> input_buff_BIN_used - start_pos , pairer -> BAM_n_ref)){
4532 //if(1) SUBREADprintf("STFIND # %d : start = %d\n", ++tfinds, start_pos);
4533 if(start_pos>0){
4534 char * margin_key = malloc(22);
4535 char * margin_data = malloc(start_pos+4);
4536 memcpy(margin_data, &start_pos, 4);
4537 memcpy(margin_data+4, thread_context -> input_buff_BIN, start_pos);
4538 #ifdef __MINGW32__
4539 sprintf(margin_key,"S%lu", (unsigned long) thread_context -> input_buff_SBAM_file_start);
4540 #else
4541 sprintf(margin_key,"S%llu", thread_context -> input_buff_SBAM_file_start);
4542 #endif
4543 subread_lock_occupy(&pairer -> SAM_BAM_table_lock);
4544 HashTablePut(pairer -> bam_margin_table, margin_key, margin_data);
4545 subread_lock_release(&pairer -> SAM_BAM_table_lock);
4546 }
4547 break;
4548 }
4549 }
4550 thread_context -> input_buff_BIN_ptr = start_pos;
4551 // SUBREADprintf("ABBO TH %d : FOUND START AT %d in %d\n", thread_context -> thread_id , start_pos, thread_context -> input_buff_BIN_used);
4552 return start_pos < min(MAX_BIN_RECORD_LENGTH, thread_context -> input_buff_BIN_used);
4553 }
4554
4555
SAM_pairer_thread_run(void * params)4556 void * SAM_pairer_thread_run( void * params ){
4557 void ** param_ptr = (void **) params;
4558 SAM_pairer_context_t * pairer = param_ptr[0];
4559 int thread_no = (int)(param_ptr[1]-NULL), is_disk_full = 0;
4560 free(params);
4561
4562 SAM_pairer_thread_t * thread_context = pairer -> threads + thread_no;
4563 int is_finished = 0;
4564 while(1){
4565 subread_lock_occupy(&pairer -> input_fp_lock);
4566 if(pairer -> BAM_header_parsed || thread_no == 0){
4567 thread_context -> need_find_start = pairer -> BAM_header_parsed;
4568 //SUBREADprintf("ABBO TH %d : FILL_BIN AT FILE %lld\n", thread_context -> thread_id, ftello(pairer -> input_fp ));
4569 SAM_pairer_fill_BIN_buff(pairer, thread_context, &is_finished);
4570 thread_context -> chunk_number = pairer -> input_chunk_no;
4571 pairer -> input_chunk_no ++;
4572 }
4573 subread_lock_release(&pairer -> input_fp_lock);
4574
4575 if(!pairer -> BAM_header_parsed && thread_no > 0) {
4576 usleep(PAIRER_WAIT_TICK_TIME);
4577 } else if(thread_context -> input_buff_SBAM_used>0) {
4578 unsigned int processed_reads = 0;
4579 while(1){
4580 int has_no_more = SAM_pairer_do_next_read(pairer, thread_context);
4581 if(has_no_more)break;
4582 processed_reads++;
4583 }
4584
4585 pairer -> total_input_reads += processed_reads;
4586 }
4587 if(pairer -> is_bad_format) break;
4588
4589 if(thread_context -> immediate_last_read_full_name[0]){
4590 SAM_pairer_register_matcher(pairer, thread_context -> chunk_number, thread_context -> readno_in_chunk - 1, thread_context -> immediate_last_read_full_name, thread_context -> immediate_last_read_bin, thread_context -> immediate_last_read_bin_len , thread_context -> immediate_last_read_flags);
4591 SAM_pairer_do_read_test(pairer , thread_context , thread_context -> immediate_last_read_name_len , thread_context -> immediate_last_read_full_name , thread_context -> immediate_last_read_bin_len , thread_context -> immediate_last_read_bin, thread_context -> immediate_last_read_flags);
4592 thread_context -> immediate_last_read_full_name[0] = 0;
4593 }
4594
4595 if(thread_context -> orphant_space > pairer -> input_buff_SBAM_size)
4596 if(!is_disk_full)is_disk_full |= SAM_pairer_update_orphant_table(pairer, thread_context);
4597
4598 if(is_finished){
4599 pairer -> BAM_header_parsed = 1;
4600 break;
4601 }
4602 }
4603
4604 if(thread_context -> orphant_table -> numOfElements > 0)
4605 if(!is_disk_full)is_disk_full |= SAM_pairer_update_orphant_table(pairer, thread_context);
4606
4607 pairer -> is_internal_error |= is_disk_full;
4608
4609 return NULL;
4610 }
4611
4612
4613 // This function returns 1 if the bin is EXACTLY a whole read.
SAM_pairer_verify_read_bin_ONE(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,char * bin,int binlen)4614 int SAM_pairer_verify_read_bin_ONE(SAM_pairer_context_t * pairer, SAM_pairer_thread_t * thread_context , char * bin, int binlen){
4615 int block_len = 9;
4616 int ret = is_read_bin_ONE(bin, binlen, pairer -> BAM_n_ref, &block_len);
4617
4618 if(ret != 1 || block_len+4 != binlen){
4619 SUBREADprintf("ERROR: cannot retrieve a read from the BAM file: %d, %d\n", block_len+4, ret);
4620 ret = -1;
4621 }
4622 //SUBREADprintf("FINAL_BIN_MATCH VERIFY : %d\n", ret);
4623 return ret;
4624 }
4625
SAM_pairer_finish_margins(void * kv,void * val,HashTable * tab)4626 void SAM_pairer_finish_margins(void * kv, void * val , HashTable * tab){
4627 char * key = kv;
4628 if(key[0]=='E'){
4629 char keyS [40];
4630 strcpy(keyS, key);
4631 keyS[0]='S';
4632 char * Sbin = HashTableGet(tab, keyS);
4633 assert(Sbin);
4634 char * Ebin = val;
4635 tab -> appendix2 ++;
4636
4637 // SUBREADprintf("PAIRED_BINS: %s %s\n", key, keyS);
4638
4639 SAM_pairer_context_t * pairer = tab -> appendix1;
4640 SAM_pairer_thread_t * thread_context = pairer -> threads+0;
4641
4642 thread_context -> readno_in_chunk = 0;
4643 int Elen = 0, Slen = 0;
4644 memcpy(&Elen, Ebin, 4);
4645 memcpy(&Slen, Sbin, 4);
4646 char * tb = malloc(Elen + Slen);
4647 memcpy(tb, Ebin+4, Elen);
4648 memcpy(tb+ Elen, Sbin+4, Slen);
4649
4650 if(SAM_pairer_verify_read_bin_ONE(pairer , thread_context, tb, Elen + Slen)==1)
4651 SAM_pairer_do_one_BIN( pairer, thread_context, tb, Elen + Slen);
4652 else{
4653 pairer -> is_bad_format = 1;
4654 }
4655 free(tb);
4656 }else tab -> appendix2 --;
4657
4658 }
4659
SAM_pairer_finish_margin_table(SAM_pairer_context_t * pairer)4660 void SAM_pairer_finish_margin_table( SAM_pairer_context_t * pairer){
4661 pairer -> bam_margin_table -> appendix1 = pairer;
4662 pairer -> bam_margin_table -> appendix2 = NULL;
4663
4664 SAM_pairer_thread_t * thread_context = pairer -> threads+0;
4665 thread_context -> immediate_last_read_full_name[0] = 0;
4666 HashTableIteration(pairer -> bam_margin_table, SAM_pairer_finish_margins);
4667
4668 if(thread_context -> immediate_last_read_full_name[0]){
4669 SAM_pairer_register_matcher(pairer, thread_context -> chunk_number, thread_context -> readno_in_chunk - 1, thread_context -> immediate_last_read_full_name, thread_context -> immediate_last_read_bin, thread_context -> immediate_last_read_bin_len , thread_context -> immediate_last_read_flags);
4670 SAM_pairer_do_read_test(pairer , thread_context , thread_context -> immediate_last_read_name_len , thread_context -> immediate_last_read_full_name , thread_context -> immediate_last_read_bin_len , thread_context -> immediate_last_read_bin, thread_context -> immediate_last_read_flags);
4671 thread_context -> immediate_last_read_full_name[0] = 0;
4672 }
4673
4674 pairer -> is_internal_error |= SAM_pairer_update_orphant_table(pairer, pairer -> threads+0);
4675 assert(NULL == pairer -> bam_margin_table -> appendix2);
4676 }
4677
4678 // not only run, but also finalise.
4679 // It returns 0 if no error.
SAM_pairer_run_once(SAM_pairer_context_t * pairer)4680 int SAM_pairer_run_once( SAM_pairer_context_t * pairer){
4681 int x1;
4682 for(x1 = 0; x1 < pairer -> total_threads ; x1++){
4683 // this 16-byte memory block is freed in the thread worker.
4684 void ** init_params = malloc(sizeof(void *) * 2);
4685
4686 init_params[0] = pairer;
4687 init_params[1] = (void *)(NULL+x1);
4688 pthread_create(&(pairer -> threads[x1].thread_stab), NULL, SAM_pairer_thread_run, init_params);
4689 }
4690
4691 for(x1 = 0; x1 < pairer -> total_threads ; x1++){
4692 pthread_join(pairer -> threads[x1].thread_stab, NULL);
4693 }
4694
4695 if(0 == pairer -> is_bad_format){
4696 if(pairer -> input_is_BAM) SAM_pairer_finish_margin_table(pairer);
4697 int is_disk_full = SAM_pairer_probe_maxfp( pairer );
4698 if(is_disk_full){
4699 SUBREADprintf("ERROR: cannot write into the temporary file. Please check the disk space in the output directory.\n");
4700 pairer -> is_internal_error = 1;
4701 }else{
4702 for(x1 = 0; x1 < pairer -> total_threads ; x1++){
4703 // this 16-byte memory block is freed in the thread worker.
4704
4705 void ** init_params = malloc(sizeof(void *) * 2);
4706
4707 init_params[0] = pairer;
4708 init_params[1] = (void *)(NULL+x1);
4709 pthread_create(&(pairer -> threads[x1].thread_stab), NULL, SAM_pairer_rescure_orphants_max_FP, init_params);
4710 }
4711
4712 for(x1 = 0; x1 < pairer -> total_threads ; x1++){
4713 pthread_join(pairer -> threads[x1].thread_stab, NULL);
4714 }
4715 }
4716 }
4717
4718 return 0;
4719 }
4720
fix_load_next_block(FILE * in,char * binbuf,z_stream * strm)4721 int fix_load_next_block(FILE * in, char * binbuf, z_stream * strm){
4722 char * bam_buf = malloc(70000);
4723 int x1, ret = 0;
4724 x1 = fgetc(in);
4725 if(x1 != 31) ret = -1;
4726 x1 = fgetc(in);
4727 if(x1 != 139) ret = -1;
4728 x1 = fgetc(in);
4729 if(x1 != 8) ret = -1;
4730 x1 = fgetc(in);
4731 if(x1 != 4) ret = -1;
4732 if(ret == 0){
4733 x1 = fgetc(in);
4734 x1 = fgetc(in);
4735 x1 = fgetc(in);
4736 x1 = fgetc(in);
4737
4738 x1 = fgetc(in);//XFL
4739
4740 x1 = fgetc(in);//OS
4741 int xlen;
4742 xlen = fgetc(in);
4743 xlen += fgetc(in) * 256;
4744 int bsize = -1, xlen_ptr = 0;
4745
4746 while(xlen_ptr < xlen){
4747 int si1 = fgetc(in);
4748 int si2 = fgetc(in);
4749 int slen = fgetc(in);
4750 slen += fgetc(in) * 256;
4751 if(si1 == 66 && si2==67){
4752 bsize = fgetc(in);
4753 bsize += 256*fgetc(in);
4754 }else{
4755 fseeko(in , slen, SEEK_CUR);
4756 }
4757 xlen_ptr += 4 + slen;
4758 }
4759 if(bsize > 0){
4760 int rlenv = fread(bam_buf, 1, bsize - xlen - 19, in);
4761 if(rlenv < bsize - xlen - 19) return -1;
4762 }
4763 fseeko(in, 8, SEEK_CUR);
4764
4765 strm -> avail_in = bsize - xlen - 19;
4766 strm -> next_in = (unsigned char*)bam_buf;
4767 strm -> avail_out = 70000;
4768 strm -> next_out = (unsigned char*)binbuf;
4769 int ret_inf = inflate(strm, Z_FINISH);
4770 if(ret_inf == Z_STREAM_END)
4771 ret = 70000 - strm -> avail_out;
4772 else
4773 ret = -2;
4774 inflateReset(strm);
4775 }
4776 free(bam_buf);
4777 return ret;
4778 }
4779
fix_write_block(FILE * out,char * bin,int binlen,z_stream * strm)4780 int fix_write_block(FILE * out, char * bin, int binlen, z_stream * strm){
4781 int is_end_mode = binlen == 0, written=0;
4782 //SUBREADprintf("FIX_WRTR : %d\n", binlen);
4783
4784 while(1){
4785 if(binlen - written<1 && !is_end_mode) return 0;
4786
4787 char * bam_buf = malloc(70000);
4788 int x1, bam_len = 0, old_in= 0, this_sec_len = 0, old_start = written;
4789
4790 if(binlen - written > 0){
4791 old_in = strm -> avail_in = binlen - written;
4792 strm -> next_in = (unsigned char*)bin + written;
4793 strm -> avail_out = 70000;
4794 strm -> next_out = (unsigned char*)bam_buf;
4795 deflate(strm , Z_FINISH);
4796 bam_len = 70000 - strm -> avail_out;
4797 this_sec_len = old_in - strm -> avail_in;
4798 written += this_sec_len;
4799
4800 deflateReset(strm);
4801 }else{
4802 z_stream nstrm;
4803 nstrm.zalloc = Z_NULL;
4804 nstrm.zfree = Z_NULL;
4805 nstrm.opaque = Z_NULL;
4806 nstrm.avail_in = 0;
4807 nstrm.next_in = Z_NULL;
4808
4809 deflateInit2(&nstrm, SAMBAM_COMPRESS_LEVEL_NORMAL, Z_DEFLATED,
4810 PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
4811
4812 nstrm.avail_in = 0;
4813 nstrm.next_in = (unsigned char*)bin;
4814 nstrm.avail_out = 70000;
4815 nstrm.next_out = (unsigned char*)bam_buf;
4816 deflate(&nstrm, Z_FINISH);
4817 bam_len = 70000 - nstrm.avail_out;
4818 deflateEnd(&nstrm);
4819 }
4820
4821 //SUBREADprintf("FIX_COMPR: %d -> %d RET=%d\n", binlen , bam_len, retbam);
4822
4823 unsigned int crc0 = crc32(0, NULL, 0);
4824 unsigned int crc = crc32(crc0, (unsigned char *) bin + old_start, this_sec_len);
4825
4826 fputc(31, out);
4827 fputc(139, out);
4828 fputc(8, out);
4829 fputc(4, out);
4830 fputc(0, out);
4831 fputc(0, out);
4832 fputc(0, out);
4833 fputc(0, out);
4834
4835 fputc(0, out);//XFL
4836 fputc(0xff, out);//OS
4837
4838 x1 = 6;
4839 fwrite( &x1, 2, 1 , out );
4840 fputc( 66, out );
4841 fputc( 67, out );
4842 x1 = 2;
4843 fwrite( &x1, 2, 1 , out );
4844 x1 = bam_len + 19 + 6;
4845 fwrite( &x1, 2, 1 , out );
4846 int write_len = fwrite( bam_buf , 1,bam_len, out );
4847
4848 fwrite( &crc, 4, 1, out );
4849 fwrite( &binlen, 4, 1, out );
4850
4851 free(bam_buf);
4852
4853 if(write_len < bam_len)return 1;
4854 if(binlen<1) return 0;
4855 }
4856 return 0;
4857 }
4858
4859 #define FIX_GET_NEXT_NCH { while(in_bin_ptr == in_bin_size){ \
4860 in_bin_ptr = 0; in_bin_size = 0;\
4861 int newsize = fix_load_next_block(old_fp, in_bin, &in_strm);\
4862 if(newsize < 0){ in_bin_size = -1; if(newsize<-1)SUBREADprintf("ERROR: failed to decompress the BAM file %s\n", pairer -> in_file_name) ;break;}else{in_bin_size = newsize;}\
4863 } if(in_bin_size>0){nch = in_bin[in_bin_ptr++]; if(nch < 0)nch += 256; } else nch = -1; }
4864
4865 #define FIX_FLASH_OUT { if(out_bin_ptr > 0)disk_is_full |= fix_write_block(new_fp, out_bin, out_bin_ptr, &out_strm); out_bin_ptr = 0; }
4866
4867 #define FIX_APPEND_OUT(p, c) { if(out_bin_ptr > 60002){FIX_FLASH_OUT} ; memcpy(out_bin + out_bin_ptr, p, c); out_bin_ptr +=c ; }
4868 #define FIX_APPEND_READ(p, c){ memcpy(out_bin + out_bin_ptr, p, c); out_bin_ptr +=c ; }
4869
SAM_pairer_fix_format(SAM_pairer_context_t * pairer)4870 int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
4871 FILE * old_fp = pairer -> input_fp;
4872 fseeko(old_fp, 0, SEEK_SET);
4873 char tmpfname [MAX_FILE_NAME_LENGTH+14], readname[256];
4874
4875 sprintf(tmpfname, "%s.fixbam", pairer -> tmp_file_prefix);
4876
4877 FILE * new_fp = f_subr_open(tmpfname, "wb");
4878 char * in_bin = malloc(1024*70);
4879 char * out_bin = malloc(20*1024*1024);
4880
4881 z_stream in_strm;
4882 z_stream out_strm;
4883 in_strm.zalloc = Z_NULL;
4884 in_strm.zfree = Z_NULL;
4885 in_strm.opaque = Z_NULL;
4886 in_strm.avail_in = 0;
4887 in_strm.next_in = Z_NULL;
4888
4889 inflateInit2(&in_strm, PAIRER_GZIP_WINDOW_BITS);
4890
4891 out_strm.zalloc = Z_NULL;
4892 out_strm.zfree = Z_NULL;
4893 out_strm.opaque = Z_NULL;
4894 out_strm.avail_in = 0;
4895 out_strm.next_in = Z_NULL;
4896
4897 deflateInit2(&out_strm, Z_NO_COMPRESSION, Z_DEFLATED,
4898 PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
4899
4900 int disk_is_full = 0;
4901 int in_bin_ptr = 0;
4902 int out_bin_ptr = 0;
4903 int in_bin_size = 0;
4904 int content_count = 0;
4905 int content_size = 0;
4906 int x1, nch = 0, is_longcigar = 0;
4907
4908 for(x1 = 0; x1 < 4; x1++){
4909 FIX_GET_NEXT_NCH; // BAM1
4910 if(nch < 0) return -1;
4911 FIX_APPEND_OUT(&nch, 1);
4912 }
4913
4914
4915 // ====== The header texts
4916 content_size = 0;
4917 for(x1 = 0; x1 < 4; x1++){
4918 FIX_GET_NEXT_NCH;
4919 if(nch < 0) return -1;
4920 // SUBREADprintf("FIX: TLEN: %d\n", nch);
4921 content_size += (nch << (8 * x1));
4922 }
4923 FIX_APPEND_OUT(&content_size, 4);
4924 //SUBREADprintf("FIX: TXTLEN=%d\n", content_size);
4925 for(content_count = 0; content_count < content_size; content_count++){
4926 FIX_GET_NEXT_NCH;
4927 if(nch < 0) return -1;
4928 FIX_APPEND_OUT(&nch, 1);
4929 // fputc(nch, stderr);
4930 }
4931 FIX_FLASH_OUT;
4932
4933 // ====== The chromosome table
4934 content_size = 0;
4935 for(x1 = 0; x1 < 4; x1++){
4936 FIX_GET_NEXT_NCH;
4937 if(nch < 0) return -1;
4938 content_size += (nch << (8 * x1));
4939 }
4940 FIX_APPEND_OUT(&content_size, 4);
4941 //SUBREADprintf("LONGFIX: CHROLEN=%d\n", content_size);
4942 for(content_count = 0; content_count < content_size; content_count++){
4943 int namelen = 0;
4944 for(x1 = 0; x1 < 4; x1++){
4945 FIX_GET_NEXT_NCH;
4946 if(nch < 0) return -1;
4947 namelen+= (nch << (8 * x1));
4948 }
4949 FIX_APPEND_READ(&namelen, 4);
4950 for(x1 = 0; x1 < namelen + 4; x1++){ // inc. length
4951 FIX_GET_NEXT_NCH;
4952 if(nch < 0) return -1;
4953 FIX_APPEND_READ(&nch, 1);
4954 }
4955
4956 if(out_bin_ptr > 60003){
4957 FIX_FLASH_OUT;
4958 }
4959 }
4960 FIX_FLASH_OUT;
4961
4962 // ===== The reads
4963 int seq_len = 0, name_len = 0, cigar_opts = 0;
4964 srInt_64 reads =0;
4965 pairer -> is_bad_format = 0;
4966
4967 while(! is_longcigar){
4968 int block_size = 0, new_block_size;
4969 char * block_size_ptr = out_bin + out_bin_ptr;
4970 char * sqlen_ptr = NULL;
4971 seq_len = 0, name_len = 0, cigar_opts = 0;
4972
4973 // block_length
4974 FIX_GET_NEXT_NCH;
4975 if(nch<0) break;
4976 block_size = nch;
4977 for(x1 = 1; x1 < 4; x1++){
4978 FIX_GET_NEXT_NCH;
4979 if(nch < 0) return -1;
4980 block_size += (nch << (8 * x1));
4981 }
4982
4983 FIX_APPEND_READ(&block_size, 4);
4984
4985 if(pairer -> tiny_mode){
4986 // block_remainder
4987 int extag_new_len = 0;
4988 for(x1 = 0; x1 < block_size; x1++){
4989 FIX_GET_NEXT_NCH;
4990 if(nch < 0) return -1;
4991 if(x1 == 8) name_len = nch;
4992 else if(x1 >= 16 && x1 < 20){
4993 seq_len += ( nch << (8 * (x1 - 16)));
4994 if(x1 == 16) sqlen_ptr = out_bin + out_bin_ptr;
4995 }else if(x1 == 12 || x1 == 13){
4996 cigar_opts += ( nch << (8 * (x1 - 12)));
4997 }else if(seq_len > 1){
4998 if(x1 == 32 + name_len + 4 * cigar_opts || x1 == 32 + name_len + 4 * cigar_opts + 1){
4999 nch = 0xff;
5000 }else if(x1 > 32 + name_len + 4 * cigar_opts + 1 && x1 < 32 + name_len + 4 * cigar_opts + seq_len + (seq_len+1)/2){
5001 continue;
5002 }
5003 }
5004
5005 //#warning "+===================== REMOVE -59999 IN NEXT LINE ================"
5006 //if(x1==32)SUBREADprintf("SEQ_LEN=%d, REC_LEN=%d\n", seq_len, block_size);
5007 if( x1 == 32 && seq_len >= pairer -> long_read_minimum_length){
5008 is_longcigar = 1;
5009 int x2;
5010 for(x2 = 0; x2 < name_len; x2++){
5011 FIX_GET_NEXT_NCH;
5012 readname[x2] = nch;
5013 }
5014 break;
5015 }
5016
5017 // #warning "================ THIS BLOCK WAS DISABLED ON 03OCT2019; MAKE SURE IT WORKS ON LONG READS/LONG READ RECORDS =============="
5018 if(0 && x1 == 32 && block_size > 60000 ){
5019 print_in_box(80,0,0,"");
5020 print_in_box(80,0,0," ERROR: Alignment record is too long.");
5021 print_in_box(80,0,0," Please use the long read mode.");
5022 return -1;
5023 }
5024
5025 char etag_name0 = -1, etag_name1, etag_type;
5026 if(x1 == 32 + name_len + 4 * cigar_opts + seq_len + (seq_len+1)/2){
5027 while(x1 < block_size){
5028 int this_tag_output = 0;
5029 if(etag_name0 > 0){
5030 FIX_GET_NEXT_NCH;
5031 if(nch < 0) return -1;
5032 }
5033 etag_name0 = nch;
5034 FIX_GET_NEXT_NCH;
5035 if(nch < 0) return -1;
5036 etag_name1 = nch;
5037 FIX_GET_NEXT_NCH;
5038 if(nch < 0) return -1;
5039 etag_type = nch;
5040 x1 += 3;
5041
5042 //SUBREADprintf("ETAG_NAME: %c%c (%c), x1 = %d < %d\n", etag_name0,etag_name1,etag_type, x1, block_size);
5043
5044 if((( etag_name0 == 'H' && etag_name1 == 'I' ) ||
5045 ( etag_name0 == 'N' && etag_name1 == 'H' ) ||
5046 ( etag_name0 == 'R' && etag_name1 == 'G' ) ||
5047 ( etag_name0 == 'N' && etag_name1 == 'M' )
5048 ) && ( etag_type == 'c' || etag_type=='Z' || etag_type == 'C'||etag_type == 's'||etag_type == 'S'||etag_type == 'i'||etag_type == 'I')
5049 ){
5050 FIX_APPEND_READ(&etag_name0,1);
5051 FIX_APPEND_READ(&etag_name1,1);
5052 FIX_APPEND_READ(&etag_type,1);
5053 this_tag_output = 1;
5054 // SUBREADprintf("ADDED INTO BAM\n");
5055 }
5056 if(etag_type == 'Z'||etag_type =='H'){
5057 if(this_tag_output) extag_new_len +=3;
5058 while(1){
5059 FIX_GET_NEXT_NCH;
5060 if(nch < 0) return -1;
5061 if(this_tag_output){
5062 assert(x1 < 20000);
5063 FIX_APPEND_READ(&nch, 1);
5064 extag_new_len++;
5065 }
5066 x1++;
5067 if(nch == 0)break;
5068 }
5069 }else if(etag_type == 'A'){
5070 FIX_GET_NEXT_NCH;
5071 if(nch < 0) return -1;
5072 x1++;
5073 }else if(etag_type =='B'){
5074 FIX_GET_NEXT_NCH;
5075 if(nch < 0) return -1;
5076 char array_type = nch;
5077 int x2, adlen = 1, aditems = 0;
5078 if(array_type == 's'||array_type == 'S')adlen = 2;
5079 if(array_type == 'i'||array_type == 'I'||array_type == 'f')adlen = 4;
5080 for(x2=0;x2<4; x2++) {
5081 FIX_GET_NEXT_NCH;
5082 if(nch < 0) return -1;
5083 aditems += nch << (8*x2);
5084 }
5085 x1 += 5 + aditems * adlen;
5086 for(x2 = 0; x2 < aditems * adlen; x2++){
5087 FIX_GET_NEXT_NCH;
5088 if(nch < 0) return -1;
5089 }
5090 }else{
5091 int dlen = 1;
5092 if(etag_type == 's'||etag_type == 'S') dlen = 2;
5093 if(etag_type == 'i'||etag_type == 'I' || etag_type == 'f') dlen = 4;
5094 if(this_tag_output) extag_new_len += dlen + 3;
5095 x1 += dlen;
5096 while(dlen > 0){
5097 FIX_GET_NEXT_NCH;
5098 if(nch < 0) return -1;
5099 if(this_tag_output)
5100 FIX_APPEND_READ(&nch, 1);
5101 dlen--;
5102 }
5103 }
5104 }
5105 break;
5106 }
5107 FIX_APPEND_READ(&nch, 1);
5108 //SUBREADprintf("WR[%d]: %d = %c, SL=%d, RNL=%d, COP=%d\n", out_bin_ptr, nch, nch, seq_len, name_len, cigar_opts);
5109 }
5110
5111 if(!is_longcigar){
5112 seq_len = min(1, seq_len);
5113 sqlen_ptr[0]=seq_len; sqlen_ptr[1]=0, sqlen_ptr[2]=0; sqlen_ptr[3]=0;
5114 new_block_size = 32 + name_len + 4 * cigar_opts + seq_len + (seq_len+1)/2 + extag_new_len;
5115 //SUBREADprintf("ETAG_NLEN=%d, ETAGS=%d\n", new_block_size, extag_new_len);
5116 memcpy(block_size_ptr, &new_block_size, 4);
5117 }
5118 }else{
5119 for(x1 = 0; x1 < block_size; x1++){
5120 FIX_GET_NEXT_NCH;
5121 if(nch < 0) return -1;
5122
5123 if(x1 == 8) name_len = nch;
5124 else if(x1 >= 16 && x1 < 20){
5125 seq_len += ( nch << (8 * (x1 - 16)));
5126 if(x1 == 16) sqlen_ptr = out_bin + out_bin_ptr;
5127 }else if(x1 == 12 || x1 == 13){
5128 cigar_opts += ( nch << (8 * (x1 - 12)));
5129 }
5130
5131 if(x1 == 32 && seq_len >= pairer -> long_read_minimum_length){
5132 is_longcigar = 1;
5133 int x2;
5134 for(x2 = 0; x2 < name_len; x2++){
5135 FIX_GET_NEXT_NCH;
5136 readname[x2] = nch;
5137 }
5138 break;
5139 }
5140
5141 FIX_APPEND_READ(&nch, 1);
5142 }
5143 }
5144
5145 reads ++;
5146 if(out_bin_ptr > 60000){
5147 // SUBREADprintf("WRIR3: TINY=%d\n", pairer -> tiny_mode);
5148 FIX_FLASH_OUT;
5149 }
5150 }
5151 FIX_FLASH_OUT;
5152 //SUBREADprintf("FIX READS=%llu\n", reads);
5153 disk_is_full |= fix_write_block(new_fp, out_bin, 0, &out_strm);
5154 deflateEnd(&out_strm);
5155 inflateEnd(&in_strm);
5156
5157 fclose(new_fp);
5158
5159 free(in_bin);
5160 free(out_bin);
5161
5162 if(is_longcigar){
5163 unlink(tmpfname);
5164 pairer -> long_cigar_mode = 1;
5165 pairer -> tiny_mode = 1;
5166 if(0 && ! pairer -> is_single_end_mode){
5167 print_in_box(80,0,0," Switch to long-read mode; reads, not read-pairs, will be counted.");
5168 print_in_box(80,0,0," Read name: %s", readname);
5169 print_in_box(80,0,0," It had %d cigar opts and %d bases, more than %d.", cigar_opts, seq_len, pairer -> long_read_minimum_length);
5170 }
5171 }else{
5172 fclose(old_fp);
5173 pairer -> input_fp = f_subr_open(tmpfname, "rb");
5174 }
5175
5176 if(disk_is_full)SUBREADprintf("ERROR: cannot write into the temporary file. Please check the empty space in the output directory.\n");
5177 return disk_is_full;
5178 }
5179
5180
5181
5182 unsigned int nosort_tick_time = 100;
5183 #define NOSORT_SBAM_BUFF_SIZE 5000000
5184 #define NOSORT_BIN_BUFF_SIZE (2*5010000)
5185
5186
SAM_nosort_thread_run(void * params)5187 void * SAM_nosort_thread_run( void * params ){
5188 void ** param_ptr = (void **) params;
5189 SAM_pairer_context_t * pairer = param_ptr[0];
5190 int thread_no = (int)(param_ptr[1]-NULL);
5191 free(params);
5192
5193 SAM_pairer_thread_t * thread_context = pairer -> threads + thread_no;
5194
5195 char * read_ptr_1 = (char *)thread_context -> input_buff_BIN;
5196 char * read_ptr_2 = (char *)thread_context -> input_buff_BIN + NOSORT_BIN_BUFF_SIZE / 2;
5197
5198 while(1){
5199 int has_found = 0, to_quit = 0;
5200 subread_lock_occupy(&thread_context -> SBAM_lock);
5201
5202 // SUBREADprintf("CONSUME:RINS=%d, PTR=%d\n", thread_context -> reads_in_SBAM, thread_context -> input_buff_BIN_ptr );
5203
5204 if(thread_context -> reads_in_SBAM > 1){
5205 if(pairer -> input_is_BAM){
5206 int record_len, seq_len1 = 0, seq_len2 = 0;
5207 // SUBREADprintf("LOAD BY THREAD %d:", thread_no);
5208 memcpy(&record_len, thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr, 4);
5209 // SUBREADprintf("RLEN=%d\n", record_len);
5210 assert(record_len > 32 &&record_len < NOSORT_SBAM_BUFF_SIZE);
5211 memcpy(read_ptr_1 , thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr, 4 + record_len);
5212 memcpy(&seq_len1, thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr + 20, 4);
5213 thread_context -> input_buff_SBAM_ptr += record_len + 4;
5214
5215 memcpy(&record_len, thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr, 4);
5216 assert(record_len > 32 &&record_len < NOSORT_SBAM_BUFF_SIZE);
5217 memcpy(read_ptr_2 , thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr, 4 + record_len);
5218 memcpy(&seq_len2, thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr + 20, 4);
5219 thread_context -> input_buff_SBAM_ptr += record_len + 4;
5220 has_found = 1;
5221 thread_context -> reads_in_SBAM -= 2;
5222
5223 if(seq_len1 >= pairer -> long_read_minimum_length || seq_len2 >= pairer -> long_read_minimum_length)
5224 pairer -> long_cigar_mode = 1;
5225
5226 }else{
5227 thread_context -> input_buff_BIN_ptr = 0;
5228 int rret = reduce_SAM_to_BAM(pairer, thread_context , 0);
5229 thread_context -> reads_in_SBAM -- ;
5230 if(rret > 0){
5231 thread_context -> input_buff_BIN_ptr = NOSORT_BIN_BUFF_SIZE/2;
5232 rret = reduce_SAM_to_BAM(pairer, thread_context, 0);
5233 thread_context -> reads_in_SBAM -- ;
5234 if(rret > 0){
5235 has_found = 1;
5236 }
5237 }
5238 }
5239 }
5240 if(pairer -> is_finished) to_quit = 1;
5241 subread_lock_release(&thread_context -> SBAM_lock);
5242
5243 if(has_found)
5244 pairer -> output_function(pairer, thread_no, (char*) read_ptr_1,(char*) read_ptr_2);
5245 else{
5246 if(to_quit) break;
5247 usleep(nosort_tick_time);
5248 }
5249 }
5250
5251 return NULL;
5252 }
5253
SAM_nosort_decompress_next_block(SAM_pairer_context_t * pairer)5254 int SAM_nosort_decompress_next_block(SAM_pairer_context_t * pairer){
5255 int SBAM_used;
5256 unsigned int decompressed_len;
5257
5258 char * SBAM_buff = pairer -> appendix2;
5259 char * BIN_buff = pairer -> appendix3;
5260 int * BIN_buff_used = pairer -> appendix4;
5261 int * BIN_buff_ptr = pairer -> appendix5;
5262
5263 SBAM_used = PBam_get_next_zchunk(pairer -> input_fp, SBAM_buff, NOSORT_SBAM_BUFF_SIZE, &decompressed_len);
5264 if(SBAM_used<0){
5265 if(SBAM_used == -2){
5266 SUBREADputs("ERROR: the BAM format is broken.");
5267 pairer->is_internal_error = 1;
5268 }
5269 return -1;
5270 }
5271
5272 //SUBREADprintf("PRE-LOAD BAM: USED %d, PTR %d\n", * BIN_buff_used , * BIN_buff_ptr);
5273 if((* BIN_buff_ptr) < (* BIN_buff_used)){
5274 int diff = (* BIN_buff_used) - (* BIN_buff_ptr);
5275 int x1;
5276 for(x1 = 0; x1 < diff; x1++){
5277 BIN_buff[x1] = BIN_buff[x1 + (* BIN_buff_ptr)];
5278 }
5279 (* BIN_buff_used) = diff;
5280 } else (* BIN_buff_used) = 0;
5281 (* BIN_buff_ptr) = 0;
5282
5283 int binlen = SamBam_unzip(BIN_buff + (* BIN_buff_used), 65536, SBAM_buff , SBAM_used, 0);
5284 //assert(binlen == decompressed_len);
5285 if(binlen < 0) return -1;
5286 (* BIN_buff_used) += binlen;
5287 return binlen;
5288 }
5289
5290 #define NOSORT_BAM_next_nch { while( BIN_buff_used == BIN_buff_ptr ){int rlen = SAM_nosort_decompress_next_block(pairer); if(rlen < 0) { BIN_buff_used = -1 ; break;}} if(BIN_buff_used < 0) nch = -1; else nch = BIN_buff[BIN_buff_ptr++]; }
5291 #define NOSORT_BAM_next_u32(v){ NOSORT_BAM_next_nch; if(nch < 0)v=-1;else{; v= nch; NOSORT_BAM_next_nch; v+=nch*256; NOSORT_BAM_next_nch; v+=nch*65536; NOSORT_BAM_next_nch; v+=nch*16777216;} }
5292
5293 #define NOSORT_SAM_next_line {NOSORT_SAM_eof = fgets(line_ptr, NOSORT_SBAM_BUFF_SIZE, pairer -> input_fp);}
5294
5295 #if FEATURECOUNTS_BUFFER_SIZE < ( 12*1024*1024 )
5296 #error "FEATURECOUNTS_BUFFER_SIZE MUST BE GREATER THAN 12MB!."
5297 #endif
5298
5299 #define NOSORT_REFILL_LOWBAR ( 3 * 1024 * 1024 )
5300 #define NOSORT_REFILL_HIGHBAR ( 6 * 1024 * 1024 )
5301
SAM_nosort_run_once(SAM_pairer_context_t * pairer)5302 void SAM_nosort_run_once(SAM_pairer_context_t * pairer){
5303 int x1;
5304 for(x1 = 0; x1 < pairer -> total_threads ; x1++){
5305 // this 16-byte memory block is freed in the thread worker.
5306 void ** init_params = malloc(sizeof(void *) * 2);
5307
5308 init_params[0] = pairer;
5309 init_params[1] = (void *)(NULL+x1);
5310 pthread_create(&(pairer -> threads[x1].thread_stab), NULL, SAM_nosort_thread_run, init_params);
5311 }
5312
5313 char * SBAM_buff = malloc(NOSORT_SBAM_BUFF_SIZE);
5314 int nch;
5315 unsigned char * BIN_buff = malloc(NOSORT_BIN_BUFF_SIZE);
5316 char *NOSORT_SAM_eof=NULL;
5317 int BIN_buff_used = 0;
5318 int BIN_buff_ptr = 0;
5319
5320 pairer -> appendix2 = SBAM_buff;
5321 pairer -> appendix3 = BIN_buff;
5322 pairer -> appendix4 = &BIN_buff_used;
5323 pairer -> appendix5 = &BIN_buff_ptr;
5324
5325 if(pairer -> input_is_BAM){
5326 int x1;
5327 unsigned int bam_signature;
5328 NOSORT_BAM_next_u32(bam_signature);
5329 NOSORT_BAM_next_u32(pairer -> BAM_l_text);
5330 char * header_txt = malloc(max(1000000,pairer->BAM_l_text));
5331
5332 for(x1 = 0 ; x1 < pairer -> BAM_l_text; x1++){
5333 NOSORT_BAM_next_nch;
5334 header_txt [x1] = nch;
5335 }
5336
5337 int is_OK = pairer -> output_header(pairer, 0, 1, pairer -> BAM_l_text , header_txt , pairer -> BAM_l_text );
5338 NOSORT_BAM_next_u32(pairer -> BAM_n_ref);
5339 unsigned int ref_bin_len = 0;
5340 for(x1 = 0; x1 < pairer -> BAM_n_ref; x1++) {
5341 unsigned int l_name, l_ref, x2;
5342 NOSORT_BAM_next_u32(l_name);
5343 assert(l_name < 256);
5344 memcpy(header_txt + ref_bin_len, &l_name, 4);
5345 ref_bin_len += 4;
5346 for(x2 = 0; x2 < l_name; x2++){
5347 NOSORT_BAM_next_nch;
5348 header_txt[ref_bin_len++] = nch;
5349 }
5350 NOSORT_BAM_next_u32(l_ref);
5351 memcpy(header_txt + ref_bin_len, &l_ref, 4);
5352 ref_bin_len += 4;
5353
5354 assert(ref_bin_len < pairer -> BAM_l_text);
5355 }
5356
5357 is_OK = is_OK || pairer -> output_header(pairer, 0, 0, pairer -> BAM_n_ref , header_txt , ref_bin_len );
5358 free(header_txt);
5359
5360 if(is_OK){
5361 pairer -> is_incomplete_BAM = 1;
5362 return;
5363 }
5364
5365 while(1){
5366 if(pairer -> is_finished) break;
5367 int need_sleep = 1;
5368 for(x1 = 0; x1 < pairer -> total_threads ; x1++){
5369 if(pairer -> is_finished) break;
5370 SAM_pairer_thread_t * this_thread = pairer -> threads + x1;
5371 if(this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr < NOSORT_REFILL_LOWBAR && (this_thread -> input_buff_SBAM_used == 0 || this_thread -> input_buff_SBAM_ptr > 0)){
5372 subread_lock_occupy(&this_thread -> SBAM_lock);
5373 int to_be_add = NOSORT_REFILL_HIGHBAR - (this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr);
5374
5375 int x2, x3;
5376 if(this_thread -> input_buff_SBAM_ptr < this_thread -> input_buff_SBAM_used){
5377 for(x2 = 0; x2 < this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr; x2++)
5378 this_thread -> input_buff_SBAM[x2] = this_thread -> input_buff_SBAM[x2 + this_thread -> input_buff_SBAM_ptr];
5379 this_thread -> input_buff_SBAM_used -= this_thread -> input_buff_SBAM_ptr;
5380 }else this_thread -> input_buff_SBAM_used =0;
5381
5382 this_thread -> input_buff_SBAM_ptr = 0;
5383 for(x2 = 0 ; ; x2++){
5384 int record_len;
5385 NOSORT_BAM_next_u32(record_len);
5386 if(record_len < 32 || record_len > 500000){
5387 if(record_len!=-1)
5388 SUBREADprintf("Unexpected record length: %d.\n", record_len);
5389 pairer -> is_finished = 1;
5390 break;
5391 }
5392
5393 memcpy(this_thread -> input_buff_SBAM + this_thread -> input_buff_SBAM_used , &record_len, 4);
5394 this_thread -> input_buff_SBAM_used += 4;
5395 for(x3 =0; x3 < record_len; x3++){
5396 NOSORT_BAM_next_nch;
5397 this_thread -> input_buff_SBAM[this_thread -> input_buff_SBAM_used++] = nch;
5398 }
5399 this_thread -> reads_in_SBAM ++;
5400 if(x2 % 2 == 1 && to_be_add <= this_thread -> input_buff_SBAM_used + 20000 )break;
5401 }
5402 need_sleep = 0;
5403 subread_lock_release(&this_thread -> SBAM_lock);
5404 }
5405 }
5406 if(need_sleep) usleep(nosort_tick_time);
5407 }
5408 }else{ // if input is SAM
5409 char * line_ptr = SBAM_buff;
5410 char * header_start = NULL;
5411 int passed_read_SBAM_ptr = -1;
5412 unsigned int header_buffer_safe_size = 0;
5413 while(1){
5414 passed_read_SBAM_ptr = ftello(pairer -> input_fp);
5415 NOSORT_SAM_next_line;
5416 if(NOSORT_SAM_eof == NULL)break;
5417
5418 header_buffer_safe_size += strlen(line_ptr);
5419 if(NULL== header_start && line_ptr[0] == '@') header_start = line_ptr;
5420
5421 if(NULL == line_ptr){
5422 SUBREADprintf("FATAL: the header is too large to the buffer.\n");
5423 break;
5424 }else{
5425 //SUBREADprintf("LINELEN=%d, PTR=%d, FIRST=%c\n", line_len, thread_context -> input_buff_SBAM_ptr , line_ptr[0]);
5426 }
5427 if(line_ptr[0]!='@'){
5428 break;
5429 }
5430 }
5431
5432 fseeko(pairer -> input_fp, 0 , SEEK_SET);
5433 int header_bin_ptr = 0, header_contigs = 0;
5434 char * header_bin = malloc(header_buffer_safe_size);
5435
5436
5437 while(1){
5438 NOSORT_SAM_next_line;
5439 if(NOSORT_SAM_eof == NULL)break;
5440 if(line_ptr[0]!='@') break;
5441 if(memcmp(line_ptr, "@SQ\t",4)==0){
5442 unsigned int ct_len = 0, ctptr = 4, status = 0, sqname_len = 0;
5443 char * sqname = NULL;
5444 while(1){
5445 char ctnch = line_ptr[ctptr++];
5446 if( status == 0){
5447 if(ctnch=='S' && line_ptr[ctptr] == 'N' && line_ptr[ctptr+1] == ':'){
5448 ctptr += 2;
5449 status = 10;
5450 sqname = line_ptr + ctptr;
5451 }else if(ctnch=='L' && line_ptr[ctptr] == 'N' && line_ptr[ctptr+1] == ':'){
5452 ctptr += 2;
5453 status = 20;
5454 }else status = 30;
5455 }else if(status == 10 || status == 20 || status == 30){
5456 if(ctnch == '\t' || ctnch == '\n'){
5457 status = 0;
5458 if(ctnch == '\n') break;
5459 //break;
5460 }
5461 if(status == 10) sqname_len ++;
5462 else if(status == 20) ct_len = ct_len * 10 + ctnch - '0';
5463 }
5464 }
5465
5466
5467 sqname_len += 1;
5468 memcpy(header_bin + header_bin_ptr, &sqname_len, 4);
5469 header_bin_ptr += 4;
5470 memcpy(header_bin + header_bin_ptr, sqname, sqname_len-1);
5471 *(header_bin + header_bin_ptr + sqname_len - 1) = 0;
5472 char * mem_contig_name = malloc(sqname_len);
5473 strcpy(mem_contig_name , header_bin + header_bin_ptr);
5474 // SUBREADprintf("CONTIG %d : %s (len=%d = %d)\n", header_contigs, header_bin + header_bin_ptr , sqname_len, strlen(mem_contig_name));
5475 HashTablePut(pairer -> sam_contig_number_table , mem_contig_name, NULL + 1 + header_contigs);
5476 header_bin_ptr += sqname_len;
5477
5478 memcpy(header_bin + header_bin_ptr, &ct_len, 4);
5479 header_bin_ptr += 4;
5480 header_contigs++;
5481 }
5482 }
5483
5484 pairer -> BAM_header_parsed = 1;
5485 int is_OK = pairer -> output_header(pairer, 0, 0, header_contigs , header_bin , header_bin_ptr);
5486 free(header_bin);
5487
5488 if(is_OK){
5489 pairer -> is_incomplete_BAM = 1;
5490 return;
5491 }
5492
5493
5494
5495 fseeko(pairer -> input_fp, passed_read_SBAM_ptr, SEEK_SET);
5496
5497 line_ptr = SBAM_buff;
5498
5499 while(1){
5500 if(pairer -> is_finished) break;
5501 int need_sleep = 1;
5502 for(x1 = 0; x1 < pairer -> total_threads ; x1++){
5503 if(pairer -> is_finished) break;
5504 SAM_pairer_thread_t * this_thread = pairer -> threads + x1;
5505 if(this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr < NOSORT_REFILL_LOWBAR && (this_thread -> input_buff_SBAM_used == 0 || this_thread -> input_buff_SBAM_ptr > 0)){
5506 subread_lock_occupy(&this_thread -> SBAM_lock);
5507 int to_be_add = NOSORT_REFILL_HIGHBAR - (this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr);
5508
5509 int x2;
5510 if(this_thread -> input_buff_SBAM_ptr < this_thread -> input_buff_SBAM_used){
5511 for(x2 = 0; x2 < this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr; x2++)
5512 this_thread -> input_buff_SBAM[x2] = this_thread -> input_buff_SBAM[x2 + this_thread -> input_buff_SBAM_ptr];
5513 this_thread -> input_buff_SBAM_used -= this_thread -> input_buff_SBAM_ptr;
5514 }else this_thread -> input_buff_SBAM_used =0;
5515
5516 this_thread -> input_buff_SBAM_ptr = 0;
5517 for(x2 = 0 ; ; x2++){
5518 int record_len;
5519 NOSORT_SAM_next_line;
5520
5521 if(NULL==NOSORT_SAM_eof || line_ptr[0]==0){
5522 pairer -> is_finished = 1;
5523 break;
5524 }
5525
5526 record_len = strlen(line_ptr);
5527 // SUBREADprintf("1CHR=%c, ECHR=%d , RL=%d, RINS=%d, USED=%d, SIZE=%d\n", line_ptr[0], line_ptr[record_len - 1], record_len, this_thread -> reads_in_SBAM, this_thread -> input_buff_SBAM_used, pairer -> input_buff_SBAM_size);
5528 memcpy(this_thread -> input_buff_SBAM + this_thread -> input_buff_SBAM_used , line_ptr, record_len);
5529 this_thread -> input_buff_SBAM_used += record_len;
5530 this_thread -> reads_in_SBAM ++;
5531 if(x2 % 2 == 1 && to_be_add <= this_thread -> input_buff_SBAM_used + 20000 )break;
5532 }
5533 need_sleep = 0;
5534 subread_lock_release(&this_thread -> SBAM_lock);
5535 }
5536 }
5537 if(need_sleep) usleep(nosort_tick_time);
5538 }
5539 }
5540
5541 free(SBAM_buff);
5542 free(BIN_buff);
5543
5544
5545 for(x1 = 0; x1 < pairer -> total_threads ; x1++){
5546 pthread_join(pairer -> threads[x1].thread_stab, NULL);
5547 }
5548 }
5549
5550 #define BINADD_NCHAR { if(binptr >= bin_buff_capacity - 10){\
5551 bin_buff_capacity = bin_buff_capacity * 14 / 10;\
5552 bin_buffer = realloc(bin_buffer, bin_buff_capacity);\
5553 } bin_buffer[binptr++] = nch;}
5554
5555
5556
5557 // only one thread; very large buffer size.
SAM_pairer_long_cigar_run(SAM_pairer_context_t * pairer)5558 int SAM_pairer_long_cigar_run(SAM_pairer_context_t * pairer){
5559 char *bin_buffer, *bam_buffer;
5560 FILE * old_fp = pairer -> input_fp;
5561 int bin_buff_capacity = 1000000, block_size = 0;
5562 char * in_bin = malloc(140000);
5563 bin_buffer = malloc(bin_buff_capacity);
5564 bam_buffer = malloc(70000);
5565
5566 z_stream in_strm;
5567 in_strm.zalloc = Z_NULL;
5568 in_strm.zfree = Z_NULL;
5569 in_strm.opaque = Z_NULL;
5570 in_strm.avail_in = 0;
5571 in_strm.next_in = Z_NULL;
5572
5573 inflateInit2(&in_strm, PAIRER_GZIP_WINDOW_BITS);
5574
5575 fseeko(old_fp, 0, SEEK_SET);
5576
5577 if(1){
5578 int disk_is_full = 0;
5579 int in_bin_ptr = 0;
5580 int out_bin_ptr = 0;
5581 int in_bin_size = 0;
5582 int content_count = 0;
5583 int content_size = 0;
5584 int is_finished = 0;
5585 int x1, nch = 0, binptr = 0;
5586
5587 disk_is_full = disk_is_full?out_bin_ptr:out_bin_ptr; // stupid code to avoid warning messages from GCC 7
5588
5589 for(x1 = 0; x1 < 4; x1++){
5590 FIX_GET_NEXT_NCH; // BAM1
5591 if(nch < 0) return -1;
5592 }
5593
5594 // ====== The header texts
5595 content_size = 0;
5596 binptr = 0;
5597 for(x1 = 0; x1 < 4; x1++){
5598 FIX_GET_NEXT_NCH;
5599 if(nch < 0) return -1;
5600 content_size += (nch << (8 * x1));
5601 }
5602 for(content_count = 0; content_count < content_size; content_count++){
5603 FIX_GET_NEXT_NCH;
5604 BINADD_NCHAR;
5605 if(nch < 0) return -1;
5606 }
5607
5608 pairer -> output_header (pairer , 0, 1, binptr, bin_buffer, binptr);
5609
5610 // ====== The chromosome table
5611 binptr = 0;
5612 content_size = 0;
5613 for(x1 = 0; x1 < 4; x1++){
5614 FIX_GET_NEXT_NCH;
5615 if(nch < 0) return -1;
5616 content_size += (nch << (8 * x1));
5617 }
5618
5619 for(content_count = 0; content_count < content_size; content_count++){
5620 block_size = 0;
5621 for(x1 = 0; x1 < 4; x1++){
5622 FIX_GET_NEXT_NCH;
5623 if(nch < 0) return -1;
5624 BINADD_NCHAR;
5625 block_size += (nch << (8 * x1));
5626 }
5627
5628 for(x1 = 0; x1 < block_size + 4; x1++){
5629 FIX_GET_NEXT_NCH;
5630 if(nch < 0) return -1;
5631 BINADD_NCHAR;
5632 }
5633 }
5634 pairer -> output_header (pairer , 0, 0, content_size, bin_buffer, binptr);
5635
5636 // go through the reads
5637 int reads = 0;
5638 while(1){
5639 binptr = 0;
5640 block_size = 0;
5641 for(x1 = 0; x1 < 4; x1++){
5642 FIX_GET_NEXT_NCH;
5643 if(x1 == 0 && nch < 0){
5644 is_finished=1;
5645 break;
5646 }
5647 if(nch < 0) return -1;
5648
5649 BINADD_NCHAR;
5650 block_size += (nch << (8 * x1));
5651 }
5652 if(is_finished)break;
5653
5654 for(x1 = 0; x1 < block_size; x1 ++){
5655 FIX_GET_NEXT_NCH;
5656 if(nch < 0) return -1;
5657 BINADD_NCHAR;
5658 }
5659
5660 pairer -> output_function(pairer, 0, bin_buffer, NULL);
5661 reads++;
5662 }
5663 }
5664
5665 free(bam_buffer);
5666 free(bin_buffer);
5667 free(in_bin);
5668
5669 return 0;
5670 }
5671
pairer_increase_SAMBAM_buffer(SAM_pairer_context_t * pairer)5672 void pairer_increase_SAMBAM_buffer(SAM_pairer_context_t * pairer){
5673 pairer -> input_buff_SBAM_size *= 5;
5674 pairer -> input_buff_BIN_size = max(1024*1024, pairer -> input_buff_SBAM_size );
5675
5676 int x1;
5677 for(x1 = 0; x1< pairer -> total_threads; x1++){
5678 pairer -> threads[x1].input_buff_SBAM = realloc( pairer -> threads[x1].input_buff_SBAM, pairer -> input_buff_SBAM_size);
5679 pairer -> threads[x1].input_buff_BIN = realloc( pairer -> threads[x1].input_buff_BIN, pairer -> input_buff_BIN_size);
5680 }
5681 }
5682
SAM_pairer_run(SAM_pairer_context_t * pairer)5683 int SAM_pairer_run( SAM_pairer_context_t * pairer){
5684 int corrected_run;
5685
5686 if(pairer -> force_do_not_sort){
5687 SAM_nosort_run_once(pairer);
5688
5689 }else for(corrected_run = 0; corrected_run < 2 ; corrected_run ++){
5690 pairer -> is_final_run = corrected_run;
5691 SAM_pairer_run_once(pairer);
5692 if(pairer -> is_bad_format && pairer->input_is_BAM && ( ! pairer -> is_internal_error ) && ( ! pairer -> is_incomplete_BAM )){
5693 //#warning ">>>>>> REMOVE '+ 1' FROM NEXT LINE IN RELEASE <<<<<<"
5694 assert(1 != corrected_run);
5695 delete_with_prefix(pairer -> tmp_file_prefix);
5696 pairer -> is_internal_error |= SAM_pairer_fix_format(pairer);
5697
5698 if(pairer -> is_bad_format || pairer -> is_internal_error)
5699 return -1;
5700 SAM_pairer_reset(pairer);
5701 if(pairer -> reset_output_function)pairer -> reset_output_function(pairer);
5702 pairer_increase_SAMBAM_buffer(pairer);
5703
5704 if(pairer -> long_cigar_mode) return SAM_pairer_long_cigar_run(pairer);
5705 }else break;
5706 }
5707
5708 return pairer -> is_bad_format || pairer -> is_internal_error || pairer -> is_incomplete_BAM;
5709 }
5710
sort_SAM_create(SAM_sort_writer * writer,char * output_file,char * tmp_path)5711 int sort_SAM_create(SAM_sort_writer * writer, char * output_file, char * tmp_path)
5712 {
5713 char tmp_fname[MAX_FILE_NAME_LENGTH+40], mac_rand[13];
5714 memset(writer, 0, sizeof(SAM_sort_writer));
5715
5716 old_sig_TERM = signal (SIGTERM, SAM_SORT_SIGINT_hook);
5717 old_sig_INT = signal (SIGINT, SAM_SORT_SIGINT_hook);
5718
5719 mac_or_rand_str(mac_rand);
5720 if(tmp_path == NULL){
5721 int slash_pos = 0;
5722 for(slash_pos = strlen(output_file); slash_pos >=0; slash_pos--){
5723 if(output_file[slash_pos]=='/')break;
5724 }
5725 if(slash_pos >= 0){
5726 memcpy(writer -> tmp_path, output_file, slash_pos+1);
5727 sprintf(writer -> tmp_path + slash_pos+1, "temp-sort-%06u-%s-", getpid(), mac_rand);
5728 }else sprintf(writer -> tmp_path, "./temp-sort-%06u-%s-", getpid(), mac_rand);
5729
5730 }else sprintf(writer -> tmp_path, "%s/temp-sort-%06u-%s-", tmp_path, getpid(), mac_rand);
5731
5732 //#warning " >>>>>>>>>>>>>>>> REMOVE THE NEXT LINE <<<<<<<<<<<<<<<<<<<< "
5733 //SUBREADprintf("TMP_SORT=%s FROM %s\n", writer -> tmp_path, output_file);
5734
5735 _SAMSORT_SNP_delete_temp_prefix = writer -> tmp_path;
5736
5737 sprintf(tmp_fname, "%s%s", writer -> tmp_path, "headers.txt");
5738 writer -> all_chunks_header_fp = f_subr_open(tmp_fname,"w");
5739 if(!writer -> all_chunks_header_fp) return -1;
5740 fclose(writer -> all_chunks_header_fp);
5741 unlink(tmp_fname);
5742
5743 writer -> out_fp = f_subr_open(output_file,"w");
5744 if(!writer -> out_fp) return -1;
5745
5746 return 0;
5747 }
5748
find_tag_out(char * read_line_buf,char * tag,char * hi_tag_out)5749 void find_tag_out(char * read_line_buf, char * tag, char * hi_tag_out)
5750 {
5751 int hi_tag = -1;
5752 char tag_str[10];
5753 sprintf(tag_str , "\t%s:i:", tag);
5754 char * hi_tag_str = strstr(read_line_buf, tag_str);
5755 if(hi_tag_str)
5756 {
5757
5758
5759 hi_tag = 0;
5760 int line_cursor;
5761 for(line_cursor=6; ; line_cursor++)
5762 {
5763 char nch = hi_tag_str[line_cursor];
5764 // printf("HI:i=%s; nch [%d] ='%c'\n", hi_tag_str, line_cursor, nch);
5765 if(!isdigit(nch)) break;
5766 hi_tag = hi_tag*10 + (nch-'0');
5767 }
5768 }
5769
5770 if(hi_tag >=0)
5771 {
5772 sprintf(hi_tag_out,"\t%s:i:%d", tag, hi_tag);
5773 }else hi_tag_out[0] = 0;
5774
5775
5776 }
5777
sort_SAM_finalise(SAM_sort_writer * writer)5778 int sort_SAM_finalise(SAM_sort_writer * writer)
5779 {
5780 int x1_chunk, x1_block, is_disk_full = 0;
5781 int xk1;
5782 for(xk1=0;xk1<SAM_SORT_BLOCKS;xk1++)
5783 {
5784 if(writer -> current_block_fp_array[xk1])
5785 fclose(writer -> current_block_fp_array[xk1]);
5786 }
5787 memset(writer -> current_block_fp_array, 0, sizeof(FILE *)*SAM_SORT_BLOCKS);
5788 writer -> current_chunk_size = 0;
5789 writer -> current_chunk++;
5790
5791 for(x1_block = 0; x1_block <SAM_SORT_BLOCKS; x1_block++){
5792 HashTable * first_read_name_table;
5793 first_read_name_table = HashTableCreate(SAM_SORT_BLOCK_SIZE / 100 );
5794 HashTableSetKeyComparisonFunction(first_read_name_table , fc_strcmp_chro);
5795 HashTableSetDeallocationFunctions(first_read_name_table , free, free);
5796 HashTableSetHashFunction(first_read_name_table, HashTableStringHashFunction);
5797
5798 for(x1_chunk = 0; x1_chunk < writer -> current_chunk; x1_chunk++)
5799 {
5800 char tmpfname[MAX_FILE_NAME_LENGTH+40];
5801 sprintf(tmpfname, "%sCHK%08d-BLK%03d.bin", writer -> tmp_path, x1_chunk , x1_block);
5802
5803 FILE * bbfp = f_subr_open(tmpfname,"rb");
5804 if(!bbfp) continue;
5805
5806 while(!feof(bbfp))
5807 {
5808 char * read_name = NULL;
5809 short flags;
5810 short read_name_len;
5811 short read_len;
5812 int ret = fread(&flags, 2,1 , bbfp);
5813 if(ret<1) break;
5814 ret = fread(&read_name_len, 2,1 , bbfp);
5815 if(ret<1) break;
5816
5817 if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
5818 fseeko(bbfp, read_name_len, SEEK_CUR);
5819 else
5820 {
5821 read_name = malloc(read_name_len+1);
5822 ret = fread(read_name, 1, read_name_len, bbfp);
5823 if(ret< read_name_len) break;
5824 read_name[read_name_len] = 0;
5825 }
5826 ret =fread(&read_len,2,1,bbfp);
5827 if(ret<1) break;
5828
5829 if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
5830 fseeko(bbfp, read_len, SEEK_CUR);
5831 else
5832 {
5833 char * new_line_mem = malloc(read_len+1);
5834 ret = fread(new_line_mem, 1, read_len, bbfp);
5835 if(ret<read_len) break;
5836
5837 new_line_mem[read_len] = 0;
5838
5839 if(read_len<2)
5840 {
5841 SUBREADprintf("Cannot determain read length from the tmp file.\n");
5842 assert(0);
5843 }
5844
5845
5846 if( new_line_mem[0]==0 || new_line_mem[1]==0)
5847 {
5848 SUBREADprintf("Cannot load read part from the tmp file.\n");
5849 assert(0);
5850 }
5851
5852
5853 char * old_line_mem = HashTableGet(first_read_name_table, read_name);
5854 if(old_line_mem)
5855 old_line_mem[0]=0xff;
5856 else
5857 HashTablePut(first_read_name_table, read_name, new_line_mem);
5858 //if( first_read_name_table -> numOfElements<4)printf("RV=%s\n", read_name);
5859 }
5860 }
5861
5862 fclose(bbfp);
5863 }
5864
5865 //printf("BLK=%d; CKS=%d; READS=%llu\n", x1_block, x1_chunk, first_read_name_table -> numOfElements);
5866 srInt_64 finished_second_reads = 0;
5867
5868 for(x1_chunk = 0; x1_chunk < writer -> current_chunk; x1_chunk++)
5869 {
5870 char tmpfname[MAX_FILE_NAME_LENGTH+40];
5871 sprintf(tmpfname, "%sCHK%08d-BLK%03d.bin", writer -> tmp_path, x1_chunk , x1_block);
5872
5873 // printf("START_BLOCK: %s\n", tmpfname);
5874
5875 FILE * bbfp = f_subr_open(tmpfname,"rb");
5876 if(!bbfp) continue;
5877
5878 char * read_line_buf = malloc(3000);
5879 char * read_name_buf = malloc(MAX_READ_NAME_LEN + MAX_CHROMOSOME_NAME_LEN * 2 + 26);
5880
5881 while(!feof(bbfp))
5882 {
5883 short flags;
5884 short read_name_len;
5885 short read_len;
5886 int ret = fread(&flags, 2,1 , bbfp);
5887 if(ret<1) break;
5888
5889 ret = fread(&read_name_len, 2,1 , bbfp);
5890 if(ret < 1) break;
5891
5892 if(read_name_len>=MAX_READ_NAME_LEN + MAX_CHROMOSOME_NAME_LEN * 2 + 26)
5893 SUBREADprintf("VERY_LONG_NAME(%d)\n", read_name_len);
5894 if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
5895 {
5896 ret = fread(read_name_buf, 1, read_name_len, bbfp);
5897 if(ret < read_name_len) break;
5898
5899 read_name_buf[read_name_len] = 0;
5900 } else fseeko(bbfp, read_name_len, SEEK_CUR);
5901 ret = fread(&read_len, 2,1 , bbfp);
5902 if(ret < 1) break;
5903
5904 if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
5905 {
5906 ret = fread(read_line_buf, 1, read_len, bbfp);
5907 if(ret < 1) break;
5908 read_line_buf[read_len] = 0;
5909 }
5910 else fseeko(bbfp, read_len, SEEK_CUR);
5911
5912
5913 if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
5914 {
5915 // printf("RRNAME:%s\n", read_name_buf);
5916
5917 char * first_read_text = HashTableGet(first_read_name_table, read_name_buf);
5918 strtok(read_name_buf,"\t");
5919 if(first_read_text && first_read_text[0]!=(char)0xff)
5920 {
5921 fputs(read_name_buf, writer->out_fp);
5922 putc('\t', writer->out_fp);
5923 fputs(first_read_text, writer->out_fp);
5924
5925 fputs(read_name_buf, writer->out_fp);
5926 putc('\t', writer->out_fp);
5927 int write_len = fputs(read_line_buf, writer->out_fp);
5928 if(write_len < 0) is_disk_full = 1;
5929
5930 read_name_buf[strlen(read_name_buf)]='\t';
5931 HashTableRemove(first_read_name_table, read_name_buf);
5932 finished_second_reads ++;
5933 }
5934 else{
5935
5936 int dummy_flags = 4 | 1, mate_flags = 0;
5937 char * dummy_mate_chr = NULL;
5938 char dummy_mate_chr_buf[120];
5939 unsigned int dummy_old_read_pos = 0, tmpi=0,dummy_char_strpos = 0;
5940 int tabs = 0;
5941 int read_cursor = 0;
5942
5943 for(read_cursor = 0;; read_cursor++)
5944 {
5945 char nch = read_line_buf[read_cursor];
5946 if(!nch) break;
5947 if(nch == '\t')
5948 {
5949 if(tabs == 0){
5950 mate_flags = tmpi;
5951 dummy_mate_chr = read_line_buf+read_cursor+1;
5952 }
5953 else if(tabs == 1)
5954 dummy_char_strpos = read_cursor;
5955 else if(tabs == 2)
5956 {
5957 dummy_old_read_pos = tmpi;
5958 break;
5959 }
5960 tmpi=0;
5961 tabs++;
5962 }else{
5963 if(tabs==0 || tabs == 2) tmpi = tmpi * 10 + (nch - '0');
5964 }
5965 }
5966
5967
5968 dummy_flags |= SAM_FLAG_FIRST_READ_IN_PAIR;
5969 if(mate_flags & SAM_FLAG_UNMAPPED) dummy_flags |= SAM_FLAG_MATE_UNMATCHED;
5970 if(mate_flags & SAM_FLAG_REVERSE_STRAND_MATCHED) dummy_flags |= SAM_FLAG_MATE_REVERSE_STRAND_MATCHED;
5971 if(mate_flags & SAM_FLAG_MATE_REVERSE_STRAND_MATCHED) dummy_flags |= SAM_FLAG_REVERSE_STRAND_MATCHED;
5972
5973 memcpy(dummy_mate_chr_buf, dummy_mate_chr, read_line_buf +dummy_char_strpos - dummy_mate_chr);
5974 dummy_mate_chr_buf[read_line_buf +dummy_char_strpos - dummy_mate_chr]=0;
5975
5976 char hi_tag_out[18];
5977 char nh_tag_out[18];
5978
5979 find_tag_out(read_line_buf, "HI", hi_tag_out);
5980 find_tag_out(read_line_buf, "NH", nh_tag_out);
5981
5982 // build a fake FIRST read for the mapped SECOND read.
5983 // note that the TLEN, MATE_POS and MATE_CHAR are incorrect for general use.
5984 fprintf(writer->out_fp, "%s\t%d\t*\t0\t0\t*\t%s\t%d\t0\tN\tI%s%s\n", read_name_buf, dummy_flags, dummy_mate_chr_buf, dummy_old_read_pos, nh_tag_out, hi_tag_out);
5985 fputs(read_name_buf, writer->out_fp);
5986 putc('\t', writer->out_fp);
5987 int write_len = fputs(read_line_buf, writer->out_fp);
5988 if(write_len < 0) is_disk_full = 1;
5989 writer -> unpaired_reads +=1;
5990 }
5991
5992 //else SUBREADprintf("WARNING: Unpaired read found in file:%s\n", read_name_buf);
5993 }
5994 }
5995
5996 fclose(bbfp);
5997 unlink(tmpfname);
5998 free(read_name_buf);
5999 free(read_line_buf);
6000 }
6001
6002
6003
6004 if(1)
6005 {
6006 writer -> unpaired_reads += first_read_name_table -> numOfElements;
6007
6008 KeyValuePair * cursor;
6009 int bucket;
6010
6011 // go through the hash table and write correct FIRST lines and dummy SECOND lines.
6012 for(bucket=0; bucket< first_read_name_table -> numOfBuckets; bucket++)
6013 {
6014 cursor = first_read_name_table -> bucketArray[bucket];
6015 while(1)
6016 {
6017 if (!cursor) break;
6018 char * first_read_text = (char *)cursor -> value;
6019 char * first_read_name = (char *)cursor -> key;
6020
6021 if(first_read_text[0]!=(char)0xff)
6022 {
6023 int dummy_flags = 4 | 1, mate_flags = 0;
6024 char * dummy_mate_chr = NULL;
6025 unsigned int dummy_old_read_pos = 0, tmpi=0, dummy_char_strpos = 0;
6026 int tabs = 0;
6027 int read_cursor = 0;
6028
6029 for(read_cursor = 0;; read_cursor++)
6030 {
6031 char nch = first_read_text[read_cursor];
6032 if(!nch) break;
6033 if(nch == '\t')
6034 {
6035 if(tabs == 0){
6036 mate_flags = tmpi;
6037 dummy_mate_chr = first_read_text+read_cursor+1;
6038 }
6039 else if(tabs == 1)
6040 dummy_char_strpos = read_cursor;
6041 else if(tabs == 2)
6042 {
6043 dummy_old_read_pos = tmpi;
6044 break;
6045 }
6046 tmpi=0;
6047 tabs++;
6048 }else{
6049 if(tabs==0 || tabs == 2) tmpi = tmpi * 10 + (nch - '0');
6050 }
6051 }
6052
6053 dummy_flags |= SAM_FLAG_SECOND_READ_IN_PAIR;
6054 if(mate_flags & SAM_FLAG_UNMAPPED) dummy_flags |= SAM_FLAG_MATE_UNMATCHED;
6055 if(mate_flags & SAM_FLAG_REVERSE_STRAND_MATCHED) dummy_flags |= SAM_FLAG_MATE_REVERSE_STRAND_MATCHED;
6056 if(mate_flags & SAM_FLAG_MATE_REVERSE_STRAND_MATCHED) dummy_flags |= SAM_FLAG_REVERSE_STRAND_MATCHED;
6057
6058 if((!first_read_text[0])||(!first_read_text[1]))
6059 {
6060 SUBREADprintf("unable to recover the first read : '%s' , flags = %d\n", first_read_name, mate_flags);
6061 assert(0);
6062 }
6063
6064 char nh_tag_out[18];
6065 char hi_tag_out[18];
6066 find_tag_out(first_read_text, "NH", nh_tag_out);
6067 find_tag_out(first_read_text, "HI", hi_tag_out);
6068
6069 strtok(first_read_name, "\t");
6070 fputs(first_read_name, writer->out_fp);
6071 putc('\t', writer->out_fp);
6072 fputs(first_read_text, writer->out_fp);
6073 first_read_text[dummy_char_strpos] = 0;
6074 fprintf(writer->out_fp, "%s\t%d\t*\t0\t0\t*\t%s\t%d\t0\tN\tI%s%s\n", first_read_name, dummy_flags, dummy_mate_chr, dummy_old_read_pos, nh_tag_out,hi_tag_out);
6075 }
6076 cursor = cursor->next;
6077 }
6078 }
6079
6080
6081 }
6082
6083 HashTableDestroy(first_read_name_table);
6084 }
6085 fclose(writer -> out_fp);
6086 signal (SIGTERM, old_sig_TERM);
6087 signal (SIGINT, old_sig_INT);
6088 return is_disk_full;
6089 }
6090
sort_SAM_check_chunk(SAM_sort_writer * writer)6091 void sort_SAM_check_chunk(SAM_sort_writer * writer)
6092 {
6093 if(writer -> current_chunk_size > SAM_SORT_BLOCK_SIZE * SAM_SORT_BLOCKS)
6094 {
6095 int xk1;
6096 for(xk1=0;xk1<SAM_SORT_BLOCKS;xk1++)
6097 {
6098 if(writer -> current_block_fp_array[xk1])
6099 fclose(writer -> current_block_fp_array[xk1]);
6100 }
6101 memset(writer -> current_block_fp_array, 0, sizeof(FILE *)*SAM_SORT_BLOCKS);
6102 writer -> current_chunk_size = 0;
6103 writer -> current_chunk++;
6104 }
6105 }
6106
6107 // the SAM_line includes "\n" at the tail!
6108 // line_len = strlen(SAM_line)
sort_SAM_add_line(SAM_sort_writer * writer,char * SAM_line,int line_len)6109 int sort_SAM_add_line(SAM_sort_writer * writer, char * SAM_line, int line_len)
6110 {
6111 int is_disk_full = 0;
6112 assert(writer -> all_chunks_header_fp);
6113 if(line_len<3) return 0;
6114 if(SAM_line[0]=='@'){
6115 int wlen = fputs(SAM_line, writer -> out_fp);
6116 if(wlen < 0){
6117 return -2;
6118 }
6119 }
6120 else
6121 {
6122 char read_name[MAX_READ_NAME_LEN + MAX_CHROMOSOME_NAME_LEN * 2 + 26];
6123 char chromosome_1_name[MAX_CHROMOSOME_NAME_LEN];
6124 char chromosome_2_name[MAX_CHROMOSOME_NAME_LEN];
6125 unsigned int pos_1, pos_2;
6126 int hi_tag,flags = 0, line_cursor = 0, field_cursor = 0, tabs=0;
6127 char * second_col_pos = NULL;
6128
6129 chromosome_1_name[0]=0;
6130 chromosome_2_name[0]=0;
6131 pos_1 = 0;
6132 pos_2 = 0;
6133 hi_tag = -1;
6134
6135 while(line_cursor < line_len)
6136 {
6137 char nch = SAM_line[line_cursor++];
6138 if(!nch)break;
6139
6140 if(nch == '\t')
6141 {
6142 field_cursor = 0;
6143 tabs++;
6144 if(tabs == 1) second_col_pos = SAM_line + line_cursor;
6145 if(tabs>7) break;
6146 }
6147 else if(tabs == 0)
6148 {
6149 read_name[field_cursor++] = nch;
6150 if(MAX_READ_NAME_LEN<=field_cursor){
6151 return -1;
6152 }
6153 read_name[field_cursor] = 0;
6154 }
6155 else if(tabs == 1)
6156 flags = flags*10+(nch-'0');
6157 else if(tabs == 2)
6158 {
6159 chromosome_1_name[field_cursor++] = nch;
6160 chromosome_1_name[field_cursor]=0;
6161 if(MAX_CHROMOSOME_NAME_LEN - 1 <= field_cursor) return -1;
6162 }
6163 else if(tabs == 3)
6164 pos_1 = pos_1 * 10 + (nch-'0');
6165 else if(tabs == 6)
6166 {
6167 chromosome_2_name[field_cursor++] = nch;
6168 chromosome_2_name[field_cursor] = 0;
6169 if(MAX_CHROMOSOME_NAME_LEN - 1 <= field_cursor) return -1;
6170 }
6171 else if(tabs == 7)
6172 pos_2 = pos_2 * 10 + (nch-'0');
6173
6174 }
6175 if(tabs <= 7) return -1;
6176
6177 //if(memcmp("V0112_0155:7:1101:4561:132881", read_name, 27)==0)
6178
6179 char * hi_tag_str = strstr(SAM_line,"\tHI:i:");
6180 if(hi_tag_str)
6181 {
6182 hi_tag = 0;
6183 for(line_cursor=6; ; line_cursor++)
6184 {
6185 char nch = hi_tag_str[line_cursor];
6186 if(!isdigit(nch)) break;
6187 hi_tag = hi_tag*10 + (nch-'0');
6188 }
6189 }
6190
6191 line_len = strlen(second_col_pos);
6192 sort_SAM_check_chunk(writer);
6193
6194 for(field_cursor = 0; read_name[field_cursor] ; field_cursor++)
6195 if(read_name[field_cursor] == '/') read_name[field_cursor] = 0;
6196
6197 if(chromosome_2_name[0]=='=')
6198 strcpy(chromosome_2_name, chromosome_1_name);
6199
6200
6201 // new read name format: OLD_READ_NAME\tCHR_R1:POS_R1:CHR_R2:POS_R2
6202
6203
6204 if(flags & SAM_FLAG_MATE_UNMATCHED)
6205 {
6206 if(chromosome_2_name[0] != '*')
6207 strcpy(chromosome_2_name , "*");
6208 pos_2 = 0;
6209 }
6210
6211
6212 if(flags & SAM_FLAG_UNMAPPED)
6213 {
6214 if(chromosome_1_name[0] != '*')
6215 strcpy(chromosome_1_name , "*");
6216 pos_1 = 0;
6217 }
6218
6219 char hi_key [13];
6220 if(hi_tag >=0)// && pos_1 && pos_2)
6221 sprintf(hi_key, ":%d", hi_tag);
6222 else
6223 hi_key[0]=0;
6224
6225 if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
6226 sprintf(read_name+strlen(read_name), "\t%s:%u:%s:%u%s",chromosome_2_name, pos_2, chromosome_1_name, pos_1, hi_key);
6227 else
6228 sprintf(read_name+strlen(read_name), "\t%s:%u:%s:%u%s",chromosome_1_name, pos_1, chromosome_2_name, pos_2, hi_key);
6229
6230 //if(memcmp("V0112_0155:7:1101:4561:132881", read_name, 27)==0)
6231 // printf("RRN=%s\n", read_name);
6232
6233 int read_name_len = strlen(read_name);
6234 srUInt_64 read_line_hash = sort_SAM_hash(read_name);
6235
6236 int block_id = read_line_hash % SAM_SORT_BLOCKS;
6237 if(!writer -> current_block_fp_array[block_id])
6238 {
6239 char tmpfname[MAX_FILE_NAME_LENGTH+40];
6240 sprintf(tmpfname,"%sCHK%08d-BLK%03d.bin", writer -> tmp_path , writer -> current_chunk , block_id);
6241 writer -> current_block_fp_array[block_id] = f_subr_open(tmpfname, "wb");
6242 }
6243
6244 if(line_len < 2)
6245 {
6246 SUBREADprintf("unable to put the first read.\n");
6247 assert(0);
6248 }
6249
6250 if(second_col_pos[0]==0 || second_col_pos[1]==0)
6251 {
6252 SUBREADprintf("unable to put the first read TEXT.\n");
6253 assert(0);
6254 }
6255
6256
6257 // printf("WRNAME:%s\n", read_name);
6258
6259 fwrite(&flags, 2, 1, writer -> current_block_fp_array[block_id]);
6260 fwrite(&read_name_len, 2, 1, writer -> current_block_fp_array[block_id]);
6261 fwrite(read_name, 1, read_name_len, writer -> current_block_fp_array[block_id]);
6262 fwrite(&line_len, 2, 1, writer -> current_block_fp_array[block_id]);
6263 int write_len = fwrite(second_col_pos, 1, line_len, writer -> current_block_fp_array[block_id]);
6264 if(write_len < line_len)is_disk_full = -2;
6265
6266 writer -> output_file_size += line_len;
6267 writer -> current_chunk_size += line_len;
6268 writer -> written_reads ++;
6269 }
6270
6271 return is_disk_full;
6272 }
6273
is_SAM_unsorted(char * SAM_line,char * tmp_read_name,short * tmp_flag,srInt_64 read_no)6274 int is_SAM_unsorted(char * SAM_line, char * tmp_read_name, short * tmp_flag, srInt_64 read_no)
6275 {
6276 char read_name[MAX_READ_NAME_LEN];
6277 int flags = 0, line_cursor = 0, field_cursor = 0, tabs=0;
6278 read_name[0] =0;
6279
6280 while(1)
6281 {
6282 char nch = SAM_line[line_cursor++];
6283 if(!nch)break;
6284 if(nch == '\t')
6285 {
6286 field_cursor = 0;
6287 tabs++;
6288 if(tabs>1) break;
6289 }
6290 else if(tabs == 0)
6291 {
6292 read_name[field_cursor++] = nch;
6293 assert(MAX_READ_NAME_LEN>field_cursor);
6294 read_name[field_cursor] = 0;
6295 }
6296 else if(tabs == 1)
6297 flags = flags*10+(nch-'0');
6298 }
6299
6300 //int is_second_read = (flags & 0x80) ? 1:0;
6301 for(field_cursor = 0; read_name[field_cursor] ; field_cursor++)
6302 if(read_name[field_cursor] == '/') read_name[field_cursor] = 0;
6303
6304
6305 (*tmp_flag) = flags;
6306 if(!(flags &1)) return 0;
6307 if(read_no % 2 == 0)
6308 {
6309 if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)return 1;
6310 strcpy(tmp_read_name , read_name);
6311 }
6312 else
6313 {
6314 if(flags & SAM_FLAG_FIRST_READ_IN_PAIR) return 1;
6315 if(strcmp(tmp_read_name, read_name))return 1;
6316 }
6317
6318 return 0;
6319 }
6320
is_certainly_bam_file(char * fname,int * is_first_read_PE,srInt_64 * SAMBAM_header_size)6321 int is_certainly_bam_file(char * fname, int * is_first_read_PE, srInt_64 * SAMBAM_header_size)
6322 {
6323
6324 int read_type = probe_file_type_EX(fname, is_first_read_PE, SAMBAM_header_size);
6325 if(read_type == FILE_TYPE_NONEXIST || read_type == FILE_TYPE_EMPTY || read_type == FILE_TYPE_UNKNOWN)
6326 return -1;
6327 if(read_type == FILE_TYPE_BAM)
6328 return 1;
6329 return 0;
6330 }
6331
6332
is_pipe_file(char * fname)6333 int is_pipe_file(char * fname)
6334 {
6335 FILE * fp = fopen(fname,"r");
6336 if(!fp) return 0;
6337
6338 int seeked = fseeko(fp, 0, SEEK_SET);
6339 fclose(fp);
6340
6341 return (seeked != 0);
6342 }
6343
warning_file_type(char * fname,int expected_type)6344 int warning_file_type(char * fname, int expected_type)
6345 {
6346 int ret_pipe_file = is_pipe_file(fname);
6347 if(ret_pipe_file)
6348 {
6349 print_in_box(80,0,0,"WARNING file '%s' is not a regular file.", fname);
6350 print_in_box(80,0,0," No alignment can be done on a pipe file.");
6351 print_in_box(80,0,0," If the FASTQ file is gzipped, please use gzFASTQinput option.");
6352 print_in_box(80,0,0,"");
6353 return 1;
6354 }
6355
6356 int read_type = probe_file_type(fname, NULL);
6357
6358 if(read_type == FILE_TYPE_NONEXIST)
6359 {
6360 SUBREADprintf("ERROR: unable to open file '%s'. File name might be incorrect, or you do not have the permission to read the file.\n", fname);
6361 return -1;
6362 }
6363 else if(read_type == FILE_TYPE_EMPTY)
6364 {
6365 SUBREADprintf("\nERROR: file '%s' is empty.\n\n", fname);
6366 return -1;
6367 }
6368 else if((expected_type == FILE_TYPE_FAST_ && (read_type!= FILE_TYPE_FASTQ && read_type!= FILE_TYPE_FASTA && read_type!= FILE_TYPE_GZIP_FASTQ && read_type!= FILE_TYPE_GZIP_FASTA))||
6369 (expected_type == FILE_TYPE_GZIP_FAST_ && read_type!= FILE_TYPE_GZIP_FASTA) ||
6370 (( expected_type != FILE_TYPE_GZIP_FAST_ && expected_type != FILE_TYPE_FAST_) && expected_type != read_type))
6371 {
6372 char * req_fmt = "SAM";
6373 if(expected_type==FILE_TYPE_BAM) req_fmt = "BAM";
6374 else if(expected_type==FILE_TYPE_FAST_) req_fmt = "FASTQ or FASTA";
6375 else if(expected_type==FILE_TYPE_GZIP_FAST_) req_fmt = "gzip FASTQ or FASTA";
6376
6377 char * real_fmt = "SAM";
6378 if(read_type==FILE_TYPE_BAM) real_fmt = "BAM";
6379 else if(read_type==FILE_TYPE_FASTA) real_fmt = "FASTA";
6380 else if(read_type==FILE_TYPE_FASTQ) real_fmt = "FASTQ";
6381 else if(read_type==FILE_TYPE_GZIP_FASTQ) real_fmt = "gzip FASTQ";
6382 else if(read_type==FILE_TYPE_GZIP_FASTA) real_fmt = "gzip FASTA";
6383
6384 print_in_box(80,0,0,"WARNING format issue in file '%s':", fname);
6385 print_in_box(80,0,0," The required format is : %s", req_fmt);
6386 if(read_type == FILE_TYPE_UNKNOWN)
6387 print_in_box(80,0,0," The file format is unknown.");
6388 else
6389 print_in_box(80,0,0," The real format seems to be : %s", real_fmt);
6390 print_in_box(80,0,0,"A wrong format may result in wrong results or crash the program.");
6391 print_in_box(80,0,0,"Please refer to the manual for file format options.");
6392 print_in_box(80,0,0,"If the file is in the correct format, please ignore this message.");
6393 print_in_box(80,0,0,"");
6394
6395 return 1;
6396 }
6397 return 0;
6398 }
6399
gzgets_noempty(void * fp,char * buf,int maxlen)6400 char * gzgets_noempty(void * fp, char * buf, int maxlen)
6401 {
6402 char * ret;
6403 while(1)
6404 {
6405 ret = gzgets(fp,buf, maxlen);
6406 if(!ret)return NULL;
6407 if(ret[0]!='\n') return ret;
6408 }
6409 }
6410
6411
fgets_noempty(char * buf,int maxlen,FILE * fp)6412 char * fgets_noempty(char * buf, int maxlen, FILE * fp)
6413 {
6414 char * ret;
6415 while(1)
6416 {
6417 ret = fgets(buf, maxlen, fp);
6418 if(!ret)return NULL;
6419 if(ret[0]!='\n') return ret;
6420 }
6421 }
6422
is_comment_line(const char * l,int file_type,unsigned int lineno)6423 int is_comment_line(const char * l, int file_type, unsigned int lineno)
6424 {
6425 int tabs = 0, xk1 = 0;
6426 if(l[0]=='#') return 1;
6427
6428 if(isalpha(l[0]) && file_type == FILE_TYPE_RSUBREAD)
6429 {
6430 char target_chr[16];
6431 memcpy(target_chr, l, 16);
6432 for(xk1=0; xk1<16; xk1++)
6433 target_chr[xk1] = tolower(target_chr[xk1]);
6434
6435 if(memcmp(target_chr, "geneid\tchr\tstart",16)==0) return 1;
6436 }
6437
6438 xk1=0;
6439 while(l[xk1]) tabs += (l[xk1++] == '\t');
6440
6441 return tabs < ((file_type == FILE_TYPE_GTF)?8:4);
6442 }
6443
6444
6445
probe_file_type_fast(char * fname)6446 int probe_file_type_fast(char * fname){
6447 FILE * fp = f_subr_open(fname, "rb");
6448 if(!fp) return FILE_TYPE_NONEXIST;
6449
6450 int ret = FILE_TYPE_UNKNOWN;
6451 int nch;
6452 char *test_buf=malloc(5000);
6453
6454 nch = fgetc(fp);
6455
6456 if(feof(fp))
6457 ret = FILE_TYPE_EMPTY;
6458 else
6459 {
6460 if(nch == '@') // FASTQ OR SAM
6461 {
6462 char * rptr = fgets_noempty(test_buf, 4999, fp);
6463 int second_line_len = 0;
6464 if(rptr)
6465 {
6466 rptr = fgets_noempty(test_buf, 4999, fp);
6467 if(rptr)
6468 {
6469 second_line_len = strlen(test_buf);
6470 int tabs = 0, x1;
6471 for(x1=0;x1<4999;x1++)
6472 {
6473 if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6474 if(test_buf[x1]=='\t'){
6475 tabs++;
6476 continue;
6477 }
6478
6479 if(tabs == 1)
6480 if(!isdigit(test_buf[x1]))break;
6481 }
6482 if(rptr[0]=='@' || tabs>7)
6483 ret = FILE_TYPE_SAM;
6484 }
6485 }
6486 if(ret == FILE_TYPE_UNKNOWN)
6487 {
6488 rptr = fgets_noempty(test_buf, 4999, fp);
6489 if(rptr[0] == '+')
6490 {
6491 rptr = fgets_noempty(test_buf, 4999, fp);
6492 if(rptr && second_line_len == strlen(test_buf))
6493 ret = FILE_TYPE_FASTQ;
6494 }
6495 }
6496 }
6497 else if(nch == '>') // FASTA
6498 {
6499 ret = FILE_TYPE_FASTA;
6500 }
6501 else if(nch == 31) // BAM OR GZ_FASTQ
6502 {
6503 nch = fgetc(fp);
6504 if(nch == 139)
6505 {
6506 fclose(fp);
6507 fp=NULL;
6508 gzFile zfp = gzopen(fname, "rb");
6509 if(zfp)
6510 {
6511 int rlen = gzread(zfp, test_buf,4);
6512 if(rlen == 4 && memcmp(test_buf,"BAM\1",4)==0)
6513 ret = FILE_TYPE_BAM;
6514 if(rlen == 4 && test_buf[0]=='@')
6515 ret = FILE_TYPE_GZIP_FASTQ;
6516 if(rlen == 4 && test_buf[0]=='>')
6517 ret = FILE_TYPE_GZIP_FASTA;
6518 gzclose(zfp);
6519 }
6520 }
6521 }
6522 else if(nch >= 0x20 && nch <= 0x7f) // SAM without headers
6523 {
6524 int tabs = 0, x1;
6525 char * rptr = fgets(test_buf, 4999, fp);
6526 if(rptr)
6527 for(x1=0;x1<4999;x1++)
6528 {
6529 if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6530 if(test_buf[x1]=='\t'){
6531 tabs++;
6532 continue;
6533 }
6534 if(tabs == 1)
6535 if(!isdigit(test_buf[x1]))break;
6536 }
6537 if(tabs>7)
6538 ret = FILE_TYPE_SAM;
6539
6540 }
6541 }
6542
6543 if(fp)fclose(fp);
6544
6545 free(test_buf);
6546 return ret;
6547
6548 }
probe_file_type(char * fname,int * is_first_read_PE)6549 int probe_file_type(char * fname, int * is_first_read_PE)
6550 {
6551 return probe_file_type_EX(fname, is_first_read_PE, NULL);
6552 }
probe_file_type_EX(char * fname,int * is_first_read_PE,srInt_64 * SAMBAM_header_length)6553 int probe_file_type_EX(char * fname, int * is_first_read_PE, srInt_64 * SAMBAM_header_length)
6554 {
6555 FILE * fp = f_subr_open(fname, "rb");
6556 if(!fp) return FILE_TYPE_NONEXIST;
6557
6558 int ret = FILE_TYPE_UNKNOWN;
6559 int nch;
6560 char *test_buf=malloc(5000);
6561
6562 nch = fgetc(fp);
6563
6564 if(feof(fp))
6565 ret = FILE_TYPE_EMPTY;
6566
6567 else
6568 {
6569 if(nch == '@') // FASTQ OR SAM
6570 {
6571 char * rptr = fgets_noempty(test_buf, 4999, fp);
6572 int second_line_len = 0;
6573 if(rptr)
6574 {
6575 rptr = fgets_noempty(test_buf, 4999, fp);
6576 if(rptr)
6577 {
6578 second_line_len = strlen(test_buf);
6579 int tabs = 0, x1;
6580 for(x1=0;x1<4999;x1++)
6581 {
6582 if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6583 if(test_buf[x1]=='\t'){
6584 tabs++;
6585 continue;
6586 }
6587
6588 if(tabs == 1)
6589 if(!isdigit(test_buf[x1]))break;
6590 }
6591 if(rptr[0]=='@' || tabs>7)
6592 ret = FILE_TYPE_SAM;
6593 }
6594 }
6595 if(ret == FILE_TYPE_UNKNOWN)
6596 {
6597 rptr = fgets_noempty(test_buf, 4999, fp);
6598 if(rptr[0] == '+')
6599 {
6600 rptr = fgets_noempty(test_buf, 4999, fp);
6601 if(rptr && second_line_len == strlen(test_buf))
6602 ret = FILE_TYPE_FASTQ;
6603 }
6604 }
6605 }
6606 else if(nch == '>') // FASTA
6607 {
6608 char * rptr = fgets(test_buf, 4999, fp);
6609 int x1;
6610 if(rptr)
6611 {
6612 ret = FILE_TYPE_FASTA;
6613 for(x1=0;x1<4999;x1++)
6614 {
6615 if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6616 nch = toupper(test_buf[x1]);
6617 if(nch < ' ' || nch>127)
6618 {
6619 ret = FILE_TYPE_UNKNOWN;
6620 break;
6621 }
6622 }
6623 rptr = fgets(test_buf, 4999, fp);
6624 if(rptr && ret == FILE_TYPE_FASTA)
6625 {
6626 for(x1=0;x1<4999;x1++)
6627 {
6628 if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6629 nch = toupper(test_buf[x1]);
6630 if(nch == 'A' || nch == 'T' || nch == 'G' || nch == 'C' || nch == 'N' || nch == '.' || (nch >='0' && nch <= '3'))
6631 ;
6632 else
6633 {
6634 ret = FILE_TYPE_UNKNOWN;
6635 break;
6636 }
6637 }
6638
6639 if(x1==0) ret = FILE_TYPE_UNKNOWN;
6640 }
6641 }
6642 }
6643 else if(nch == 31) // BAM OR GZ_FASTQ
6644 {
6645 nch = fgetc(fp);
6646 if(nch == 139)
6647 {
6648 fclose(fp);
6649 fp=NULL;
6650 gzFile zfp = gzopen(fname, "rb");
6651 if(zfp)
6652 {
6653 int rlen = gzread(zfp, test_buf,4);
6654 if(rlen == 4 && memcmp(test_buf,"BAM\1",4)==0)
6655 ret = FILE_TYPE_BAM;
6656 if(rlen == 4 && test_buf[0]=='@')
6657 ret = FILE_TYPE_GZIP_FASTQ;
6658 if(rlen == 4 && test_buf[0]=='>')
6659 ret = FILE_TYPE_GZIP_FASTA;
6660 gzclose(zfp);
6661 }
6662 }
6663 }
6664 else if(nch >= 0x20 && nch <= 0x7f) // SAM without headers
6665 {
6666 int tabs = 0, x1;
6667 char * rptr = fgets(test_buf, 4999, fp);
6668 if(rptr)
6669 for(x1=0;x1<4999;x1++)
6670 {
6671 if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6672 if(test_buf[x1]=='\t'){
6673 tabs++;
6674 continue;
6675 }
6676 if(tabs == 1)
6677 if(!isdigit(test_buf[x1]))break;
6678 }
6679 if(tabs>7)
6680 ret = FILE_TYPE_SAM;
6681
6682 }
6683 }
6684
6685 if(fp)fclose(fp);
6686
6687 //SUBREADprintf("RET=%d, FIRSTPE=%p, SAMLEN=%p\n" , ret, is_first_read_PE, SAMBAM_header_length);
6688 if(FILE_TYPE_BAM == ret || FILE_TYPE_SAM == ret)
6689 if(is_first_read_PE || SAMBAM_header_length)
6690 {
6691 SamBam_FILE * tpfp = SamBam_fopen(fname, (FILE_TYPE_BAM == ret)?SAMBAM_FILE_BAM:SAMBAM_FILE_SAM);
6692 while(1)
6693 {
6694 char * tbr = SamBam_fgets(tpfp, test_buf, 4999, 0);
6695 if( is_first_read_PE && tpfp -> is_paired_end >= 10)
6696 (*is_first_read_PE) = tpfp -> is_paired_end - 10;
6697 if(tbr == NULL)break;
6698 if(tbr[0]=='@') continue;
6699 break;
6700 }
6701
6702 if( SAMBAM_header_length) (*SAMBAM_header_length) = tpfp -> header_length;
6703 SamBam_fclose(tpfp);
6704 }
6705
6706 free(test_buf);
6707 //if(is_first_read_PE)assert(0);
6708 return ret;
6709 }
6710
warning_hash_hash(HashTable * t1,HashTable * t2,char * msg)6711 void warning_hash_hash(HashTable * t1, HashTable * t2, char * msg){
6712 int buck_i, shown = 0;
6713 for(buck_i = 0; buck_i < t1 -> numOfBuckets; buck_i++){
6714 KeyValuePair * cursor = t1 -> bucketArray[buck_i];
6715 while(cursor){
6716 char * t1chro = (char *) cursor -> key;
6717 int found = HashTableGet(t2, t1chro) != NULL;
6718 if(!found) if(strlen(t1chro)>3 && t1chro[0]=='c'&&t1chro[1]=='h'&&t1chro[2]=='r' ) found = HashTableGet(t2, t1chro+3) != NULL;
6719 if(!found) {
6720 char tmp_t1chro [MAX_CHROMOSOME_NAME_LEN+1];
6721 sprintf(tmp_t1chro, "chr%s", t1chro);
6722 found = HashTableGet(t2, tmp_t1chro) != NULL;
6723 }
6724
6725 if(!found){
6726 if(!shown){
6727 print_in_box(80,0,0,"");
6728 print_in_box(80,0,0,msg);
6729 shown = 1;
6730 }
6731 print_in_box(80,0,0," %s", t1chro);
6732 }
6733 cursor = cursor -> next;
6734 }
6735 }
6736 if(shown) print_in_box(80,0,0,"");
6737 }
6738
6739
6740 #ifdef MAKE_INPUTTEST
main(int argc,char ** argv)6741 int main(int argc, char ** argv)
6742 {
6743 FILE * ifp;
6744 srInt_64 rno=0;
6745 short tmp_flags, is_sorted = 1;
6746 char buff[3000], tmp_rname[MAX_FILE_NAME_LENGTH];
6747
6748 ifp = f_subr_open(argv[1],"r");
6749 while(1)
6750 {
6751 char * rr = fgets(buff,2999, ifp);
6752 if(!rr) break;
6753 if(buff[0]=='@')continue;
6754 if(is_SAM_unsorted(buff, tmp_rname, &tmp_flags, rno))
6755 {
6756 printf("The input file is unsorted.\n");
6757 is_sorted = 0;
6758 break;
6759 }
6760 rno++;
6761 }
6762
6763 fclose(ifp);
6764
6765 //if(is_sorted) return 0;
6766
6767 ifp = f_subr_open(argv[1],"r");
6768 SAM_sort_writer writer;
6769 if(sort_SAM_create(&writer, argv[2], ".")){
6770 printf("ERROR: unable to create the writer.\n");
6771 return -1;
6772 }
6773
6774 while(1)
6775 {
6776 char * rr = fgets(buff,2999, ifp);
6777 if(!rr) break;
6778 int line_len = strlen(buff);
6779 sort_SAM_add_line(&writer, buff, line_len);
6780 }
6781 fclose(ifp);
6782 sort_SAM_finalise(&writer);
6783 printf("WRITTEN=%llu\nUNPAIR=%llu\n", writer.written_reads, writer.unpaired_reads);
6784 }
6785 #endif
6786 #ifdef MAKE_TYPETEST
6787
6788
main(int argc,char ** argv)6789 int main(int argc, char ** argv)
6790 {
6791 char * fn = argv[1];
6792 int type = probe_file_type(fn, NULL);
6793 switch(type)
6794 {
6795 case FILE_TYPE_FASTQ: printf("Type: FASTQ\n"); break;
6796 case FILE_TYPE_FASTA: printf("Type: FASTA\n"); break;
6797 case FILE_TYPE_SAM : printf("Type: SAM\n"); break;
6798 case FILE_TYPE_BAM : printf("Type: BAM\n"); break;
6799 default: printf("Unknown type.\n");
6800 }
6801 }
6802
6803 #endif
6804