1 /***************************************************************
2 
3    The Subread software package is free software package:
4    you can redistribute it and/or modify it under the terms
5    of the GNU General Public License as published by the
6    Free Software Foundation, either version 3 of the License,
7    or (at your option) any later version.
8 
9    Subread is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty
11    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 
13    See the GNU General Public License for more details.
14 
15    Authors: Drs Yang Liao and Wei Shi
16 
17   ***************************************************************/
18 
19 
20 #include <stdio.h>
21 #include <signal.h>
22 #include <dirent.h>
23 #include <string.h>
24 #include <stdlib.h>
25 #include <ctype.h>
26 #include <sys/types.h>
27 #ifndef __MINGW32__
28 #include <sys/resource.h>
29 #endif
30 #include <sys/stat.h>
31 #include <unistd.h>
32 #include <zlib.h>
33 #include <stdio.h>
34 #include <assert.h>
35 #include "input-files.h"
36 #include "input-blc.h"
37 #include "sambam-file.h"
38 #include "HelperFunctions.h"
39 #include "hashtable.h"
40 #include "seek-zlib.h"
41 #include "gene-algorithms.h"
42 #include "sublog.h"
43 
44 unsigned int BASE_BLOCK_LENGTH = 15000000;
45 
f_subr_open(const char * fname,const char * mode)46 FILE * f_subr_open(const char * fname, const char * mode)
47 {
48 #ifdef __MINGW32__
49 		return fopen64(fname, mode);
50 #else
51 #if defined(__LP64__) || defined(_LP64) || defined(MACOS) || defined(__FreeBSD__) || defined(__DragonFly__)
52 		return fopen(fname, mode);
53 #else
54 		return fopen64(fname, mode);
55 #endif
56 #endif
57 
58 }
fastq_64_to_33(char * qs)59 void fastq_64_to_33(char * qs)
60 {
61 	int i=0;
62 	while(qs[i])
63 		qs[i++] -= 31;
64 }
65 
delay_run(void * ptr)66 void * delay_run(void * ptr){
67 	usleep(100000);
68 	free(ptr);
69 	return NULL;
70 }
71 
delay_realloc(void * old_pntr,size_t old_size,size_t new_size)72 void * delay_realloc(void * old_pntr, size_t old_size, size_t new_size){
73 	pthread_t thread;
74 	void * new_ret = malloc(new_size);
75 	memcpy(new_ret, old_pntr, old_size);
76 	pthread_create(&thread, NULL, delay_run, old_pntr);
77 	return new_ret;
78 }
79 
80 // the caller is in charge of deallocation
memstrcpy(char * in)81 char * memstrcpy(char * in){
82 	int ilen = strlen(in);
83 	char * ret = malloc(ilen+1);
84 	memcpy(ret, in, ilen);
85 	ret[ilen]=0;
86 	return ret;
87 }
88 
guess_reads_density(char * fname,int is_sam)89 double guess_reads_density(char * fname, int is_sam)
90 {
91 	return guess_reads_density_format(fname, is_sam, NULL, NULL, NULL);
92 }
93 
geinput_file_offset(gene_input_t * input)94 srInt_64 geinput_file_offset( gene_input_t * input){
95 	if(input -> file_type == GENE_INPUT_GZIP_FASTQ || input -> file_type == GENE_INPUT_GZIP_FASTA){
96 		if(((seekable_zfile_t*)input -> input_fp) -> blocks_in_chain<1)return 0;
97 		seekable_decompressed_block_t * ct = ((seekable_zfile_t*)input -> input_fp) -> block_rolling_chain+((seekable_zfile_t*)input -> input_fp) -> block_chain_current_no;
98 		return ct -> block_start_in_file_offset + ((seekable_zfile_t*)input -> input_fp) -> current_block_txt_read_ptr * 5/16; // compressed text ~= plain text * 28%
99 	}else{
100 		return ftello((FILE*)input ->input_fp);
101 	}
102 }
103 
guess_reads_density_format(char * fname,int is_sam,int * min_phred_score,int * max_phred_score,int * tested_reads)104 double guess_reads_density_format(char * fname, int is_sam, int * min_phred_score, int * max_phred_score, int * tested_reads)
105 {
106 	gene_input_t *ginp = malloc(sizeof(gene_input_t));
107 	srInt_64 fpos =0, fpos2 = 0;
108 	int i;
109 	int max_qual_chr = -1, min_qual_chr = 127;
110 	char buff[MAX_READ_LENGTH] , qbuf[MAX_READ_LENGTH];
111 
112 	float retv = 0;
113 
114 	if(is_sam == 0)
115 	{
116 		if(geinput_open(fname, ginp))retv= -1.0;
117 	}else if(is_sam == 1)
118 	{
119 		if(geinput_open_sam(fname, ginp,0))retv= -1.0;
120 	}else if(is_sam == 2)
121 	{
122 		if(geinput_open_sam(fname, ginp,1))retv= -1.0;
123 	}
124 
125 	if(retv > -0.1){
126 		geinput_next_read(ginp, NULL, buff, NULL);
127 
128 		fpos = geinput_file_offset(ginp);
129 		for(i=0; i<3000; i++)
130 		{
131 			if(geinput_next_read(ginp, NULL, buff, qbuf)<0) break;
132 			if(qbuf[0])
133 			{
134 				int xk=0;
135 				while(qbuf[xk])
136 				{
137 					min_qual_chr = min(min_qual_chr,qbuf[xk]);
138 					max_qual_chr = max(max_qual_chr,qbuf[xk++]);
139 				}
140 			}
141 			if(tested_reads)
142 				(*tested_reads) ++;
143 
144 		}
145 
146 		if(min_phred_score)
147 		{
148 			(*min_phred_score) = min_qual_chr;
149 			(*max_phred_score) = max_qual_chr;
150 
151 		}
152 		fpos2 = geinput_file_offset(ginp) - fpos;
153 		geinput_close(ginp);
154 
155 		retv= fpos2*1.0/i;
156 	}
157 
158 	free(ginp);
159 	return retv;
160 }
161 
is_gene_char(char c)162 int is_gene_char(char c)
163 {
164 	//if(c== 'M' || c == 'm' || c == 'U' || c == 'u' || c == 'A' || c=='a' || c=='G' || c=='g' || c=='C' || c=='c' || c=='T' || c=='t' || c=='N' || c=='n')
165 	if(c=='-' || c == '.' || c == 'N')
166 		return GENE_SPACE_BASE;
167 	if((c>='A' && c<'Z') || (c>='a' && c<='z'))
168 		return GENE_SPACE_BASE;
169 	if(c>='0' && c<'9')
170 		return GENE_SPACE_COLOR;
171 	return 0;
172 }
173 
guess_gene_bases(char ** files,int file_number)174 srInt_64 guess_gene_bases(char ** files, int file_number)
175 {
176 	int i;
177 	srInt_64 ret = 0;
178 
179 	for(i=0; i<file_number; i++)
180 	{
181 		char * fname = files[i];
182 		struct stat statbuf;
183 
184 		if (stat(fname , &statbuf))
185 		{
186 			//SUBREADprintf("guess_gene_bases NOT FOUND!!%s\n", fname);
187 			return -i-1;
188 		}
189 
190 		ret += statbuf.st_size;
191 		ret -= 150;
192 		if(ret<2)ret=2;
193 	}
194 	return ret * 70 / 71;
195 }
196 
197 #define geinput_getc(input)  (   input -> file_type == GENE_INPUT_GZIP_FASTQ? (seekgz_next_char((seekable_zfile_t*)input -> input_fp)):(fgetc((FILE*)input -> input_fp)) )
198 
geinput_preload_buffer(gene_input_t * input,subread_lock_t * read_lock)199 int geinput_preload_buffer(gene_input_t * input, subread_lock_t * read_lock){
200 	if(input -> file_type == GENE_INPUT_GZIP_FASTQ)
201 		return seekgz_preload_buffer((seekable_zfile_t*)input -> input_fp, read_lock);
202 	return 0;
203 }
204 
205 
206 
207 // read the line EXCLUDE last \n
208 // returns bytes WITHOUT \n
read_line_noempty(int max_read_len,gene_input_t * input,char * buff,int must_upper)209 int read_line_noempty(int max_read_len, gene_input_t * input, char * buff, int must_upper)
210 {
211 	int ret =0;
212 
213 	if(input -> file_type == GENE_INPUT_GZIP_FASTQ || input -> file_type == GENE_INPUT_GZIP_FASTA){
214 		seekgz_preload_buffer((seekable_zfile_t*)input -> input_fp, NULL);
215 		ret = seekgz_gets((seekable_zfile_t*)input->input_fp, buff, MAX_READ_LENGTH-1);
216 		if(ret > 0){
217 			if(must_upper){
218 				int ii;
219 				for(ii=0; ii<ret-1;ii++) buff[ii] = toupper(buff[ii]);
220 			}
221 			buff[ret-1] =0;
222 			return ret - 1;
223 		} else return 0;
224 	}
225 	if(must_upper)
226 	{
227 		while(1)
228 		{
229 			char ch = geinput_getc(input);
230 			#ifdef __MINGW32__
231 			if(ch == '\r') continue;
232 			#endif
233 			if(ch == EOF) break;
234 			if(ch == '\n'){
235 					if(ret)
236 						break;
237 			}
238 			else if(ret < max_read_len-1)
239 				buff[ret++] = toupper(ch);
240 		}
241 	}
242 	else
243 	{
244 		while(1)
245 		{
246 			char ch = geinput_getc(input);
247 			#ifdef __MINGW32__
248 			if(ch == '\r') continue;
249 			#endif
250 			if (ch == EOF) break;
251 			if(ch == '\n'){
252 					if(ret)
253 						break;
254 			}
255 			else if(ret < max_read_len-1) buff[ret++] = ch;
256 		}
257 
258 	}
259 	buff[ret]=0;
260 	return ret;
261 }
262 
263 
264 
read_line(int max_read_len,FILE * fp,char * buff,int must_upper)265 int read_line(int max_read_len, FILE * fp, char * buff, int must_upper)
266 {
267 	int ret =0;
268 	if(must_upper)
269 	{
270 		while(1)
271 		{
272 			char ch = fgetc(fp);
273 			#ifdef __MINGW32__
274 			if(ch == '\r') continue;
275 			#endif
276 			if(ch == '\n' || ch == EOF) break;
277 			if(ret < max_read_len-1)
278 				buff[ret++] = toupper(ch);
279 		}
280 	}
281 	else
282 	{
283 		while(1)
284 		{
285 			char ch = fgetc(fp);
286 			#ifdef __MINGW32__
287 			if(ch == '\r') continue;
288 			#endif
289 			if (ch == '\n' || ch == EOF) break;
290 			if(ret < max_read_len-1)
291 				buff[ret++] = ch;
292 		}
293 
294 	}
295 	buff[ret]=0;
296 	return ret;
297 }
298 
299 
300 
read_line_back(int max_read_len,FILE * fp,char * buff,int must_upper)301 int read_line_back(int max_read_len, FILE * fp, char * buff, int must_upper)
302 {
303 	int ret =0;
304 	int started = 0;
305 	if(must_upper)
306 	{
307 		while(1)
308 		{
309 			char ch = fgetc(fp);
310 			if (ch == '\n')
311 			{
312 				if (started)break;
313 				else continue;
314 			}
315 			else if(ch == EOF) break;
316 			else
317 				started = 1;
318 			if(ret <max_read_len && ch != '\r')
319 				if ((ch!=' ' && ch != '\t'))
320 					buff[ret++] = toupper(ch);
321 		}
322 	}
323 	else
324 	{
325 		while(1)
326 		{
327 			char ch = fgetc(fp);
328 			if (ch == '\n')
329 			{
330 				if (started)break;
331 				else continue;
332 			}
333 			else if(ch == EOF) break;
334 			else
335 				started = 1;
336 
337 			if(ret <max_read_len && ch != '\r')
338 				buff[ret++] = ch;
339 		}
340 
341 	}
342 	buff[ret]=0;
343 	return ret;
344 }
345 
geinput_readline(gene_input_t * input,char * buff,int conv_to_upper)346 int geinput_readline(gene_input_t * input, char * buff, int conv_to_upper)
347 {
348 	return read_line(MAX_READ_LENGTH, input -> input_fp, buff, conv_to_upper);
349 }
350 
is_read(char * in_buff)351 int is_read(char * in_buff)
352 {
353 	int p=0;
354 	char c;
355 	int space_type = GENE_SPACE_BASE;
356 	while((c=in_buff[p++])!='\0')
357 	{
358 		if(c!='\r' && c!='\n'){
359 			int x = is_gene_char(c);
360 			if (x == GENE_SPACE_COLOR)
361 				space_type = GENE_SPACE_COLOR;
362 			else if(!x)
363 				return 0;
364 		}
365 	}
366 	return space_type;
367 }
368 
strtokmm(char * str,const char * delim,char ** next)369 char *strtokmm(char *str, const char *delim, char ** next) {
370     char *tok;
371     char *m;
372 
373     if (delim == NULL) return NULL;
374 
375     tok = (str) ? str : (*next);
376     if (tok == NULL) return NULL;
377 
378     m = strstr(tok, delim);
379 
380     if (m) {
381 	(*next) = m + strlen(delim);
382 	*m = '\0';
383     } else {
384 	(*next) = NULL;
385     }
386 
387     return tok;
388 }
389 
geinput_open_scRNA_BAM(char * rfnames,gene_input_t * input,int reads_per_chunk,int threads)390 int geinput_open_scRNA_BAM(char * rfnames,  gene_input_t * input, int reads_per_chunk, int threads ){
391 	strcpy(input->filename,rfnames);
392 	int rv = input_scBAM_init(&input -> scBAM_input, rfnames);
393 	input -> file_type = GENE_INPUT_SCRNA_BAM;
394 	input -> space_type = GENE_SPACE_BASE;
395 	return rv;
396 }
397 
geinput_open_scRNA_fqs(char * rfnames,gene_input_t * input,int reads_per_chunk,int threads)398 int geinput_open_scRNA_fqs(char * rfnames,  gene_input_t * input, int reads_per_chunk, int threads ){
399 	strcpy(input->filename,rfnames);
400 	int rv = input_mFQ_init_by_one_string(&input -> scRNA_fq_input, rfnames);
401 	input -> file_type = GENE_INPUT_SCRNA_FASTQ;
402 	input -> space_type = GENE_SPACE_BASE;
403 	return rv;
404 }
405 
geinput_open_bcl(const char * dir_name,gene_input_t * input,int reads_per_chunk,int threads)406 int geinput_open_bcl( const char * dir_name,  gene_input_t * input, int reads_per_chunk, int threads){
407 	int rv = cacheBCL_init(&input -> bcl_input , (char*) dir_name, reads_per_chunk, threads );
408 	strcpy(input->filename, dir_name);
409 	if(rv) return -1;
410 	input -> file_type = GENE_INPUT_BCL;
411 	input -> space_type = GENE_SPACE_BASE;
412 	return 0;
413 }
414 
geinput_open_sam(const char * filename,gene_input_t * input,int half_number)415 int geinput_open_sam(const char * filename, gene_input_t * input, int half_number)
416 {
417 	input->input_fp = f_subr_open(filename, "rb");
418 
419 	strcpy(input->filename, filename);
420 
421 	if(input->input_fp == NULL)
422 		return 1;
423 	input -> file_type = half_number + GENE_INPUT_SAM_SINGLE;
424 	while(1){
425 		char in_buff[3001];
426 		srInt_64 current_pos = ftello(input -> input_fp);
427 		int rlen = read_line(3000, input->input_fp, in_buff, 0);
428 		if(rlen < 1) return 1;
429 
430 		if(in_buff[0] != '@')
431 		{
432 			int x, tab_no = 0;
433 			char *read_buf=NULL;
434 			for(x=0; x<rlen; x++)
435 			{
436 				if(in_buff[x]=='\t')
437 				{
438 					tab_no ++;
439 					if(tab_no ==9) read_buf = in_buff+x+1;
440 					if(tab_no ==10) in_buff[x]=0;
441 					continue;
442 				}
443 			}
444 			if (tab_no<10)return 1;
445 			input->space_type = is_read(read_buf);
446 			if (GENE_INPUT_SAM_PAIR_2 != input -> file_type) fseeko(input -> input_fp , current_pos, SEEK_SET);
447 			input -> read_chunk_start = ftell(input -> input_fp);
448 			break;
449 		}
450 	}
451 
452 	return 0;
453 }
454 
geinput_open(const char * filename,gene_input_t * input)455 int geinput_open(const char * filename, gene_input_t * input)
456 {
457 	char in_buff[MAX_READ_LENGTH];
458 	int line_no = 0, ret = 0;
459 	if(strlen(filename)>298)
460 		return 1;
461 
462 	input -> gzfa_last_name[0]=0;
463 	strcpy(input->filename, filename);
464 	FILE * TMP_FP = f_subr_open(filename, "rb");
465 
466 	if(TMP_FP == NULL)
467 		return 1;
468 
469 	int id1, id2;
470 	id1 = fgetc(TMP_FP);
471 	id2 = fgetc(TMP_FP);
472 
473 	if(id1 == 31 && id2 == 139) {
474 		fclose(TMP_FP);
475 		input->input_fp = malloc(sizeof(seekable_zfile_t));
476 		ret = seekgz_open(filename, input->input_fp, NULL );
477 		if(ret == 0){
478 			int fq_stat = 0;
479 			for(line_no = 0; line_no < 1000; line_no++){
480 				int fl = seekgz_gets(input->input_fp, in_buff, 1000);
481 				if(fl < 1)break;	// EOF
482 				else if(fl == 1)continue;	// empty line
483 				else{		// text line
484 
485 					if(line_no==0)input->file_type = in_buff[0]=='@'? GENE_INPUT_GZIP_FASTQ: GENE_INPUT_GZIP_FASTA;
486 					if(fq_stat%4 == 1) // read text
487 					{
488 						input->space_type = is_read(in_buff);
489 						break;
490 					}
491 					fq_stat ++;
492 				}
493 			}
494 			seekgz_close(input->input_fp);
495 			seekgz_open(filename, input->input_fp, NULL);
496 		}
497 		//SUBREADprintf("ZFAtest: type=%d\n", input->file_type);
498 	}else{
499 		input->file_type = GENE_INPUT_FASTQ;
500 		input->input_fp = TMP_FP;
501 		fseeko(input->input_fp, 0, SEEK_SET);
502 		while (1){
503 			srInt_64 last_pos = ftello(input->input_fp);
504 			int rlen = read_line_noempty(MAX_READ_LENGTH, input, in_buff, 0);
505 			if (rlen<=0){
506 				ret = 1;
507 				break;
508 			}else{
509 				if(line_no==0 && is_read(in_buff))
510 				{
511 					input->file_type = GENE_INPUT_PLAIN;
512 					input->space_type = is_read(in_buff);
513 					fseeko(input->input_fp,last_pos,SEEK_SET);
514 					break;
515 				}
516 				if(in_buff[0]=='>')
517 				{
518 					input->file_type = GENE_INPUT_FASTA;
519 				//	printf("FILE %s OPENED AS FATSA.\n", filename);
520 					rlen += read_line(MAX_READ_LENGTH, input->input_fp, in_buff, 0);
521 					input->space_type = is_read(in_buff);
522 
523 					fseeko(input->input_fp,last_pos,SEEK_SET);
524 					break;
525 				}
526 				if(in_buff[0]=='@')
527 				{
528 					input->file_type = GENE_INPUT_FASTQ;
529 					rlen += read_line_noempty(MAX_READ_LENGTH, input, in_buff, 0);
530 					input->space_type = is_read(in_buff);
531 					fseeko(input->input_fp, last_pos,SEEK_SET);
532 					break;
533 				}
534 				line_no++;
535 			}
536 		}
537 	}
538 	input -> read_chunk_start = geinput_file_offset(input);
539 
540 	if(0 == input->space_type)input->space_type = GENE_SPACE_BASE;
541 	return ret;
542 }
543 
geinput_next_char(gene_input_t * input)544 int geinput_next_char(gene_input_t * input)
545 {
546 	if(input->file_type == GENE_INPUT_FASTA)
547 	{
548 		int last_br = 0;
549 		while (1)
550 		{
551 			char nch = fgetc((FILE *)input->input_fp);
552 			if (nch <0 && feof((FILE *)input->input_fp))
553 				return -2;
554 			else if (nch < 0 || nch > 126)SUBREADprintf("\nUnrecognised char = #%d\n", nch);
555 
556 			if (nch == '\r')
557 			{
558 				#ifndef __MINGW32__
559 				SUBREADprintf("The input FASTA file contains \\r characters. This should not result in any problem but we suggest to use UNIX-style line breaks.\n");
560 				#endif
561 				last_br ++;
562 				continue;
563 			}
564 			if (nch == '\n')
565 			{
566 				last_br ++;
567 				continue;
568 			}
569 			if (nch == ' ' || nch == '\t')
570 				continue;
571 
572 			if (nch == '>' && last_br)
573 			{
574 				// if this is a new segment
575 
576 				fseeko(input->input_fp, -last_br , SEEK_CUR);
577 				return -1;
578 			}
579 
580 			if (is_gene_char(nch))
581 				return toupper(nch);
582 			else {
583 				srInt_64 fpos = ftello(input->input_fp);
584 				int back_search_len =2;
585 				int is_empty_seq = 0;
586 				char *out_buf = malloc(2000);
587 
588 				while( fpos >= back_search_len )
589 				{
590 					fseeko(input->input_fp, fpos - back_search_len, SEEK_SET);
591 					int bc_nch = fgetc(input->input_fp);
592 					//SUBREADprintf("SEEKINGBACK : %d : ch=%d '%c' ; bch=%d '%c'\n", back_search_len, nch, nch, bc_nch, bc_nch);
593 					if(bc_nch=='\n')
594 					{
595 						if(nch == '>' && back_search_len==2) is_empty_seq=1;
596 						break;
597 					}
598 					back_search_len++;
599 				}
600 
601 				char * fgin = fgets(out_buf, 1999,input->input_fp);
602 				if(NULL == fgin) out_buf[0]=0;
603 
604 				if(is_empty_seq)
605 				{
606 					if(strlen(out_buf)>0)
607 						out_buf[strlen(out_buf)-1]=0;
608 					SUBREADprintf ("\nEmpty chromosome sequence before '%s'. The file offset is %llu\n",out_buf, fpos);
609 					free(out_buf);
610 					return -1;
611 				}
612 				else
613 				{
614 					#ifdef __MINGW32__
615 					SUBREADprintf ("\nUnknown character in the chromosome data: '%c' (ASCII:%02X), ignored. The file offset is %lu\n", nch, nch, fpos);
616 					#else
617 					SUBREADprintf ("\nUnknown character in the chromosome data: '%c' (ASCII:%02X), ignored. The file offset is %llu\n", nch, nch, fpos);
618 					#endif
619 					SUBREADprintf("%s", out_buf);
620 					for(; back_search_len>2; back_search_len--)
621 						SUBREADprintf(" ");
622 					SUBREADprintf("^\n");
623 
624 					fseeko(input->input_fp, fpos, SEEK_SET);
625 					free(out_buf);
626 					return 'N';
627 				}
628 			}
629 			if(nch !='\r' && nch != '\n')last_br = 0;
630 		}
631 	}
632 	else
633 	{
634 		SUBREADprintf("Only the FASTA format is accepted for input chromosome data.\n");
635 		return -3;
636 	}
637 
638 }
639 
640 
geinput_readline_back(gene_input_t * input,char * linebuffer_3000)641 int geinput_readline_back(gene_input_t * input, char * linebuffer_3000)
642 {
643 	srInt_64 last_pos = ftello(input -> input_fp);
644 	int ret = read_line(3000, input->input_fp, linebuffer_3000, 0);
645 	if(ret<1) return -1;
646 	fseeko(input -> input_fp, last_pos, SEEK_SET);
647 	return ret;
648 }
649 
650 #define SKIP_LINE { nch=' '; while(nch != EOF && nch != '\n') nch = geinput_getc(input); }
651 #define SKIP_LINE_NOEMPTY {int content_line_l = 0; nch=' '; while(nch != EOF && (nch != '\n' ||! content_line_l)){nch = geinput_getc(input); content_line_l += (nch != '\n');} }
652 
653 //#define SKIP_LINE { nch=' '; while(nch != EOF && nch != '\n') nch = geinput_getc(input); }
654 
read_numbers(gene_input_t * input)655 unsigned int read_numbers(gene_input_t * input)
656 {
657 	unsigned int ret = 0;
658 	char nch;
659 	srInt_64 fpos = ftello(input->input_fp);
660 	if(input->file_type >= GENE_INPUT_SAM_SINGLE)
661 	{
662 		while(1)
663 		{
664 			nch = fgetc(input->input_fp);
665 			if(nch=='@')
666 				SKIP_LINE
667 			else break;
668 		}
669 	}
670 
671 	while(1)
672 	{
673 		SKIP_LINE
674 		if(nch==EOF) break;
675 		ret ++;
676 	}
677 	fseeko(input->input_fp, fpos, SEEK_SET);
678 	if (input->file_type == GENE_INPUT_FASTQ) return ret/4;
679 	if (input->file_type == GENE_INPUT_FASTA) return ret/2;
680 	return ret;
681 }
682 
geinput_tell(gene_input_t * input,gene_inputfile_position_t * pos)683 void geinput_tell(gene_input_t * input, gene_inputfile_position_t * pos){
684 	if(input -> file_type == GENE_INPUT_SCRNA_BAM){
685 		scBAM_tell(&input -> scBAM_input, &pos -> scBAM_position);
686 	}else if(input -> file_type == GENE_INPUT_SCRNA_FASTQ){
687 		input_mFQ_tell(&input -> scRNA_fq_input, &pos -> mFQ_position);
688 	}else if(input -> file_type == GENE_INPUT_BCL){
689 		assert(input -> file_type != GENE_INPUT_BCL);
690 	}else if(input -> file_type == GENE_INPUT_GZIP_FASTQ || input -> file_type == GENE_INPUT_GZIP_FASTA){
691 		seekgz_tell(( seekable_zfile_t *)input -> input_fp, &pos -> seekable_gzip_position);
692 		if(input -> gzfa_last_name[0]) strcpy(pos -> gzfa_last_name, input -> gzfa_last_name);
693 		else pos -> gzfa_last_name[0]=0;
694 	}else{
695 		pos -> simple_file_position = ftello((FILE *)input -> input_fp);
696 	}
697 }
698 
geinput_seek(gene_input_t * input,gene_inputfile_position_t * pos)699 void geinput_seek(gene_input_t * input, gene_inputfile_position_t * pos){
700 	if(input -> file_type == GENE_INPUT_SCRNA_BAM){
701 		scBAM_seek(&input -> scBAM_input, &pos -> scBAM_position);
702 	}else if(input -> file_type == GENE_INPUT_SCRNA_FASTQ){
703 		input_mFQ_seek(&input -> scRNA_fq_input, &pos -> mFQ_position);
704 	}else if(input -> file_type == GENE_INPUT_BCL){
705 		assert(input -> file_type != GENE_INPUT_BCL);
706 	}else if(input -> file_type == GENE_INPUT_GZIP_FASTQ || input -> file_type == GENE_INPUT_GZIP_FASTA){
707 		seekgz_seek(( seekable_zfile_t *)input -> input_fp, &pos -> seekable_gzip_position);
708 		if(pos -> gzfa_last_name[0]) strcpy(input -> gzfa_last_name, pos -> gzfa_last_name);
709 		else input -> gzfa_last_name[0]=0;
710 	}else{
711 		fseeko((FILE *)input -> input_fp, pos -> simple_file_position, SEEK_SET);
712 	}
713 }
714 
trim_read_inner(char * read_text,char * qual_text,int rlen,short t_5,short t_3)715 int trim_read_inner(char * read_text, char * qual_text, int rlen, short t_5, short t_3)
716 {
717 
718 	if(rlen > t_5)
719 	{
720 		int xk1;
721 		for(xk1 = 0; xk1 < rlen - t_5 ; xk1++)
722 			read_text[xk1] = read_text[xk1+t_5];
723 
724 		if(qual_text)
725 			for(xk1 = 0; xk1 < rlen - t_5 ; xk1++)
726 				qual_text[xk1] = qual_text[xk1+t_5];
727 	}
728 	else{
729 		read_text[0]=0;
730 		if(qual_text)qual_text[0]=0;
731 		return 0;
732 	}
733 
734 	if(rlen - t_5 > t_3)
735 	{
736 		read_text[rlen - t_5 - t_3]=0;
737 		if(qual_text)qual_text[rlen - t_5 - t_3]=0;
738 	}
739 	else{
740 		read_text[0]=0;
741 		if(qual_text)qual_text[0]=0;
742 		return 0;
743 	}
744 
745 
746 
747 	return max(0, rlen - t_5 - t_3);
748 }
749 
tell_current_line_no(gene_input_t * input)750 srInt_64 tell_current_line_no(gene_input_t * input){
751 	srInt_64 fpos = ftello(input->input_fp);
752 	fseeko(input->input_fp,0,SEEK_SET);
753 	srInt_64 ret = 0, fscanpos = 0;
754 	while(1)
755 	{
756 		char nch = fgetc(input->input_fp);
757 		if(nch == EOF) return -1;
758 		if(nch == '\n') ret ++;
759 		fscanpos ++;
760 		if(fscanpos >= fpos){
761 			fseeko(input->input_fp, fpos, SEEK_SET);
762 			return ret;
763 		}
764 	}
765 }
766 
geinput_next_read(gene_input_t * input,char * read_name,char * read_string,char * quality_string)767 int geinput_next_read(gene_input_t * input, char * read_name, char * read_string, char * quality_string)
768 {
769 	return geinput_next_read_trim( input, read_name, read_string,  quality_string, 0, 0, NULL);
770 }
771 
772 // returns read length if OK
geinput_next_read_trim(gene_input_t * input,char * read_name,char * read_string,char * quality_string,short trim_5,short trim_3,int * is_secondary)773 int geinput_next_read_trim(gene_input_t * input, char * read_name, char * read_string, char * quality_string, short trim_5, short trim_3, int * is_secondary)
774 {
775 	if(input -> file_type == GENE_INPUT_BCL) {
776 		int rv = cacheBCL_next_read(&input -> bcl_input, read_name, read_string, quality_string, NULL);
777 		if(rv<=0) return -1;
778 		if(trim_5 || trim_3) rv = trim_read_inner(read_string, quality_string, rv, trim_5, trim_3);
779 		return rv;
780 	} else if(input -> file_type == GENE_INPUT_SCRNA_FASTQ) {
781 		int rv = input_mFQ_next_read(&input -> scRNA_fq_input, read_name, read_string, quality_string);
782 		if(rv<=0) return -1;
783 		if(trim_5 || trim_3) rv = trim_read_inner(read_string, quality_string, rv, trim_5, trim_3);
784 		return rv;
785 	} else if(input -> file_type == GENE_INPUT_SCRNA_BAM) {
786 		int rv = scBAM_next_read(&input -> scBAM_input, read_name, read_string, quality_string);
787 		if(rv<=0) return -1;
788 		if(trim_5 || trim_3) rv = trim_read_inner(read_string, quality_string, rv, trim_5, trim_3);
789 		return rv;
790 	} else if(input -> file_type == GENE_INPUT_PLAIN) {
791 		int ret = read_line(MAX_READ_LENGTH, input->input_fp, read_string, 0);
792 		if(quality_string) *quality_string=0;
793 
794 		if(ret <3)return -1;
795 
796 		if(trim_5 || trim_3) ret = trim_read_inner(read_string, NULL, ret, trim_5, trim_3);
797 		return ret;
798 	} else if(input->file_type >= GENE_INPUT_SAM_SINGLE) {
799 		char in_buff [3001];
800 		int tabs;
801 		int current_str_pos;
802 		int i;
803 		int ret = -1;
804 		int need_reverse;
805 		char mask_buf[5];
806 
807 
808 
809 		while(1)
810 		{
811 			//	int is_second_map = 0;
812 				int linelen = read_line(3000, input->input_fp, in_buff, 0);
813 				if(linelen <1)return -1;
814 				if(read_name)
815 					*read_name = 0;
816 				if(quality_string)
817 					*quality_string = 0;
818 				*read_string = 0;
819 				need_reverse = 0;
820 				current_str_pos = 0;
821 				ret = -1;
822 				tabs=0;
823 
824 				for(i=0; i<linelen+1; i++)
825 				{
826 					if(in_buff[i]=='\t'|| i ==linelen)
827 					{
828 						if(tabs == 0 && read_name)read_name[current_str_pos] = 0;
829 						if(tabs == 1)
830 						{
831 							mask_buf[current_str_pos] = 0;
832 							int flags = atoi(mask_buf) ;
833 							if(is_secondary && (flags & SAM_FLAG_SECONDARY_MAPPING))
834 							{
835 								(*is_secondary) = 1;
836 							}
837 							need_reverse = ( flags & SAM_FLAG_REVERSE_STRAND_MATCHED )?1:0;
838 
839 						}
840 						if(tabs == 9){
841 							read_string[current_str_pos] = 0;
842 							ret = current_str_pos;
843 						}
844 						if(tabs == 10 && quality_string){
845 							quality_string[current_str_pos] = 0;
846 							break;
847 						}
848 
849 						current_str_pos = 0 ;
850 						tabs +=1;
851 					}
852 					else
853 					{
854 						if(tabs == 9)// read
855 							read_string[current_str_pos++] = in_buff[i];
856 						else if(tabs == 10 && quality_string)// quality string
857 							quality_string[current_str_pos++] = in_buff[i];
858 						else if(tabs == 0 && read_name)// name
859 							read_name[current_str_pos++] = in_buff[i];
860 						else if(tabs == 1)
861 							mask_buf[current_str_pos++] = in_buff[i];
862 					}
863 				}
864 				if(input->file_type > GENE_INPUT_SAM_SINGLE)
865 					// skip a line if not single-end
866 					read_line(1, input->input_fp, in_buff, 0);
867 
868 				break;
869 				//printf("Repeated read skipped : %s\n", read_name);
870 		}
871 
872 		if(need_reverse)
873 		{
874 			if(quality_string)
875 				reverse_quality(quality_string, ret);
876 			reverse_read(read_string, ret, input->space_type);
877 		}
878 		if(trim_5 || trim_3) ret = trim_read_inner(read_string, quality_string, ret, trim_5, trim_3);
879 		return ret;
880 	} else if(input->file_type == GENE_INPUT_GZIP_FASTA) {
881 		// it is currently at ">"
882 		int tr = 0, ret = 0;
883 		char rbuf [MAX_READ_LENGTH+2];
884 		if(input -> gzfa_last_name [0] == 0){
885 			ret = read_line_noempty(MAX_READ_NAME_LEN, input, rbuf, 0);
886 			if(ret <1) return -1;
887 			if(read_name)strcpy(read_name, rbuf+1);
888 		}
889 		else if(read_name)strcpy(read_name, input -> gzfa_last_name);
890 		ret=0;
891 
892 		while(1){
893 			tr = read_line_noempty(MAX_READ_LENGTH, input, rbuf, 0);
894 			if(tr<1) {
895 				if(ret<1) return -1;
896 				break;
897 			}else{
898 				if(rbuf[0]=='>'){
899 					strcpy(input -> gzfa_last_name, rbuf+1);
900 					break;
901 				}else{
902 					strcpy(read_string+ret, rbuf);
903 					ret += tr; // read_line_noempty have no \n
904 				}
905 				read_string[ret]=0;
906 			}
907 		}
908 		if(trim_5 || trim_3) ret = trim_read_inner(read_string, quality_string, ret, trim_5, trim_3);
909 		return ret;
910 	} else if(input->file_type == GENE_INPUT_FASTA) {
911 		int ret;
912 		if(quality_string) (*quality_string)=0;
913 		#ifdef __MINGW32__
914 		assert(0);
915 		#endif
916 		while(1) // fetch read name
917 		{
918 			ret = read_line(MAX_READ_LENGTH, input->input_fp, read_string, 0);
919 			if(ret <1)
920 			{
921 				sublog_printf(SUBLOG_STAGE_RELEASED,SUBLOG_LEVEL_DEBUG, "The input file normally exhausted.");
922 				return -1;
923 			}
924 
925 			int cursor = 0;
926 			while(read_string[cursor])
927 			{
928 				if(cursor >=2 &&(read_string[cursor] == ' ' || read_string[cursor] == '\t'))
929 				{
930 					read_string [cursor] = 0;
931 					break;
932 				}
933 				cursor++;
934 			}
935 
936 			if(read_string[0]=='>'){
937 				if (read_name != NULL)
938 					strncpy(read_name, read_string+1, MAX_READ_NAME_LEN);
939 				break;
940 			}
941 			else
942 				sublog_printf(SUBLOG_STAGE_RELEASED,SUBLOG_LEVEL_FATAL,"The input file may be broken.");
943 		}
944 		ret = 0;
945 		while(1) // fetch read text
946 		{
947 			char nch = 0;
948 			ret += read_line(MAX_READ_LENGTH-ret, input->input_fp, read_string+ret, 1);
949 
950 			nch = fgetc(input->input_fp);
951 
952 			if(nch!=EOF)
953 				fseeko(input->input_fp, -1, SEEK_CUR);
954 
955 			if(nch == '>'||nch<1 || nch == EOF)
956 				break;
957 		}
958 //		printf("LOAD R=|%s|\nRETV=%d\n", read_string, ret);
959 		if(ret <1)return -1;
960 		if(trim_5 || trim_3) ret = trim_read_inner(read_string, quality_string, ret, trim_5, trim_3);
961 		return ret;
962 
963 	} else if(input->file_type == GENE_INPUT_FASTQ || input->file_type == GENE_INPUT_GZIP_FASTQ) {
964 		char nch = 0;
965 		int ret;
966 
967 		//if(input->file_type == GENE_INPUT_GZIP_FASTQ)seekgz_preload_buffer(input, NULL);
968 		//READ NAME
969 		if (read_name == NULL)
970 		{
971 			SKIP_LINE_NOEMPTY;
972 			if(nch == EOF) return -1;
973 		}
974 		else
975 		{
976 			do{
977 				nch = geinput_getc(input);
978 				//SUBREADprintf("B4_READ_NAME: %d '%c'\n", nch,nch);
979 			} while (nch == '\n');
980 			if(nch==EOF) return -1;
981 
982 			if(nch != '@') {
983 				if(input->file_type == GENE_INPUT_FASTQ){
984 					srInt_64 lineno = tell_current_line_no(input);
985 					SUBREADprintf("ERROR: a format issue %d is found on the %lld-th line in input file '%s'.\nProgram aborted.\n", nch, lineno, input -> filename);
986 				} else {
987 					SUBREADprintf("ERROR: a format issue %d is found on the input file '%s'.\nProgram aborted.\n", nch, input -> filename);
988 					SUBREADprintf("The lines after the error point:\n");
989 					read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
990 					SUBREADprintf("%s\n", read_string);
991 					read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
992 					SUBREADprintf("%s\n", read_string);
993 					read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
994 					SUBREADprintf("%s\n", read_string);
995 					read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
996 					SUBREADprintf("%s\n", read_string);
997 				}
998 				return -1;
999 			}
1000 
1001 			read_line_noempty(MAX_READ_NAME_LEN, input, read_name, 0);
1002 
1003 			int cursor = 1;
1004 			while(read_name[cursor])
1005 			{
1006 				if(read_name[cursor] == ' ' || read_name[cursor] == '\t')
1007 				{
1008 					read_name [cursor] = 0;
1009 					break;
1010 				}
1011 				cursor++;
1012 			}
1013 		}
1014 		//if(input->file_type == GENE_INPUT_GZIP_FASTQ)seekgz_preload_buffer(input, NULL);
1015 		// READ LINE
1016 		ret = read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
1017 		//SUBREADprintf("READ_SHOULD_ATGC [len=%d] : '''%s'''\n", ret, read_string);
1018 
1019 		// SKIP "+"
1020 		do{
1021 			nch = geinput_getc(input);
1022 		} while( nch == '\n' );
1023 		if(nch != '+'){
1024 			if(input->file_type == GENE_INPUT_FASTQ){
1025 				srInt_64 lineno = tell_current_line_no(input);
1026 				SUBREADprintf("ERROR: a format issue %c is found on the %lld-th line in input file '%s'.\nProgram aborted.\n", nch, lineno, input -> filename);
1027 			}else{
1028 				SUBREADprintf("ERROR: a format issue %d  (should be +) is found on the input file '%s'.\nProgram aborted.\n", nch, input -> filename);
1029 				read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
1030 				SUBREADprintf("%s\n", read_string);
1031 				read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
1032 				SUBREADprintf("%s\n", read_string);
1033 				read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
1034 				SUBREADprintf("%s\n", read_string);
1035 				read_line_noempty(MAX_READ_LENGTH, input, read_string, 0);
1036 				SUBREADprintf("%s\n", read_string);
1037 			}
1038 			return -1;
1039 		}
1040 		SKIP_LINE;
1041 
1042 		// QUAL LINE
1043 		if (quality_string)
1044 			read_line_noempty(MAX_READ_LENGTH, input, quality_string, 0);
1045 		else
1046 			SKIP_LINE_NOEMPTY;
1047 
1048 
1049 
1050 		#ifdef MODIFIED_READ_LEN
1051 		{
1052 			int modified_start = 0;
1053 			if(modified_start)
1054 			{
1055 				int i;
1056 				for(i=0;i<MODIFIED_READ_LEN; i++)
1057 				{
1058 					read_string[i] = read_string[i+modified_start];
1059 					if(quality_string)
1060 						quality_string[i] = quality_string[i+modified_start];
1061 				}
1062 			}
1063 			read_string[MODIFIED_READ_LEN]=0;
1064 			if(quality_string)
1065 				quality_string[MODIFIED_READ_LEN]=0;
1066 			ret = MODIFIED_READ_LEN;
1067 		}
1068 		#endif
1069 
1070 //		printf("LOAD R=|%s|\nRETV=%d\n", read_string, ret);
1071 
1072 		if(trim_5 || trim_3) ret = trim_read_inner(read_string, quality_string, ret, trim_5, trim_3);
1073 		return ret;
1074 
1075 	}else return -1;
1076 }
1077 
geinput_close(gene_input_t * input)1078 void geinput_close(gene_input_t * input)
1079 {
1080 	if(input -> file_type == GENE_INPUT_SCRNA_BAM)
1081 		input_scBAM_close(&input -> scBAM_input);
1082 	else if(input -> file_type == GENE_INPUT_SCRNA_FASTQ)
1083 		input_mFQ_close(&input -> scRNA_fq_input);
1084 	else if(input -> file_type == GENE_INPUT_BCL)
1085 		cacheBCL_close(&input -> bcl_input);
1086 	else if(input -> file_type == GENE_INPUT_GZIP_FASTQ || input -> file_type == GENE_INPUT_GZIP_FASTA)
1087 		seekgz_close((seekable_zfile_t * ) input->input_fp);
1088 	else
1089 		fclose((FILE*)input->input_fp);
1090 }
1091 
1092 char * __converting_char_table = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNGNNNCNNNNNNNNNNNNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNGNNNCNNNNNNNNNNNNAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN  ";
1093 
reverse_read(char * InBuff,int read_len,int space_type)1094 void reverse_read(char * InBuff, int read_len, int space_type)
1095 {
1096 	int i;
1097 
1098 	if(space_type == GENE_SPACE_COLOR)
1099 	{
1100 		int start_pos = 0;
1101 		char last_base = InBuff[0];
1102 
1103 		//printf("CLRLEN0=%d\nS0=%s\n", read_len, InBuff);
1104 		if(isalpha(last_base))
1105 		{
1106 			read_len ++;
1107 
1108 			for (i=1; i<read_len; i++)
1109 			{
1110 				int new_int = InBuff[i];
1111 				int new_base = 0;
1112 				if(new_int == '0')
1113 					new_base=last_base;
1114 				else if(new_int == '1')
1115 				{
1116 					if(last_base == 'A')new_base = 'C';
1117 					else if(last_base == 'G')new_base = 'T';
1118 					else if(last_base == 'T')new_base = 'G';
1119 					else new_base = 'A';
1120 				}
1121 				else if(new_int == '2')
1122 				{
1123 					if(last_base == 'A')new_base = 'G';
1124 					else if(last_base == 'G')new_base = 'A';
1125 					else if(last_base == 'T')new_base = 'C';
1126 					else new_base = 'T';
1127 				}
1128 				else
1129 				{
1130 					if(last_base == 'A')new_base = 'T';
1131 					else if(last_base == 'G')new_base = 'C';
1132 					else if(last_base == 'T')new_base = 'A';
1133 					else new_base = 'G';
1134 				}
1135 				last_base = new_base;
1136 			//	putchar(last_base);
1137 			}
1138 			//puts("");
1139 			InBuff[0] = *(__converting_char_table+last_base);
1140 			start_pos = 1;
1141 		}
1142 		else read_len--;
1143 
1144 		for (i=0; i<(read_len - start_pos)/2; i++)
1145 		{
1146 			int rll1 = read_len - 1 - i;
1147 			char tmp = InBuff[rll1];
1148 			InBuff[rll1] = InBuff[i + start_pos];
1149 			InBuff[i + start_pos] = tmp;
1150 		}
1151 	}
1152 	else
1153 	{
1154 		for (i=0; i<read_len/2; i++)
1155 		{
1156 			int rll1 = read_len - 1 - i;
1157 			unsigned char tmp = InBuff[rll1];
1158 
1159 			InBuff[rll1] = *(__converting_char_table+InBuff[i]);
1160 			InBuff[i] = *(__converting_char_table+tmp);
1161 
1162 		}
1163 		if(i*2 == read_len-1)
1164 		{
1165 			InBuff[i] = *(__converting_char_table+InBuff[i]);
1166 		}
1167 	}
1168 
1169 }
1170 
1171 
1172 
reverse_quality(char * InBuff,int read_len)1173 void reverse_quality(char * InBuff, int read_len)
1174 {
1175 	int i;
1176 	if(!InBuff) return;
1177 	if(!InBuff[0]) return;
1178 	for (i=0; i<read_len/2; i++)
1179 	{
1180 		char tmp;
1181 		tmp = InBuff[i];
1182 		InBuff[i] = InBuff[read_len -1-i];
1183 		InBuff[read_len -1-i] = tmp;
1184 	}
1185 }
1186 
1187 
genekey2intX(char * key,int space_type)1188 int genekey2intX(char * key,int space_type)
1189 {
1190 	int i;
1191 	int ret;
1192 
1193 	ret = 0;
1194 	if(space_type == GENE_SPACE_BASE)
1195 		for (i=30; i>=0; i-=2)
1196 		{
1197 			char kv = *(key++);
1198 			ret |= base2int(kv)<<i;
1199 		}
1200 	else
1201 		for (i=0; i<16; i++)
1202 		{
1203 			ret = ret << 2;
1204 			ret |= color2int (key[i]);
1205 		}
1206 
1207 //	printf("RET=%u\n",ret);
1208 
1209 	return ret;
1210 }
1211 
1212 
genekey2int(char * key,int space_type)1213 int genekey2int(char *key,int space_type)
1214 {
1215 	int i;
1216 	int ret;
1217 
1218 	ret = 0;
1219 	if(space_type == GENE_SPACE_BASE)
1220 		for (i=30; i>=0; i-=2)
1221 		{
1222 			char c1 = *(key++);
1223 			ret |= base2int(c1)<<i;
1224 		}
1225 	else
1226 		for (i=0; i<16; i++)
1227 		{
1228 			ret = ret << 2;
1229 			ret |= color2int (key[i]);
1230 		}
1231 	return ret;
1232 }
1233 
genekey2color(char last_base,char key[])1234 int genekey2color(char last_base, char key [])
1235 {
1236 	int i, ret = 0;
1237 	char last_char = last_base;
1238 
1239 	for (i=0; i<16; i++)
1240 	{
1241 		char next_char = key[i];
1242 
1243 		ret = ret << 2;
1244 		ret += chars2color(last_char, next_char);
1245 
1246 		last_char = next_char;
1247 	}
1248 
1249 	return ret;
1250 }
1251 
colorread2base(char * read_buffer,int read_len)1252 void colorread2base(char * read_buffer, int read_len)
1253 {
1254 	int i;
1255 	char last_base = read_buffer[0];
1256 	//printf("C2B:%s\n",read_buffer);
1257 	for (i=1; i<read_len; i++)
1258 	{
1259 		int new_int = read_buffer[i];
1260 		int new_base = 0;
1261 		if(new_int == '0')
1262 			new_base=last_base;
1263 		else if(new_int == '1')
1264 		{
1265 			if(last_base == 'A')new_base = 'C';
1266 			else if(last_base == 'G')new_base = 'T';
1267 			else if(last_base == 'T')new_base = 'G';
1268 			else new_base = 'A';
1269 		}
1270 		else if(new_int == '2')
1271 		{
1272 			if(last_base == 'A')new_base = 'G';
1273 			else if(last_base == 'G')new_base = 'A';
1274 			else if(last_base == 'T')new_base = 'C';
1275 			else new_base = 'T';
1276 		}
1277 		else
1278 		{
1279 			if(last_base == 'A')new_base = 'T';
1280 			else if(last_base == 'G')new_base = 'C';
1281 			else if(last_base == 'T')new_base = 'A';
1282 			else new_base = 'G';
1283 		}
1284 		read_buffer[i] = new_base;
1285 		last_base = new_base;
1286 	}
1287 	//printf("CBX:%s\n",read_buffer);
1288 }
1289 
color2char(char clr,char c1)1290 char color2char(char clr, char c1)
1291 {
1292 	if(clr == '0')return c1;
1293 	else if(clr == '1')
1294 	{
1295 		if(c1 == 'A') return 'C';
1296 		else if(c1 == 'T') return 'G';
1297 		else if(c1 == 'G') return 'T';
1298 		else return 'A';
1299 	}
1300 	else if(clr == '2')
1301 	{
1302 		if(c1 == 'A') return 'G';
1303 		else if(c1 == 'T') return 'C';
1304 		else if(c1 == 'G') return 'A';
1305 		else return 'T';
1306 	}
1307 	else if(clr == '3')
1308 	{
1309 		if(c1 == 'A') return 'T';
1310 		else if(c1 == 'T') return 'A';
1311 		else if(c1 == 'G') return 'C';
1312 		else return 'G';
1313 	}
1314 
1315 	return 'N';
1316 }
1317 
chars2color(char c1,char c2)1318 int chars2color(char c1, char c2)
1319 {
1320 	if(c1 == 'A')
1321 	{
1322 		if (c2=='A') return 0;
1323 		if (c2=='C') return 1;
1324 		if (c2=='G') return 2;
1325 		else return 3;
1326 	}
1327 	if (c1 == 'C')
1328 	{
1329 		if (c2=='A') return 1;
1330 		if (c2=='C') return 0;
1331 		if (c2=='G') return 3;
1332 		else return 2;
1333 	}
1334 	if (c1 == 'G')
1335 	{
1336 		if (c2=='A') return 2;
1337 		if (c2=='C') return 3;
1338 		if (c2=='G') return 0;
1339 		else return 1;
1340 	}
1341 
1342 	// if c1 == 'T', 'U'
1343 	if (c2=='A') return 3;
1344 	if (c2=='C') return 2;
1345 	if (c2=='G') return 1;
1346 	else return 0;
1347 
1348 
1349 
1350 }
1351 
find_subread_end(int len,int TOTAL_SUBREADS,int subread)1352 int find_subread_end(int len, int TOTAL_SUBREADS, int subread)
1353 {
1354 	if(len<= EXON_LONG_READ_LENGTH)
1355 	{
1356 		int subread_step =  ((len<<16) - (19<<16))/(TOTAL_SUBREADS -1);
1357 		return ((subread_step*(subread))>>16)+15;
1358 	}
1359 	else
1360 	{
1361 		int subread_step;
1362 
1363 		subread_step = 6<<16;
1364 		if(((len - 18)<<16) / subread_step > 62)
1365 			subread_step = ((len - 18)<<16)/62;
1366 		return ((subread_step*(subread))>>16)+15;
1367 	}
1368 }
1369 
fix_cigar_SAM14(char * cig)1370 void fix_cigar_SAM14(char * cig){
1371 	int tmpi = 0, ci = 0, tmpM = 0, wi = 0;
1372 	char ncig[EXON_MAX_CIGAR_LEN];
1373 
1374 	if(cig[0]=='*'){
1375 		return;
1376 	}
1377 	while(1){
1378 		int nch = cig[ci];
1379 		if(isdigit(nch)) tmpi = tmpi * 10 + nch - '0';
1380 		else{
1381 			if(nch == '=' || nch == 'X' || nch == 'M'){
1382 				tmpM += tmpi;
1383 			}else{
1384 				if(tmpM > 0){
1385 					wi += sprintf(ncig + wi, "%dM", tmpM);
1386 					tmpM = 0;
1387 				}
1388 				if(0 == nch) break;
1389 				else wi += sprintf(ncig + wi, "%d%c", tmpi, nch);
1390 			}
1391 			tmpi = 0;
1392 		}
1393 		ci++;
1394 	}
1395 	memcpy(cig, ncig, wi+1);
1396 }
1397 
1398 //This function returns 0 if the line is a mapped read; -1 if the line is in a wrong format and 1 if the read is unmapped.
parse_SAM_line(char * sam_line,char * read_name,int * flags,char * chro,unsigned int * pos,char * cigar,int * mapping_quality,unsigned int * pair_dist,char * sequence,char * quality_string,int * rl,int * repeated)1399 int parse_SAM_line(char * sam_line, char * read_name, int * flags, char * chro, unsigned int * pos, char * cigar, int * mapping_quality, unsigned int * pair_dist, char * sequence, char * quality_string, int * rl, int * repeated)
1400 {
1401 	char cc;
1402 	int ci = 0, k=0, field=0, ret_quality = 0, ret_flag = 0, ret_pairdist=0;
1403 	unsigned int ret_pos = 0;
1404 	int is_rep = 0;
1405 
1406 	while( (cc = sam_line[k]) )
1407 	{
1408 		if(cc=='\t')
1409 		{
1410 			field++;
1411 			k++;
1412 			if(field == 1)read_name[ci]=0;
1413 			else if(field == 3)chro[ci]=0;
1414 			else if(field == 6)cigar[ci]=0;
1415 			else if(field == 10)
1416 			{
1417 				sequence[ci]=0;
1418 				(*rl) = ci;
1419 			}
1420 			else if(field == 11)quality_string[ci]=0;
1421 			ci=0;
1422 			is_rep = 0;
1423 			continue;
1424 		}
1425 		if(field == 9)
1426 			sequence[ci++] = cc;
1427 		else if(field == 10)
1428 			quality_string[ci++] = cc;
1429 		else if(field == 0)
1430 			read_name[ci++] = cc;
1431 		else if(field == 1)
1432 			ret_flag = ret_flag*10 + (cc-'0');
1433 		else if(field == 8)
1434 		{
1435 			if(cc!='-')
1436 				ret_pairdist = ret_pairdist*10 + (cc-'0');
1437 		}
1438 		else if(field == 2)
1439 		{
1440 			//if(ci == 0 && cc == '*') return 1;
1441 			chro[ci++] = cc;
1442 		}
1443 		else if(field == 3)
1444 			ret_pos = ret_pos * 10 + (cc-'0');
1445 		else if(field == 4)
1446 			ret_quality = ret_quality * 10 + (cc-'0');
1447 		else if(field == 5)
1448 			cigar[ci++] = cc;
1449 		else if(field > 10)
1450 		{
1451 			if(cc == 'I' && ci==0) is_rep = 1;
1452 			if(cc != 'H' && ci==1 ) is_rep = 0;
1453 			if(is_rep && ci == 4) *repeated = 0;
1454 			if(is_rep && ci>4)
1455 				(*repeated)=(*repeated)*10+(cc-'0');
1456 			ci++;
1457 
1458 		}
1459 		k++;
1460 
1461 	}
1462 
1463 	//printf("REP=%d\n", *repeated);
1464 
1465 	if(field == 10 && ci>0)quality_string[ci]=0;
1466 	else if(field < 10) return -1;
1467 
1468 	if(ret_flag & 4)
1469 		(*mapping_quality) = 0;
1470 	else
1471 		(*mapping_quality) = ret_quality;
1472 	(*pos) = ret_pos;
1473 	(*flags) = ret_flag;
1474 	(*pair_dist) = ret_pairdist;
1475 	//printf("FLAG=%d\n", (*flags));
1476 	if(((*flags) & 4) == 4) return 1;
1477 
1478 	fix_cigar_SAM14(cigar);
1479 	return 0;
1480 
1481 }
1482 
1483 
1484 // This function returns 0 if the block is determined.
1485 // The block is undeterminable if the chromosome name is not in known_chromosomes, or the position is larger than the known length.
1486 // Pos is in terms of [1, ... , max_length]
get_read_block(char * chro,unsigned int pos,char * temp_file_suffix,chromosome_t * known_chromosomes,unsigned int * max_base_position)1487 int get_read_block(char *chro, unsigned int pos, char *temp_file_suffix, chromosome_t *known_chromosomes, unsigned int * max_base_position)
1488 {
1489 	int chro_no;
1490 	unsigned int max_known_chromosome=0;
1491 
1492 	for(chro_no=0;known_chromosomes[chro_no].chromosome_name[0]; chro_no++)
1493 	{
1494 		if(strcmp(chro , known_chromosomes[chro_no].chromosome_name) == 0)
1495 		{
1496 			max_known_chromosome = known_chromosomes[chro_no].known_length;
1497 			break;
1498 		}
1499 		//if(chro_no > 1)
1500 		//	printf("TOO MANY CHROS:%d\n", chro_no);
1501 	}
1502 	if(!known_chromosomes[chro_no].chromosome_name[0]) return 1;
1503 	if(pos >= known_chromosomes[chro_no].known_length) return 1;
1504 
1505 	int block_no = (pos-1) / BASE_BLOCK_LENGTH;
1506 	sprintf(temp_file_suffix , "%s-%04u.bin", chro, block_no);
1507 	if(max_base_position)*max_base_position=min((block_no+1)*BASE_BLOCK_LENGTH, max_known_chromosome);
1508 
1509 	return 0;
1510 }
1511 
get_temp_file_pointer(char * temp_file_name,HashTable * fp_table,int * close_immediately)1512 FILE * get_temp_file_pointer(char *temp_file_name, HashTable* fp_table, int * close_immediately)
1513 {
1514 	FILE * temp_file_pointer = (FILE *) HashTableGet(fp_table, temp_file_name);
1515 	*close_immediately = 0;
1516 
1517 	if(temp_file_pointer == NULL || temp_file_pointer == NULL + 1) {
1518 		int need_put = (temp_file_pointer == NULL );
1519 		char *key_name;
1520 		key_name = (char *)SUBREAD_malloc(300);
1521 		if(!key_name)
1522 			return NULL;
1523 		strcpy(key_name, temp_file_name);
1524 		temp_file_pointer = f_subr_open(key_name,"ab");
1525 
1526 		if(!temp_file_pointer){
1527 			SUBREADprintf("File cannot be opened: '%s'.\nPlease increase the maximum open files by command 'ulimit -n'.\nThis number should be set to at least 500 for human genome, and more chromosomes require more opened files.\n\n", key_name);
1528 			return NULL;
1529 		}
1530 
1531 		int maximum_open_file =  fp_table -> appendix1 - NULL;
1532 		if( fp_table -> numOfElements < maximum_open_file && need_put)
1533 			HashTablePut(fp_table, key_name ,temp_file_pointer);
1534 		else{
1535 			if(need_put)
1536 				HashTablePut(fp_table, key_name , NULL + 1);
1537 			*close_immediately = 1;
1538 		}
1539 	}
1540 
1541 	return temp_file_pointer;
1542 }
1543 
my_fclose(void * fp)1544 void my_fclose(void * fp)
1545 {
1546 	if(fp && fp != NULL+1)
1547 		fclose((FILE *)fp);
1548 }
1549 
my_strcmp(const void * s1,const void * s2)1550 int my_strcmp(const void * s1, const void * s2)
1551 {
1552 	int ret = strcmp((char*)s1, (char*)s2);
1553 	//SUBREADprintf("SCM:%s %s = %d\n", s1, s2, ret);
1554 	return ret;
1555 }
1556 
write_read_block_file(FILE * temp_fp,unsigned int read_number,char * read_name,int flags,char * chro,unsigned int pos,char * cigar,int mapping_quality,char * sequence,char * quality_string,int rl,int is_sequence_needed,char strand,unsigned short read_pos,unsigned short read_len,unsigned short mapped_seg)1557 int write_read_block_file(FILE *temp_fp , unsigned int read_number, char *read_name, int flags, char * chro, unsigned int pos, char *cigar, int mapping_quality, char *sequence , char *quality_string, int rl , int is_sequence_needed, char strand, unsigned short read_pos, unsigned short read_len, unsigned short mapped_seg)
1558 {
1559 	base_block_temp_read_t datum;
1560 	memset(&datum,0,sizeof(datum));
1561 	datum.record_type = 100;
1562 	datum.read_number = read_number;
1563 	datum.pos = pos;
1564 	datum.flags = flags;
1565 	datum.strand = strand;
1566 	datum.read_pos = read_pos;
1567 	datum.read_len = read_len;
1568 	datum.mapping_quality = mapping_quality;
1569 	datum.mapped_segment_in_read = mapped_seg;
1570 
1571 	if(rl < 1|| rl > MAX_READ_LENGTH)
1572 	{
1573 
1574 		SUBREADprintf("READ IS TOO LONG:%d\n", rl);
1575 		return -1;
1576 	}
1577 
1578 	fwrite(&datum, sizeof(datum), 1, temp_fp);
1579 	if(is_sequence_needed)
1580 	{
1581 		unsigned short srl = rl&0xffff;
1582 		int wlen = fwrite(&srl, sizeof(short),1, temp_fp);
1583 		if(wlen != 1) return -1;
1584 		wlen = fwrite(sequence , 1, rl,temp_fp );
1585 		if(wlen != rl) return -1;
1586 		wlen = fwrite(quality_string , 1, rl,temp_fp );
1587 		if(wlen != rl) return -1;
1588 	}
1589 	return 0;
1590 }
1591 
1592 
get_known_chromosomes(char * in_SAM_file,chromosome_t * known_chromosomes)1593 int get_known_chromosomes(char * in_SAM_file, chromosome_t * known_chromosomes)
1594 {
1595 	int i, is_first_read_PE;
1596 	int is_BAM = is_certainly_bam_file(in_SAM_file,  &is_first_read_PE, NULL);
1597 	SamBam_FILE * fp = SamBam_fopen(in_SAM_file,is_BAM?SAMBAM_FILE_BAM:SAMBAM_FILE_SAM);
1598 
1599 	while(1)
1600 	{
1601 		char line_buffer [3000];
1602 		char * is_ret = SamBam_fgets(fp, line_buffer, 2999, 0);
1603 		if(!is_ret) break;
1604 		int linelen = strlen(line_buffer);
1605 
1606 		if(line_buffer[0]=='@')
1607 		{
1608 			int chro_numb=0, field = 0, ci=0, ciw = 0;
1609 			if(line_buffer[1]!='S' || line_buffer[2]!='Q' || line_buffer[3]!='\t' ) continue;
1610 
1611 			while(known_chromosomes[chro_numb].chromosome_name[0]!=0) chro_numb++;
1612 			if(chro_numb > XOFFSET_TABLE_SIZE-1)
1613 			{
1614 				SUBREADprintf("FATAL ERROR: the number of chromosomes excessed %d\n", XOFFSET_TABLE_SIZE);
1615 				return -1;
1616 			}
1617 			known_chromosomes[chro_numb].known_length = 0;
1618 			for(i=0; i< linelen; i++)
1619 			{
1620 				char cc = line_buffer[i];
1621 
1622 				if(cc == '\r' || cc=='\n') continue;
1623 
1624 				if(cc == '\t')
1625 				{
1626 					if(field == 1)
1627 						known_chromosomes[chro_numb].chromosome_name[ciw]=0;
1628 					ci = 0;
1629 					ciw = 0;
1630 					field ++;
1631 				}
1632 				else if(field == 1)
1633 				{
1634 					if(ci >2)
1635 						known_chromosomes[chro_numb].chromosome_name[ciw++]=cc;
1636 					ci++;
1637 				}
1638 				else if(field == 2)
1639 				{
1640 					if(ci >2)
1641 						known_chromosomes[chro_numb].known_length = known_chromosomes[chro_numb].known_length * 10 + (cc - '0');
1642 					ci++;
1643 				}
1644 			}
1645 		}
1646 		else
1647 			break;
1648 	}
1649 	SamBam_fclose(fp);
1650 	return 0;
1651 }
1652 
add_cigar_indel_event(HashTable * event_table_ptr,char * chro,unsigned int chro_pos,int indels,char * ins_seq)1653 void add_cigar_indel_event(HashTable * event_table_ptr, char * chro, unsigned int chro_pos, int indels , char * ins_seq)
1654 {
1655 	if(abs(indels)>100) return;
1656 
1657 	char event_token[100];
1658 	snprintf(event_token, 99,"%s\t%u", chro, chro_pos);
1659 	int x1;
1660 	unsigned int indel_event_id = 0xffffffff, token_len;
1661 
1662 	int exist_indel_count = HashTableGet(event_table_ptr, event_token) - NULL;
1663 	unsigned short * app2_ptr =  event_table_ptr->appendix2;
1664 
1665 	if(exist_indel_count)
1666 		for(x1 = 0; x1< exist_indel_count; x1++)
1667 		{
1668 			snprintf(event_token, 99,"%s\t%u\t%d", chro, chro_pos, x1);
1669 			srInt_64 t64v =  (HashTableGet(event_table_ptr, event_token)-NULL);
1670 			srInt_64 indel_len = (t64v&0xff) - 0x80;
1671 			if(indel_len == indels){
1672 				indel_event_id = 0xffffff&(t64v >> 8) ;
1673 				if(app2_ptr[indel_event_id]<65000)
1674 					app2_ptr[indel_event_id] +=1;
1675 				return;
1676 			}
1677 		}
1678 
1679 
1680 	if(event_table_ptr->counter2<0xffff00)
1681 	{
1682 		unsigned int event_space_max_size = event_table_ptr-> counter1;
1683 		indel_event_id = event_table_ptr->counter2 ++;
1684 
1685 		if(indel_event_id >= event_space_max_size)
1686 		{
1687 			event_table_ptr->appendix1 = realloc(event_table_ptr->appendix1 , sizeof(char *) * event_space_max_size*2);
1688 			event_table_ptr->appendix2 = realloc(event_table_ptr->appendix2 , sizeof(short) * event_space_max_size*2);
1689 			memset(event_table_ptr->appendix2 + event_space_max_size * sizeof(short), 0, sizeof(short) * event_space_max_size);
1690 			event_table_ptr-> counter1 = event_space_max_size*2;
1691 			app2_ptr =  event_table_ptr->appendix2;
1692 		}
1693 
1694 		token_len=snprintf(event_token, 99,"%s\t%u", chro, chro_pos);
1695 		if(exist_indel_count<1)
1696 		{
1697 			char * token_1 = malloc(token_len+1);
1698 			strcpy(token_1, event_token);
1699 			HashTablePut(event_table_ptr, token_1, NULL+1);
1700 		}
1701 		else
1702 		{
1703 			HashTablePutReplace(event_table_ptr, event_token, NULL+exist_indel_count+1, 0);
1704 		}
1705 
1706 		token_len=snprintf(event_token, 99,"%s\t%u\t%d", chro, chro_pos, exist_indel_count);
1707 		char * token_2 = malloc(token_len+1);
1708 		strcpy(token_2, event_token);
1709 		srInt_64 indel_event_id_long = indel_event_id;
1710 		app2_ptr[indel_event_id] +=1;
1711 
1712 		HashTablePut(event_table_ptr, token_2, NULL + ((0xff & (0x80 + indels)) | ((indel_event_id_long&0xffffff) << 8)));
1713 		if(indels<0)
1714 		{
1715 			char * ins_seq_2 = malloc(-indels), ** app1_ptrptr = event_table_ptr->appendix1;
1716 			memcpy(ins_seq_2, ins_seq, -indels);
1717 			app1_ptrptr[indel_event_id] = ins_seq_2;
1718 		}
1719 	}
1720 }
1721 
destroy_cigar_event_table(HashTable * event_table)1722 void destroy_cigar_event_table(HashTable * event_table)
1723 {
1724 	int bucket;
1725 	KeyValuePair * cursor;
1726 	char ** seq_tab = event_table->appendix1;
1727 	for(bucket=0; bucket<event_table -> numOfBuckets; bucket++)
1728 	{
1729 		cursor = event_table -> bucketArray[bucket];
1730 		while (1)
1731 		{
1732 			int xk1, tabs;
1733 			if (!cursor) break;
1734 
1735 			char * token = (char *)cursor -> key;
1736 			tabs = 0;
1737 			for(xk1=0; token[xk1]; xk1++)
1738 				if(token[xk1]=='\t') tabs++;
1739 			srInt_64 tmpv = cursor -> value - NULL;
1740 			//printf("%s\t%lld\n", token, tmpv);
1741 
1742 			if(tabs==3)
1743 			{
1744 				unsigned int event_id = (tmpv>>8)&0xffffff;
1745 				free(seq_tab[event_id]);
1746 			}
1747 			free(token);
1748 			cursor = cursor->next;
1749 		}
1750 	}
1751 
1752 	free(event_table->appendix1);
1753 	free(event_table->appendix2);
1754 	HashTableDestroy(event_table);
1755 }
1756 
break_VCF_file(char * vcf_file,HashTable * fp_table,char * temp_file_prefix,chromosome_t * known_chromosomes)1757 void break_VCF_file(char * vcf_file, HashTable * fp_table, char * temp_file_prefix, chromosome_t* known_chromosomes)
1758 {
1759 	autozip_fp vzfp;
1760 	int vret = autozip_open(vcf_file, &vzfp);
1761 	char temp_file_suffix[MAX_CHROMOSOME_NAME_LEN+20];
1762 	int close_now = 0;
1763 
1764 	if(vret < 0)
1765 	{
1766 		SUBREADprintf("The specified VCF does not exist.\n");
1767 		return;
1768 	}
1769 
1770 	char * linebuf = malloc(3000);
1771 	char * tmpfname = malloc(MAX_FILE_NAME_LENGTH);
1772 
1773 	while(1)
1774 	{
1775 		char * tok_tmp;
1776 		int aretc = autozip_gets(&vzfp, linebuf, 2999);
1777 		if(aretc < 1) break;
1778 		if(linebuf[0]=='#') continue;
1779 		if(strstr(linebuf, "INDEL")) continue;
1780 		//SUBREADprintf("VLINE:%s\n", linebuf);
1781 
1782 		char * chro = strtok_r(linebuf, "\t", &tok_tmp);
1783 		if(!tok_tmp) continue;
1784 		char * pos_str = strtok_r(NULL, "\t", &tok_tmp);
1785 		if(!tok_tmp) continue;
1786 
1787 		strtok_r(NULL, "\t", &tok_tmp);// name
1788 		if(!tok_tmp) continue;
1789 
1790 		char * ref_seq = strtok_r(NULL, "\t", &tok_tmp);
1791 		if(!tok_tmp) continue;
1792 		char * alt_seq = strtok_r(NULL, "\t", &tok_tmp);
1793 		if(!tok_tmp) continue;
1794 
1795 		int is_snp = 0;
1796 		if(strstr(alt_seq,","))
1797 		{
1798 			char * com_tmp = NULL;
1799 			char * com_sec = strtok_r(alt_seq, ",", &com_tmp);
1800 			while(com_sec)
1801 			{
1802 				if(strlen(com_sec)==strlen(ref_seq))
1803 				{
1804 					is_snp=1;
1805 					break;
1806 				}
1807 
1808 				com_sec = strtok_r(NULL,  ",", &com_tmp);
1809 			}
1810 
1811 		}else if(strlen(ref_seq) == strlen(alt_seq)) is_snp=1;
1812 
1813 		if(!is_snp)continue;
1814 		unsigned int max_section_pos;
1815 
1816 		if(get_read_block(chro, atoi(pos_str) , temp_file_suffix, known_chromosomes, &max_section_pos))continue;
1817 		sprintf(tmpfname, "%s%s", temp_file_prefix , temp_file_suffix);
1818 		FILE * temp_fp = get_temp_file_pointer(tmpfname, fp_table, &close_now);
1819 		if(temp_fp)
1820 		{
1821 			VCF_temp_read_t datum;
1822 			datum.record_type = 200;
1823 			datum.pos = atoi(pos_str);
1824 			datum.type = CHRO_EVENT_TYPE_SNP;
1825 			fwrite(&datum, sizeof(VCF_temp_read_t), 1, temp_fp);
1826 			if(close_now) fclose(temp_fp);
1827 		}
1828 	}
1829 
1830 	free(linebuf);
1831 	free(tmpfname);
1832 	autozip_close(&vzfp);
1833 }
1834 
break_SAM_file(char * in_SAM_file,int is_BAM_file,char * temp_file_prefix,unsigned int * real_read_count,int * block_count,chromosome_t * known_chromosomes,int is_sequence_needed,int base_ignored_head_tail,gene_value_index_t * array_index,gene_offset_t * offsets,srInt_64 * all_mapped_bases,HashTable * event_table,char * VCF_file,srInt_64 * all_mapped_reads,int do_fragment_filtering,int push_to_read_head,int use_softclipped_bases)1835 int break_SAM_file(char * in_SAM_file, int is_BAM_file, char * temp_file_prefix, unsigned int * real_read_count, int * block_count, chromosome_t * known_chromosomes, int is_sequence_needed, int base_ignored_head_tail, gene_value_index_t *array_index, gene_offset_t * offsets, srInt_64 * all_mapped_bases, HashTable * event_table, char * VCF_file, srInt_64 * all_mapped_reads, int do_fragment_filtering, int push_to_read_head, int use_softclipped_bases )
1836 {
1837 	int i, is_first_read=1, is_error = 0;
1838 	HashTable * fp_table;
1839 	unsigned int read_number = 0;
1840 	char line_buffer [3000];
1841 	SamBam_FILE  * sambam_reader;
1842 
1843 	sambam_reader = SamBam_fopen(in_SAM_file, is_BAM_file?SAMBAM_FILE_BAM:SAMBAM_FILE_SAM);
1844 
1845 	if(!sambam_reader){
1846 		SUBREADprintf("SAM file does not exist or is not accessible: '%s'\n", in_SAM_file);
1847 		return 1;
1848 	}
1849 	if(push_to_read_head)assert(is_sequence_needed==0);
1850 
1851 	fp_table = HashTableCreate( 11011 );
1852 	HashTableSetDeallocationFunctions(fp_table, free, my_fclose);
1853 	HashTableSetKeyComparisonFunction(fp_table, my_strcmp);
1854 	HashTableSetHashFunction(fp_table,HashTableStringHashFunction);
1855 
1856 	char * fns = malloc(200);
1857 	fns[0]=0;
1858 	exec_cmd("ulimit -n", fns, 200);
1859 	int max_open_file = atoi(fns);
1860 	//SUBREADprintf("SYS FILE LIMIT=%d\n", max_open_file);
1861 	free(fns);
1862 
1863 	max_open_file = max(100, max_open_file);
1864 	max_open_file = min(3000, max_open_file);
1865 
1866 	fp_table -> appendix1 = NULL + max_open_file  * 2/ 3;
1867 
1868 	if(event_table!=NULL && event_table->appendix1==NULL)
1869 	{
1870 		event_table->appendix1 = malloc(sizeof(char *) * 100);
1871 		event_table->appendix2 = malloc(sizeof(unsigned short) * 100);
1872 		memset(event_table->appendix2, 0, sizeof(unsigned short) * 100);
1873 		event_table->counter1 = 100;
1874 		event_table->counter2 = 0;
1875 	}
1876 
1877 	while(1)
1878 	{
1879 		//srInt_64 file_position = ftello(fp);
1880 		//int linelen = read_line(2999, fp, line_buffer, 0);
1881 		char * is_ret = SamBam_fgets(sambam_reader, line_buffer, 2999, 1);
1882 
1883 		if(!is_ret) break;
1884 
1885 		if(line_buffer[0]=='@')
1886 		{
1887 			int chro_numb=0, field = 0, ci=0, ciw = 0;
1888 			if(line_buffer[1]!='S' || line_buffer[2]!='Q' || line_buffer[3]!='\t' ) continue;
1889 
1890 			while(known_chromosomes[chro_numb].chromosome_name[0]!=0) chro_numb++;
1891 
1892 			if(chro_numb > XOFFSET_TABLE_SIZE-1)
1893 			{
1894 				SUBREADprintf("FATAL ERROR: the number of chromosomes excessed %d\n", XOFFSET_TABLE_SIZE);
1895 				return -1;
1896 			}
1897 
1898 			known_chromosomes[chro_numb].known_length = 0;
1899 			for(i=0; ; i++)
1900 			{
1901 				char cc = line_buffer[i];
1902 				if(!cc) break;
1903 
1904 				if(cc == '\r' || cc=='\n') continue;
1905 
1906 				if(cc == '\t')
1907 				{
1908 					if(field == 1)
1909 						known_chromosomes[chro_numb].chromosome_name[ciw]=0;
1910 					ci = 0;
1911 					ciw = 0;
1912 					field ++;
1913 				}
1914 				else if(field == 1)
1915 				{
1916 					if(ci >2)
1917 						known_chromosomes[chro_numb].chromosome_name[ciw++]=cc;
1918 					ci++;
1919 				}
1920 				else if(field == 2)
1921 				{
1922 					if(ci >2)
1923 						known_chromosomes[chro_numb].known_length = known_chromosomes[chro_numb].known_length * 10 + (cc - '0');
1924 					ci++;
1925 				}
1926 			}
1927 			if(chro_numb < XOFFSET_TABLE_SIZE-1) known_chromosomes[chro_numb+1].chromosome_name[0]=0;
1928 		}
1929 		else
1930 		{
1931 			char read_name[MAX_READ_NAME_LEN], chro[MAX_CHROMOSOME_NAME_LEN], cigar[EXON_MAX_CIGAR_LEN], sequence[MAX_READ_LENGTH+1], quality_string[MAX_READ_LENGTH+1];
1932 			int flags = 0, mapping_quality = 0, rl=0;
1933 			char is_negative_strand = 0;
1934 			unsigned int pos = 0, pairdist = 0;
1935 			char temp_file_suffix[MAX_FILE_NAME_LENGTH];
1936 			char temp_file_name[MAX_FILE_NAME_LENGTH];
1937 			FILE * temp_fp;
1938 			int repeated = -1, close_now = 0;
1939 
1940 			if(is_first_read)
1941 			{
1942 				is_first_read=0;
1943 
1944 				if(VCF_file && VCF_file[0])
1945 					break_VCF_file(VCF_file, fp_table, temp_file_prefix, known_chromosomes);
1946 			}
1947 
1948 
1949 			//SUBREADprintf("ARRI_0=%p ; OFFS=%p ; EVT=%p\n%s\n",array_index, offsets, event_table, line_buffer);
1950 			int line_parse_result = parse_SAM_line(line_buffer, read_name, &flags, chro, &pos, cigar, & mapping_quality, &pairdist, sequence, quality_string, &rl, &repeated);
1951 			if(line_parse_result<0)SUBREADprintf("WRONG LINE FORMAT: %s\n", line_buffer);
1952 
1953 			if(strlen(quality_string)<2)
1954 			{
1955 				int xk1;
1956 				for(xk1=0; xk1<rl; xk1++)
1957 				{
1958 					quality_string[xk1]='I';
1959 				}
1960 				quality_string[xk1]=0;
1961 			}
1962 
1963 			if(line_parse_result || (flags & SAM_FLAG_UNMAPPED)){
1964 				read_number ++;
1965 				continue;
1966 			}
1967 
1968 			if(do_fragment_filtering && (flags & SAM_FLAG_PAIRED_TASK) && (pairdist ==0 || pairdist > 500000)){
1969 				read_number ++;
1970 				continue;
1971 			}
1972 
1973 			if(do_fragment_filtering && array_index)
1974 			{
1975 				int mismatch = 0;
1976 
1977 				unsigned int linear_pos = linear_gene_position(offsets , chro, pos)-1;
1978 				float match_rate = final_mapping_quality(array_index, linear_pos, sequence, quality_string, cigar, FASTQ_PHRED33,  & mismatch,  rl, NULL, NULL);
1979 				if(mismatch>8 || match_rate < 160)
1980 				{
1981 					read_number ++;
1982 					continue;
1983 				}
1984 			}
1985 
1986 			is_negative_strand = (flags & SAM_FLAG_REVERSE_STRAND_MATCHED)?1:0;
1987 			if((flags & 4) ==0 && all_mapped_reads)(*all_mapped_reads)++;
1988 
1989 
1990 			if(is_sequence_needed == 2)
1991 			{
1992 
1993 			}
1994 			else if(is_sequence_needed == 1)
1995 			{
1996 				int read_cursor = 0;
1997 				int is_first_S = 1;
1998 				unsigned int chromosome_cursor = pos;
1999 				int j, tmpv=0;
2000 				char cc;
2001 				unsigned short M_parts=0;
2002 
2003 				for(j=0; cigar[j]; j++)
2004 				{
2005 					cc = cigar[j];
2006 					if(cc>='0' && cc<='9') tmpv= tmpv*10+(cc-'0');
2007 					else if(cc == 'S'||cc == 'M')
2008 					{
2009 						if(cc == 'M') is_first_S = 0;
2010 
2011 						if(cc == 'M' || use_softclipped_bases)
2012 						{
2013 							unsigned int insertion_cursor = chromosome_cursor - ((cc=='S' && is_first_S)?tmpv:0);
2014 							unsigned int insertion_end = chromosome_cursor + ((cc=='S' && is_first_S)?0:tmpv);
2015 							// DO INSERTION
2016 							while(insertion_cursor < insertion_end && read_cursor < (rl - base_ignored_head_tail))
2017 							{
2018 								unsigned int max_section_pos, insert_length;
2019 								int need_write = 1;
2020 
2021 								if(get_read_block(chro, insertion_cursor , temp_file_suffix, known_chromosomes, &max_section_pos))break;
2022 								insert_length = min(max_section_pos + 1, insertion_end) - insertion_cursor;
2023 								if(insert_length<1) break;
2024 
2025 								if(base_ignored_head_tail)
2026 								{
2027 									if(read_cursor+insert_length < base_ignored_head_tail)
2028 										need_write = 0;
2029 									else if(read_cursor < base_ignored_head_tail)
2030 									{
2031 										int ignored_length = base_ignored_head_tail - read_cursor;
2032 										insert_length = read_cursor + insert_length - base_ignored_head_tail;
2033 
2034 										read_cursor = base_ignored_head_tail;
2035 										insertion_cursor += ignored_length;
2036 									}
2037 
2038 									if(read_cursor >= (rl - base_ignored_head_tail))
2039 										need_write = 0;
2040 									else if(read_cursor +insert_length >= (rl - base_ignored_head_tail))
2041 										insert_length = (rl - base_ignored_head_tail) - read_cursor;
2042 								}
2043 //								printf("INST: RL=%d; INSL=%d; READ_CUR=%d; IGNORE=%d\n", rl, insert_length, read_cursor , base_ignored_head_tail);
2044 
2045 //#warning " ======= DEBUG OUT ========="
2046 								if(0 && FIXLENstrcmp("SRR768163.14829906", read_name) == 0)
2047 									SUBREADprintf("INST: RL=%d; NEED=%d; INSL=%d; READ_CUR=%d; IGNORE=%d; RN=%s\nWRT AT %u (one-based): %s\n\n", rl, need_write, insert_length, read_cursor , base_ignored_head_tail, read_name, insertion_cursor, sequence + read_cursor);
2048 
2049 								if(0 && strcmp(chro, "chr12") == 0 && insertion_cursor <= 114788620  && insertion_cursor + insert_length > 114788620){
2050 									int read_pos0 = 114788620 - insertion_cursor + read_cursor;
2051 									SUBREADprintf("INST_114788620 : %s : val=%c ; NEED=%d\n", read_name, sequence[read_pos0], need_write);
2052 								}
2053 
2054 								if(need_write  && insert_length > 0 && sequence[0]!='*') {
2055 									sprintf(temp_file_name, "%s%s", temp_file_prefix , temp_file_suffix);
2056 									temp_fp = get_temp_file_pointer(temp_file_name, fp_table, &close_now);
2057 									if(!temp_fp) return -1;
2058 									if(all_mapped_bases)
2059 										(*all_mapped_bases) += insert_length;
2060 
2061 									is_error |= write_read_block_file(temp_fp , read_number, read_name, flags, chro, insertion_cursor, cigar, mapping_quality, sequence + read_cursor , quality_string + read_cursor, insert_length , 1, is_negative_strand, read_cursor, rl, M_parts);
2062 									if(close_now) fclose(temp_fp);
2063 								}
2064 								insertion_cursor += insert_length;
2065 								read_cursor += insert_length;
2066 							}
2067 							if(M_parts < 65535)M_parts ++;
2068 						}
2069 						else
2070 							read_cursor += tmpv;
2071 
2072 						if(!is_first_S)
2073 							chromosome_cursor += tmpv;
2074 
2075 						tmpv=0;
2076 					}
2077 					else if(cc == 'D' || cc == 'N')
2078 					{
2079 						// the left edge ( last WANTED base ) is chromosome_cursor-1
2080 						// the indel length is tmpv;
2081 						// now we add this into the event table.
2082 						if(event_table && cc=='D')
2083 							add_cigar_indel_event(event_table, chro, chromosome_cursor-1, tmpv, NULL);
2084 						chromosome_cursor += tmpv;
2085 						tmpv = 0;
2086 					}
2087 					else if(cc == 'I' )
2088 					{
2089 						// the left edge ( last WANTED base ) is chromosome_cursor-1
2090 						// the indel length is -tmpv;
2091 						// now we add this into the event table.
2092 						if(event_table &&  sequence[0]!='*')
2093 							add_cigar_indel_event(event_table, chro, chromosome_cursor-1, -tmpv, sequence + read_cursor);
2094 						read_cursor += tmpv;
2095 						tmpv = 0;
2096 					}
2097 					else	tmpv = 0;
2098 
2099 				}
2100 
2101 			}else{ // NO sequence is needed : no CIGAR is parsed.
2102 				int cgi, cc;
2103 				int pushback = 0;
2104 
2105 				for(cgi=0; cigar[cgi]; cgi++){
2106 					cc = cigar[cgi];
2107 					if(cc >='0' && cc<='9') pushback = pushback*10 + cc-'0';
2108 					else{
2109 						if(cc!='S') pushback=0;
2110 						break;
2111 					}
2112 				}
2113 
2114 				assert(pos>=pushback);
2115 				pos -= pushback;
2116 
2117 				if(get_read_block(chro, pos, temp_file_suffix, known_chromosomes, NULL)) {
2118 					read_number ++;
2119 					continue;
2120 				}
2121 				sprintf(temp_file_name, "%s%s", temp_file_prefix , temp_file_suffix);
2122 
2123 				temp_fp = get_temp_file_pointer(temp_file_name, fp_table, &close_now);
2124 				is_error |= write_read_block_file(temp_fp , read_number, read_name, flags, chro, pos, cigar, mapping_quality, sequence , quality_string, rl , is_sequence_needed, is_negative_strand, 0,rl, 0);
2125 				if(close_now)fclose(temp_fp);
2126 			}
2127 			read_number ++;
2128 		}
2129 	}
2130 
2131 	if(block_count)
2132 		(*block_count) = fp_table->numOfElements;
2133 	HashTableDestroy(fp_table);
2134 	SamBam_fclose(sambam_reader);
2135 	if(real_read_count)
2136 		(*real_read_count) = read_number;
2137 	if(is_error){
2138 		SUBREADprintf("ERROR: cannot write into the temporary files. Please check the disk space in the temp directory.\n");
2139 	}
2140 	return is_error;
2141 }
2142 
is_in_exon_annotations(gene_t * output_genes,unsigned int offset,int is_start)2143 int is_in_exon_annotations(gene_t *output_genes, unsigned int offset, int is_start)
2144 {
2145 	int i,j;
2146 
2147 	for(i=0; i< MAX_ANNOTATION_EXONS; i++)
2148 	{
2149 		if(!output_genes[i].end_offset) break;
2150 		if(output_genes[i].end_offset >= offset && output_genes[i].start_offset <= offset)
2151 		{
2152 			for(j=0; j< MAX_EXONS_PER_GENE; j++)
2153 			{
2154 				if(output_genes[i].exon_ends[j] >= offset && output_genes[i].exon_starts[j] <= offset)
2155 				{
2156 					if(output_genes[i].exon_starts[j] == offset && is_start) return 2;	// 2==exactly matched
2157 					if(output_genes[i].exon_ends[j] == offset && !is_start)	return 2;
2158 					return 1;	// 1==enclosed
2159 				}
2160 			}
2161 		}
2162 	}
2163 	return 0;	//0==exon not found
2164 }
2165 
load_exon_annotation(char * annotation_file_name,gene_t ** output_genes,gene_offset_t * offsets)2166 int load_exon_annotation(char * annotation_file_name, gene_t ** output_genes, gene_offset_t* offsets)
2167 {
2168 	int line_len, gene_number = 0, exons = 0;
2169 	char old_gene_name[MAX_GENE_NAME_LEN];
2170 	FILE * fp = f_subr_open(annotation_file_name, "rb");
2171 
2172 	if(!fp)
2173 	{
2174 		SUBREADprintf("Cannot open the exon annotation file: %s\n", annotation_file_name);
2175 		return -1;
2176 	}
2177 	(*output_genes) = malloc(sizeof(gene_t)*MAX_ANNOTATION_EXONS);
2178 	if(!*output_genes)
2179 	{
2180 		SUBREADprintf("Cannot allocate memory for the exon table. \n");
2181 		return -1;
2182 	}
2183 
2184 
2185 	old_gene_name[0]=0;
2186 	(*output_genes)[0].end_offset = 0;
2187 	(*output_genes)[0].start_offset = 0xffffffff;
2188 	while(gene_number < MAX_ANNOTATION_EXONS)
2189 	{
2190 		char buff[1200], this_gene_name[MAX_GENE_NAME_LEN], chromosome_name[MAX_CHROMOSOME_NAME_LEN];
2191 		int i = 0, j=0;
2192 		unsigned int exon_location;
2193 
2194 		line_len = read_line(1200, fp, buff, 0);
2195 
2196 		if(line_len>0)	//Not EOF
2197 		{
2198 			if(!isdigit(buff[0]))	// it is a title line or something else
2199 				continue;
2200 
2201 			for(i=0; buff[i] != '\t' &&  buff[i] != '\n' && i < 1200; i++)
2202 				this_gene_name[i] = buff[i];
2203 			this_gene_name[i] = 0;
2204 		}
2205 
2206 		if(line_len<=0 || (exons && old_gene_name[0] && strcmp(this_gene_name , old_gene_name)))	// it is a new gene
2207 		{
2208 			strncpy((*output_genes)[gene_number].gene_name , old_gene_name, MAX_GENE_NAME_LEN);
2209 			(*output_genes)[gene_number].exon_ends[exons] = 0;
2210 			gene_number++;
2211 			exons = 0;
2212 			(*output_genes)[gene_number].end_offset = 0;
2213 			(*output_genes)[gene_number].start_offset = 0xffffffff;
2214 		}
2215 
2216 		if(line_len<=0) break;
2217 
2218 
2219 		// copy chromosome name
2220 		for(i++; buff[i] != '\t' &&  buff[i] != '\n' && i < 1200; i++)
2221 			chromosome_name[j++] = buff[i];
2222 		chromosome_name[j] = 0;
2223 
2224 		// start location
2225 		exon_location = 0;
2226 		for(i++; buff[i] != '\t' &&  buff[i] != '\n' && i < 1200; i++)
2227 			if(isdigit(buff[i]))
2228 				exon_location = exon_location*10 + buff[i] - '0';
2229 
2230 		(*output_genes)[gene_number].exon_starts[exons] = linear_gene_position(offsets, chromosome_name , exon_location-1);
2231 		if( (*output_genes)[gene_number].exon_starts[exons] == 0xffffffff)
2232 			continue;
2233 
2234 		if((*output_genes)[gene_number].start_offset > (*output_genes)[gene_number].exon_starts[exons])
2235 			(*output_genes)[gene_number].start_offset = (*output_genes)[gene_number].exon_starts[exons];
2236 
2237 		// end location
2238 		exon_location = 0;
2239 		for(i++; buff[i] != '\t' &&  buff[i] != '\n' && buff[i] && i < 1200; i++)
2240 			if(isdigit(buff[i]))
2241 				exon_location = exon_location*10 + buff[i] - '0';
2242 
2243 		(*output_genes)[gene_number].exon_ends[exons] = linear_gene_position(offsets, chromosome_name , exon_location);
2244 
2245 		if((*output_genes)[gene_number].end_offset <  (*output_genes)[gene_number].exon_ends[exons])
2246 			(*output_genes)[gene_number].end_offset =  (*output_genes)[gene_number].exon_ends[exons];
2247 
2248 		exons ++;
2249 		if(exons >= MAX_EXONS_PER_GENE)
2250 		{
2251 			SUBREADprintf("The number of exons excesses the limit. Please increase the value of MAX_EXONS_PER_GENE in subread.h.\n");
2252 			return -1;
2253 		}
2254 
2255 		strncpy(old_gene_name, this_gene_name , MAX_GENE_NAME_LEN);
2256 	}
2257 	fclose(fp);
2258 	return 0;
2259 }
2260 
does_file_exist(char * path)2261 int does_file_exist(char * path)
2262 {
2263 	int ret ;
2264 	FILE * fp = f_subr_open(path, "rb");
2265 	ret = fp!=NULL;
2266 	if(fp)fclose(fp);
2267 
2268 	return ret;
2269 }
2270 
sort_SAM_hash(char * str)2271 srUInt_64 sort_SAM_hash(char * str)
2272 {
2273 	srUInt_64 hash = 5381;
2274 	int c, xk1=0;
2275 
2276 	while (1)
2277 	{
2278 		c = str[xk1++];
2279 		if(!c)break;
2280 		hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
2281 	}
2282 	return hash;
2283 }
2284 
2285 
2286 void do_SIGINT_remove(char * prefix, int param);
2287 char * _SAMSORT_SNP_delete_temp_prefix = NULL;
2288 char * _REPAIRER_delete_temp_prefix = NULL;
SAM_SORT_SIGINT_hook(int param)2289 void SAM_SORT_SIGINT_hook(int param) {
2290 	do_SIGINT_remove(_SAMSORT_SNP_delete_temp_prefix,  param);
2291 }
REPAIR_SIGINT_hook(int param)2292 void REPAIR_SIGINT_hook(int param) {
2293 	do_SIGINT_remove(_REPAIRER_delete_temp_prefix,  param);
2294 }
2295 
delete_with_prefix(char * prefix)2296 void delete_with_prefix(char * prefix){
2297 	if(prefix != NULL)
2298 	{
2299 		int xk1, last_slash = -1;
2300 		char del2[MAX_FILE_NAME_LENGTH], del_suffix[MAX_FILE_NAME_LENGTH], del_name[MAX_FILE_NAME_LENGTH];
2301 		for(xk1=0; prefix[xk1]; xk1++)
2302 		{
2303 			if(prefix[xk1]=='/') last_slash = xk1;
2304 			else if(prefix[xk1]=='\\')
2305 			{
2306 				SUBREADprintf("The file name is unknown.\n");
2307 				return;
2308 			}
2309 		}
2310 		if(last_slash>=0)
2311 		{
2312 			memcpy(del2, prefix, last_slash);
2313 			del2[last_slash] = 0;
2314 			strcpy(del_suffix , prefix + last_slash + 1);
2315 		}
2316 		else
2317 		{
2318 			strcpy(del2,".");
2319 			strcpy(del_suffix , prefix);
2320 		}
2321 
2322 		//#warning ">>>>>>>> COMMENT THIS OUT <<<<<<<<<<<<<<<<<<<<<"
2323 		//SUBREADprintf("SCANDEL: %s, PREFIX %s, SUFFIX %s\n", del2, prefix, del_suffix);
2324 		if(strlen(del_suffix)>8)
2325 		{
2326 			DIR	   *d;
2327 			struct dirent *dir;
2328 
2329 			d = opendir(del2);
2330 			if (d)
2331 			{
2332 				while ((dir = readdir(d)) != NULL)
2333 				{
2334 					if(strstr(dir->d_name, del_suffix))
2335 					{
2336 						strcpy(del_name, del2);
2337 						strcat(del_name, "/");
2338 						strcat(del_name, dir->d_name);
2339 						unlink(del_name);
2340 
2341 			//			#warning ">>>>>>>> COMMENT THIS OUT <<<<<<<<<<<<<<<<<<<<<"
2342 			//			SUBREADprintf("DEL: %s\n", del_name);
2343 						//test fix
2344 					}
2345 				}
2346 				closedir(d);
2347 			}
2348 		}
2349 
2350 	}
2351 
2352 }
2353 
do_SIGINT_remove(char * prefix,int param)2354 void do_SIGINT_remove(char * prefix, int param) {
2355 	#ifdef MAKE_STANDALONE
2356 	delete_with_prefix(prefix);
2357 	SUBREADprintf("\n\nReceived a terminal signal. The temporary files were removed.\n");
2358 	exit(param);
2359 	#endif
2360 }
2361 
2362 
2363 void * old_sig_TERM = NULL, * old_sig_INT = NULL;
2364 
SAM_pairer_writer_create(SAM_pairer_writer_main_t * bam_main,int all_threads,int has_dummy,int BAM_input,int c_level,char * out_file)2365 int SAM_pairer_writer_create( SAM_pairer_writer_main_t * bam_main , int all_threads , int has_dummy, int BAM_input, int c_level, char * out_file){
2366 	int x1;
2367 
2368 	memset(bam_main, 0, sizeof(SAM_pairer_writer_main_t));
2369 	bam_main -> bam_fp = f_subr_open(out_file, "wb");
2370 	if(NULL == bam_main -> bam_fp) return 1;
2371 	strcpy(bam_main -> bam_name, out_file);
2372 	bam_main -> threads = malloc(all_threads * sizeof(SAM_pairer_writer_thread_t));
2373 	bam_main -> all_threads = all_threads;
2374 	bam_main -> has_dummy = has_dummy;
2375 	bam_main -> compression_level = c_level;
2376 	subread_init_lock(&bam_main -> output_fp_lock);
2377 
2378 	for(x1 = 0; x1 < all_threads ; x1 ++){
2379 		bam_main -> threads[x1].BIN_buffer_ptr = 0;
2380 		bam_main -> threads[x1].strm.zalloc = Z_NULL;
2381 		bam_main -> threads[x1].strm.zfree = Z_NULL;
2382 		bam_main -> threads[x1].strm.opaque = Z_NULL;
2383 		bam_main -> threads[x1].strm.avail_in = 0;
2384 		bam_main -> threads[x1].strm.next_in = Z_NULL;
2385 
2386 		deflateInit2(&bam_main -> threads[x1].strm, bam_main -> compression_level, Z_DEFLATED,
2387 		PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
2388 	}
2389 	return 0;
2390 }
2391 
SAM_pairer_write_BAM_header(FILE * writer,int compressed_size)2392 void SAM_pairer_write_BAM_header(FILE * writer, int compressed_size)
2393 {
2394 
2395 	// the four magic characters
2396 	fputc(31,  writer);
2397 	fputc(139,  writer);
2398 	fputc(8,  writer);
2399 	fputc(4,  writer);
2400 
2401 	time_t time_now = 0;
2402 	fwrite(&time_now,4,1, writer);
2403 
2404 	int tmp_i;
2405 	// Extra flags and OS
2406 	fputc(0,  writer);
2407 	fputc(0xff,  writer);
2408 
2409 	// Extra length
2410 	tmp_i = 6;
2411 	fwrite(&tmp_i,2,1, writer);
2412 
2413 
2414 	// SI1 and SI2 magic numbers, and SLEN
2415 	fputc(66,  writer);
2416 	fputc(67,  writer);
2417 	tmp_i = 2;
2418 	fwrite(&tmp_i,2,1, writer);
2419 	tmp_i = compressed_size + 19 + 6;
2420 	fwrite(&tmp_i,2,1, writer);
2421 }
2422 
2423 
2424 
SAM_pairer_multi_thread_compress(SAM_pairer_writer_main_t * bam_main,SAM_pairer_writer_thread_t * bam_thread)2425 int SAM_pairer_multi_thread_compress(SAM_pairer_writer_main_t * bam_main ,  SAM_pairer_writer_thread_t * bam_thread)
2426 {
2427 	#define BAM_compressed_space 65536
2428 	char * BAM_compressed = malloc(BAM_compressed_space);
2429 	int ret, have;
2430 	if(bam_thread -> BIN_buffer_ptr>0){
2431 		deflateReset(&bam_thread -> strm);
2432 		bam_thread -> strm.avail_in = bam_thread -> BIN_buffer_ptr;
2433 		bam_thread -> strm.next_in = bam_thread -> BIN_buffer;
2434 		bam_thread -> strm.avail_out = BAM_compressed_space;
2435 		bam_thread -> strm.next_out = (unsigned char *)BAM_compressed;
2436 		ret = deflate( &bam_thread -> strm , Z_FINISH);
2437 
2438 		have = BAM_compressed_space - bam_thread -> strm.avail_out;
2439 		assert(bam_thread -> strm.avail_in == 0);
2440 	}else{
2441 		z_stream nstrm;
2442 		nstrm.zalloc = Z_NULL;
2443 		nstrm.zfree = Z_NULL;
2444 		nstrm.opaque = Z_NULL;
2445 		nstrm.avail_in = 0;
2446 		nstrm.next_in = Z_NULL;
2447 
2448 		deflateInit2(&nstrm, SAMBAM_COMPRESS_LEVEL_NORMAL, Z_DEFLATED,
2449 			PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
2450 
2451 		nstrm.avail_in = 0;
2452 		nstrm.next_in = bam_thread -> BIN_buffer;
2453 		nstrm.avail_out = BAM_compressed_space;
2454 		nstrm.next_out = (unsigned char *)BAM_compressed;
2455 		ret = deflate(&nstrm, Z_FINISH);
2456 		deflateEnd(&nstrm);
2457 		have = BAM_compressed_space - nstrm.avail_out;
2458 	}
2459 	if(ret == Z_OK || 1){
2460 
2461 		//SUBREADprintf("Compress: %d -> %d  %p\n", bam_thread -> BIN_buffer_ptr, have, bam_main -> bam_fp);
2462 		//if(bam_thread -> BIN_buffer_ptr == 0) have = 0;
2463 		unsigned int crc0 = crc32(0, NULL, 0);
2464 		unsigned int CRC32 = crc32(crc0, (unsigned char *)  bam_thread -> BIN_buffer ,bam_thread -> BIN_buffer_ptr);
2465 
2466 
2467 		subread_lock_occupy( &bam_main -> output_fp_lock );
2468 		SAM_pairer_write_BAM_header( bam_main -> bam_fp , have);
2469 		fwrite(BAM_compressed,1, have, bam_main -> bam_fp );
2470 		fwrite(&CRC32 , 4, 1, bam_main -> bam_fp);
2471 		fwrite( &bam_thread -> BIN_buffer_ptr , 4, 1, bam_main -> bam_fp);
2472 
2473 		subread_lock_release( &bam_main -> output_fp_lock );
2474 
2475 		bam_thread -> BIN_buffer_ptr = 0;
2476 	} else {
2477 		SUBREADprintf("ERROR: Cannot compress a BAM block : %d\n", ret);
2478 		return 1;
2479 	}
2480 	free(BAM_compressed);
2481 	return 0;
2482 }
2483 
2484 
2485 
SAM_pairer_writer_destroy(SAM_pairer_writer_main_t * bam_main)2486 void SAM_pairer_writer_destroy( SAM_pairer_writer_main_t * bam_main ) {
2487 	int x1;
2488 	for(x1 = 0; x1 < bam_main -> all_threads ; x1 ++){
2489 		if(bam_main -> threads[x1].BIN_buffer_ptr>0){
2490 			SAM_pairer_multi_thread_compress(bam_main, bam_main->threads+x1);
2491 		}
2492 
2493 		if(x1 == bam_main -> all_threads - 1){
2494 			assert(0 == bam_main -> threads[x1].BIN_buffer_ptr);
2495 			SAM_pairer_multi_thread_compress(bam_main, bam_main->threads+x1);
2496 		}
2497 		deflateEnd(&bam_main -> threads[x1].strm);
2498 	}
2499 	subread_destroy_lock(&bam_main -> output_fp_lock);
2500 	fclose(bam_main -> bam_fp);
2501 	free(bam_main -> threads);
2502 }
2503 
SAM_pairer_set_unsorted_notification(SAM_pairer_context_t * pairer,void (* unsorted_notification)(void * pairer,char * bin1,char * bin2))2504 void SAM_pairer_set_unsorted_notification(SAM_pairer_context_t * pairer, void (* unsorted_notification) (void * pairer, char * bin1, char * bin2)){
2505 	pairer -> unsorted_notification = unsorted_notification;
2506 }
2507 
2508 
SAM_pairer_warning_file_open_limit()2509 int SAM_pairer_warning_file_open_limit(){
2510 #ifndef __MINGW32__
2511 	struct rlimit limit_st;
2512 	getrlimit(RLIMIT_NOFILE, & limit_st);
2513 
2514 	if(min(limit_st.rlim_cur, limit_st.rlim_max  ) < MIN_FILE_POINTERS_ALLOWED){
2515 		SUBREADprintf(" ERROR: the maximum file open number (%d) is too low. Please increase this number to a number larger than 50 by using the 'ulimit -n' command.\n\n",(int)(min(limit_st.rlim_cur, limit_st.rlim_max)));
2516 		return 1;
2517 	}
2518 #endif
2519 	return 0;
2520 }
2521 
2522 // Tiny_Mode only write the following information:
2523 // Name   Flag   Chro   Pos   Mapq   Cigar   MateChro   MatePos   Tlen  N  I  NH:i:xx  HI:i:xx
2524 // Tiny_Mode does not work when output and input are both in BAM format
2525 // in_format can be either
2526 // bin_buff_size_per_thread is in Mega-Bytes.
2527 // It returns 0 if no error
SAM_pairer_create(SAM_pairer_context_t * pairer,int all_threads,int bin_buff_size_per_thread,int BAM_input,int is_Tiny_Mode,int is_single_end_mode,int force_do_not_sort,int need_read_group_tag,int display_progress,char * in_file,void (* reset_output_function)(void * pairer),int (* output_header_function)(void * pairer,int thread_no,int is_text,unsigned int items,char * bin,unsigned int bin_len),int (* output_function)(void * pairer,int thread_no,char * bin1,char * bin2),char * tmp_path,void * appendix1,int long_read_minimum_length)2528 int SAM_pairer_create(SAM_pairer_context_t * pairer, int all_threads, int bin_buff_size_per_thread, int BAM_input, int is_Tiny_Mode, int is_single_end_mode, int force_do_not_sort, int need_read_group_tag, int display_progress, char * in_file, void (* reset_output_function) (void * pairer), int (* output_header_function) (void * pairer, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len), int (* output_function) (void * pairer, int thread_no, char * bin1, char * bin2), char * tmp_path, void * appendix1, int long_read_minimum_length) {
2529 
2530 	memset(pairer, 0, sizeof(SAM_pairer_context_t));
2531 
2532 	if(in_file[0]=='<'){
2533 		in_file++;
2534 		strncpy(pairer -> in_file_name, "<STDIN>", MAX_FILE_NAME_LENGTH);
2535 	}else
2536 		strncpy(pairer -> in_file_name, in_file, MAX_FILE_NAME_LENGTH);
2537 
2538 	pairer -> input_fp = f_subr_open(in_file, "rb");
2539 	if(NULL == pairer -> input_fp) return 1;
2540 
2541 	SAM_pairer_warning_file_open_limit();
2542 
2543 	pairer -> input_is_BAM = BAM_input;
2544 	pairer -> tiny_mode = is_Tiny_Mode;
2545 	pairer -> reset_output_function = reset_output_function;
2546 	pairer -> output_function = output_function;
2547 	pairer -> output_header = output_header_function;
2548 	pairer -> display_progress = display_progress;
2549 	pairer -> is_single_end_mode = is_single_end_mode;
2550 	pairer -> force_do_not_sort = force_do_not_sort;
2551 	pairer -> need_read_group_tag = need_read_group_tag;
2552 	pairer -> long_read_minimum_length = long_read_minimum_length;
2553 
2554 	subread_init_lock(&pairer -> unsorted_notification_lock);
2555 	subread_init_lock(&pairer -> input_fp_lock);
2556 	subread_init_lock(&pairer -> SAM_BAM_table_lock);
2557 
2558 	pairer -> total_threads = all_threads;
2559 	if(pairer ->input_is_BAM){
2560 		pairer -> input_buff_SBAM_size = bin_buff_size_per_thread * 1024 * 1024;
2561 	}else{
2562 		pairer -> input_buff_SBAM_size = max(bin_buff_size_per_thread * 1024 * 1024 + FC_LONG_READ_RECORD_HARDLIMIT ,  3*FC_LONG_READ_RECORD_HARDLIMIT/2);
2563 	}
2564 
2565 	pairer -> input_buff_BIN_size = max(1024*1024, pairer -> input_buff_SBAM_size );
2566 
2567 	pairer -> appendix1 = appendix1;
2568 
2569 	old_sig_TERM = signal (SIGTERM, REPAIR_SIGINT_hook);
2570 	old_sig_INT = signal (SIGINT, REPAIR_SIGINT_hook);
2571 
2572 	strcpy(pairer -> tmp_file_prefix, tmp_path);
2573 	_REPAIRER_delete_temp_prefix = pairer -> tmp_file_prefix;
2574 	pairer -> threads = malloc(all_threads * sizeof(SAM_pairer_thread_t));
2575 	memset(pairer -> threads, 0, all_threads * sizeof(SAM_pairer_thread_t));
2576 
2577 	if(pairer ->input_is_BAM){
2578 		pairer ->bam_margin_table = HashTableCreate(2191);
2579 		HashTableSetHashFunction(pairer -> bam_margin_table, fc_chro_hash);
2580 		HashTableSetKeyComparisonFunction(pairer -> bam_margin_table, fc_strcmp_chro);
2581 		HashTableSetDeallocationFunctions(pairer -> bam_margin_table, free, free);
2582 	}else{
2583 		pairer -> sam_contig_number_table = HashTableCreate(21907);
2584 		HashTableSetHashFunction(pairer -> sam_contig_number_table, fc_chro_hash);
2585 		HashTableSetKeyComparisonFunction(pairer -> sam_contig_number_table, fc_strcmp_chro);
2586 		HashTableSetDeallocationFunctions(pairer -> sam_contig_number_table, free, NULL);
2587 	}
2588 
2589 	pairer -> unsorted_notification_table = HashTableCreate(2191);
2590 	HashTableSetHashFunction(pairer -> unsorted_notification_table, fc_chro_hash);
2591 	HashTableSetKeyComparisonFunction(pairer -> unsorted_notification_table, fc_strcmp_chro);
2592 	HashTableSetDeallocationFunctions(pairer -> unsorted_notification_table, free, free);
2593 
2594 	int x1;
2595 
2596 	for(x1 = 0; x1 < all_threads ; x1++){
2597 		pairer -> threads[x1].thread_id = x1;
2598 		pairer -> threads[x1].reads_in_SBAM = 0;
2599 		pairer -> threads[x1].input_buff_SBAM = malloc(pairer -> input_buff_SBAM_size);
2600 		pairer -> threads[x1].input_buff_BIN_capacity = pairer -> input_buff_BIN_size;
2601 		pairer -> threads[x1].input_buff_BIN = malloc(pairer -> threads[x1].input_buff_BIN_capacity );
2602 
2603 		pairer -> threads[x1].input_buff_BIN_used = 0;
2604 		pairer -> threads[x1].orphant_table = HashTableCreate(pairer -> input_buff_SBAM_size / 100);
2605 		HashTableSetHashFunction(pairer -> threads[x1].orphant_table, fc_chro_hash);
2606 		HashTableSetKeyComparisonFunction(pairer -> threads[x1].orphant_table, fc_strcmp_chro);
2607 		HashTableSetDeallocationFunctions(pairer -> threads[x1].orphant_table, free, free);
2608 		pairer -> threads[x1].strm.zalloc = Z_NULL;
2609 		pairer -> threads[x1].strm.zfree = Z_NULL;
2610 		pairer -> threads[x1].strm.opaque = Z_NULL;
2611 		pairer -> threads[x1].strm.avail_in = 0;
2612 		pairer -> threads[x1].strm.next_in = Z_NULL;
2613 
2614 		inflateInit2(&pairer -> threads[x1].strm, PAIRER_GZIP_WINDOW_BITS);
2615 
2616 		if(force_do_not_sort)
2617 			subread_init_lock(&pairer -> threads[x1].SBAM_lock);
2618 	}
2619 	return 0;
2620 }
2621 
SAM_pairer_print_keys(void * key,void * hashed_obj,HashTable * tab)2622 void SAM_pairer_print_keys(void * key, void * hashed_obj, HashTable * tab){
2623 	int dlen =0;
2624 	memcpy(&dlen,  hashed_obj,4);
2625 	SUBREADprintf("ESKY = %s   LEN = %d\n",(char*)key,dlen);
2626 }
2627 
SAM_pairer_destroy(SAM_pairer_context_t * pairer)2628 void SAM_pairer_destroy(SAM_pairer_context_t * pairer){
2629 
2630 	int x1;
2631 	srInt_64 all_orphants = 0;
2632 	for(x1 = 0; x1 < pairer -> total_threads ; x1++){
2633 		inflateEnd(&pairer -> threads[x1].strm);
2634 		free(pairer -> threads[x1].input_buff_BIN);
2635 		free(pairer -> threads[x1].input_buff_SBAM);
2636 
2637 		if(pairer -> force_do_not_sort)
2638 			subread_destroy_lock(&pairer -> threads[x1].SBAM_lock);
2639 
2640 		all_orphants += pairer -> threads[x1].orphant_table->numOfElements;
2641 		HashTableDestroy(pairer -> threads[x1].orphant_table);
2642 	}
2643 
2644 	if(pairer->input_is_BAM){
2645 		//HashTableIteration(pairer -> bam_margin_table, SAM_pairer_print_keys);
2646 	    HashTableDestroy(pairer -> bam_margin_table);
2647 	}
2648 	else HashTableDestroy(pairer -> sam_contig_number_table);
2649 	HashTableDestroy(pairer -> unsorted_notification_table);
2650 
2651 	subread_destroy_lock(&pairer -> unsorted_notification_lock);
2652 	subread_destroy_lock(&pairer -> input_fp_lock);
2653 	subread_destroy_lock(&pairer -> SAM_BAM_table_lock);
2654 
2655 	delete_with_prefix(pairer -> tmp_file_prefix);
2656 	fclose(pairer -> input_fp);
2657 	free(pairer -> threads);
2658 	signal (SIGTERM, old_sig_TERM);
2659 	signal (SIGINT, old_sig_INT);
2660 	//SUBREADprintf("All orphans=%llu frags\n", all_orphants);
2661 }
2662 
2663 // always assume that fp is at the start of a BAM GZ block.
SAM_pairer_read_BAM_block(FILE * fp,int max_read_len,char * inbuff)2664 int SAM_pairer_read_BAM_block(FILE * fp, int max_read_len, char * inbuff) {
2665 	unsigned char gz_header_12 [12];
2666 	//SUBREADprintf("STAT GZ  POS=%llu\n", ftello(fp));
2667 	int read_len = fread(gz_header_12, 1, 12, fp );
2668 	if(read_len < 12){
2669 		return -1;
2670 	}
2671 	if(gz_header_12[0]!=31 || gz_header_12[1]!=139){
2672 		//SUBREADprintf("Unrecognized Gzip headers: %u, %u\nPlease make sure if the input file is in the BAM format.\n", gz_header_12[0], gz_header_12[1]);
2673 		return -1;
2674 	}
2675 	unsigned short xlen = 0, bsize = 0;
2676 	memcpy(&xlen, gz_header_12 + 10, 2);
2677 	int xlen_read = 0;
2678 
2679 	while( xlen_read < xlen ){
2680 		unsigned char x_header_4[4];
2681 		unsigned short slen = 0;
2682 		read_len = fread(x_header_4, 1, 4, fp);
2683 		if(read_len < 4){
2684 			SUBREADprintf("BAD GZ BAM 6LEN\n");
2685 			return -1;
2686 		}
2687 		memcpy(&slen, x_header_4+2 , 2);
2688 		xlen_read += 4;
2689 		if(x_header_4[0]==66 && x_header_4[1]==67 && slen == 2){
2690 			read_len = fread(&bsize, 2, 1, fp);
2691 			if(read_len < 1){
2692 				SUBREADprintf("BAD GZ BAM XLEN\n");
2693 				return -1;
2694 			}
2695 		}else{
2696 			fseeko(fp, slen, SEEK_CUR);
2697 		}
2698 		xlen_read += slen;
2699 	}
2700 	if(bsize < 1 || bsize < xlen + 19){
2701 		SUBREADprintf("BAD GZ BAM BSIZE\n");
2702 		return -1;
2703 	}
2704 	read_len = fread(inbuff, 1, bsize - xlen - 19, fp);
2705 	//SUBREADprintf("ABBO : GOOD GZ , LEN=%d , POS=%llu\n", read_len, ftello(fp));
2706 
2707 	// seek over CRC and ISIZE
2708 	fseeko(fp, 8, SEEK_CUR);
2709 	if(read_len < bsize - xlen - 19) return -1;
2710 	return read_len;
2711 }
2712 
2713 #define MIN_BAM_BLOCK_SIZE 66000
2714 
SAM_pairer_read_SAM_MB(FILE * fp,int max_read_len,char * inbuff)2715 int SAM_pairer_read_SAM_MB( FILE * fp, int max_read_len, char * inbuff ){
2716 	int ret = 0;
2717 
2718 	if(feof(fp)) return 0;
2719 
2720 	while(1){
2721 		if(ret >= max_read_len - FC_LONG_READ_RECORD_HARDLIMIT || feof(fp))break;
2722 		int rlen = fread(inbuff +ret , 1, max_read_len - FC_LONG_READ_RECORD_HARDLIMIT - ret , fp);
2723 		if(rlen > 0){
2724 			int x1;
2725 			for(x1 = 0; x1 < min(200, rlen); x1++)
2726 				if(*(inbuff+ret+x1)<8 || *(inbuff+ret+x1)> 127){
2727 					SUBREADprintf("NOT_SAM_ACTUALLY\n");
2728 					return -1;
2729 				}
2730 			ret += rlen;
2731 		}
2732 	}
2733 	if(!feof(fp)){
2734 		int nch;
2735 		while(1){
2736 			nch = fgetc(fp);
2737 			if(nch < 0 || nch == '\n'){
2738 				break;
2739 			}else{
2740 				inbuff[ret++]=nch;
2741 			}
2742 		}
2743 	}
2744 	if(inbuff[ret-1] != '\n') inbuff[ret++]='\n';
2745 	inbuff[ret] = 0;
2746 
2747 	return ret;
2748 }
2749 
SAM_pairer_fill_BIN_buff(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,int * is_finished)2750 void SAM_pairer_fill_BIN_buff(SAM_pairer_context_t * pairer ,  SAM_pairer_thread_t * thread_context , int * is_finished){
2751 	// load continuous 64MB of data into the SBAM buffer of the current thread
2752 	// For BAM files: must be the entire blocks.
2753 	// For SAM files: must be the full lines.
2754 	int current_buffer_used = 0;
2755 	int current_blocks = 0;
2756 	int last_read_len = -1, this_size;
2757 	if(pairer -> input_is_BAM){
2758 		thread_context -> input_buff_SBAM_file_start = ftello(pairer -> input_fp);
2759 		while(1){
2760 			if( feof(pairer -> input_fp)){
2761 				*is_finished = 1;
2762 				break;
2763 			}
2764 			if(pairer -> input_buff_SBAM_size - current_buffer_used < MIN_BAM_BLOCK_SIZE) {
2765 				break;
2766 			}
2767 			this_size = SAM_pairer_read_BAM_block( pairer -> input_fp , pairer -> input_buff_SBAM_size - current_buffer_used , thread_context -> input_buff_SBAM + current_buffer_used);
2768 
2769 			current_blocks ++;
2770 			if(this_size >= 0) {
2771 				current_buffer_used += this_size;
2772 			} else {
2773 				if(feof(pairer -> input_fp) && last_read_len != -1 ){
2774 					pairer -> is_bad_format |= (last_read_len > 2);
2775 					pairer -> is_incomplete_BAM |= (last_read_len > 2);
2776 					if(pairer -> is_incomplete_BAM)SUBREADprintf("ERROR: the BAM file seems incomplete : this %d, last %d.\n", this_size , last_read_len );
2777 				}
2778 				*is_finished = 1;
2779 				break;
2780 			}
2781 			last_read_len = this_size;
2782 		}
2783 		thread_context -> input_buff_SBAM_file_end = ftello(pairer -> input_fp);
2784 	}else{ // is_SAM
2785 		current_buffer_used = SAM_pairer_read_SAM_MB(pairer -> input_fp , pairer -> input_buff_SBAM_size , thread_context -> input_buff_SBAM);
2786 		if(current_buffer_used < 1) *is_finished = 1;
2787 	}
2788 
2789 	//SUBREADprintf("PAPA:READ=%d by %d blocks  %p, PTRS=%p %p\n", current_buffer_used, current_blocks, thread_context, thread_context -> input_buff_SBAM, thread_context -> input_buff_BIN);
2790 	thread_context -> input_buff_SBAM_used = current_buffer_used;
2791 	thread_context -> input_buff_SBAM_ptr = 0;
2792 	thread_context -> input_buff_BIN_used = 0;
2793 	thread_context -> input_buff_BIN_ptr = 0;
2794 	thread_context -> readno_in_chunk = 0;
2795 }
2796 
2797 int SAM_pairer_find_start(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context );
2798 #define BAM_next_nch { \
2799 	int retXX = 0; while(thread_context -> input_buff_BIN_ptr >= thread_context -> input_buff_BIN_used){retXX = SAM_pairer_fetch_BAM_block(pairer, thread_context);  if(retXX) break;}\
2800 	if(retXX) nch=-1; else nch = thread_context -> input_buff_BIN[thread_context -> input_buff_BIN_ptr++];}
2801 
2802 #define SAM_next_line {\
2803 	if( thread_context -> input_buff_SBAM_used <= thread_context -> input_buff_SBAM_ptr ){ line_ptr = NULL;}else{\
2804 	line_ptr = thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr;line_len = 0;\
2805 	while(line_len + thread_context -> input_buff_SBAM_ptr < thread_context -> input_buff_SBAM_used){ int ccch = thread_context -> input_buff_SBAM[ thread_context -> input_buff_SBAM_ptr + line_len ]; if(ccch == '\n')break; line_len ++;}\
2806 	thread_context -> input_buff_SBAM_ptr += line_len+1;}}
2807 
SAM_pairer_fetch_BAM_block(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context)2808 int SAM_pairer_fetch_BAM_block(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context){
2809 	if(thread_context -> input_buff_SBAM_used <=  thread_context -> input_buff_SBAM_ptr){
2810 		return 1;
2811 	}
2812 
2813 	int remained_BIN =  thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_ptr;
2814 	if( remained_BIN > 0) {
2815 		int x1;
2816 		for(x1 = 0 ; x1 < thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_ptr; x1++)
2817 			thread_context -> input_buff_BIN[x1] = thread_context -> input_buff_BIN[x1+thread_context -> input_buff_BIN_ptr];
2818 		thread_context -> input_buff_BIN_used -= thread_context -> input_buff_BIN_ptr;
2819 	} else thread_context -> input_buff_BIN_used = 0;
2820 
2821 	thread_context -> input_buff_BIN_ptr = 0;
2822 
2823 	inflateReset(&thread_context -> strm);
2824 
2825 	int lin, lout;
2826 
2827 	lin=thread_context -> strm.avail_in = (unsigned int)(thread_context -> input_buff_SBAM_used - thread_context -> input_buff_SBAM_ptr);
2828 	thread_context -> strm.next_in = (unsigned char *)thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr;
2829 
2830 	if( thread_context -> input_buff_BIN_capacity < thread_context -> input_buff_BIN_used + 128*1024){
2831 		thread_context -> input_buff_BIN_capacity =  max(thread_context -> input_buff_BIN_used, thread_context -> input_buff_BIN_capacity )*1.5;
2832 		if(thread_context -> input_buff_BIN_capacity > 1024*1024*1024){
2833 			SUBREADprintf("ERROR: buffer size larger than 1GB\n");
2834 			return 1;
2835 		}else{
2836 			//SUBREADprintf("Resize Buffer of Th_%d to %d (used %d); In_ava=%d - %d\n", thread_context -> thread_id,  thread_context -> input_buff_BIN_capacity, thread_context -> input_buff_BIN_used, thread_context -> input_buff_SBAM_used , thread_context -> input_buff_SBAM_ptr);
2837 		}
2838 		thread_context -> input_buff_BIN = realloc( thread_context -> input_buff_BIN , thread_context -> input_buff_BIN_capacity);
2839 		//SUBREADprintf("  PTR=%p\n",thread_context -> input_buff_BIN);
2840 		assert( thread_context -> input_buff_BIN );
2841 	}
2842 	lout=thread_context -> strm.avail_out = thread_context -> input_buff_BIN_capacity - thread_context -> input_buff_BIN_used;
2843 	thread_context -> strm.next_out = (unsigned char *)thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_used;
2844 
2845 	int ret = inflate(&thread_context ->strm, Z_FINISH);
2846 	if(ret == Z_OK || ret == Z_STREAM_END)
2847 	{
2848 		int have = lout - thread_context ->strm.avail_out;
2849 		int used_BAM = lin - thread_context -> strm.avail_in;
2850 
2851 		//SUBREADprintf("ABBO TH %d : INFLATED BAM_CONSUMED: %d BIN_USED: %d => %d    NEED_FIND_START=%d\n", thread_context -> thread_id, used_BAM, thread_context -> input_buff_BIN_used , thread_context -> input_buff_BIN_used+have, thread_context -> need_find_start);
2852 		thread_context -> input_buff_BIN_used += have;
2853 		thread_context -> input_buff_SBAM_ptr += used_BAM;
2854 
2855 		if(thread_context -> need_find_start){
2856 			int test_read_bin = SAM_pairer_find_start(pairer, thread_context);
2857 			if(test_read_bin<1 && thread_context -> input_buff_BIN_used >= 32  ){
2858 				pairer -> is_bad_format = 1;
2859 				SUBREADprintf("ERROR: cannot find the start of the next BAM block.\n");
2860 			}
2861 		}
2862 		//SUBREADprintf("FETCHED BLOCK DECOMP=%d FROM COMP=%d\n", have, used_BAM);
2863 	} else {
2864 		if(ret == -5){
2865 			SUBREADprintf("Cannot parse the input BAM file. If the BAM file contains long reads, please run featureCounts on the long-read mode.\n");
2866 		}else{
2867 			SUBREADprintf("GZIP ERROR:%d\n", ret);
2868 		}
2869 		pairer -> is_bad_format = 1;
2870 		pairer -> is_internal_error = 1;
2871 		return 1;
2872 	}
2873 
2874 	return 0;
2875 }
2876 
2877 #define BAM_next_u32(v) {\
2878  (v) = 0; unsigned int poww = 1 ;  \
2879   BAM_next_nch; (v) += nch*poww; poww *= 256;\
2880   BAM_next_nch; (v) += nch*poww; poww *= 256;\
2881   BAM_next_nch; (v) += nch*poww; poww *= 256;\
2882   BAM_next_nch; (v) += nch*poww;\
2883 }
2884 
SAM_pairer_reduce_BAM_bin(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,unsigned char * bin_where,int * bin_len)2885 void SAM_pairer_reduce_BAM_bin(SAM_pairer_context_t * pairer, SAM_pairer_thread_t * thread_context,  unsigned char * bin_where, int * bin_len){
2886 	unsigned int seq_len, name_len, cigar_ops;
2887 	memcpy(&seq_len, bin_where + 20, 4);
2888 	if(seq_len<=1) return;
2889 	memcpy(&name_len, bin_where + 12, 4);
2890 	name_len = name_len & 0xff;
2891 	memcpy(&cigar_ops, bin_where + 16, 4);
2892 	cigar_ops = cigar_ops & 0xffff;
2893 
2894 	int targ_pos = 36+name_len+4*cigar_ops + 2;
2895 	int src_pos = 36+name_len+4*cigar_ops + (1+seq_len) / 2 + seq_len;
2896 
2897 	bin_where[targ_pos-2]=0xff;
2898 	bin_where[targ_pos-1]=0xff;
2899 
2900 	//SUBREADprintf("REDUCE by Thread %d : %d -> %d in %d ; seq_len = %u\n", thread_context -> thread_id, src_pos, targ_pos, *bin_len, seq_len);
2901 	seq_len = 1;
2902 	memcpy(bin_where + 20, &seq_len, 4);
2903 
2904 	while(src_pos < (*bin_len)){
2905 		bin_where[targ_pos++]=bin_where[src_pos++];
2906 	}
2907 	(* bin_len) = targ_pos - 4;
2908 	memcpy(bin_where, bin_len, 4);
2909 	(* bin_len) += 4;
2910 
2911 }
2912 
2913 #define MAX_BIN_RECORD_LENGTH ( 20*1024*1024)
2914 int reduce_SAM_to_BAM(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context, int include_sequence);
2915 int is_read_bin(char * bin, int bin_len, int max_refID);
2916 
SAM_pairer_get_next_read_BIN(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,unsigned char ** bin_where,int * bin_len)2917 int SAM_pairer_get_next_read_BIN( SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context , unsigned char ** bin_where, int * bin_len ) {
2918 	if( pairer -> input_is_BAM ){
2919 		int nch = 0;
2920 		while(1){
2921 			if(!pairer -> BAM_header_parsed){
2922 				int x1;
2923 				unsigned int bam_signature;
2924 				BAM_next_u32(bam_signature);
2925 				BAM_next_u32(pairer -> BAM_l_text);
2926 				char * header_txt = NULL;
2927 				int header_txt_dynamic_length = -1;
2928 
2929 				if(pairer->BAM_l_text>0) header_txt = malloc(max(1000000,pairer->BAM_l_text));
2930 
2931 				for(x1 = 0 ; x1 < pairer -> BAM_l_text; x1++){
2932 					BAM_next_nch;
2933 					header_txt [x1] = nch;
2934 				}
2935 				int is_OK = 0;
2936 				if(pairer -> output_header)pairer -> output_header(pairer, thread_context -> thread_id, 1, pairer -> BAM_l_text , header_txt , pairer -> BAM_l_text );
2937 
2938 				BAM_next_u32(pairer -> BAM_n_ref);
2939 				unsigned int ref_bin_len = 0;
2940 				for(x1 = 0; x1 < pairer -> BAM_n_ref; x1++) {
2941 					unsigned int l_name, l_ref, x2;
2942 					BAM_next_u32(l_name);
2943 					assert(l_name < 256);
2944 
2945 					if(header_txt == NULL){
2946 						header_txt = malloc(3000000);
2947 						header_txt_dynamic_length = 3000000;
2948 					}
2949 
2950 					if( header_txt_dynamic_length>0 && ref_bin_len > header_txt_dynamic_length - 1000000 ){
2951 						header_txt_dynamic_length *= 2;
2952 						header_txt = realloc( header_txt,  header_txt_dynamic_length);
2953 					}
2954 
2955 					memcpy(header_txt + ref_bin_len, &l_name, 4);
2956 					ref_bin_len += 4;
2957 					for(x2 = 0; x2 < l_name; x2++){
2958 						BAM_next_nch;
2959 						header_txt[ref_bin_len++] = nch;
2960 					}
2961 					BAM_next_u32(l_ref);
2962 					memcpy(header_txt + ref_bin_len, &l_ref, 4);
2963 					ref_bin_len += 4;
2964 				}
2965 
2966 				is_OK = is_OK || (pairer -> output_header?pairer -> output_header(pairer, thread_context -> thread_id, 0, pairer -> BAM_n_ref , header_txt ,  ref_bin_len ):0);
2967 				//SUBREADprintf("TFMT:HEADER REFS=%d TXTS=%d SIGN=%u\n", pairer -> BAM_n_ref, pairer->BAM_l_text, bam_signature);
2968 
2969 				if(header_txt) free(header_txt);
2970 				if(is_OK){
2971 					pairer -> is_incomplete_BAM = 1;
2972 					return 0;
2973 				}
2974 
2975 				pairer -> BAM_header_parsed = 1;
2976 				SAM_pairer_fetch_BAM_block(pairer, thread_context);
2977 			}
2978 
2979 			if(pairer -> is_bad_format) return 0;
2980 
2981 			unsigned int record_len = 0xffffffff;
2982 			while(1){
2983 				if(thread_context -> input_buff_BIN_ptr <= thread_context -> input_buff_BIN_used - 4)
2984 					memcpy(&record_len, thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr, 4);
2985 
2986 				if(record_len < 0xfffffff0 && thread_context -> input_buff_BIN_ptr +4 + record_len <= thread_context -> input_buff_BIN_used){
2987 					break;
2988 				}
2989 
2990 				int ret_fetch = SAM_pairer_fetch_BAM_block(pairer, thread_context); // if ret != 0 then load next big chunk of BAM.
2991 				if(ret_fetch){
2992 					if(thread_context -> input_buff_BIN_used > thread_context -> input_buff_BIN_ptr){
2993 						char * margin_key = malloc(40);
2994 						char * margin_data = malloc(thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_ptr+4);
2995 						int margin_size = thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_ptr;
2996 						memcpy(margin_data, &margin_size, 4);
2997 						memcpy(margin_data+4,  thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr, thread_context -> input_buff_BIN_used - thread_context -> input_buff_BIN_ptr);
2998 						#ifdef __MINGW32__
2999 						sprintf(margin_key,"E%lu",  (unsigned long)thread_context -> input_buff_SBAM_file_end);
3000 						#else
3001 						sprintf(margin_key,"E%llu", thread_context -> input_buff_SBAM_file_end);
3002 						#endif
3003 						subread_lock_occupy(&pairer -> SAM_BAM_table_lock);
3004 
3005 						HashTablePut(pairer -> bam_margin_table, margin_key, margin_data);
3006 						subread_lock_release(&pairer -> SAM_BAM_table_lock);
3007 					}
3008 					return 0;
3009 				}
3010 			}
3011 
3012 			//SUBREADprintf("TFMT:RLEN=%d\n", record_len);
3013 
3014 			if(!pairer -> is_bad_format){
3015 				unsigned int  seq_len = 0;
3016 				thread_context -> input_buff_BIN_ptr += 4;
3017 				memcpy(&seq_len, thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr + 16, 4);
3018 
3019 				//SUBREADprintf("REDUCE_2: record %u, %u\n", record_len, seq_len);
3020 	//			#warning "=========== CHECK IF '0 && ' IS CORRECT ==========="
3021 				if(record_len < 32 || (0 && record_len > min(MAX_BIN_RECORD_LENGTH,60000))|| seq_len >= pairer -> long_read_minimum_length){
3022 					if(seq_len >= pairer -> long_read_minimum_length) pairer -> is_single_end_mode = 1;
3023 					SUBREADprintf("ERROR: sequence length in the BAM record is out of the expected region: %d, %d\n", record_len , seq_len );
3024 					pairer -> is_bad_format = 1;
3025 					return 0;
3026 				}
3027 
3028 				(* bin_where) = thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr - 4;
3029 				(* bin_len) = record_len + 4;
3030 
3031 				thread_context -> input_buff_BIN_ptr += record_len;
3032 			}
3033 			return 1;
3034 		}
3035 	} else { // if input is SAM
3036 		char *line_ptr;
3037 		int line_len=0, passed_read_SBAM_ptr = -1;
3038 		if(!pairer -> BAM_header_parsed){
3039 			char * header_start = NULL;
3040 			int header_len = 0;
3041 			while(1){
3042 				SAM_next_line;
3043 				//SUBREADprintf("LINE_PTR[%d][used bytes=%d]='''%s'''\n", thread_context -> thread_id, thread_context -> input_buff_SBAM_used, line_ptr);
3044 				if(NULL == header_start && line_ptr[0] == '@') header_start = line_ptr;
3045 
3046 				if(NULL == line_ptr){
3047 					passed_read_SBAM_ptr = line_ptr - thread_context -> input_buff_SBAM;
3048 					//SUBREADprintf("FATAL: the header is too large to the buffer.\n");
3049 					break;
3050 				}else{
3051 					//SUBREADprintf("LINELEN=%d, PTR=%d, FIRST=%c\n", line_len, thread_context -> input_buff_SBAM_ptr , line_ptr[0]);
3052 				}
3053 				if(line_ptr[0]=='@'){
3054 					header_len += 1+line_len;
3055 				}else{
3056 					passed_read_SBAM_ptr = line_ptr - thread_context -> input_buff_SBAM;
3057 					break;
3058 				}
3059 			}
3060 
3061 			int is_OK = pairer -> output_header(pairer, thread_context -> thread_id, 1, header_len , header_start , header_len);
3062 			thread_context -> input_buff_SBAM_ptr = 0;
3063 			int header_bin_ptr = 0, header_contigs = 0;
3064 			while(1){
3065 				SAM_next_line;
3066 				if(line_ptr == NULL || line_ptr[0]!='@') break;
3067 				if(memcmp(line_ptr, "@SQ\t",4)==0){
3068 					unsigned int ct_len = 0, ctptr = 4, status = 0, sqname_len = 0;
3069 					char * sqname = NULL;
3070 					while(1){
3071 						char ctnch = line_ptr[ctptr++];
3072 						if( status == 0){
3073 							if(ctnch=='S' && line_ptr[ctptr] == 'N' && line_ptr[ctptr+1] == ':'){
3074 								ctptr += 2;
3075 								status = 10;
3076 								sqname = line_ptr + ctptr;
3077 							}else if(ctnch=='L' && line_ptr[ctptr] == 'N' && line_ptr[ctptr+1] == ':'){
3078 								ctptr += 2;
3079 								status = 20;
3080 							}else	status = 30;
3081 						}else if(status == 10 || status == 20 || status == 30){
3082 							if(ctnch == '\t' || ctnch == '\n'){
3083 								status = 0;
3084 								if(ctnch == '\n') break;
3085 								//break;
3086 							}
3087 							if(status == 10) sqname_len ++;
3088 							else if(status == 20) ct_len = ct_len * 10 + ctnch - '0';
3089 						}
3090 					}
3091 
3092 
3093 					sqname_len += 1;
3094 					memcpy(header_start + header_bin_ptr, &sqname_len, 4);
3095 					header_bin_ptr += 4;
3096 					memcpy(header_start + header_bin_ptr, sqname, sqname_len-1);
3097 					*(header_start + header_bin_ptr + sqname_len - 1) = 0;
3098 					char * mem_contig_name = malloc(sqname_len);
3099 					strcpy(mem_contig_name , header_start + header_bin_ptr);
3100 					//SUBREADprintf("CONTIG %d : %s (len=%d = %d)\n", header_contigs, header_start + header_bin_ptr , sqname_len, strlen(mem_contig_name));
3101 					HashTablePut(pairer -> sam_contig_number_table , mem_contig_name, NULL + 1 + header_contigs);
3102 					header_bin_ptr += sqname_len;
3103 
3104 					memcpy(header_start + header_bin_ptr, &ct_len, 4);
3105 					header_bin_ptr += 4;
3106 					header_contigs++;
3107 				}
3108 			}
3109 
3110 			is_OK = is_OK || pairer -> output_header(pairer, thread_context -> thread_id, 0, header_contigs , header_start , header_bin_ptr);
3111 			pairer -> BAM_header_parsed = 1;
3112 			if(is_OK){
3113 				pairer -> is_incomplete_BAM = 1;
3114 				return 0;
3115 			}
3116 		}
3117 
3118 		if(passed_read_SBAM_ptr >=0)
3119 			thread_context -> input_buff_SBAM_ptr = passed_read_SBAM_ptr;
3120 
3121 		if( thread_context -> input_buff_SBAM_ptr < thread_context -> input_buff_SBAM_used ){
3122 			thread_context -> input_buff_BIN_ptr = 0;
3123 			//SUBREADprintf("reduce_SAM_to_BAM_0 \n");
3124 			*bin_len = reduce_SAM_to_BAM(pairer, thread_context,!pairer -> tiny_mode);
3125 			*bin_where = (unsigned char *)thread_context -> input_buff_BIN;
3126 
3127 			return ((*bin_len) > 0 && !pairer->is_bad_format)?1:0;
3128 		}
3129 		return 0;
3130 	}
3131 	return 0;
3132 }
3133 
online_register_contig(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,char * ref)3134 int online_register_contig(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context, char * ref){
3135 	SUBREADprintf("ERROR: Unable to find chromosome '%s' in the SAM header.\n", ref);
3136 	assert(0);
3137 	int reflen = strlen(ref);
3138 	char * header_sec = malloc(reflen + 20);
3139 	reflen++;
3140 	memcpy(header_sec, &reflen, 4);
3141 	memcpy(header_sec + 4, ref, reflen);
3142 	memset(header_sec + 4+reflen, 0, 4);
3143 	subread_lock_occupy(&pairer -> SAM_BAM_table_lock);
3144 
3145 	int refId = HashTableGet(pairer->sam_contig_number_table, ref) - NULL - 1;
3146 	if(refId < 0){
3147 		refId = pairer->sam_contig_number_table->numOfElements;
3148 		pairer -> output_header(pairer, thread_context -> thread_id, 0, 1 , header_sec , 8+reflen);
3149 		char * mem_ref = malloc(reflen+1);
3150 		memcpy(mem_ref, ref, reflen);
3151 		mem_ref[reflen]=0;
3152 		HashTablePut(pairer->sam_contig_number_table, mem_ref, NULL + refId + 1);
3153 	}
3154 	subread_lock_release(&pairer -> SAM_BAM_table_lock);
3155 	free(header_sec);
3156 	return refId;
3157 }
3158 
3159 #define set_memory_int(ptr, iii)  { *(ptr) = (iii)&0xff; *(ptr+1) = (iii>>8)&0xff;  *(ptr+2) = (iii>>16)&0xff;*(ptr+3) = (iii>>24); }
3160 
reduce_SAM_to_BAM(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,int include_sequence)3161 int reduce_SAM_to_BAM(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context, int include_sequence){
3162 	int column_no = 0, in_ptr = 0;
3163 	char * in_str = thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr;
3164 	char * read_name = NULL, * ref = NULL, * mate_ref = NULL, * cigar = NULL, * seq = NULL, * qual = NULL;
3165 	int flag = 0, pos = 0, mapq = 0, old_read_pos = 0, tlen = 0, l_read_name = 0, tlen_sign = 1, l_seq = 0;
3166 
3167 	read_name = in_str;
3168 	while(1){
3169 		int nch = in_str[in_ptr];
3170 		if(nch == '\n' || nch == '\0') {
3171 			break;
3172 		}else if(nch == '\t'){
3173 			if(column_no == 0 || column_no == 2 || column_no == 5 || column_no == 6 || column_no == 9)
3174 				in_str[in_ptr] = 0;
3175 			column_no ++;
3176 			if(column_no == 2) ref = in_str + in_ptr + 1;
3177 			else if(column_no == 5) cigar = in_str + in_ptr + 1;
3178 			else if(column_no == 6) mate_ref  = in_str + in_ptr + 1;
3179 			else if(column_no == 9) seq = in_str + in_ptr + 1;
3180 			else if(column_no == 10) qual = in_str + in_ptr + 1;
3181 			else if(column_no == 11) break;
3182 		}else{
3183 			if(column_no == 0) l_read_name ++;
3184 			else if(column_no == 1) flag = flag *10 + nch - '0';
3185 			else if(column_no == 3) pos = pos *10 + nch - '0';
3186 			else if(column_no == 4) mapq = mapq *10 + nch - '0';
3187 			else if(column_no == 7) old_read_pos = old_read_pos *10 + nch - '0';
3188 			else if(column_no == 9) l_seq ++;
3189 			else if(column_no == 8){
3190 				if(nch == '-') tlen_sign = -1;
3191 				else tlen = tlen *10 + nch - '0';
3192 			}
3193 		}
3194 
3195 		in_ptr++;
3196 	}
3197 	if(column_no < 10){
3198 		//SUBREADprintf("RETURN_LESS:%d\n", column_no);
3199 		return -1;
3200 	}
3201 	l_read_name++;
3202 
3203 	char * bin_tmp = (char *)thread_context -> input_buff_BIN + thread_context -> input_buff_BIN_ptr;
3204 
3205 	int refID = HashTableGet(pairer->sam_contig_number_table, ref) - NULL - 1;
3206 	if(refID < 0 && ref[0]!='*')
3207 		refID = online_register_contig(pairer, thread_context, ref);
3208 	set_memory_int(bin_tmp + 4, refID);
3209 
3210 	pos -= 1;
3211 	set_memory_int(bin_tmp + 8, pos);
3212 
3213 	int mapq_nl = mapq << 8 | l_read_name;
3214 	set_memory_int(bin_tmp + 12, mapq_nl);
3215 
3216 	int coverage;
3217 	int cigar_ops = SamBam_compress_cigar(cigar, (int *)(bin_tmp + 36 + l_read_name), &coverage, 65535);
3218 	int flag_nc = flag << 16 | cigar_ops;
3219 	set_memory_int(bin_tmp + 16, flag_nc);
3220 
3221 
3222 
3223 	int seq_len = qual - seq - 1;
3224 
3225 	if(seq_len >=pairer -> long_read_minimum_length ){
3226 		pairer -> is_single_end_mode = 1;
3227 		include_sequence = 0;
3228 		pairer -> tiny_mode = 1;
3229 		pairer -> long_cigar_mode = 1;
3230 	}
3231 
3232 	if(include_sequence){
3233 		set_memory_int(bin_tmp + 20, l_seq); // SEQ_LEN
3234 	}else	set_memory_int(bin_tmp + 20, 1);
3235 
3236 	int mate_refID = refID;
3237 	if(mate_ref[0]!='=' || mate_ref[1]!=0)
3238 		mate_refID = HashTableGet(pairer->sam_contig_number_table, mate_ref) - NULL - 1;
3239 
3240 	if(mate_refID < 0 && mate_ref[0]!='*')
3241 		mate_refID = online_register_contig(pairer, thread_context, mate_ref);
3242 
3243 	set_memory_int(bin_tmp + 24, mate_refID);
3244 
3245 	old_read_pos -= 1;
3246 	set_memory_int(bin_tmp + 28, old_read_pos);
3247 
3248 	tlen = tlen * tlen_sign;
3249 	set_memory_int(bin_tmp + 32, tlen);
3250 
3251 	memcpy(bin_tmp + 36, read_name, l_read_name);
3252 	int bin_ptr = 36 + l_read_name + 4 * cigar_ops;
3253 
3254 	if(include_sequence){
3255 		int xk1, nch;
3256 		//SUBREADprintf("SEQ (%d = %d) = %s\n", strlen(seq), l_seq, seq);
3257 		//SUBREADprintf("QUA (%d = %d) = %s\n\n", strlen(qual), l_seq, qual);
3258 		SamBam_read2bin(seq  , bin_tmp +  bin_ptr);
3259 		bin_ptr += (l_seq + 1) / 2;
3260 		for(xk1=0; xk1 < l_seq; xk1++){
3261 			nch = qual[xk1];
3262 			bin_tmp[bin_ptr++] = nch - 33;
3263 		}
3264 	}else{
3265 		bin_tmp[bin_ptr ++] = 0xff;
3266 		bin_tmp[bin_ptr ++] = 0xff;
3267 	}
3268 
3269 	if(column_no == 11)	// has extra tags
3270 	{
3271 		while(in_str[in_ptr] == '\t'){
3272 			if((!isalpha(in_str[in_ptr+1])) || (!isalpha(in_str[in_ptr+4]))){
3273 				while(in_str[in_ptr] !='\n')in_ptr++;
3274 				break;
3275 			}
3276 			in_ptr ++;
3277 
3278 			//SUBREADprintf("EXTRA_TAGS : %c%c : %c\n",in_str[in_ptr+0], in_str[in_ptr+1], in_str[in_ptr+3]);
3279 
3280 			int is_important_tag =  (in_str[in_ptr+0] == 'N' && in_str[in_ptr+1] == 'H') ||
3281 						(in_str[in_ptr+0] == 'H' && in_str[in_ptr+1] == 'I') ||
3282 						(in_str[in_ptr+0] == 'R' && in_str[in_ptr+1] == 'G') ||
3283 						(in_str[in_ptr+0] == 'N' && in_str[in_ptr+1] == 'M') ;
3284 			int xxnch;
3285 			if(in_str[in_ptr + 3] == 'Z' || in_str[in_ptr + 3] == 'H'){
3286 				if(is_important_tag||!pairer -> tiny_mode){
3287 					bin_tmp[bin_ptr+0] = in_str[in_ptr+0];
3288 					bin_tmp[bin_ptr+1] = in_str[in_ptr+1];
3289 					bin_tmp[bin_ptr+2] = in_str[in_ptr + 3];
3290 					bin_ptr += 3;
3291 				}
3292 				in_ptr += 5;
3293 				while(1){
3294 					xxnch = *(in_str + in_ptr);
3295 					if(xxnch == '\n' || xxnch == '\t' || xxnch == 0) break;
3296 					if(is_important_tag||!pairer -> tiny_mode)
3297 						*(bin_tmp + (bin_ptr++)) = xxnch;
3298 					in_ptr ++;
3299 				}
3300 				if(is_important_tag||!pairer -> tiny_mode){
3301 					*(bin_tmp + (bin_ptr++)) = 0;
3302 				}
3303 			}else if(in_str[in_ptr + 3] == 'i'){
3304 				int tmpi = 0, tmpi_sign = 1;
3305 				if(is_important_tag || !pairer -> tiny_mode){
3306 					bin_tmp[bin_ptr+0] = in_str[in_ptr+0];
3307 					bin_tmp[bin_ptr+1] = in_str[in_ptr+1];
3308 					bin_tmp[bin_ptr+2] = 'i';
3309 					bin_ptr += 3;
3310 				}
3311 
3312 				in_ptr += 5;
3313 
3314 				while(1){
3315 					xxnch = *(in_str + in_ptr);
3316 					if(xxnch == '\n' || xxnch == '\t' || xxnch == 0) break;
3317 					else if(xxnch == '-') tmpi_sign = -1;
3318 					else tmpi = tmpi * 10 + xxnch - '0';
3319 					in_ptr ++;
3320 				}
3321 				tmpi *= tmpi_sign;
3322 				if(is_important_tag || !pairer -> tiny_mode){
3323 					set_memory_int(bin_tmp+bin_ptr, tmpi);
3324 					bin_ptr += 4;
3325 				}
3326 			}else if(in_str[in_ptr + 3] == 'f'){
3327 				char ftxt[30];
3328 				int fi=0;
3329 				while(1){
3330 					xxnch = *(in_str + in_ptr + 5 + fi);
3331 					if(xxnch== '\n' || xxnch == '\t'|| xxnch == 0) break;
3332 					ftxt[fi++]=xxnch;
3333 					ftxt[fi]=0;
3334 				}
3335 				if(!pairer -> tiny_mode){
3336 					float fv = atof(ftxt);
3337 					bin_tmp[bin_ptr+0] = in_str[in_ptr+0];
3338 					bin_tmp[bin_ptr+1] = in_str[in_ptr+1];
3339 					bin_tmp[bin_ptr+2] = 'f';
3340 					memcpy( bin_tmp + bin_ptr + 3, &fv, 4);
3341 					bin_ptr += 7;
3342 				}
3343 				in_ptr += 5 + fi;
3344 			}else if(in_str[in_ptr + 3] == 'B'){
3345 				char elemtype = in_str[in_ptr + 5];
3346 				int txi=0, eles=0;
3347 				char ttxt[30], *elen_ptr = NULL;;
3348 				if(!pairer -> tiny_mode){
3349 					bin_tmp[bin_ptr+0] = in_str[in_ptr+0];
3350 					bin_tmp[bin_ptr+1] = in_str[in_ptr+1];
3351 					bin_tmp[bin_ptr+2] = 'B';
3352 					bin_tmp[bin_ptr+3] = elemtype;
3353 					elen_ptr = bin_tmp+4 + bin_ptr;
3354 					bin_ptr += 8;
3355 				}
3356 				in_ptr += 6;
3357 				int elembytes_no = 4;
3358 				if(elemtype == 'C' || elemtype == 'c') elembytes_no = 1;
3359 				if(elemtype == 'S' || elemtype == 's') elembytes_no = 2;
3360 				while(1){
3361 					xxnch = *(in_str + in_ptr);
3362 					if((!pairer -> tiny_mode)){
3363 						if((xxnch ==',' || xxnch =='\n' || xxnch == '\t' || xxnch == 0) && txi > 0){
3364 							//SUBREADprintf("ADD VAL : `%s`\n", ttxt);
3365 							if(elemtype == 'f'){
3366 								float fv = atof(ttxt);
3367 								memcpy( bin_tmp + bin_ptr, &fv, 4);
3368 							}else{
3369 								int iv = atoi(ttxt);
3370 								memcpy( bin_tmp + bin_ptr, &iv, elembytes_no);
3371 							}
3372 							bin_ptr+=elembytes_no;
3373 							txi=0;
3374 							eles++;
3375 						}else{
3376 							if(xxnch!=','){
3377 								ttxt[txi++] = xxnch;
3378 								ttxt[txi] = 0;
3379 							}
3380 						}
3381 					}
3382 					if(xxnch =='\n' || xxnch == '\t' || xxnch == 0)break;
3383 					in_ptr ++;
3384 				}
3385 				if((!pairer -> tiny_mode)) memcpy(elen_ptr, & eles, 4);
3386 
3387 			}else if(in_str[in_ptr + 3] == 'A'){
3388 				if(!pairer -> tiny_mode){
3389 					bin_tmp[bin_ptr+0] = in_str[in_ptr+0];
3390 					bin_tmp[bin_ptr+1] = in_str[in_ptr+1];
3391 					bin_tmp[bin_ptr+2] = 'A';
3392 					bin_tmp[bin_ptr+3] = in_str[in_ptr+5];
3393 					bin_ptr += 4;
3394 				}
3395 				in_ptr += 6;
3396 			}else{
3397 				in_ptr += 5;
3398 				while(1){
3399 					xxnch = *(in_str + in_ptr);
3400 					if(xxnch == '\n' || xxnch == '\t' || xxnch == 0) break;
3401 					in_ptr++;
3402 				}
3403 			}
3404 		//	#warning "=============== COMMENT NEXT ====================="
3405 		//	SUBREADprintf("Z_len PTR = %d + %d\n", bin_ptr, thread_context -> input_buff_BIN_ptr);
3406 		}
3407 
3408 	}
3409 
3410 	thread_context -> input_buff_SBAM_ptr += in_ptr + 1;
3411 
3412 	bin_ptr -= 4;
3413 	set_memory_int(bin_tmp, bin_ptr);
3414 	bin_ptr += 4;
3415 	//memcpy(buf, bin_tmp, bin_ptr);
3416 
3417 	return bin_ptr;
3418 }
3419 
SAP_pairer_skip_tag_body_len(char * bin)3420 int SAP_pairer_skip_tag_body_len(char *bin){
3421 	int skip_content = 0;
3422 	if(bin[2]=='i' || bin[2]=='I' || bin[2]=='f')
3423 		skip_content = 4;
3424 	else if(bin[2]=='s' || bin[2]=='S')
3425 		skip_content = 2;
3426 	else if(bin[2]=='c' || bin[2]=='C' ||  bin[2]=='A')
3427 		skip_content = 1;
3428 	else if(bin[2]=='Z' || bin[2]=='H'){
3429 		while(bin[skip_content + 3]) skip_content++;
3430 		skip_content ++;
3431 	} else if(bin[2]=='B'){
3432 		char cell_type = tolower(bin[3]);
3433 		memcpy(&skip_content, bin + 4, 4);
3434 		if(cell_type == 's')skip_content *=2;
3435 		else if(cell_type == 'i' || cell_type == 'f')skip_content *= 4;
3436 		skip_content += 4+1; // 32-bit count, 1 byte type
3437 	}else{
3438 		SUBREADprintf("UnknownTag=%c\n", bin[2]);
3439 		assert(0);
3440 	}
3441 	return skip_content+3;
3442 }
3443 
SAM_pairer_iterate_tags(unsigned char * bin,int bin_len,char * tag_name,char * data_type,char ** saved_value)3444 int SAM_pairer_iterate_tags(unsigned char * bin, int bin_len, char * tag_name, char * data_type, char ** saved_value){
3445 	int found = 0;
3446 	int bin_cursor = 0;
3447 	while(bin_cursor < bin_len){
3448 		if(0){
3449 			char outc[3];
3450 			outc[0] = bin[bin_cursor];
3451 			outc[1] = bin[bin_cursor+1];
3452 
3453 			outc[2]=0;
3454 			SUBREADprintf("TAG=%s, TYP=%c %d %c\n", outc, bin[bin_cursor+2],  bin[bin_cursor+3],  bin[bin_cursor+4]);
3455 		}
3456 
3457 		if(bin[bin_cursor] == tag_name[0] && bin[bin_cursor+1] == tag_name[1]){
3458 			(* data_type) = bin[bin_cursor+2];
3459 			(* saved_value) = (char *)bin+bin_cursor+3;
3460 			found = 1;
3461 			break;
3462 		}
3463 
3464 		int skip_content = SAP_pairer_skip_tag_body_len((char*)bin+bin_cursor);
3465 		bin_cursor += skip_content ;
3466 	}
3467 	return found;
3468 }
3469 
SAM_pairer_iterate_int_tags(unsigned char * bin,int bin_len,char * tag_name,int * saved_value)3470 int SAM_pairer_iterate_int_tags(unsigned char * bin, int bin_len, char * tag_name, int * saved_value){
3471 	char * data_ptr = NULL;
3472 	char data_type = 0;
3473 
3474 	(*saved_value) = 0;
3475 	int ret = SAM_pairer_iterate_tags(bin, bin_len, tag_name, &data_type, &data_ptr);
3476 	//SUBREADprintf(" NEED %s , FOUND %d, TYPE %c\n", tag_name, ret, data_type);
3477 	if(ret){
3478 		if(data_type == 'i' || data_type == 'I')
3479 			memcpy(saved_value, data_ptr, 4);
3480 		else if(data_type == 's' || data_type == 'S')
3481 			memcpy(saved_value, data_ptr, 2);
3482 		else if(data_type == 'c' || data_type == 'C')
3483 			memcpy(saved_value, data_ptr, 1);
3484 		else return 0;
3485 	}
3486 
3487 	return ret;
3488 }
3489 
3490 
3491 
SAM_pairer_get_read_full_name(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,unsigned char * bin,int bin_len,char * full_name,int * this_flag)3492 int SAM_pairer_get_read_full_name( SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context , unsigned char * bin, int bin_len , char * full_name, int * this_flag){
3493 	full_name[0]=0;
3494 	int rlen = 0;
3495 	unsigned int l_read_name = 0;
3496 	unsigned int refID = 0;
3497 	unsigned int next_refID = 0;
3498 	unsigned int pos = 0, l_seq = 0, cigar_opts;
3499 	unsigned int next_pos = 0, tmpi = 0;
3500 	int FLAG;
3501 
3502 	int HItag = -1;
3503 
3504 
3505 	memcpy(&refID, bin + 4, 4);
3506 	memcpy(&pos, bin + 8, 4);
3507 	memcpy(&tmpi, bin + 12, 4);
3508 	l_read_name = tmpi & 0xff;
3509 	memcpy(&tmpi, bin + 16, 4);
3510 	FLAG = (tmpi >> 16)&0xffff;
3511 	(*this_flag) = FLAG;
3512 	cigar_opts = tmpi & 0xffff;
3513 	memcpy(&next_refID, bin + 24, 4);
3514 	memcpy(&next_pos, bin + 28, 4);
3515 	memcpy(full_name, bin+36, l_read_name);
3516 	assert(l_read_name > 0);
3517 	unsigned int r1_refID, old_read_pos, r2_refID, new_dummy_pos;
3518 
3519 	if(FLAG & 4){
3520 		refID = -1;
3521 		pos = 0;
3522 	}
3523 
3524 	if(FLAG & 8){
3525 		next_refID = -1;
3526 		next_pos = 0;
3527 	}
3528 
3529 	if((FLAG & 0x40) == 0x40){
3530 		r1_refID = refID;
3531 		old_read_pos = pos;
3532 		r2_refID = next_refID;
3533 		new_dummy_pos = next_pos;
3534 	} else {
3535 		r2_refID = refID;
3536 		new_dummy_pos = pos;
3537 		r1_refID = next_refID;
3538 		old_read_pos = next_pos;
3539 	}
3540 
3541 
3542 	memcpy(&l_seq, bin + 20, 4);
3543 	//SUBREADprintf("LQ=%d, RL=%d, CIGAR_OPT=%d\n", l_seq, (l_seq+1)/2, cigar_opts);
3544 
3545 	unsigned int tags_start = 36+l_read_name+4*cigar_opts+(l_seq+1)/2+l_seq;
3546 	unsigned int tags_len = bin_len - tags_start;
3547 
3548 	if(tags_len > 2){
3549 		int found = SAM_pairer_iterate_int_tags(bin + tags_start, tags_len, "HI", &HItag);
3550 		if(!found) HItag = -1;
3551 	}
3552 
3553 	int slash_pos = 0;
3554 	for(; slash_pos < l_read_name - 1; slash_pos++){
3555 		if(full_name[slash_pos] == '/') break;
3556 	}
3557 
3558 	rlen = slash_pos + sprintf(full_name+slash_pos, "\027%d\027%u\027%d\027%u\027%d", r1_refID, old_read_pos, r2_refID, new_dummy_pos, HItag);
3559 
3560 	return rlen;
3561 }
3562 
SAM_pairer_multi_thread_header(void * pairer_vp,int thread_no,int is_text,unsigned int items,char * bin,unsigned int bin_len)3563 int SAM_pairer_multi_thread_header (void * pairer_vp, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len){
3564 
3565 	SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
3566 	SAM_pairer_writer_main_t * bam_main = (SAM_pairer_writer_main_t * )pairer -> appendix1;
3567 	SAM_pairer_writer_thread_t * bam_thread = bam_main -> threads + thread_no;
3568 	unsigned int BIN_block_cursor = 0, bin_cursor = 0;
3569 	//SUBREADprintf("WRITE HEADER TYPE=%d; ITEMS=%d\n", is_text, items);
3570 	if(is_text){
3571 		memcpy( bam_thread -> BIN_buffer, "BAM\1", 4 );
3572 		memcpy( bam_thread -> BIN_buffer + 4 , & items , 4 );
3573 		BIN_block_cursor = 8;
3574 	}else{
3575 		memcpy( bam_thread -> BIN_buffer , & items , 4 );
3576 		BIN_block_cursor = 4;
3577 	}
3578 	while( bin_cursor  < bin_len ){
3579 		int write_text_len = min(SAM_PAIRER_WRITE_BUFFER - BIN_block_cursor, bin_len - bin_cursor);
3580 	//	SUBREADprintf("WRITE TLEN=%d\n", write_text_len);
3581 		memcpy(bam_thread -> BIN_buffer + BIN_block_cursor , bin + bin_cursor, write_text_len);
3582 		bam_thread -> BIN_buffer_ptr = write_text_len + BIN_block_cursor;
3583 
3584 		SAM_pairer_multi_thread_compress(bam_main, bam_thread);
3585 		bin_cursor += write_text_len;
3586 		BIN_block_cursor = 0;
3587 	}
3588 
3589 	bam_thread -> BIN_buffer_ptr = 0;
3590 	return 0;
3591 }
3592 
SAM_pairer_get_tag_bin_start(char * bin1)3593 int SAM_pairer_get_tag_bin_start(char * bin1){
3594 	int seq_len = 0;
3595 	int cigar_opts = 0;
3596 	int len_name = (unsigned char)bin1[12];
3597 	memcpy(&seq_len, bin1 + 20,4);
3598 	memcpy(&cigar_opts, bin1 + 16, 2);
3599 	return 36 + len_name + seq_len + (seq_len+1)/2 + 4 * cigar_opts;
3600 }
3601 
SAM_pairer_make_dummy(char * rname,char * bin1,char * out_bin2,int need_RG_tag)3602 void SAM_pairer_make_dummy(char * rname, char * bin1, char * out_bin2, int need_RG_tag){
3603 	char * realname = bin1 + 36;
3604 	int block1len =-1;
3605 	int len_name = (unsigned char)bin1[12] -1;
3606 	int old_read_chro =-1;
3607 	int old_read_pos =-1;
3608 	int new_dummy_chro =-1;
3609 	int new_dummy_pos =-1;
3610 
3611 	memcpy(&block1len, bin1, 4);
3612 	memcpy(&old_read_chro, bin1 + 4, 4);
3613 	memcpy(&old_read_pos, bin1 + 8, 4);
3614 
3615 	memcpy(&new_dummy_chro, bin1 + 24, 4);
3616 	memcpy(&new_dummy_pos, bin1 + 28, 4);
3617 
3618 	int HItag =-1;
3619 	int NHtag =-1;
3620 
3621 	int seq_len = -1;
3622 	int cigar_opts = -1;
3623 	memcpy(&seq_len, bin1 + 20,4);
3624 	int old_read_FLAG = -1;
3625 	memcpy(&old_read_FLAG, bin1 + 16, 4);
3626 	cigar_opts = old_read_FLAG & 0xffff;
3627 
3628 	char * RG_tag_val = NULL;
3629 	int bin1ptr = 36 + len_name +1 + seq_len + (seq_len+1)/2 + 4 * cigar_opts;
3630 	//SUBREADprintf("MAKE_DUMMY: %s ; need_RG=%d, %d > %d\n", realname, need_RG_tag,  block1len + 4  ,bin1ptr + 3);
3631 	if( block1len + 4 > bin1ptr + 3 ){
3632 		SAM_pairer_iterate_int_tags((unsigned char *)(bin1+bin1ptr),block1len + 4 - bin1ptr, "NH", &NHtag);
3633 		SAM_pairer_iterate_int_tags((unsigned char *)(bin1+bin1ptr),block1len + 4 - bin1ptr, "HI", &HItag);
3634 		if( need_RG_tag ){
3635 			char RG_type=0;
3636 			SAM_pairer_iterate_tags((unsigned char *)(bin1+bin1ptr),block1len + 4 - bin1ptr, "RG", &RG_type, &RG_tag_val);
3637 			if(RG_type != 'Z') RG_tag_val = NULL;
3638 			//SUBREADprintf("type=%c\tval=%s\n", RG_type, RG_tag_val);
3639 		}
3640 	}
3641 
3642 	old_read_FLAG = 0xffff&(old_read_FLAG >>16);
3643 	int mate_tlen = 0;
3644 	memcpy(&mate_tlen, bin1 + 32, 4);
3645 
3646 	if(old_read_chro<0) old_read_pos=-1;
3647 	if(new_dummy_chro<0) new_dummy_pos=-1;
3648 
3649 
3650 	int bin_mq_nl = (len_name+1);
3651 	int new_dummy_FLAG = (old_read_FLAG&0x40)? 0x80:0x40;
3652 	new_dummy_FLAG |= 1;
3653 
3654 	// Dummy reads should always be unmapped!
3655 	//if(old_read_FLAG & 8)new_dummy_FLAG |=4;
3656 
3657 	if(old_read_FLAG & 4)new_dummy_FLAG |=8;
3658 	if(old_read_FLAG & 8)new_dummy_FLAG |=4;
3659 	if(old_read_FLAG & 0x10) new_dummy_FLAG |= 0x20;
3660 	if(old_read_FLAG & 0x20) new_dummy_FLAG |= 0x10;
3661 	new_dummy_FLAG = new_dummy_FLAG << 16;
3662 
3663 	memcpy(out_bin2+4, &new_dummy_chro,4);
3664 	memcpy(out_bin2+8, &new_dummy_pos,4);
3665 	memcpy(out_bin2+12, &bin_mq_nl, 4);
3666 	memcpy(out_bin2+16, &new_dummy_FLAG, 4);
3667 
3668 	new_dummy_FLAG = 1;
3669 	memcpy(out_bin2+20, &new_dummy_FLAG, 4);
3670 	memcpy(out_bin2+24, &old_read_chro, 4);
3671 	memcpy(out_bin2+28, &old_read_pos, 4);
3672 
3673 	mate_tlen = -mate_tlen;
3674 	memcpy(out_bin2+32, &mate_tlen, 4);
3675 	memcpy(out_bin2+36, realname, len_name+1);
3676 	out_bin2[36 + len_name+1] = 0xff;
3677 	out_bin2[36 + len_name+2] = 0x20;
3678 
3679 	int all_len = 36 + len_name + 3 - 4;
3680 	int tag_ptr = 36 + len_name + 3;
3681 	//SUBREADprintf("HI=%d\n", HItag);
3682 	if(HItag>0){
3683 		out_bin2[tag_ptr++]='H';
3684 		out_bin2[tag_ptr++]='I';
3685 		if(HItag<128){
3686 			out_bin2[tag_ptr++]='C';
3687 			memcpy(out_bin2 + (tag_ptr++), &HItag, 1);
3688 			all_len += 4;
3689 		}else if(HItag<32767){
3690 			out_bin2[(tag_ptr+=2)]='S';
3691 			memcpy(out_bin2 + 36 + len_name+6, &HItag, 2);
3692 			all_len += 5;
3693 		}else {
3694 			out_bin2[(tag_ptr+=4)]='I';
3695 			memcpy(out_bin2 + 36 + len_name+6, &HItag, 4);
3696 			all_len += 7;
3697 		}
3698 	}
3699 	if(NHtag>0){
3700 		out_bin2[tag_ptr++]='N';
3701 		out_bin2[tag_ptr++]='H';
3702 		if(NHtag<128){
3703 			out_bin2[tag_ptr++]='C';
3704 			memcpy(out_bin2 + (tag_ptr++), &NHtag, 1);
3705 			all_len += 4;
3706 		}else if(NHtag<32767){
3707 			out_bin2[(tag_ptr+=2)]='S';
3708 			memcpy(out_bin2 + 36 + len_name+6, &NHtag, 2);
3709 			all_len += 5;
3710 		}else {
3711 			out_bin2[(tag_ptr+=4)]='I';
3712 			memcpy(out_bin2 + 36 + len_name+6, &NHtag, 4);
3713 			all_len += 7;
3714 		}
3715 	}
3716 	if(RG_tag_val){
3717 		out_bin2[tag_ptr++]='R';
3718 		out_bin2[tag_ptr++]='G';
3719 		out_bin2[tag_ptr++]='Z';
3720 		all_len +=3;
3721 		while(*RG_tag_val){
3722 			out_bin2[tag_ptr++]=*(RG_tag_val++);
3723 			all_len ++;
3724 		}
3725 		out_bin2[tag_ptr++]=0;
3726 		all_len ++;
3727 	}
3728 
3729 	memcpy(out_bin2,&all_len,4);
3730 }
3731 
SAM_pairer_reset(SAM_pairer_context_t * pairer)3732 void SAM_pairer_reset( SAM_pairer_context_t * pairer ) {
3733 	int x1;
3734 	pairer -> is_finished = 0;
3735 	pairer -> BAM_header_parsed = 0;
3736 	pairer -> total_input_reads = 0;
3737 	pairer -> input_chunk_no = 0;
3738 	pairer -> merge_level_finished = 0;
3739 	for(x1 = 0; x1 < pairer -> total_threads ; x1 ++){
3740 		pairer -> threads[x1].reads_in_SBAM = 0;
3741 		pairer -> threads[x1].input_buff_BIN_used = 0;
3742 		pairer -> threads[x1].input_buff_BIN_ptr = 0;
3743 		pairer -> threads[x1].input_buff_SBAM_used = 0;
3744 		pairer -> threads[x1].input_buff_SBAM_ptr = 0;
3745 		pairer -> threads[x1].orphant_block_no = 0;
3746 		pairer -> threads[x1].readno_in_chunk = 0;
3747 		pairer -> threads[x1].immediate_last_read_full_name[0]=0;
3748 		HashTableDestroy(pairer -> threads[x1].orphant_table);
3749 		pairer -> threads[x1].orphant_table = HashTableCreate(pairer -> input_buff_SBAM_size / 100);
3750 		HashTableSetHashFunction(pairer -> threads[x1].orphant_table, fc_chro_hash);
3751 		HashTableSetKeyComparisonFunction(pairer -> threads[x1].orphant_table, fc_strcmp_chro);
3752 		HashTableSetDeallocationFunctions(pairer -> threads[x1].orphant_table, free, free);
3753 		inflateReset(&pairer -> threads[x1].strm);
3754 	}
3755 	HashTableDestroy(pairer -> unsorted_notification_table);
3756 	pairer -> unsorted_notification_table = HashTableCreate(2191);
3757 	HashTableSetHashFunction(pairer -> unsorted_notification_table, fc_chro_hash);
3758 	HashTableSetKeyComparisonFunction(pairer -> unsorted_notification_table, fc_strcmp_chro);
3759 	HashTableSetDeallocationFunctions(pairer -> unsorted_notification_table, free, free);
3760 
3761 
3762 	HashTableDestroy(pairer ->bam_margin_table);
3763 	pairer ->bam_margin_table = HashTableCreate(2191);
3764 	HashTableSetHashFunction(pairer -> bam_margin_table, fc_chro_hash);
3765 	HashTableSetKeyComparisonFunction(pairer -> bam_margin_table, fc_strcmp_chro);
3766 	HashTableSetDeallocationFunctions(pairer -> bam_margin_table, free, free);
3767 
3768 }
SAM_pairer_writer_reset(void * pairer_vp)3769 void SAM_pairer_writer_reset( void * pairer_vp ) {
3770 	SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
3771 	SAM_pairer_writer_main_t * bam_main = (SAM_pairer_writer_main_t * )pairer -> appendix1;
3772 	int rlen = ftruncate(fileno(bam_main -> bam_fp), 0);
3773 	if(rlen != 0)SUBREADprintf("ERROR: Cannot reset the output file.");
3774 
3775 	fclose(bam_main -> bam_fp);
3776 	bam_main -> bam_fp = f_subr_open(bam_main -> bam_name, "wb");
3777 	int x1;
3778 	for(x1 = 0; x1 < pairer -> total_threads ; x1 ++){
3779 		bam_main -> threads[x1].BIN_buffer_ptr = 0;
3780 		deflateReset(&bam_main -> threads[x1].strm);
3781 	}
3782 
3783 
3784 }
3785 
SAM_pairer_multi_thread_output(void * pairer_vp,int thread_no,char * bin1,char * bin2)3786 int SAM_pairer_multi_thread_output(void * pairer_vp, int thread_no, char * bin1, char * bin2 ){
3787 	SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
3788 	SAM_pairer_writer_main_t * bam_main = (SAM_pairer_writer_main_t * )pairer -> appendix1;
3789 	SAM_pairer_writer_thread_t * bam_thread = bam_main -> threads + thread_no;
3790 
3791 	char dummy_bin2 [MAX_READ_NAME_LEN*2 + 180 ];
3792 	if(bin2==NULL && bam_main -> has_dummy){
3793 		SAM_pairer_make_dummy( "DUMMY", bin1, dummy_bin2, pairer -> need_read_group_tag );
3794 		bin2 = dummy_bin2;
3795 	}
3796 
3797 	int bin_len1, bin_len2 = 0;
3798 	memcpy(&bin_len1, bin1, 4);
3799 	bin_len1 +=4;
3800 
3801 	if(bin2) {
3802 		memcpy(&bin_len2, bin2, 4);
3803 		bin_len2 +=4;
3804 	}
3805 
3806 	if( bin_len1 + bin_len2 >= SAM_PAIRER_WRITE_BUFFER){
3807 		SUBREADprintf("ERROR: BAM Record larger than a BAM block.\n");
3808 		return 1;
3809 	}
3810 
3811 	if(bin_len1 + bin_len2 + bam_thread -> BIN_buffer_ptr >= SAM_PAIRER_WRITE_BUFFER){
3812 		int ret = SAM_pairer_multi_thread_compress(bam_main, bam_thread);
3813 		if(ret)return 1;
3814 	}
3815 	memcpy( bam_thread -> BIN_buffer + bam_thread -> BIN_buffer_ptr, bin1, bin_len1 );
3816 	if(bin2)
3817 		memcpy( bam_thread -> BIN_buffer + bam_thread -> BIN_buffer_ptr + bin_len1, bin2, bin_len2 );
3818 	bam_thread -> BIN_buffer_ptr += bin_len1 + bin_len2;
3819 	return 0;
3820 }
3821 
SAM_pairer_do_read_test(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,int read_name_len,char * read_full_name,int bin_len,char * bin,int flags)3822 void SAM_pairer_do_read_test( SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context , int read_name_len, char * read_full_name, int bin_len, char * bin , int flags){
3823 
3824 	unsigned char * mate_bin = HashTableGet(thread_context -> orphant_table, read_full_name);
3825 
3826 	if(mate_bin){
3827 		if(pairer -> output_function)
3828 			pairer -> output_function(pairer, thread_context -> thread_id, bin, (char*)mate_bin);
3829 		HashTableRemove(thread_context -> orphant_table, read_full_name);
3830 		if(thread_context -> orphant_space > bin_len)
3831 			thread_context -> orphant_space -= bin_len;
3832 		else	thread_context -> orphant_space = 0;
3833 		//SUBREADprintf("Mate_found: %s\n", read_full_name);
3834 	} else {
3835 		char * mem_name = malloc(read_name_len + 1);
3836 		memcpy(mem_name, read_full_name, read_name_len);
3837 		mem_name[read_name_len] = 0;
3838 
3839 		char * mem_bin = malloc(bin_len);
3840 		memcpy(mem_bin, bin , bin_len);
3841 
3842 		HashTablePut(thread_context -> orphant_table, mem_name, mem_bin);
3843 		thread_context -> orphant_space += bin_len;
3844 		//#warning "============= COMMENT NEXT =================="
3845 		//SUBREADprintf("Orphant_created [%d]: %s ; BINLEN=%d, OPSIZE=%d\n", thread_context -> thread_id, read_full_name, bin_len, thread_context -> orphant_space);
3846 	}
3847 }
3848 
3849 
SAM_pairer_register_matcher(SAM_pairer_context_t * pairer,unsigned int chunk_number,unsigned int readno_in_chunk,char * read_full_name,char * bin,int bin_len,int this_flags)3850 void SAM_pairer_register_matcher(SAM_pairer_context_t * pairer , unsigned int chunk_number, unsigned int readno_in_chunk, char * read_full_name , char * bin, int bin_len , int this_flags){
3851 
3852 	char * mem_bin = malloc(bin_len);
3853 	memcpy(mem_bin, bin , bin_len);
3854 	subread_lock_occupy(&pairer -> unsorted_notification_lock);
3855 	char * mem_name = malloc(24);
3856 	sprintf(mem_name, "B:%u:%d", chunk_number , (readno_in_chunk>0)?1:0);
3857 	HashTablePut(pairer -> unsorted_notification_table, mem_name, mem_bin);
3858 
3859 	mem_bin = malloc(bin_len);
3860 	sprintf(mem_bin,"%010u %d", chunk_number, (readno_in_chunk>0)?1:0);
3861 	mem_name = malloc(strlen(read_full_name) + 5);
3862 	sprintf(mem_name, "C:%s:%d", read_full_name , (this_flags & 0x80)?1:0);
3863 
3864 	HashTablePut(pairer -> unsorted_notification_table, mem_name, mem_bin);
3865 	subread_lock_release(&pairer -> unsorted_notification_lock);
3866 }
3867 
SAM_pairer_do_one_BIN(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,char * bin,int bin_len)3868 void SAM_pairer_do_one_BIN(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context , char * bin, int bin_len){
3869 	char read_full_name[ MAX_READ_NAME_LEN*2 +80 ];	// rname:chr_r1:pos_r1:chr_r2:pos_r2:HI_tag
3870 	int this_flags=0;
3871 	int name_len = SAM_pairer_get_read_full_name(pairer, thread_context, (unsigned char *)bin, bin_len, read_full_name, & this_flags);
3872 
3873 	if(pairer -> is_single_end_mode == 0 && ( this_flags & 1 ) == 1){ // if the reads are PE
3874 		if(strcmp(read_full_name , thread_context -> immediate_last_read_full_name) == 0){
3875 			if(pairer -> output_function)
3876 				pairer -> output_function(pairer, thread_context -> thread_id, (char*) bin, (char*)thread_context -> immediate_last_read_bin);
3877 			thread_context -> immediate_last_read_full_name[0] = 0;
3878 		}else{
3879 
3880 			if(thread_context -> immediate_last_read_full_name[0]){
3881 				if(thread_context -> readno_in_chunk>2){
3882 					if(pairer -> is_unsorted_notified == 0){
3883 						if(pairer -> unsorted_notification){
3884 							//SUBREADprintf("READ_%d : UNSORT1 : %s != %s\n", thread_context -> readno_in_chunk,  thread_context -> immediate_last_read_full_name , read_full_name);
3885 							pairer -> unsorted_notification(pairer , thread_context -> immediate_last_read_bin, (char *) bin);
3886 						}
3887 						pairer -> is_unsorted_notified = 1;
3888 					}
3889 				}else if(thread_context -> readno_in_chunk == 1 && !pairer -> is_unsorted_notified ) {
3890 					SAM_pairer_register_matcher(pairer, thread_context -> chunk_number, thread_context -> readno_in_chunk - 1, thread_context -> immediate_last_read_full_name,  thread_context -> immediate_last_read_bin,  thread_context -> immediate_last_read_bin_len , thread_context -> immediate_last_read_flags );
3891 				}
3892 
3893 				SAM_pairer_do_read_test(pairer , thread_context , thread_context -> immediate_last_read_name_len , thread_context -> immediate_last_read_full_name , thread_context -> immediate_last_read_bin_len , thread_context -> immediate_last_read_bin, thread_context -> immediate_last_read_flags);
3894 			}
3895 
3896 			thread_context -> immediate_last_read_bin_len = bin_len;
3897 			thread_context -> immediate_last_read_name_len = name_len;
3898 			thread_context -> immediate_last_read_flags = this_flags;
3899 			strcpy(thread_context -> immediate_last_read_full_name, read_full_name);
3900 			memcpy(thread_context -> immediate_last_read_bin, bin, bin_len);
3901 		}
3902 	}else{ // else just write.
3903 		if(pairer -> output_function)
3904 			pairer -> output_function(pairer, thread_context -> thread_id, (char*) bin, NULL);
3905 	}
3906 	thread_context -> readno_in_chunk ++;
3907 }
3908 
SAM_pairer_do_next_read(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context)3909 int SAM_pairer_do_next_read( SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context ){
3910 	unsigned char * bin = NULL;
3911 	int bin_len = 0;
3912 
3913 	int has_next_read = SAM_pairer_get_next_read_BIN(pairer, thread_context, &bin, &bin_len);
3914 	if(has_next_read && !pairer -> is_bad_format){
3915 		SAM_pairer_do_one_BIN( pairer, thread_context,(char *)bin, bin_len );
3916 		return 0;
3917 	}
3918 	else pairer -> BAM_header_parsed = 1;
3919 	return 1;
3920 }
3921 
3922 
3923 // all orphants are written into files, each has a size of buffer size.
3924 // when the orphants are longer than buffer_size, then sort and save to disk.
3925 
SAM_pairer_sort_exchange(void * arr,int l,int r)3926 void SAM_pairer_sort_exchange(void * arr, int l, int r){
3927 	unsigned char *** sort_data = (unsigned char ***) arr;
3928 	unsigned char * tmpc;
3929 
3930 	tmpc = sort_data[0][r];
3931 	sort_data[0][r] = sort_data[0][l];
3932 	sort_data[0][l] = tmpc;
3933 
3934 	tmpc = sort_data[1][r];
3935 	sort_data[1][r] = sort_data[1][l];
3936 	sort_data[1][l] = tmpc;
3937 }
3938 
SAM_pairer_sort_compare(void * arr,int l,int r)3939 int SAM_pairer_sort_compare(void * arr, int l, int r){
3940 	char *** sort_data = (char ***) arr;
3941 	return strcmp(sort_data[0][l], sort_data[0][r]);
3942 }
3943 
SAM_pairer_sort_merge(void * arr,int start,int items,int items2)3944 void SAM_pairer_sort_merge( void * arr, int start, int items, int items2 ){
3945 	unsigned char *** sort_data = (unsigned char ***) arr;
3946 
3947 	unsigned char ** tmp_name_list = malloc(sizeof(char *) * (items+items2));
3948 	unsigned char ** tmp_bin_list = malloc(sizeof(char *) * (items+items2));
3949 
3950 	int i1_cursor = start, i2_cursor = items + start;
3951 	int tmp_cursor = 0;
3952 
3953 	while(1){
3954 		if(i1_cursor == items + start && i2_cursor == items + items2 + start )break;
3955 		int select_items_1 = (i2_cursor == start + items + items2) || (i1_cursor < items + start && SAM_pairer_sort_compare(arr, i1_cursor, i2_cursor) <= 0);
3956 		if(select_items_1){
3957 			tmp_name_list[tmp_cursor] = sort_data[0][i1_cursor];
3958 			tmp_bin_list[tmp_cursor ++] = sort_data[1][i1_cursor++];
3959 		}else{
3960 			tmp_name_list[tmp_cursor] = sort_data[0][i2_cursor];
3961 			tmp_bin_list[tmp_cursor ++] = sort_data[1][i2_cursor++];
3962 		}
3963 	}
3964 	assert(tmp_cursor == items + items2);
3965 
3966 	memcpy( sort_data[0] + start, tmp_name_list, sizeof(char *) * (items+items2) );
3967 	memcpy( sort_data[1] + start, tmp_bin_list, sizeof(char *) * (items+items2) );
3968 	free(tmp_name_list);
3969 	free(tmp_bin_list);
3970 
3971 }
3972 
SAM_pairer_osr_hash(char * st)3973 unsigned int SAM_pairer_osr_hash(char * st){
3974 	int x1 = 0, nch;
3975 	unsigned int ret = 0, ret2=0;
3976 	while((nch = st[x1++])!=0){
3977 		ret = (ret << 2) ^ nch;
3978 		ret2 = (ret << 3) ^ nch;
3979 	}
3980 	return (ret^ret2) % 39846617;
3981 }
3982 
SAM_pairer_osr_next_name(FILE * fp,char * name,int thread_no,int all_threads)3983 int SAM_pairer_osr_next_name(FILE * fp , char * name, int thread_no, int all_threads){
3984 	while(1){
3985 		if(feof(fp)) return 0;
3986 		int rlen =0;
3987 		int retv = fread(&rlen, 1, 2, fp);
3988 		if(retv < 2) return 0;
3989 		if(rlen < 1) return 0;
3990 		assert(rlen < 1024);
3991 
3992 		int rlen2 = fread(name, 1, rlen, fp);
3993 		if(rlen2 != rlen) return 0;
3994 		name[rlen]=0;
3995 		if(all_threads < 0 || SAM_pairer_osr_hash(name)% all_threads == thread_no  )
3996 		{
3997 			fseeko(fp, -2-rlen, SEEK_CUR);
3998 			return 1;
3999 		}
4000 		retv = fread(&rlen, 1, 4, fp);
4001 		if(retv!=4) return -1;
4002 		rlen +=4;
4003 		fseeko(fp, rlen, SEEK_CUR);
4004 	}
4005 	return 0;
4006 }
4007 
SAM_pairer_osr_next_bin(FILE * fp,char * bin)4008 void SAM_pairer_osr_next_bin(FILE * fp, char * bin){
4009 	int rlen =0;
4010 	int retv = fread(&rlen, 1, 2, fp);
4011 	if(retv <2) *((int*)bin)=0;
4012 
4013 	assert(rlen < 1024);
4014 	fseeko(fp, rlen, SEEK_CUR);
4015 	rlen =0;
4016 	retv = fread(&rlen, 1, 4, fp);
4017 	if(retv <4) *((int*)bin)=0;
4018 	rlen +=4;
4019 	retv = fread(bin, 1, rlen, fp);
4020 	if(retv <rlen) *((int*)bin)=0;
4021 }
4022 
SAM_pairer_is_matched_chunks(char * c1,char * c2)4023 int SAM_pairer_is_matched_chunks(char * c1, char * c2){
4024 	if(c1==NULL || c2==NULL)return 0;
4025 
4026 	unsigned int i1 = (unsigned int) atoi(c1);
4027 	unsigned int i2 = (unsigned int) atoi(c2);
4028 	int start_1 = c1[11]=='0';
4029 	int start_2 = c2[11]=='0';
4030 
4031 	if(start_1+start_2!=1)return 0;
4032 	if(start_1) i2++;else i1++;
4033 	return i2==i1;
4034 }
4035 
4036 
4037 
4038 
4039 
4040 
merge_level_fps(SAM_pairer_context_t * pairer,char * fname,FILE ** fps,int fps_no)4041 int merge_level_fps(SAM_pairer_context_t * pairer, char * fname, FILE ** fps, int fps_no){
4042 	char * bin_tmp1 , * bin_tmp2;
4043 	int max_name_len = MAX_READ_NAME_LEN*2 +80, x1, is_disk_full = 0;
4044 
4045 	char tmp_fname[MAX_FILE_NAME_LENGTH+30];
4046 	sprintf(tmp_fname, "%s-MERGE-TMP.tmp", pairer->tmp_file_prefix);
4047 
4048 	char * names = malloc(  fps_no  * max_name_len );
4049 
4050 	bin_tmp1 = malloc(FC_LONG_READ_RECORD_HARDLIMIT);
4051 	bin_tmp2 = malloc(FC_LONG_READ_RECORD_HARDLIMIT);
4052 	FILE * out_fp = fopen(tmp_fname, "wb");
4053 
4054 
4055 	// initialize the "current_first_name" for each orphan file
4056 
4057 	for(x1 = 0 ; x1 < fps_no; x1++)
4058 	{
4059 		int has = SAM_pairer_osr_next_name( fps[x1] , names + max_name_len*x1 , -1 , -1);
4060 		if(!has) *(names + max_name_len*x1)=0;
4061 	}
4062 
4063 
4064 	while(1){
4065 		int min_name_fileno = -1;
4066 		int min2_name_fileno = -1;
4067 
4068 		// find the min_name in all FPs
4069 		// and find the same min_name if there is any
4070 
4071 		for(x1 = 0 ; x1 < fps_no; x1++){
4072 			int has = *(names + max_name_len*x1);
4073 			if(has){
4074 				int strcv_12 = 1;
4075 				if(min_name_fileno >=0) strcv_12 = strcmp(names+(min_name_fileno * max_name_len), names+(x1 * max_name_len));
4076 				if(strcv_12 > 0){
4077 					min_name_fileno = x1;
4078 					min2_name_fileno = -1;
4079 				}else if( strcv_12 == 0){
4080 					min2_name_fileno = x1;
4081 				}
4082 			}
4083 
4084 		}
4085 
4086 
4087 		if(min_name_fileno >= 0 && !is_disk_full){
4088 			SAM_pairer_osr_next_bin( fps[ min_name_fileno ] , bin_tmp1);
4089 
4090 			if(min2_name_fileno>=0){
4091 				SAM_pairer_osr_next_bin( fps[ min2_name_fileno ] , bin_tmp2);
4092 				pairer -> output_function(pairer, 0, (char*) bin_tmp1, (char*)bin_tmp2);
4093 
4094 				if(0 && 0 == pairer -> is_unsorted_notified){
4095 					char * name_tmp_1 = malloc(strlen(names+(min_name_fileno * max_name_len))+5), *name_tmp_2 = malloc(strlen(names+(min_name_fileno * max_name_len))+5);
4096 					char * min1_chunk_info, * min2_chunk_info;
4097 					sprintf(name_tmp_1, "C:%s:%d", names+(min_name_fileno * max_name_len), 0);
4098 					sprintf(name_tmp_2, "C:%s:%d", names+(min2_name_fileno * max_name_len), 1);
4099 					min1_chunk_info = HashTableGet( pairer -> unsorted_notification_table , name_tmp_1);
4100 					min2_chunk_info = HashTableGet( pairer -> unsorted_notification_table , name_tmp_2);
4101 					if(min1_chunk_info == NULL || min2_chunk_info == NULL || !SAM_pairer_is_matched_chunks(min1_chunk_info, min2_chunk_info)){
4102 						sprintf(name_tmp_1, "B:%s:%d", names+(min_name_fileno * max_name_len), 0);
4103 						if( pairer -> unsorted_notification ){
4104 							//SUBREADprintf("FINAL STEP\n");
4105 							//SUBREADprintf("UNSORT2\n");
4106 							pairer -> unsorted_notification(pairer ,  HashTableGet( pairer -> unsorted_notification_table , name_tmp_1), NULL);
4107 						}
4108 						pairer -> is_unsorted_notified = 1;
4109 					}
4110 					free(name_tmp_1);
4111 				}
4112 
4113 				int read_has = SAM_pairer_osr_next_name( fps[min2_name_fileno],  names + max_name_len*min2_name_fileno, -1, -1);
4114 				if(!read_has) *(names + max_name_len*min2_name_fileno)=0;
4115 			}else{
4116 				unsigned short wlen;
4117 				unsigned int rbinlen = 0;
4118 				wlen = strlen( names+(min_name_fileno * max_name_len) );
4119 				fwrite( &wlen, 2, 1,out_fp );
4120 				fwrite( names+(min_name_fileno * max_name_len), 1, wlen, out_fp );
4121 				memcpy( &rbinlen, bin_tmp1 , 4);
4122 				rbinlen += 4;
4123 				fwrite( bin_tmp1, 4, 1, out_fp );
4124 				int write_len = fwrite( bin_tmp1, 1, rbinlen, out_fp );
4125 				if(write_len < rbinlen)is_disk_full = 1;
4126 			}
4127 			int read_has = SAM_pairer_osr_next_name( fps[min_name_fileno],  names + max_name_len*min_name_fileno, -1, -1);
4128 			if(!read_has) *(names + max_name_len*min_name_fileno)=0;
4129 		} else break;
4130 	}
4131 
4132 	fclose(out_fp);
4133 	unlink(fname);
4134 	rename(tmp_fname, fname);
4135 	free(names);
4136 	free(bin_tmp1);
4137 	free(bin_tmp2);
4138 	return is_disk_full;
4139 }
4140 #define PAIRER_WAIT_TICK_TIME 10000
4141 
SAM_pairer_get_merge_max_fp(SAM_pairer_context_t * pairer)4142 int SAM_pairer_get_merge_max_fp(SAM_pairer_context_t * pairer){
4143 	return pairer -> max_file_open_number;
4144 
4145 }
4146 
SAM_pairer_set_merge_max_fp(SAM_pairer_context_t * pairer,int fon)4147 void SAM_pairer_set_merge_max_fp(SAM_pairer_context_t * pairer, int fon){
4148 	pairer -> max_file_open_number = fon;
4149 }
4150 
4151 
SAM_pairer_probe_maxfp(SAM_pairer_context_t * pairer)4152 int SAM_pairer_probe_maxfp( SAM_pairer_context_t * pairer){
4153 	int orphant_fp_no=0, is_disk_full = 0;
4154 	int thno, bkno, x1;
4155 	int thread_fps [ pairer -> total_threads ];
4156 	char tmp_fname[MAX_FILE_NAME_LENGTH+50];
4157 
4158 	memset(thread_fps, 0, sizeof(int) * pairer -> total_threads);
4159 	for( thno = 0 ; thno < pairer -> total_threads ; thno ++ ){
4160 		for( bkno = 0 ; ; bkno++){
4161 			sprintf(tmp_fname, "%s-TH%02d-BK%06d.tmp", pairer->tmp_file_prefix,  thno, bkno);
4162 			FILE * in_fp = fopen(tmp_fname, "rb");
4163 			if(NULL == in_fp) break;
4164 			thread_fps[thno] = bkno;
4165 			fclose(in_fp);
4166 			orphant_fp_no ++;
4167 		}
4168 	}
4169 
4170 	int max_open_fps = 0, has_limit = 0;
4171 	int orphant_fp_size = 50;
4172 	FILE ** orphant_fps = malloc(sizeof(FILE *) * orphant_fp_size);
4173 
4174 	for( bkno = 0 ; bkno < 5; bkno++){
4175 		sprintf(tmp_fname, "%s-FTEST-%d.tmp", pairer->tmp_file_prefix, bkno);
4176 		FILE * tfp = fopen(tmp_fname, "w");
4177 		if(NULL == tfp){
4178 			has_limit = 1;
4179 			break;
4180 		}
4181 		orphant_fps[max_open_fps++] = tfp;
4182 	}
4183 	//#warning ">>>>>>> COMMENT NEXT LINE <<<<<<<<"
4184 	for( thno = 0 ; thno < pairer -> total_threads ; thno ++ ){
4185 		if(has_limit) break;
4186 		for( bkno = 0 ; ; bkno++){
4187 			sprintf(tmp_fname, "%s-TH%02d-BK%06d.tmp", pairer->tmp_file_prefix,  thno, bkno);
4188 			FILE * in_fp = fopen(tmp_fname, "rb");
4189 			if(NULL == in_fp){
4190 				if( bkno <= thread_fps[thno] ) has_limit = 1;
4191 				break;
4192 			}
4193 			orphant_fps[max_open_fps++] = in_fp;
4194 			if(max_open_fps >= orphant_fp_size - 1){
4195 				orphant_fp_size *= 2;
4196 				orphant_fps = realloc(orphant_fps, orphant_fp_size * sizeof(FILE *));
4197 			}
4198 		}
4199 	}
4200 
4201 	for( bkno = 0 ;bkno < max_open_fps; bkno ++) fclose(orphant_fps[bkno]);
4202 
4203 	SAM_pairer_set_merge_max_fp(pairer, max_open_fps - 5);
4204 
4205 	//#warning ">>>>>>> COMMENT NEXT LINE <<<<<<<<"
4206 	//SUBREADprintf("Needed FPS = %d, Ulimit FPS = %d, Has_Limit = %d  \n", orphant_fp_no, max_open_fps, has_limit);
4207 
4208 	if( SAM_pairer_get_merge_max_fp(pairer) < orphant_fp_no * pairer -> total_threads){
4209 		int processed_orphant = 0;
4210 		int current_opened_fp_no = 0 ;
4211 		FILE * level_merge_fps [ SAM_pairer_get_merge_max_fp(pairer) ];
4212 		for( thno = 0 ; thno < pairer -> total_threads ; thno ++ ){
4213 			for( bkno = 0 ; ; bkno++){
4214 				char tmp_fname[MAX_FILE_NAME_LENGTH+50];
4215 				sprintf(tmp_fname, "%s-TH%02d-BK%06d.tmp", pairer->tmp_file_prefix,  thno, bkno);
4216 
4217 				FILE * in_fp = fopen(tmp_fname, "rb");
4218 				if(NULL == in_fp) break;
4219 
4220 	//			#warning ">>>> COMMENT DEBUG OUTPUT <<<<"
4221 	//			SUBREADprintf("Adding temp file:%s\n", tmp_fname);
4222 				level_merge_fps[current_opened_fp_no ++] = in_fp;
4223 				processed_orphant ++;
4224 				if(current_opened_fp_no >= SAM_pairer_get_merge_max_fp(pairer) || processed_orphant == orphant_fp_no){
4225 					sprintf(tmp_fname, "%s-LEVELMERGE.tmp", pairer->tmp_file_prefix);
4226 
4227 	//				#warning ">>>> COMMENT DEBUG OUTPUT <<<<"
4228 	//				SUBREADprintf("Merging temp files\n");
4229 					is_disk_full |= merge_level_fps(pairer , tmp_fname, level_merge_fps, current_opened_fp_no);
4230 					for(x1 = 0; x1 < current_opened_fp_no; x1++) fclose(level_merge_fps[x1]);
4231 
4232 					if(processed_orphant < orphant_fp_no){
4233 						level_merge_fps[0] = fopen(tmp_fname, "rb");
4234 						current_opened_fp_no = 1;
4235 					}
4236 					if(is_disk_full) break;
4237 				}
4238 			}
4239 		}
4240 		pairer -> merge_level_finished = 1;
4241 	}
4242 	free(orphant_fps);
4243 	return is_disk_full;
4244 }
4245 
SAM_pairer_rescure_orphants_max_FP(void * params)4246 void * SAM_pairer_rescure_orphants_max_FP(void * params){
4247 	void ** param_ptr = (void **) params;
4248 	SAM_pairer_context_t * pairer = param_ptr[0];
4249 	int thread_no = (int)(param_ptr[1]-NULL);
4250 	free(params);
4251 
4252 	srInt_64 died=0;
4253 	int orphant_fp_no=0;
4254 	int thno, bkno, x1;
4255 	char tmp_fname[MAX_FILE_NAME_LENGTH+60];
4256 
4257 	int max_name_len = MAX_READ_NAME_LEN*2 +80, orphant_fp_size = 50;
4258 	FILE ** orphant_fps = malloc(sizeof(FILE *) * orphant_fp_size);
4259 
4260 	if(0 == thread_no && pairer -> display_progress)
4261 		SUBREADprintf("Finished scanning the input file. Processing unpaired reads.\n");
4262 
4263 	//SUBREADprintf("merged = %d\n", pairer -> merge_level_finished);
4264 	if(pairer -> merge_level_finished){
4265 		sprintf(tmp_fname, "%s-LEVELMERGE.tmp", pairer->tmp_file_prefix);
4266 		FILE * in_fp = fopen(tmp_fname, "rb");
4267 		orphant_fps[0] = in_fp;
4268 		orphant_fp_no=1;
4269 	}else{
4270 		orphant_fp_no = 0;
4271 		for( thno = 0 ; thno < pairer -> total_threads ; thno ++ ){
4272 			for( bkno = 0 ; ; bkno++){
4273 				sprintf(tmp_fname, "%s-TH%02d-BK%06d.tmp", pairer->tmp_file_prefix,  thno, bkno);
4274 
4275 				FILE * in_fp = fopen(tmp_fname, "rb");
4276 				if(NULL == in_fp) break;
4277 				if(orphant_fp_no >= orphant_fp_size){
4278 					orphant_fp_size *= 1.5;
4279 					orphant_fps = realloc(orphant_fps, orphant_fp_size * sizeof(FILE *));
4280 				}
4281 				orphant_fps[orphant_fp_no++]=in_fp;
4282 			}
4283 		}
4284 	}
4285 
4286 	char * names = malloc( orphant_fp_no * max_name_len );
4287 	memset(names, 0, orphant_fp_no * max_name_len );
4288 	char * bin_tmp1 , * bin_tmp2;
4289 	bin_tmp1 = malloc(66000);
4290 	bin_tmp2 = malloc(66000);
4291 
4292 
4293 	for(x1 = 0 ; x1 < orphant_fp_no; x1++)
4294 	{
4295 		int has = SAM_pairer_osr_next_name( orphant_fps[x1] , names + max_name_len*x1 , thread_no , pairer-> total_threads);
4296 		if(!has) *(names + max_name_len*x1)=0;
4297 	}
4298 
4299 
4300 	while(1){
4301 		int min_name_fileno = -1;
4302 		int min2_name_fileno = -1;
4303 
4304 		for(x1 = 0 ; x1 < orphant_fp_no; x1++){
4305 			int has = *(names + max_name_len*x1);
4306 			if(has){
4307 				int strcv_12 = 1;
4308 				if(min_name_fileno >=0) strcv_12 = strcmp(names+(min_name_fileno * max_name_len), names+(x1 * max_name_len));
4309 				if(strcv_12 > 0){
4310 					min_name_fileno = x1;
4311 					min2_name_fileno = -1;
4312 				}else if( strcv_12 == 0){
4313 					min2_name_fileno = x1;
4314 				}
4315 			}
4316 
4317 		}
4318 
4319 		if(min_name_fileno >= 0){
4320 			SAM_pairer_osr_next_bin( orphant_fps[ min_name_fileno ] , bin_tmp1);
4321 
4322 			if( min2_name_fileno >=0){
4323 				SAM_pairer_osr_next_bin( orphant_fps[ min2_name_fileno ] , bin_tmp2);
4324 				pairer -> output_function(pairer, thread_no, (char*) bin_tmp1, (char*)bin_tmp2);
4325 
4326 				if(0 && 0 == pairer -> is_unsorted_notified){
4327 					char *name_tmp_1 = malloc(strlen(names+(min_name_fileno * max_name_len))+5), *name_tmp_2 = malloc(strlen(names+(min_name_fileno * max_name_len))+5);
4328 					char * min1_chunk_info, * min2_chunk_info;
4329 					sprintf(name_tmp_1, "C:%s:%d", names+(min_name_fileno * max_name_len), 0);
4330 					sprintf(name_tmp_2, "C:%s:%d", names+(min2_name_fileno * max_name_len), 1);
4331 					min1_chunk_info = HashTableGet( pairer -> unsorted_notification_table , name_tmp_1);
4332 					min2_chunk_info = HashTableGet( pairer -> unsorted_notification_table , name_tmp_2);
4333 					//#warning ">>>>>>> COMMENT NEXT LINE <<<<<<<<"
4334 					//SUBREADprintf("RESCURE MATCHER:  %s , %s ==  %s , %s, %s\n", name_tmp_1, name_tmp_2, min1_chunk_info, min2_chunk_info,
4335 					//	SAM_pairer_is_matched_chunks(min1_chunk_info, min2_chunk_info)?"MATCH":"XXXXX");
4336 
4337 					if(min1_chunk_info == NULL || min2_chunk_info == NULL || !SAM_pairer_is_matched_chunks(min1_chunk_info, min2_chunk_info)){
4338 						sprintf(name_tmp_1, "B:%s:%d", names+(min_name_fileno * max_name_len), 0);
4339 						if( pairer -> unsorted_notification ){
4340 							SUBREADprintf("UNSORT3\n");
4341 							//SUBREADprintf("FINAL STEP\n");
4342 							pairer -> unsorted_notification(pairer ,  HashTableGet( pairer -> unsorted_notification_table , name_tmp_1), NULL);
4343 						}
4344 						pairer -> is_unsorted_notified = 1;
4345 					}
4346 				}
4347 
4348 				int read_has = SAM_pairer_osr_next_name( orphant_fps[min2_name_fileno],  names + max_name_len*min2_name_fileno, thread_no,  pairer-> total_threads);
4349 				if(!read_has) *(names + max_name_len*min2_name_fileno)=0;
4350 			}else{
4351 				//#warning ">>>>>>> COMMENT NEXT LINE <<<<<<<<"
4352 				//SUBREADprintf("FINAL_ORPHAN:%s\n" , names + max_name_len*min_name_fileno);
4353 				pairer -> output_function(pairer, thread_no, (char*) bin_tmp1, NULL);
4354 				died++;
4355 			}
4356 
4357 			int read_has = SAM_pairer_osr_next_name( orphant_fps[min_name_fileno],  names + max_name_len*min_name_fileno, thread_no, pairer-> total_threads);
4358 			//#warning ">>>>>>> COMMENT NEXT BLOCK <<<<<<<<"
4359 			if(0){
4360 					if(!read_has) SUBREADprintf("FP %d FINISHED\n", min_name_fileno);
4361 				}
4362 			if(!read_has) *(names + max_name_len*min_name_fileno)=0;
4363 		} else break;
4364 	}
4365 	free(names);
4366 
4367 	//#warning ">>>>>>> COMMENT NEXT LINE <<<<<<<<"
4368 	//SUBREADprintf("finished_fps= %d\n", orphant_fp_no);
4369 
4370 	for(x1 = 0 ; x1 < orphant_fp_no; x1++)
4371 	{
4372 		fclose ( orphant_fps[x1] );
4373 	}
4374 	free( bin_tmp1 );
4375 	free( bin_tmp2 );
4376 	free(orphant_fps);
4377 	pairer -> total_orphan_reads += died;
4378 	return NULL;
4379 }
4380 
4381 
SAM_pairer_update_orphant_table(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context)4382 int SAM_pairer_update_orphant_table(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context){
4383 	unsigned int x2 = 0;
4384 	unsigned char ** name_list, ** bin_list;
4385 	//SUBREADprintf("ELES=%lu\n",  thread_context->orphant_table->numOfElements);
4386 	name_list = malloc(sizeof(char*) * thread_context->orphant_table->numOfElements);
4387 	bin_list  = malloc(sizeof(char*) * thread_context->orphant_table->numOfElements);
4388 
4389 	int x1, is_error = 0;
4390 	for(x1 = 0; x1 < thread_context->orphant_table->numOfBuckets; x1 ++){
4391 		KeyValuePair *pair = thread_context->orphant_table->bucketArray[x1];
4392 		while (pair != NULL) {
4393 			KeyValuePair *nextPair = pair->next;
4394 			name_list [x2] = (unsigned char *)pair -> key;
4395 			bin_list [x2] = pair -> value;
4396 			x2++;
4397 			pair = nextPair;
4398 		}
4399 	}
4400 
4401 	assert(x2 == thread_context->orphant_table->numOfElements);
4402 	unsigned char ** sort_data[2];
4403 	sort_data[0]=name_list;
4404 	sort_data[1]=bin_list;
4405 	merge_sort(sort_data, thread_context->orphant_table->numOfElements, SAM_pairer_sort_compare, SAM_pairer_sort_exchange, SAM_pairer_sort_merge);
4406 
4407 	char tmp_fname[MAX_FILE_NAME_LENGTH+40];
4408 	sprintf(tmp_fname, "%s-TH%02d-BK%06d.tmp", pairer->tmp_file_prefix, thread_context -> thread_id, thread_context -> orphant_block_no++);
4409 	FILE * tmp_fp = fopen(tmp_fname, "wb");
4410 	if(tmp_fp){
4411 		for(x1 = 0; x1 < x2;  x1 ++){
4412 			unsigned int bin_len;
4413 
4414 			memcpy(&bin_len, bin_list[x1] , 4);
4415 			int namelen = strlen((char *)name_list[x1]);
4416 
4417 			int write_len = fwrite(&namelen,2,1,tmp_fp);
4418 			is_error |= (write_len <1);
4419 			write_len = fwrite(name_list[x1], 1, namelen, tmp_fp);
4420 			is_error |= (write_len <namelen);
4421 			write_len = fwrite(&bin_len,4, 1,tmp_fp);
4422 			is_error |= (write_len <1);
4423 			write_len = fwrite(bin_list[x1],  1, bin_len + 4, tmp_fp);
4424 			is_error |= (write_len < bin_len + 4);
4425 
4426 			HashTableRemove(thread_context->orphant_table , name_list[x1]);
4427 		}
4428 		fclose(tmp_fp);
4429 	}else is_error =1;
4430 	assert(thread_context -> orphant_table-> numOfElements == 0);
4431 	free(name_list);
4432 	free(bin_list);
4433 	thread_context -> orphant_space = 0;
4434 	if(is_error) SUBREADprintf("ERROR: unable to write into the temporary file. Please check the disk space in the output directory.\n");
4435 	return is_error;
4436 }
4437 
4438 
is_read_bin_ONE(char * bin,int bin_len,int max_refID,int * block_len)4439 int is_read_bin_ONE(char * bin, int bin_len, int max_refID, int * block_len){
4440 	memcpy(block_len, bin, 4);
4441 	if((*block_len) > MAX_BIN_RECORD_LENGTH - 4 || (*block_len) < 32) return -1;
4442 	if((*block_len) > bin_len - 4) return -2;
4443 	int refID, mate_refID;
4444 	memcpy(&refID, bin + 4, 4);
4445 	memcpy(&mate_refID, bin + 24, 4);
4446 	if(refID != -1 && (refID< 0 || refID >=max_refID)) return -3;
4447 	if(mate_refID != -1 && (mate_refID< 0 || mate_refID >=max_refID)) return -4;
4448 	int l_seq;
4449 	memcpy(&l_seq, bin + 20, 4);
4450 	if(l_seq > bin_len*2 || l_seq > MAX_BIN_RECORD_LENGTH || l_seq  < 0) return -5;
4451 
4452 	int min_mq_nl;
4453 	memcpy(&min_mq_nl, bin + 12, 4);
4454 	int name_len = min_mq_nl & 0xff;
4455 	if(name_len < 1) return -20;
4456 	int flag_nc;
4457 	memcpy(&flag_nc, bin + 16, 4);
4458 	int cigar_opts = flag_nc & 0xffff;
4459 //	int flag = flag_nc >> 16;
4460 	if(cigar_opts > 100) return -6;
4461 
4462 	int rname_cursor = 36;
4463 	if(bin[rname_cursor] == '@') return -7;
4464 	for(; rname_cursor< 36 + name_len - 1; rname_cursor ++){
4465 		int nch = bin[rname_cursor];
4466 		if(nch < 0x20 || nch >=0x7f) return -9;
4467 		if(nch == '\t') return -8;
4468 	}
4469 
4470 	if(bin[rname_cursor]!=0)return -10;
4471 
4472 	if((*block_len) <  32 + name_len + 4*cigar_opts + l_seq + (l_seq+1)/2) return -11;
4473 
4474 	int cigar_i;
4475 	for(cigar_i = 0; cigar_i < cigar_opts ; cigar_i++){
4476 		int cigar_v;
4477 		memcpy(&cigar_v , bin + 36 + name_len + 4*cigar_i, 4);
4478 		int cigar_op = cigar_v & 0xf;
4479 		int cigar_value = cigar_v & 0xfffffff;
4480 		if(cigar_op > 8) return -12;
4481 
4482 		if((cigar_op == 0 || cigar_op == 1 || cigar_op > 6) && (cigar_value < 1 || cigar_value > MAX_BIN_RECORD_LENGTH)){
4483 
4484 			//#warning ">>>>>> COMMENT NEXT LINE IN RELEASE <<<<<<"
4485 			if(0){
4486 				char * rname = bin + 36;
4487 				SUBREADprintf("OP=%d, VAL=%d [%s]\n", cigar_op, cigar_value, rname);
4488 			}
4489 
4490 			return -13;
4491 		}
4492 	}
4493 
4494 	int ext_cursor = 36 + name_len + 4*cigar_opts + l_seq + (l_seq+1)/2;
4495 	if(ext_cursor < (*block_len) + 4){
4496 		if(ext_cursor > (*block_len) + 4 - 4) return -17;
4497 		if((!isalpha(bin[ext_cursor])) || bin[ext_cursor+1]>122 || bin[ext_cursor+1]<48 ||!isalpha(bin[ext_cursor+2])){
4498 	//		SUBREADprintf("TAGERR: %c%c%c\n", bin[ext_cursor], bin[ext_cursor+1], bin[ext_cursor+2]);
4499 			return -16;
4500 		}
4501 	}
4502 	return 1;
4503 }
4504 
4505 #define TESTING_READS_FOR_START 3
4506 int tchecks=0;
4507 
4508 // A block MUST have at least three reads as evidence; otherwise the BAM file is converted into the conservative format.
is_read_bin(char * bin,int bin_len,int max_refID)4509 int is_read_bin(char * bin, int bin_len, int max_refID){
4510 	int testing_i;
4511 	int bin_cursor = 0;
4512 	for(testing_i = 0; testing_i < TESTING_READS_FOR_START; testing_i++){
4513 		int block_len = 0;
4514 		int rr = is_read_bin_ONE(bin + bin_cursor, bin_len - bin_cursor, max_refID, &block_len);
4515 
4516 		if(0) SUBREADprintf("CHECK_START # %d: RET=%d\n", ++tchecks, rr);
4517 
4518 		if(rr!=1) return rr;
4519 		bin_cursor += block_len +4;
4520 		if(bin_cursor == bin_len) return 1;
4521 	}
4522 	return 1;
4523 }
4524 
4525 int tfinds = 0;
4526 
SAM_pairer_find_start(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context)4527 int SAM_pairer_find_start(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thread_context ){
4528 	thread_context -> need_find_start = 0;
4529 	int start_pos = 0;
4530 	for(start_pos = 0; start_pos < min(MAX_BIN_RECORD_LENGTH, thread_context -> input_buff_BIN_used); start_pos++){
4531 		if(1==is_read_bin((char *)thread_context -> input_buff_BIN + start_pos, thread_context -> input_buff_BIN_used - start_pos , pairer -> BAM_n_ref)){
4532 			//if(1) SUBREADprintf("STFIND # %d : start = %d\n", ++tfinds, start_pos);
4533 			if(start_pos>0){
4534 				char * margin_key = malloc(22);
4535 				char * margin_data = malloc(start_pos+4);
4536 				memcpy(margin_data, &start_pos, 4);
4537 				memcpy(margin_data+4,  thread_context -> input_buff_BIN, start_pos);
4538 				#ifdef __MINGW32__
4539 				sprintf(margin_key,"S%lu", (unsigned long) thread_context -> input_buff_SBAM_file_start);
4540 				#else
4541 				sprintf(margin_key,"S%llu", thread_context -> input_buff_SBAM_file_start);
4542 				#endif
4543 				subread_lock_occupy(&pairer -> SAM_BAM_table_lock);
4544 				HashTablePut(pairer -> bam_margin_table, margin_key, margin_data);
4545 				subread_lock_release(&pairer -> SAM_BAM_table_lock);
4546 			}
4547 			break;
4548 		}
4549 	}
4550 	thread_context -> input_buff_BIN_ptr = start_pos;
4551 //	SUBREADprintf("ABBO TH %d : FOUND START AT %d in %d\n", thread_context -> thread_id , start_pos, thread_context -> input_buff_BIN_used);
4552 	return start_pos < min(MAX_BIN_RECORD_LENGTH, thread_context -> input_buff_BIN_used);
4553 }
4554 
4555 
SAM_pairer_thread_run(void * params)4556 void * SAM_pairer_thread_run( void * params ){
4557 	void ** param_ptr = (void **) params;
4558 	SAM_pairer_context_t * pairer = param_ptr[0];
4559 	int thread_no = (int)(param_ptr[1]-NULL), is_disk_full = 0;
4560 	free(params);
4561 
4562 	SAM_pairer_thread_t * thread_context = pairer -> threads + thread_no;
4563 	int is_finished = 0;
4564 	while(1){
4565 		subread_lock_occupy(&pairer -> input_fp_lock);
4566 		if(pairer -> BAM_header_parsed || thread_no == 0){
4567 			thread_context -> need_find_start = pairer -> BAM_header_parsed;
4568 			//SUBREADprintf("ABBO TH %d : FILL_BIN AT FILE %lld\n", thread_context -> thread_id, ftello(pairer -> input_fp ));
4569 			SAM_pairer_fill_BIN_buff(pairer, thread_context, &is_finished);
4570 			thread_context -> chunk_number = pairer -> input_chunk_no;
4571 			pairer -> input_chunk_no ++;
4572 		}
4573 		subread_lock_release(&pairer -> input_fp_lock);
4574 
4575 		if(!pairer -> BAM_header_parsed && thread_no > 0) {
4576 			usleep(PAIRER_WAIT_TICK_TIME);
4577 		} else if(thread_context -> input_buff_SBAM_used>0) {
4578 			unsigned int processed_reads = 0;
4579 			while(1){
4580 				int has_no_more = SAM_pairer_do_next_read(pairer, thread_context);
4581 				if(has_no_more)break;
4582 				processed_reads++;
4583 			}
4584 
4585 			pairer -> total_input_reads += processed_reads;
4586 		}
4587 		if(pairer -> is_bad_format) break;
4588 
4589 		if(thread_context -> immediate_last_read_full_name[0]){
4590 			SAM_pairer_register_matcher(pairer, thread_context -> chunk_number, thread_context -> readno_in_chunk - 1, thread_context -> immediate_last_read_full_name, thread_context -> immediate_last_read_bin, thread_context -> immediate_last_read_bin_len ,  thread_context -> immediate_last_read_flags);
4591 			SAM_pairer_do_read_test(pairer , thread_context , thread_context -> immediate_last_read_name_len , thread_context -> immediate_last_read_full_name , thread_context -> immediate_last_read_bin_len , thread_context -> immediate_last_read_bin, thread_context -> immediate_last_read_flags);
4592 			thread_context -> immediate_last_read_full_name[0] = 0;
4593 		}
4594 
4595 		if(thread_context -> orphant_space > pairer -> input_buff_SBAM_size)
4596 			if(!is_disk_full)is_disk_full |= SAM_pairer_update_orphant_table(pairer, thread_context);
4597 
4598 		if(is_finished){
4599 			pairer -> BAM_header_parsed = 1;
4600 			break;
4601 		}
4602 	}
4603 
4604 	if(thread_context -> orphant_table -> numOfElements > 0)
4605 		if(!is_disk_full)is_disk_full |= SAM_pairer_update_orphant_table(pairer, thread_context);
4606 
4607 	pairer -> is_internal_error |= is_disk_full;
4608 
4609 	return NULL;
4610 }
4611 
4612 
4613 // This function returns 1 if the bin is EXACTLY a whole read.
SAM_pairer_verify_read_bin_ONE(SAM_pairer_context_t * pairer,SAM_pairer_thread_t * thread_context,char * bin,int binlen)4614 int SAM_pairer_verify_read_bin_ONE(SAM_pairer_context_t * pairer, SAM_pairer_thread_t * thread_context , char * bin, int binlen){
4615 	int block_len = 9;
4616 	int ret = is_read_bin_ONE(bin, binlen, pairer -> BAM_n_ref, &block_len);
4617 
4618 	if(ret != 1 || block_len+4 != binlen){
4619 		SUBREADprintf("ERROR: cannot retrieve a read from the BAM file: %d, %d\n", block_len+4, ret);
4620 		ret = -1;
4621 	}
4622 	//SUBREADprintf("FINAL_BIN_MATCH VERIFY : %d\n", ret);
4623 	return ret;
4624 }
4625 
SAM_pairer_finish_margins(void * kv,void * val,HashTable * tab)4626 void SAM_pairer_finish_margins(void * kv, void * val , HashTable * tab){
4627 	char * key = kv;
4628 	if(key[0]=='E'){
4629 		char keyS [40];
4630 		strcpy(keyS, key);
4631 		keyS[0]='S';
4632 		char * Sbin = HashTableGet(tab, keyS);
4633 		assert(Sbin);
4634 		char * Ebin = val;
4635 		tab -> appendix2 ++;
4636 
4637 	//	SUBREADprintf("PAIRED_BINS: %s %s\n", key, keyS);
4638 
4639 		SAM_pairer_context_t * pairer = tab -> appendix1;
4640 		SAM_pairer_thread_t * thread_context = pairer -> threads+0;
4641 
4642 		thread_context -> readno_in_chunk = 0;
4643 		int Elen = 0, Slen = 0;
4644 		memcpy(&Elen, Ebin, 4);
4645 		memcpy(&Slen, Sbin, 4);
4646 		char * tb = malloc(Elen + Slen);
4647 		memcpy(tb, Ebin+4, Elen);
4648 		memcpy(tb+ Elen, Sbin+4, Slen);
4649 
4650 		if(SAM_pairer_verify_read_bin_ONE(pairer , thread_context, tb, Elen + Slen)==1)
4651 			SAM_pairer_do_one_BIN( pairer, thread_context, tb, Elen + Slen);
4652 		else{
4653 			pairer -> is_bad_format = 1;
4654 		}
4655 		free(tb);
4656 	}else tab -> appendix2 --;
4657 
4658 }
4659 
SAM_pairer_finish_margin_table(SAM_pairer_context_t * pairer)4660 void  SAM_pairer_finish_margin_table( SAM_pairer_context_t * pairer){
4661 	pairer -> bam_margin_table -> appendix1 = pairer;
4662 	pairer -> bam_margin_table -> appendix2 = NULL;
4663 
4664 	SAM_pairer_thread_t * thread_context = pairer -> threads+0;
4665 	thread_context -> immediate_last_read_full_name[0] = 0;
4666 	HashTableIteration(pairer -> bam_margin_table, SAM_pairer_finish_margins);
4667 
4668 	if(thread_context -> immediate_last_read_full_name[0]){
4669 		SAM_pairer_register_matcher(pairer, thread_context -> chunk_number, thread_context -> readno_in_chunk - 1, thread_context -> immediate_last_read_full_name, thread_context -> immediate_last_read_bin, thread_context -> immediate_last_read_bin_len ,  thread_context -> immediate_last_read_flags);
4670 		SAM_pairer_do_read_test(pairer , thread_context , thread_context -> immediate_last_read_name_len , thread_context -> immediate_last_read_full_name , thread_context -> immediate_last_read_bin_len , thread_context -> immediate_last_read_bin, thread_context -> immediate_last_read_flags);
4671 		thread_context -> immediate_last_read_full_name[0] = 0;
4672 	}
4673 
4674 	pairer -> is_internal_error |= SAM_pairer_update_orphant_table(pairer, pairer -> threads+0);
4675 	assert(NULL == pairer -> bam_margin_table -> appendix2);
4676 }
4677 
4678 // not only run, but also finalise.
4679 // It returns 0 if no error.
SAM_pairer_run_once(SAM_pairer_context_t * pairer)4680 int SAM_pairer_run_once( SAM_pairer_context_t * pairer){
4681 	int x1;
4682 	for(x1 = 0; x1 < pairer -> total_threads ; x1++){
4683 		// this 16-byte memory block is freed in the thread worker.
4684 		void ** init_params = malloc(sizeof(void *) * 2);
4685 
4686 		init_params[0] = pairer;
4687 		init_params[1] = (void *)(NULL+x1);
4688 		pthread_create(&(pairer -> threads[x1].thread_stab), NULL, SAM_pairer_thread_run, init_params);
4689 	}
4690 
4691 	for(x1 = 0; x1 < pairer -> total_threads ; x1++){
4692 		pthread_join(pairer -> threads[x1].thread_stab, NULL);
4693 	}
4694 
4695 	if(0 == pairer -> is_bad_format){
4696 		if(pairer -> input_is_BAM) SAM_pairer_finish_margin_table(pairer);
4697 		int is_disk_full = SAM_pairer_probe_maxfp( pairer );
4698 		if(is_disk_full){
4699 			SUBREADprintf("ERROR: cannot write into the temporary file. Please check the disk space in the output directory.\n");
4700 			pairer -> is_internal_error = 1;
4701 		}else{
4702 			for(x1 = 0; x1 < pairer -> total_threads ; x1++){
4703 				// this 16-byte memory block is freed in the thread worker.
4704 
4705 				void ** init_params = malloc(sizeof(void *) * 2);
4706 
4707 				init_params[0] = pairer;
4708 				init_params[1] = (void *)(NULL+x1);
4709 				pthread_create(&(pairer -> threads[x1].thread_stab), NULL, SAM_pairer_rescure_orphants_max_FP, init_params);
4710 			}
4711 
4712 			for(x1 = 0; x1 < pairer -> total_threads ; x1++){
4713 				pthread_join(pairer -> threads[x1].thread_stab, NULL);
4714 			}
4715 		}
4716 	}
4717 
4718 	return 0;
4719 }
4720 
fix_load_next_block(FILE * in,char * binbuf,z_stream * strm)4721 int fix_load_next_block(FILE * in, char * binbuf, z_stream * strm){
4722 	char * bam_buf = malloc(70000);
4723 	int x1, ret = 0;
4724 	x1 = fgetc(in);
4725 	if(x1 != 31) ret = -1;
4726 	x1 = fgetc(in);
4727 	if(x1 != 139) ret = -1;
4728 	x1 = fgetc(in);
4729 	if(x1 != 8) ret = -1;
4730 	x1 = fgetc(in);
4731 	if(x1 != 4) ret = -1;
4732 	if(ret == 0){
4733 		x1 = fgetc(in);
4734 		x1 = fgetc(in);
4735 		x1 = fgetc(in);
4736 		x1 = fgetc(in);
4737 
4738 		x1 = fgetc(in);//XFL
4739 
4740 		x1 = fgetc(in);//OS
4741 		int xlen;
4742 		xlen = fgetc(in);
4743 		xlen += fgetc(in) * 256;
4744 		int bsize = -1, xlen_ptr = 0;
4745 
4746 		while(xlen_ptr < xlen){
4747 			int si1 = fgetc(in);
4748 			int si2 = fgetc(in);
4749 			int slen = fgetc(in);
4750 			slen += fgetc(in) * 256;
4751 			if(si1 == 66 && si2==67){
4752 				bsize = fgetc(in);
4753 				bsize += 256*fgetc(in);
4754 			}else{
4755 				fseeko(in , slen, SEEK_CUR);
4756 			}
4757 			xlen_ptr += 4 + slen;
4758 		}
4759 		if(bsize > 0){
4760 			int rlenv = fread(bam_buf, 1, bsize - xlen - 19, in);
4761 			if(rlenv < bsize - xlen - 19) return -1;
4762 		}
4763 		fseeko(in, 8, SEEK_CUR);
4764 
4765 		strm -> avail_in = bsize - xlen - 19;
4766 		strm -> next_in = (unsigned char*)bam_buf;
4767 		strm -> avail_out = 70000;
4768 		strm -> next_out = (unsigned char*)binbuf;
4769 		int ret_inf = inflate(strm, Z_FINISH);
4770 		if(ret_inf == Z_STREAM_END)
4771 			ret = 70000 - strm -> avail_out;
4772 		else
4773 			ret = -2;
4774 		inflateReset(strm);
4775 	}
4776 	free(bam_buf);
4777 	return ret;
4778 }
4779 
fix_write_block(FILE * out,char * bin,int binlen,z_stream * strm)4780 int  fix_write_block(FILE * out, char * bin, int binlen, z_stream * strm){
4781 	int is_end_mode = binlen == 0, written=0;
4782 	//SUBREADprintf("FIX_WRTR : %d\n", binlen);
4783 
4784 	while(1){
4785 		if(binlen - written<1 && !is_end_mode) return 0;
4786 
4787 		char * bam_buf = malloc(70000);
4788 		int x1, bam_len = 0, old_in= 0, this_sec_len = 0, old_start = written;
4789 
4790 		if(binlen - written > 0){
4791 			old_in = strm -> avail_in = binlen - written;
4792 			strm -> next_in = (unsigned char*)bin + written;
4793 			strm -> avail_out = 70000;
4794 			strm -> next_out = (unsigned char*)bam_buf;
4795 			deflate(strm , Z_FINISH);
4796 			bam_len = 70000 - strm -> avail_out;
4797 			this_sec_len = old_in - strm -> avail_in;
4798 			written += this_sec_len;
4799 
4800 			deflateReset(strm);
4801 		}else{
4802 			z_stream nstrm;
4803 			nstrm.zalloc = Z_NULL;
4804 			nstrm.zfree = Z_NULL;
4805 			nstrm.opaque = Z_NULL;
4806 			nstrm.avail_in = 0;
4807 			nstrm.next_in = Z_NULL;
4808 
4809 			deflateInit2(&nstrm, SAMBAM_COMPRESS_LEVEL_NORMAL, Z_DEFLATED,
4810 				PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
4811 
4812 			nstrm.avail_in = 0;
4813 			nstrm.next_in = (unsigned char*)bin;
4814 			nstrm.avail_out = 70000;
4815 			nstrm.next_out = (unsigned char*)bam_buf;
4816 			deflate(&nstrm, Z_FINISH);
4817 			bam_len = 70000 - nstrm.avail_out;
4818 			deflateEnd(&nstrm);
4819 		}
4820 
4821 		//SUBREADprintf("FIX_COMPR: %d -> %d  RET=%d\n", binlen , bam_len, retbam);
4822 
4823 		unsigned int crc0 = crc32(0, NULL, 0);
4824 		unsigned int crc = crc32(crc0, (unsigned char *) bin + old_start, this_sec_len);
4825 
4826 		fputc(31, out);
4827 		fputc(139, out);
4828 		fputc(8, out);
4829 		fputc(4, out);
4830 		fputc(0, out);
4831 		fputc(0, out);
4832 		fputc(0, out);
4833 		fputc(0, out);
4834 
4835 		fputc(0, out);//XFL
4836 		fputc(0xff, out);//OS
4837 
4838 		x1 = 6;
4839 		fwrite( &x1, 2, 1 , out );
4840 		fputc( 66, out );
4841 		fputc( 67, out );
4842 		x1 = 2;
4843 		fwrite( &x1, 2, 1 , out );
4844 		x1 = bam_len + 19 + 6;
4845 		fwrite( &x1, 2, 1 , out );
4846 		int write_len = fwrite( bam_buf , 1,bam_len, out );
4847 
4848 		fwrite( &crc, 4, 1, out );
4849 		fwrite( &binlen, 4, 1, out );
4850 
4851 		free(bam_buf);
4852 
4853 		if(write_len < bam_len)return 1;
4854 		if(binlen<1) return 0;
4855 	}
4856 	return 0;
4857 }
4858 
4859 #define FIX_GET_NEXT_NCH { while(in_bin_ptr == in_bin_size){ \
4860   in_bin_ptr = 0; in_bin_size = 0;\
4861   int newsize = fix_load_next_block(old_fp, in_bin, &in_strm);\
4862   if(newsize < 0){ in_bin_size = -1; if(newsize<-1)SUBREADprintf("ERROR: failed to decompress the BAM file %s\n", pairer -> in_file_name) ;break;}else{in_bin_size = newsize;}\
4863 } if(in_bin_size>0){nch = in_bin[in_bin_ptr++];  if(nch < 0)nch += 256; } else nch = -1; }
4864 
4865 #define FIX_FLASH_OUT { if(out_bin_ptr > 0)disk_is_full |= fix_write_block(new_fp, out_bin, out_bin_ptr, &out_strm); out_bin_ptr = 0; }
4866 
4867 #define FIX_APPEND_OUT(p, c) { if(out_bin_ptr > 60002){FIX_FLASH_OUT} ;  memcpy(out_bin + out_bin_ptr, p, c); out_bin_ptr +=c ; }
4868 #define FIX_APPEND_READ(p, c){ memcpy(out_bin + out_bin_ptr, p, c); out_bin_ptr +=c ;  }
4869 
SAM_pairer_fix_format(SAM_pairer_context_t * pairer)4870 int SAM_pairer_fix_format(SAM_pairer_context_t * pairer){
4871 	FILE * old_fp = pairer -> input_fp;
4872 	fseeko(old_fp, 0, SEEK_SET);
4873 	char tmpfname [MAX_FILE_NAME_LENGTH+14], readname[256];
4874 
4875 	sprintf(tmpfname, "%s.fixbam", pairer -> tmp_file_prefix);
4876 
4877 	FILE * new_fp = f_subr_open(tmpfname, "wb");
4878 	char * in_bin = malloc(1024*70);
4879 	char * out_bin = malloc(20*1024*1024);
4880 
4881 	z_stream in_strm;
4882 	z_stream out_strm;
4883 	in_strm.zalloc = Z_NULL;
4884 	in_strm.zfree = Z_NULL;
4885 	in_strm.opaque = Z_NULL;
4886 	in_strm.avail_in = 0;
4887 	in_strm.next_in = Z_NULL;
4888 
4889 	inflateInit2(&in_strm, PAIRER_GZIP_WINDOW_BITS);
4890 
4891 	out_strm.zalloc = Z_NULL;
4892 	out_strm.zfree = Z_NULL;
4893 	out_strm.opaque = Z_NULL;
4894 	out_strm.avail_in = 0;
4895 	out_strm.next_in = Z_NULL;
4896 
4897 	deflateInit2(&out_strm, Z_NO_COMPRESSION, Z_DEFLATED,
4898 		PAIRER_GZIP_WINDOW_BITS, PAIRER_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
4899 
4900 	int disk_is_full = 0;
4901 	int in_bin_ptr = 0;
4902 	int out_bin_ptr = 0;
4903 	int in_bin_size = 0;
4904 	int content_count = 0;
4905 	int content_size = 0;
4906 	int x1, nch = 0, is_longcigar = 0;
4907 
4908 	for(x1 = 0; x1 < 4; x1++){
4909 		FIX_GET_NEXT_NCH; // BAM1
4910 		if(nch < 0) return -1;
4911 		FIX_APPEND_OUT(&nch, 1);
4912 	}
4913 
4914 
4915 	// ====== The header texts
4916 	content_size = 0;
4917 	for(x1 = 0; x1 < 4; x1++){
4918 		FIX_GET_NEXT_NCH;
4919 		if(nch < 0) return -1;
4920 	//	SUBREADprintf("FIX: TLEN: %d\n", nch);
4921 		content_size += (nch << (8 * x1));
4922 	}
4923 	FIX_APPEND_OUT(&content_size, 4);
4924 	//SUBREADprintf("FIX: TXTLEN=%d\n", content_size);
4925 	for(content_count = 0; content_count < content_size; content_count++){
4926 		FIX_GET_NEXT_NCH;
4927 		if(nch < 0) return -1;
4928 		FIX_APPEND_OUT(&nch, 1);
4929 	//	fputc(nch, stderr);
4930 	}
4931 	FIX_FLASH_OUT;
4932 
4933 	// ====== The chromosome table
4934 	content_size = 0;
4935 	for(x1 = 0; x1 < 4; x1++){
4936 		FIX_GET_NEXT_NCH;
4937 		if(nch < 0) return -1;
4938 		content_size += (nch << (8 * x1));
4939 	}
4940 	FIX_APPEND_OUT(&content_size, 4);
4941 	//SUBREADprintf("LONGFIX: CHROLEN=%d\n", content_size);
4942 	for(content_count = 0; content_count < content_size; content_count++){
4943 		int namelen = 0;
4944 		for(x1 = 0; x1 < 4; x1++){
4945 			FIX_GET_NEXT_NCH;
4946 			if(nch < 0) return -1;
4947 			namelen+= (nch << (8 * x1));
4948 		}
4949 		FIX_APPEND_READ(&namelen, 4);
4950 		for(x1 = 0; x1 <  namelen + 4; x1++){ // inc. length
4951 			FIX_GET_NEXT_NCH;
4952 			if(nch < 0) return -1;
4953 			FIX_APPEND_READ(&nch, 1);
4954 		}
4955 
4956 		if(out_bin_ptr > 60003){
4957 			FIX_FLASH_OUT;
4958 		}
4959 	}
4960 	FIX_FLASH_OUT;
4961 
4962 	// ===== The reads
4963 	int seq_len = 0, name_len = 0, cigar_opts = 0;
4964 	srInt_64 reads =0;
4965 	pairer -> is_bad_format = 0;
4966 
4967 	while(! is_longcigar){
4968 		int block_size = 0, new_block_size;
4969 		char * block_size_ptr = out_bin + out_bin_ptr;
4970 		char * sqlen_ptr = NULL;
4971 		seq_len = 0, name_len = 0, cigar_opts = 0;
4972 
4973 		// block_length
4974 		FIX_GET_NEXT_NCH;
4975 		if(nch<0) break;
4976 		block_size = nch;
4977 		for(x1 = 1; x1 < 4; x1++){
4978 			FIX_GET_NEXT_NCH;
4979 			if(nch < 0) return -1;
4980 			block_size += (nch << (8 * x1));
4981 		}
4982 
4983 		FIX_APPEND_READ(&block_size, 4);
4984 
4985 		if(pairer -> tiny_mode){
4986 			// block_remainder
4987 			int extag_new_len = 0;
4988 			for(x1 = 0; x1 < block_size; x1++){
4989 				FIX_GET_NEXT_NCH;
4990 				if(nch < 0) return -1;
4991 				if(x1 == 8) name_len = nch;
4992 				else if(x1 >= 16 && x1 < 20){
4993 					seq_len += ( nch << (8 * (x1 - 16)));
4994 					if(x1 == 16)  sqlen_ptr = out_bin + out_bin_ptr;
4995 				}else if(x1 == 12 || x1 == 13){
4996 					cigar_opts += ( nch << (8 * (x1 - 12)));
4997 				}else if(seq_len > 1){
4998 					if(x1 == 32 + name_len + 4 * cigar_opts || x1 == 32 + name_len + 4 * cigar_opts + 1){
4999 						nch = 0xff;
5000 					}else if(x1 > 32 + name_len + 4 * cigar_opts + 1 && x1 < 32 + name_len + 4 * cigar_opts + seq_len + (seq_len+1)/2){
5001 						continue;
5002 					}
5003 				}
5004 
5005 				//#warning "+===================== REMOVE -59999 IN NEXT LINE ================"
5006 				//if(x1==32)SUBREADprintf("SEQ_LEN=%d, REC_LEN=%d\n", seq_len, block_size);
5007 				if( x1 == 32 && seq_len >= pairer -> long_read_minimum_length){
5008 					is_longcigar = 1;
5009 					int x2;
5010 					for(x2 = 0; x2 < name_len; x2++){
5011 						FIX_GET_NEXT_NCH;
5012 						readname[x2] = nch;
5013 					}
5014 					break;
5015 				}
5016 
5017 	//			#warning "================ THIS BLOCK WAS DISABLED ON 03OCT2019; MAKE SURE IT WORKS ON LONG READS/LONG READ RECORDS =============="
5018 				if(0 && x1 == 32 && block_size > 60000 ){
5019 					print_in_box(80,0,0,"");
5020 					print_in_box(80,0,0,"   ERROR: Alignment record is too long.");
5021 					print_in_box(80,0,0,"	  Please use the long read mode.");
5022 					return -1;
5023 				}
5024 
5025 				char etag_name0 = -1, etag_name1, etag_type;
5026 				if(x1 == 32 + name_len + 4 * cigar_opts + seq_len + (seq_len+1)/2){
5027 					while(x1 < block_size){
5028 						int this_tag_output = 0;
5029 						if(etag_name0 > 0){
5030 							FIX_GET_NEXT_NCH;
5031 							if(nch < 0) return -1;
5032 						}
5033 						etag_name0 = nch;
5034 						FIX_GET_NEXT_NCH;
5035 						if(nch < 0) return -1;
5036 						etag_name1 = nch;
5037 						FIX_GET_NEXT_NCH;
5038 						if(nch < 0) return -1;
5039 						etag_type = nch;
5040 						x1 += 3;
5041 
5042 						//SUBREADprintf("ETAG_NAME: %c%c (%c), x1 = %d < %d\n", etag_name0,etag_name1,etag_type, x1, block_size);
5043 
5044 						if((( etag_name0 == 'H' && etag_name1 == 'I' ) ||
5045 						    ( etag_name0 == 'N' && etag_name1 == 'H' ) ||
5046 						    ( etag_name0 == 'R' && etag_name1 == 'G' ) ||
5047 						    ( etag_name0 == 'N' && etag_name1 == 'M' )
5048 						    ) && ( etag_type == 'c' || etag_type=='Z' || etag_type == 'C'||etag_type == 's'||etag_type == 'S'||etag_type == 'i'||etag_type == 'I')
5049 						  ){
5050 							FIX_APPEND_READ(&etag_name0,1);
5051 							FIX_APPEND_READ(&etag_name1,1);
5052 							FIX_APPEND_READ(&etag_type,1);
5053 							this_tag_output = 1;
5054 						//	SUBREADprintf("ADDED INTO BAM\n");
5055 						}
5056 						if(etag_type == 'Z'||etag_type =='H'){
5057 							if(this_tag_output) extag_new_len +=3;
5058 							while(1){
5059 								FIX_GET_NEXT_NCH;
5060 								if(nch < 0) return -1;
5061 								if(this_tag_output){
5062 									assert(x1 < 20000);
5063 									FIX_APPEND_READ(&nch, 1);
5064 									extag_new_len++;
5065 								}
5066 								x1++;
5067 								if(nch == 0)break;
5068 							}
5069 						}else if(etag_type == 'A'){
5070 							FIX_GET_NEXT_NCH;
5071 							if(nch < 0) return -1;
5072 							x1++;
5073 						}else if(etag_type =='B'){
5074 							FIX_GET_NEXT_NCH;
5075 							if(nch < 0) return -1;
5076 							char array_type = nch;
5077 							int x2, adlen = 1, aditems = 0;
5078 							if(array_type == 's'||array_type == 'S')adlen = 2;
5079 							if(array_type == 'i'||array_type == 'I'||array_type == 'f')adlen = 4;
5080 							for(x2=0;x2<4; x2++) {
5081 								FIX_GET_NEXT_NCH;
5082 								if(nch < 0) return -1;
5083 								aditems += nch << (8*x2);
5084 							}
5085 							x1 += 5 + aditems * adlen;
5086 							for(x2 = 0; x2 < aditems * adlen; x2++){
5087 								FIX_GET_NEXT_NCH;
5088 								if(nch < 0) return -1;
5089 							}
5090 						}else{
5091 							int dlen = 1;
5092 							if(etag_type == 's'||etag_type == 'S') dlen = 2;
5093 							if(etag_type == 'i'||etag_type == 'I' || etag_type == 'f') dlen = 4;
5094 							if(this_tag_output) extag_new_len += dlen + 3;
5095 							x1 += dlen;
5096 							while(dlen > 0){
5097 								FIX_GET_NEXT_NCH;
5098 								if(nch < 0) return -1;
5099 								if(this_tag_output)
5100 									FIX_APPEND_READ(&nch, 1);
5101 								dlen--;
5102 							}
5103 						}
5104 					}
5105 					break;
5106 				}
5107 				FIX_APPEND_READ(&nch, 1);
5108 				//SUBREADprintf("WR[%d]: %d = %c, SL=%d, RNL=%d, COP=%d\n", out_bin_ptr, nch, nch, seq_len, name_len, cigar_opts);
5109 			}
5110 
5111 			if(!is_longcigar){
5112 				seq_len = min(1, seq_len);
5113 				sqlen_ptr[0]=seq_len; sqlen_ptr[1]=0, sqlen_ptr[2]=0; sqlen_ptr[3]=0;
5114 				new_block_size = 32 + name_len + 4 * cigar_opts + seq_len + (seq_len+1)/2 + extag_new_len;
5115 				//SUBREADprintf("ETAG_NLEN=%d, ETAGS=%d\n", new_block_size, extag_new_len);
5116 				memcpy(block_size_ptr, &new_block_size, 4);
5117 			}
5118 		}else{
5119 			for(x1 = 0; x1 < block_size; x1++){
5120 				FIX_GET_NEXT_NCH;
5121 				if(nch < 0) return -1;
5122 
5123 				if(x1 == 8) name_len = nch;
5124 				else if(x1 >= 16 && x1 < 20){
5125 					seq_len += ( nch << (8 * (x1 - 16)));
5126 					if(x1 == 16)  sqlen_ptr = out_bin + out_bin_ptr;
5127 				}else if(x1 == 12 || x1 == 13){
5128 					cigar_opts += ( nch << (8 * (x1 - 12)));
5129 				}
5130 
5131 				if(x1 == 32 && seq_len >= pairer -> long_read_minimum_length){
5132 					is_longcigar = 1;
5133 					int x2;
5134 					for(x2 = 0; x2 < name_len; x2++){
5135 						FIX_GET_NEXT_NCH;
5136 						readname[x2] = nch;
5137 					}
5138 					break;
5139 				}
5140 
5141 				FIX_APPEND_READ(&nch, 1);
5142 			}
5143 		}
5144 
5145 		reads ++;
5146 		if(out_bin_ptr > 60000){
5147 	//		SUBREADprintf("WRIR3: TINY=%d\n", pairer -> tiny_mode);
5148 			FIX_FLASH_OUT;
5149 		}
5150 	}
5151 	FIX_FLASH_OUT;
5152 	//SUBREADprintf("FIX READS=%llu\n", reads);
5153 	disk_is_full |= fix_write_block(new_fp, out_bin, 0, &out_strm);
5154 	deflateEnd(&out_strm);
5155 	inflateEnd(&in_strm);
5156 
5157 	fclose(new_fp);
5158 
5159 	free(in_bin);
5160 	free(out_bin);
5161 
5162 	if(is_longcigar){
5163 		unlink(tmpfname);
5164 		pairer -> long_cigar_mode = 1;
5165 		pairer -> tiny_mode = 1;
5166 		if(0 && ! pairer -> is_single_end_mode){
5167 			print_in_box(80,0,0,"   Switch to long-read mode; reads, not read-pairs, will be counted.");
5168 			print_in_box(80,0,0,"   Read name: %s", readname);
5169 			print_in_box(80,0,0,"   It had %d cigar opts and %d bases, more than %d.", cigar_opts, seq_len, pairer -> long_read_minimum_length);
5170 		}
5171 	}else{
5172 		fclose(old_fp);
5173 		pairer -> input_fp = f_subr_open(tmpfname, "rb");
5174 	}
5175 
5176 	if(disk_is_full)SUBREADprintf("ERROR: cannot write into the temporary file. Please check the empty space in the output directory.\n");
5177 	return disk_is_full;
5178 }
5179 
5180 
5181 
5182 unsigned int nosort_tick_time = 100;
5183 #define NOSORT_SBAM_BUFF_SIZE 5000000
5184 #define NOSORT_BIN_BUFF_SIZE (2*5010000)
5185 
5186 
SAM_nosort_thread_run(void * params)5187 void * SAM_nosort_thread_run( void * params ){
5188 	void ** param_ptr = (void **) params;
5189 	SAM_pairer_context_t * pairer = param_ptr[0];
5190 	int thread_no = (int)(param_ptr[1]-NULL);
5191 	free(params);
5192 
5193 	SAM_pairer_thread_t * thread_context = pairer -> threads + thread_no;
5194 
5195 	char * read_ptr_1 = (char *)thread_context -> input_buff_BIN;
5196 	char * read_ptr_2 = (char *)thread_context -> input_buff_BIN + NOSORT_BIN_BUFF_SIZE / 2;
5197 
5198 	while(1){
5199 		int has_found = 0, to_quit = 0;
5200 		subread_lock_occupy(&thread_context -> SBAM_lock);
5201 
5202 	//	SUBREADprintf("CONSUME:RINS=%d, PTR=%d\n", thread_context -> reads_in_SBAM, thread_context -> input_buff_BIN_ptr );
5203 
5204 		if(thread_context -> reads_in_SBAM > 1){
5205 			if(pairer -> input_is_BAM){
5206 				int record_len, seq_len1 = 0, seq_len2 = 0;
5207 		//		SUBREADprintf("LOAD BY THREAD %d:", thread_no);
5208 				memcpy(&record_len, thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr, 4);
5209 	//			SUBREADprintf("RLEN=%d\n", record_len);
5210 				assert(record_len > 32 &&record_len < NOSORT_SBAM_BUFF_SIZE);
5211 				memcpy(read_ptr_1 , thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr, 4 + record_len);
5212 				memcpy(&seq_len1, thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr + 20, 4);
5213 				thread_context -> input_buff_SBAM_ptr += record_len + 4;
5214 
5215 				memcpy(&record_len, thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr, 4);
5216 				assert(record_len > 32 &&record_len < NOSORT_SBAM_BUFF_SIZE);
5217 				memcpy(read_ptr_2 , thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr, 4 + record_len);
5218 				memcpy(&seq_len2, thread_context -> input_buff_SBAM + thread_context -> input_buff_SBAM_ptr + 20, 4);
5219 				thread_context -> input_buff_SBAM_ptr += record_len + 4;
5220 				has_found = 1;
5221 				thread_context -> reads_in_SBAM -= 2;
5222 
5223 				if(seq_len1 >= pairer -> long_read_minimum_length || seq_len2 >= pairer -> long_read_minimum_length)
5224 					pairer -> long_cigar_mode = 1;
5225 
5226 			}else{
5227 				thread_context -> input_buff_BIN_ptr = 0;
5228 				int rret = reduce_SAM_to_BAM(pairer, thread_context , 0);
5229 				thread_context -> reads_in_SBAM -- ;
5230 				if(rret > 0){
5231 					thread_context -> input_buff_BIN_ptr = NOSORT_BIN_BUFF_SIZE/2;
5232 					rret = reduce_SAM_to_BAM(pairer, thread_context, 0);
5233 					thread_context -> reads_in_SBAM -- ;
5234 					if(rret > 0){
5235 						has_found = 1;
5236 					}
5237 				}
5238 			}
5239 		}
5240 		if(pairer -> is_finished) to_quit = 1;
5241 		subread_lock_release(&thread_context -> SBAM_lock);
5242 
5243 		if(has_found)
5244 			pairer -> output_function(pairer, thread_no, (char*) read_ptr_1,(char*) read_ptr_2);
5245 		else{
5246 			if(to_quit) break;
5247 			usleep(nosort_tick_time);
5248 		}
5249 	}
5250 
5251 	return NULL;
5252 }
5253 
SAM_nosort_decompress_next_block(SAM_pairer_context_t * pairer)5254 int SAM_nosort_decompress_next_block(SAM_pairer_context_t * pairer){
5255 	int SBAM_used;
5256 	unsigned int decompressed_len;
5257 
5258 	char * SBAM_buff = pairer -> appendix2;
5259 	char * BIN_buff = pairer -> appendix3;
5260 	int * BIN_buff_used = pairer -> appendix4;
5261 	int * BIN_buff_ptr = pairer -> appendix5;
5262 
5263 	SBAM_used = PBam_get_next_zchunk(pairer -> input_fp, SBAM_buff, NOSORT_SBAM_BUFF_SIZE, &decompressed_len);
5264 	if(SBAM_used<0){
5265 		if(SBAM_used == -2){
5266 			SUBREADputs("ERROR: the BAM format is broken.");
5267 			pairer->is_internal_error = 1;
5268 		}
5269 		return -1;
5270 	}
5271 
5272 	//SUBREADprintf("PRE-LOAD BAM: USED %d,  PTR %d\n", * BIN_buff_used , * BIN_buff_ptr);
5273 	if((* BIN_buff_ptr) < (* BIN_buff_used)){
5274 		int diff =  (* BIN_buff_used) - (* BIN_buff_ptr);
5275 		int x1;
5276 		for(x1 = 0; x1 < diff; x1++){
5277 			BIN_buff[x1] = BIN_buff[x1 + (* BIN_buff_ptr)];
5278 		}
5279 		(* BIN_buff_used) = diff;
5280 	} else (* BIN_buff_used) = 0;
5281 	(* BIN_buff_ptr) = 0;
5282 
5283 	int binlen = SamBam_unzip(BIN_buff + (* BIN_buff_used), 65536, SBAM_buff , SBAM_used, 0);
5284 	//assert(binlen == decompressed_len);
5285 	if(binlen < 0) return -1;
5286 	(* BIN_buff_used) += binlen;
5287 	return binlen;
5288 }
5289 
5290 #define NOSORT_BAM_next_nch { while( BIN_buff_used == BIN_buff_ptr ){int rlen = SAM_nosort_decompress_next_block(pairer); if(rlen < 0) { BIN_buff_used = -1 ; break;}} if(BIN_buff_used < 0) nch = -1; else nch = BIN_buff[BIN_buff_ptr++]; }
5291 #define NOSORT_BAM_next_u32(v){ NOSORT_BAM_next_nch; if(nch < 0)v=-1;else{; v= nch; NOSORT_BAM_next_nch; v+=nch*256; NOSORT_BAM_next_nch; v+=nch*65536; NOSORT_BAM_next_nch; v+=nch*16777216;} }
5292 
5293 #define NOSORT_SAM_next_line {NOSORT_SAM_eof  = fgets(line_ptr, NOSORT_SBAM_BUFF_SIZE, pairer -> input_fp);}
5294 
5295 #if FEATURECOUNTS_BUFFER_SIZE < ( 12*1024*1024 )
5296 #error "FEATURECOUNTS_BUFFER_SIZE MUST BE GREATER THAN 12MB!."
5297 #endif
5298 
5299 #define NOSORT_REFILL_LOWBAR ( 3 * 1024 * 1024 )
5300 #define NOSORT_REFILL_HIGHBAR ( 6 * 1024 * 1024  )
5301 
SAM_nosort_run_once(SAM_pairer_context_t * pairer)5302 void SAM_nosort_run_once(SAM_pairer_context_t * pairer){
5303 	int x1;
5304 	for(x1 = 0; x1 < pairer -> total_threads ; x1++){
5305 		// this 16-byte memory block is freed in the thread worker.
5306 		void ** init_params = malloc(sizeof(void *) * 2);
5307 
5308 		init_params[0] = pairer;
5309 		init_params[1] = (void *)(NULL+x1);
5310 		pthread_create(&(pairer -> threads[x1].thread_stab), NULL, SAM_nosort_thread_run, init_params);
5311 	}
5312 
5313 	char * SBAM_buff = malloc(NOSORT_SBAM_BUFF_SIZE);
5314 	int nch;
5315 	unsigned char * BIN_buff = malloc(NOSORT_BIN_BUFF_SIZE);
5316 	char *NOSORT_SAM_eof=NULL;
5317 	int BIN_buff_used = 0;
5318 	int BIN_buff_ptr = 0;
5319 
5320 	pairer -> appendix2 = SBAM_buff;
5321 	pairer -> appendix3 = BIN_buff;
5322 	pairer -> appendix4 = &BIN_buff_used;
5323 	pairer -> appendix5 = &BIN_buff_ptr;
5324 
5325 	if(pairer -> input_is_BAM){
5326 		int x1;
5327 		unsigned int bam_signature;
5328 		NOSORT_BAM_next_u32(bam_signature);
5329 		NOSORT_BAM_next_u32(pairer -> BAM_l_text);
5330 		char * header_txt = malloc(max(1000000,pairer->BAM_l_text));
5331 
5332 		for(x1 = 0 ; x1 < pairer -> BAM_l_text; x1++){
5333 			NOSORT_BAM_next_nch;
5334 			header_txt [x1] = nch;
5335 		}
5336 
5337 		int is_OK = pairer -> output_header(pairer, 0, 1, pairer -> BAM_l_text , header_txt , pairer -> BAM_l_text );
5338 		NOSORT_BAM_next_u32(pairer -> BAM_n_ref);
5339 		unsigned int ref_bin_len = 0;
5340 		for(x1 = 0; x1 < pairer -> BAM_n_ref; x1++) {
5341 			unsigned int l_name, l_ref, x2;
5342 			NOSORT_BAM_next_u32(l_name);
5343 			assert(l_name < 256);
5344 			memcpy(header_txt + ref_bin_len, &l_name, 4);
5345 			ref_bin_len += 4;
5346 			for(x2 = 0; x2 < l_name; x2++){
5347 				NOSORT_BAM_next_nch;
5348 				header_txt[ref_bin_len++] = nch;
5349 			}
5350 			NOSORT_BAM_next_u32(l_ref);
5351 			memcpy(header_txt + ref_bin_len, &l_ref, 4);
5352 			ref_bin_len += 4;
5353 
5354 			assert(ref_bin_len < pairer -> BAM_l_text);
5355 		}
5356 
5357 		is_OK = is_OK || pairer -> output_header(pairer, 0, 0, pairer -> BAM_n_ref , header_txt , ref_bin_len );
5358 		free(header_txt);
5359 
5360 		if(is_OK){
5361 			pairer -> is_incomplete_BAM = 1;
5362 			return;
5363 		}
5364 
5365 		while(1){
5366 			if(pairer -> is_finished) break;
5367 			int need_sleep = 1;
5368 			for(x1 = 0; x1 < pairer -> total_threads ; x1++){
5369 				if(pairer -> is_finished) break;
5370 				SAM_pairer_thread_t * this_thread = pairer -> threads + x1;
5371 				if(this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr < NOSORT_REFILL_LOWBAR && (this_thread -> input_buff_SBAM_used == 0 || this_thread -> input_buff_SBAM_ptr > 0)){
5372 					subread_lock_occupy(&this_thread -> SBAM_lock);
5373 					int to_be_add = NOSORT_REFILL_HIGHBAR - (this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr);
5374 
5375 					int x2, x3;
5376 					if(this_thread -> input_buff_SBAM_ptr < this_thread -> input_buff_SBAM_used){
5377 						for(x2 = 0; x2 < this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr; x2++)
5378 							this_thread -> input_buff_SBAM[x2] = this_thread -> input_buff_SBAM[x2 + this_thread -> input_buff_SBAM_ptr];
5379 						this_thread -> input_buff_SBAM_used -= this_thread -> input_buff_SBAM_ptr;
5380 					}else this_thread -> input_buff_SBAM_used =0;
5381 
5382 					this_thread -> input_buff_SBAM_ptr = 0;
5383 					for(x2 = 0 ;  ; x2++){
5384 						int record_len;
5385 						NOSORT_BAM_next_u32(record_len);
5386 						if(record_len < 32 || record_len > 500000){
5387 							if(record_len!=-1)
5388 								SUBREADprintf("Unexpected record length: %d.\n", record_len);
5389 							pairer -> is_finished = 1;
5390 							break;
5391 						}
5392 
5393 						memcpy(this_thread -> input_buff_SBAM + this_thread -> input_buff_SBAM_used , &record_len, 4);
5394 						this_thread -> input_buff_SBAM_used += 4;
5395 						for(x3 =0; x3 < record_len; x3++){
5396 							NOSORT_BAM_next_nch;
5397 							this_thread -> input_buff_SBAM[this_thread -> input_buff_SBAM_used++] = nch;
5398 						}
5399 						this_thread -> reads_in_SBAM ++;
5400 						if(x2 % 2 == 1 && to_be_add <= this_thread -> input_buff_SBAM_used + 20000 )break;
5401 					}
5402 					need_sleep = 0;
5403 					subread_lock_release(&this_thread -> SBAM_lock);
5404 				}
5405 			}
5406 			if(need_sleep) usleep(nosort_tick_time);
5407 		}
5408 	}else{ // if input is SAM
5409 		char * line_ptr = SBAM_buff;
5410 		char * header_start = NULL;
5411 		int passed_read_SBAM_ptr = -1;
5412 		unsigned int header_buffer_safe_size = 0;
5413 		while(1){
5414 			passed_read_SBAM_ptr = ftello(pairer -> input_fp);
5415 			NOSORT_SAM_next_line;
5416 			if(NOSORT_SAM_eof == NULL)break;
5417 
5418 			header_buffer_safe_size += strlen(line_ptr);
5419 			if(NULL== header_start && line_ptr[0] == '@') header_start = line_ptr;
5420 
5421 			if(NULL == line_ptr){
5422 				SUBREADprintf("FATAL: the header is too large to the buffer.\n");
5423 				break;
5424 			}else{
5425 				//SUBREADprintf("LINELEN=%d, PTR=%d, FIRST=%c\n", line_len, thread_context -> input_buff_SBAM_ptr , line_ptr[0]);
5426 			}
5427 			if(line_ptr[0]!='@'){
5428 				break;
5429 			}
5430 		}
5431 
5432 		fseeko(pairer -> input_fp, 0 , SEEK_SET);
5433 		int header_bin_ptr = 0, header_contigs = 0;
5434 		char * header_bin = malloc(header_buffer_safe_size);
5435 
5436 
5437 		while(1){
5438 			NOSORT_SAM_next_line;
5439 			if(NOSORT_SAM_eof == NULL)break;
5440 			if(line_ptr[0]!='@') break;
5441 			if(memcmp(line_ptr, "@SQ\t",4)==0){
5442 				unsigned int ct_len = 0, ctptr = 4, status = 0, sqname_len = 0;
5443 				char * sqname = NULL;
5444 				while(1){
5445 					char ctnch = line_ptr[ctptr++];
5446 					if( status == 0){
5447 						if(ctnch=='S' && line_ptr[ctptr] == 'N' && line_ptr[ctptr+1] == ':'){
5448 							ctptr += 2;
5449 							status = 10;
5450 							sqname = line_ptr + ctptr;
5451 						}else if(ctnch=='L' && line_ptr[ctptr] == 'N' && line_ptr[ctptr+1] == ':'){
5452 							ctptr += 2;
5453 							status = 20;
5454 						}else	status = 30;
5455 					}else if(status == 10 || status == 20 || status == 30){
5456 						if(ctnch == '\t' || ctnch == '\n'){
5457 							status = 0;
5458 							if(ctnch == '\n') break;
5459 							//break;
5460 						}
5461 						if(status == 10) sqname_len ++;
5462 						else if(status == 20) ct_len = ct_len * 10 + ctnch - '0';
5463 					}
5464 				}
5465 
5466 
5467 				sqname_len += 1;
5468 				memcpy(header_bin + header_bin_ptr, &sqname_len, 4);
5469 				header_bin_ptr += 4;
5470 				memcpy(header_bin + header_bin_ptr, sqname, sqname_len-1);
5471 				*(header_bin + header_bin_ptr + sqname_len - 1) = 0;
5472 				char * mem_contig_name = malloc(sqname_len);
5473 				strcpy(mem_contig_name , header_bin + header_bin_ptr);
5474 		//		SUBREADprintf("CONTIG %d : %s (len=%d = %d)\n", header_contigs, header_bin + header_bin_ptr , sqname_len, strlen(mem_contig_name));
5475 				HashTablePut(pairer -> sam_contig_number_table , mem_contig_name, NULL + 1 + header_contigs);
5476 				header_bin_ptr += sqname_len;
5477 
5478 				memcpy(header_bin + header_bin_ptr, &ct_len, 4);
5479 				header_bin_ptr += 4;
5480 				header_contigs++;
5481 			}
5482 		}
5483 
5484 		pairer -> BAM_header_parsed = 1;
5485 		int is_OK = pairer -> output_header(pairer, 0, 0, header_contigs , header_bin , header_bin_ptr);
5486 		free(header_bin);
5487 
5488 		if(is_OK){
5489 			pairer -> is_incomplete_BAM = 1;
5490 			return;
5491 		}
5492 
5493 
5494 
5495 		fseeko(pairer -> input_fp, passed_read_SBAM_ptr, SEEK_SET);
5496 
5497 		line_ptr = SBAM_buff;
5498 
5499 		while(1){
5500 			if(pairer -> is_finished) break;
5501 			int need_sleep = 1;
5502 			for(x1 = 0; x1 < pairer -> total_threads ; x1++){
5503 				if(pairer -> is_finished) break;
5504 				SAM_pairer_thread_t * this_thread = pairer -> threads + x1;
5505 				if(this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr < NOSORT_REFILL_LOWBAR && (this_thread -> input_buff_SBAM_used == 0 || this_thread -> input_buff_SBAM_ptr > 0)){
5506 					subread_lock_occupy(&this_thread -> SBAM_lock);
5507 					int to_be_add = NOSORT_REFILL_HIGHBAR - (this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr);
5508 
5509 					int x2;
5510 					if(this_thread -> input_buff_SBAM_ptr < this_thread -> input_buff_SBAM_used){
5511 						for(x2 = 0; x2 < this_thread -> input_buff_SBAM_used - this_thread -> input_buff_SBAM_ptr; x2++)
5512 							this_thread -> input_buff_SBAM[x2] = this_thread -> input_buff_SBAM[x2 + this_thread -> input_buff_SBAM_ptr];
5513 						this_thread -> input_buff_SBAM_used -= this_thread -> input_buff_SBAM_ptr;
5514 					}else this_thread -> input_buff_SBAM_used =0;
5515 
5516 					this_thread -> input_buff_SBAM_ptr = 0;
5517 					for(x2 = 0 ; ; x2++){
5518 						int record_len;
5519 						NOSORT_SAM_next_line;
5520 
5521 						if(NULL==NOSORT_SAM_eof || line_ptr[0]==0){
5522 							pairer -> is_finished = 1;
5523 							break;
5524 						}
5525 
5526 						record_len = strlen(line_ptr);
5527 					//	SUBREADprintf("1CHR=%c, ECHR=%d , RL=%d, RINS=%d, USED=%d, SIZE=%d\n", line_ptr[0], line_ptr[record_len - 1], record_len, this_thread -> reads_in_SBAM, this_thread -> input_buff_SBAM_used, pairer -> input_buff_SBAM_size);
5528 						memcpy(this_thread -> input_buff_SBAM + this_thread -> input_buff_SBAM_used , line_ptr, record_len);
5529 						this_thread -> input_buff_SBAM_used += record_len;
5530 						this_thread -> reads_in_SBAM ++;
5531 						if(x2 % 2 == 1 && to_be_add <= this_thread -> input_buff_SBAM_used + 20000 )break;
5532 					}
5533 					need_sleep = 0;
5534 					subread_lock_release(&this_thread -> SBAM_lock);
5535 				}
5536 			}
5537 			if(need_sleep) usleep(nosort_tick_time);
5538 		}
5539 	}
5540 
5541 	free(SBAM_buff);
5542 	free(BIN_buff);
5543 
5544 
5545 	for(x1 = 0; x1 < pairer -> total_threads ; x1++){
5546 		pthread_join(pairer -> threads[x1].thread_stab, NULL);
5547 	}
5548 }
5549 
5550 #define BINADD_NCHAR {			if(binptr >= bin_buff_capacity - 10){\
5551 					bin_buff_capacity = bin_buff_capacity * 14 / 10;\
5552 					bin_buffer = realloc(bin_buffer, bin_buff_capacity);\
5553 				} bin_buffer[binptr++] = nch;}
5554 
5555 
5556 
5557 // only one thread; very large buffer size.
SAM_pairer_long_cigar_run(SAM_pairer_context_t * pairer)5558 int SAM_pairer_long_cigar_run(SAM_pairer_context_t * pairer){
5559 	char *bin_buffer, *bam_buffer;
5560 	FILE * old_fp = pairer -> input_fp;
5561 	int bin_buff_capacity = 1000000, block_size = 0;
5562 	char * in_bin = malloc(140000);
5563 	bin_buffer = malloc(bin_buff_capacity);
5564 	bam_buffer = malloc(70000);
5565 
5566 	z_stream in_strm;
5567 	in_strm.zalloc = Z_NULL;
5568 	in_strm.zfree = Z_NULL;
5569 	in_strm.opaque = Z_NULL;
5570 	in_strm.avail_in = 0;
5571 	in_strm.next_in = Z_NULL;
5572 
5573 	inflateInit2(&in_strm, PAIRER_GZIP_WINDOW_BITS);
5574 
5575 	fseeko(old_fp, 0, SEEK_SET);
5576 
5577 	if(1){
5578 		int disk_is_full = 0;
5579 		int in_bin_ptr = 0;
5580 		int out_bin_ptr = 0;
5581 		int in_bin_size = 0;
5582 		int content_count = 0;
5583 		int content_size = 0;
5584 		int is_finished = 0;
5585 		int x1, nch = 0, binptr = 0;
5586 
5587 		disk_is_full = disk_is_full?out_bin_ptr:out_bin_ptr; // stupid code to avoid warning messages from GCC 7
5588 
5589 		for(x1 = 0; x1 < 4; x1++){
5590 			FIX_GET_NEXT_NCH; // BAM1
5591 			if(nch < 0) return -1;
5592 		}
5593 
5594 		// ====== The header texts
5595 		content_size = 0;
5596 		binptr = 0;
5597 		for(x1 = 0; x1 < 4; x1++){
5598 			FIX_GET_NEXT_NCH;
5599 			if(nch < 0) return -1;
5600 			content_size += (nch << (8 * x1));
5601 		}
5602 		for(content_count = 0; content_count < content_size; content_count++){
5603 			FIX_GET_NEXT_NCH;
5604 			BINADD_NCHAR;
5605 			if(nch < 0) return -1;
5606 		}
5607 
5608 		pairer -> output_header (pairer , 0, 1, binptr, bin_buffer, binptr);
5609 
5610 		// ====== The chromosome table
5611 		binptr = 0;
5612 		content_size = 0;
5613 		for(x1 = 0; x1 < 4; x1++){
5614 			FIX_GET_NEXT_NCH;
5615 			if(nch < 0) return -1;
5616 			content_size += (nch << (8 * x1));
5617 		}
5618 
5619 		for(content_count = 0; content_count < content_size; content_count++){
5620 			block_size = 0;
5621 			for(x1 = 0; x1 < 4; x1++){
5622 				FIX_GET_NEXT_NCH;
5623 				if(nch < 0) return -1;
5624 				BINADD_NCHAR;
5625 				block_size += (nch << (8 * x1));
5626 			}
5627 
5628 			for(x1 = 0; x1 < block_size + 4; x1++){
5629 				FIX_GET_NEXT_NCH;
5630 				if(nch < 0) return -1;
5631 				BINADD_NCHAR;
5632 			}
5633 		}
5634 		pairer -> output_header (pairer , 0, 0, content_size, bin_buffer, binptr);
5635 
5636 		// go through the reads
5637 		int reads = 0;
5638 		while(1){
5639 			binptr = 0;
5640 			block_size = 0;
5641 			for(x1 = 0; x1 < 4; x1++){
5642 				FIX_GET_NEXT_NCH;
5643 				if(x1 == 0 && nch < 0){
5644 					is_finished=1;
5645 					break;
5646 				}
5647 				if(nch < 0) return -1;
5648 
5649 				BINADD_NCHAR;
5650 				block_size += (nch << (8 * x1));
5651 			}
5652 			if(is_finished)break;
5653 
5654 			for(x1 = 0; x1 < block_size; x1 ++){
5655 				FIX_GET_NEXT_NCH;
5656 				if(nch < 0) return -1;
5657 				BINADD_NCHAR;
5658 			}
5659 
5660 			pairer -> output_function(pairer, 0, bin_buffer, NULL);
5661 			reads++;
5662 		}
5663 	}
5664 
5665 	free(bam_buffer);
5666 	free(bin_buffer);
5667 	free(in_bin);
5668 
5669 	return 0;
5670 }
5671 
pairer_increase_SAMBAM_buffer(SAM_pairer_context_t * pairer)5672 void pairer_increase_SAMBAM_buffer(SAM_pairer_context_t * pairer){
5673 	pairer -> input_buff_SBAM_size *= 5;
5674 	pairer -> input_buff_BIN_size = max(1024*1024, pairer -> input_buff_SBAM_size );
5675 
5676 	int x1;
5677 	for(x1 = 0; x1< pairer -> total_threads; x1++){
5678 		pairer -> threads[x1].input_buff_SBAM = realloc( pairer -> threads[x1].input_buff_SBAM, pairer -> input_buff_SBAM_size);
5679 		pairer -> threads[x1].input_buff_BIN = realloc(  pairer -> threads[x1].input_buff_BIN, pairer -> input_buff_BIN_size);
5680 	}
5681 }
5682 
SAM_pairer_run(SAM_pairer_context_t * pairer)5683 int SAM_pairer_run( SAM_pairer_context_t * pairer){
5684 	int corrected_run;
5685 
5686 	if(pairer -> force_do_not_sort){
5687 		SAM_nosort_run_once(pairer);
5688 
5689 	}else for(corrected_run = 0; corrected_run < 2  ; corrected_run ++){
5690 		pairer -> is_final_run = corrected_run;
5691 		SAM_pairer_run_once(pairer);
5692 		if(pairer -> is_bad_format && pairer->input_is_BAM && ( ! pairer -> is_internal_error )  && ( ! pairer -> is_incomplete_BAM )){
5693 			//#warning ">>>>>> REMOVE '+ 1' FROM NEXT LINE IN RELEASE <<<<<<"
5694 			assert(1 != corrected_run);
5695 			delete_with_prefix(pairer -> tmp_file_prefix);
5696 			pairer -> is_internal_error |= SAM_pairer_fix_format(pairer);
5697 
5698 			if(pairer -> is_bad_format || pairer -> is_internal_error)
5699 				return -1;
5700 			SAM_pairer_reset(pairer);
5701 			if(pairer -> reset_output_function)pairer -> reset_output_function(pairer);
5702 			pairer_increase_SAMBAM_buffer(pairer);
5703 
5704 			if(pairer -> long_cigar_mode) return SAM_pairer_long_cigar_run(pairer);
5705 		}else break;
5706 	}
5707 
5708 	return pairer -> is_bad_format || pairer -> is_internal_error || pairer -> is_incomplete_BAM;
5709 }
5710 
sort_SAM_create(SAM_sort_writer * writer,char * output_file,char * tmp_path)5711 int sort_SAM_create(SAM_sort_writer * writer, char * output_file, char * tmp_path)
5712 {
5713 	char tmp_fname[MAX_FILE_NAME_LENGTH+40], mac_rand[13];
5714 	memset(writer, 0, sizeof(SAM_sort_writer));
5715 
5716 	old_sig_TERM = signal (SIGTERM, SAM_SORT_SIGINT_hook);
5717 	old_sig_INT = signal (SIGINT, SAM_SORT_SIGINT_hook);
5718 
5719 	mac_or_rand_str(mac_rand);
5720 	if(tmp_path == NULL){
5721 		int slash_pos = 0;
5722 		for(slash_pos = strlen(output_file); slash_pos >=0; slash_pos--){
5723 			if(output_file[slash_pos]=='/')break;
5724 		}
5725 		if(slash_pos >= 0){
5726 			memcpy(writer -> tmp_path, output_file, slash_pos+1);
5727 			sprintf(writer -> tmp_path + slash_pos+1, "temp-sort-%06u-%s-", getpid(), mac_rand);
5728 		}else sprintf(writer -> tmp_path, "./temp-sort-%06u-%s-", getpid(), mac_rand);
5729 
5730 	}else sprintf(writer -> tmp_path, "%s/temp-sort-%06u-%s-", tmp_path, getpid(), mac_rand);
5731 
5732 	//#warning " >>>>>>>>>>>>>>>> REMOVE THE NEXT LINE <<<<<<<<<<<<<<<<<<<< "
5733 	//SUBREADprintf("TMP_SORT=%s  FROM %s\n", writer -> tmp_path, output_file);
5734 
5735 	_SAMSORT_SNP_delete_temp_prefix = writer -> tmp_path;
5736 
5737 	sprintf(tmp_fname, "%s%s", writer -> tmp_path, "headers.txt");
5738 	writer -> all_chunks_header_fp = f_subr_open(tmp_fname,"w");
5739 	if(!writer -> all_chunks_header_fp) return -1;
5740 	fclose(writer -> all_chunks_header_fp);
5741 	unlink(tmp_fname);
5742 
5743 	writer -> out_fp = f_subr_open(output_file,"w");
5744 	if(!writer -> out_fp) return -1;
5745 
5746 	return 0;
5747 }
5748 
find_tag_out(char * read_line_buf,char * tag,char * hi_tag_out)5749 void find_tag_out(char * read_line_buf, char * tag, char * hi_tag_out)
5750 {
5751 	int hi_tag = -1;
5752 	char tag_str[10];
5753 	sprintf(tag_str , "\t%s:i:", tag);
5754 	char * hi_tag_str = strstr(read_line_buf, tag_str);
5755 	if(hi_tag_str)
5756 	{
5757 
5758 
5759 		hi_tag = 0;
5760 		int line_cursor;
5761 		for(line_cursor=6; ; line_cursor++)
5762 		{
5763 			char nch = hi_tag_str[line_cursor];
5764 //								printf("HI:i=%s; nch [%d] ='%c'\n", hi_tag_str, line_cursor, nch);
5765 			if(!isdigit(nch)) break;
5766 			hi_tag = hi_tag*10 + (nch-'0');
5767 		}
5768 	}
5769 
5770 	if(hi_tag >=0)
5771 	{
5772 		sprintf(hi_tag_out,"\t%s:i:%d", tag, hi_tag);
5773 	}else hi_tag_out[0] = 0;
5774 
5775 
5776 }
5777 
sort_SAM_finalise(SAM_sort_writer * writer)5778 int sort_SAM_finalise(SAM_sort_writer * writer)
5779 {
5780 	int x1_chunk, x1_block, is_disk_full = 0;
5781 	int xk1;
5782 	for(xk1=0;xk1<SAM_SORT_BLOCKS;xk1++)
5783 	{
5784 		if(writer -> current_block_fp_array[xk1])
5785 			fclose(writer -> current_block_fp_array[xk1]);
5786 	}
5787 	memset(writer -> current_block_fp_array, 0, sizeof(FILE *)*SAM_SORT_BLOCKS);
5788 	writer -> current_chunk_size = 0;
5789 	writer -> current_chunk++;
5790 
5791 	for(x1_block = 0; x1_block <SAM_SORT_BLOCKS; x1_block++){
5792 		HashTable * first_read_name_table;
5793 		first_read_name_table = HashTableCreate(SAM_SORT_BLOCK_SIZE / 100 );
5794 		HashTableSetKeyComparisonFunction(first_read_name_table , fc_strcmp_chro);
5795 		HashTableSetDeallocationFunctions(first_read_name_table , free, free);
5796 		HashTableSetHashFunction(first_read_name_table, HashTableStringHashFunction);
5797 
5798 		for(x1_chunk = 0; x1_chunk < writer -> current_chunk; x1_chunk++)
5799 		{
5800 			char tmpfname[MAX_FILE_NAME_LENGTH+40];
5801 			sprintf(tmpfname, "%sCHK%08d-BLK%03d.bin", writer -> tmp_path, x1_chunk , x1_block);
5802 
5803 			FILE * bbfp = f_subr_open(tmpfname,"rb");
5804 			if(!bbfp) continue;
5805 
5806 			while(!feof(bbfp))
5807 			{
5808 				char * read_name = NULL;
5809 				short flags;
5810 				short read_name_len;
5811 				short read_len;
5812 				int ret = fread(&flags, 2,1 , bbfp);
5813 				if(ret<1) break;
5814 				ret = fread(&read_name_len, 2,1 , bbfp);
5815 				if(ret<1) break;
5816 
5817 				if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
5818 					fseeko(bbfp, read_name_len, SEEK_CUR);
5819 				else
5820 				{
5821 					read_name = malloc(read_name_len+1);
5822 					ret = fread(read_name, 1, read_name_len, bbfp);
5823 					if(ret< read_name_len) break;
5824 					read_name[read_name_len] = 0;
5825 				}
5826 				ret =fread(&read_len,2,1,bbfp);
5827 				if(ret<1) break;
5828 
5829 				if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
5830 					fseeko(bbfp, read_len, SEEK_CUR);
5831 				else
5832 				{
5833 					char * new_line_mem = malloc(read_len+1);
5834 					ret = fread(new_line_mem, 1, read_len, bbfp);
5835 					if(ret<read_len) break;
5836 
5837 					new_line_mem[read_len] = 0;
5838 
5839 					if(read_len<2)
5840 					{
5841 						SUBREADprintf("Cannot determain read length from the tmp file.\n");
5842 						assert(0);
5843 					}
5844 
5845 
5846 					if( new_line_mem[0]==0 || new_line_mem[1]==0)
5847 					{
5848 						SUBREADprintf("Cannot load read part from the tmp file.\n");
5849 						assert(0);
5850 					}
5851 
5852 
5853 					char * old_line_mem = HashTableGet(first_read_name_table, read_name);
5854 					if(old_line_mem)
5855 						old_line_mem[0]=0xff;
5856 					else
5857 						HashTablePut(first_read_name_table, read_name, new_line_mem);
5858 					//if( first_read_name_table -> numOfElements<4)printf("RV=%s\n", read_name);
5859 				}
5860 			}
5861 
5862 			fclose(bbfp);
5863 		}
5864 
5865 		//printf("BLK=%d; CKS=%d; READS=%llu\n", x1_block, x1_chunk, first_read_name_table -> numOfElements);
5866 		srInt_64 finished_second_reads = 0;
5867 
5868 		for(x1_chunk = 0; x1_chunk < writer -> current_chunk; x1_chunk++)
5869 		{
5870 			char tmpfname[MAX_FILE_NAME_LENGTH+40];
5871 			sprintf(tmpfname, "%sCHK%08d-BLK%03d.bin", writer -> tmp_path, x1_chunk , x1_block);
5872 
5873 	//		printf("START_BLOCK: %s\n", tmpfname);
5874 
5875 			FILE * bbfp = f_subr_open(tmpfname,"rb");
5876 			if(!bbfp) continue;
5877 
5878 			char * read_line_buf = malloc(3000);
5879 			char * read_name_buf = malloc(MAX_READ_NAME_LEN + MAX_CHROMOSOME_NAME_LEN * 2 + 26);
5880 
5881 			while(!feof(bbfp))
5882 			{
5883 				short flags;
5884 				short read_name_len;
5885 				short read_len;
5886 				int ret = fread(&flags, 2,1 , bbfp);
5887 				if(ret<1) break;
5888 
5889 				ret = fread(&read_name_len, 2,1 , bbfp);
5890 				if(ret < 1) break;
5891 
5892 				if(read_name_len>=MAX_READ_NAME_LEN + MAX_CHROMOSOME_NAME_LEN * 2 + 26)
5893 					SUBREADprintf("VERY_LONG_NAME(%d)\n", read_name_len);
5894 				if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
5895 				{
5896 					ret = fread(read_name_buf, 1, read_name_len, bbfp);
5897 					if(ret < read_name_len) break;
5898 
5899 					read_name_buf[read_name_len] = 0;
5900 				} else fseeko(bbfp, read_name_len, SEEK_CUR);
5901 				ret = fread(&read_len, 2,1 , bbfp);
5902 				if(ret < 1) break;
5903 
5904 				if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
5905 				{
5906 					ret = fread(read_line_buf, 1, read_len, bbfp);
5907 					if(ret < 1) break;
5908 					read_line_buf[read_len] = 0;
5909 				}
5910 				else	fseeko(bbfp, read_len, SEEK_CUR);
5911 
5912 
5913 				if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
5914 				{
5915 //					printf("RRNAME:%s\n", read_name_buf);
5916 
5917 					char * first_read_text = HashTableGet(first_read_name_table, read_name_buf);
5918 					strtok(read_name_buf,"\t");
5919 					if(first_read_text && first_read_text[0]!=(char)0xff)
5920 					{
5921 						fputs(read_name_buf, writer->out_fp);
5922 						putc('\t',  writer->out_fp);
5923 						fputs(first_read_text, writer->out_fp);
5924 
5925 						fputs(read_name_buf, writer->out_fp);
5926 						putc('\t',  writer->out_fp);
5927 						int write_len = fputs(read_line_buf, writer->out_fp);
5928 						if(write_len < 0) is_disk_full = 1;
5929 
5930 						read_name_buf[strlen(read_name_buf)]='\t';
5931 						HashTableRemove(first_read_name_table, read_name_buf);
5932 						finished_second_reads ++;
5933 					}
5934 					else{
5935 
5936 						int dummy_flags = 4 | 1, mate_flags = 0;
5937 						char * dummy_mate_chr = NULL;
5938 						char dummy_mate_chr_buf[120];
5939 						unsigned int dummy_old_read_pos = 0, tmpi=0,dummy_char_strpos = 0;
5940 						int tabs = 0;
5941 						int read_cursor = 0;
5942 
5943 						for(read_cursor = 0;; read_cursor++)
5944 						{
5945 							char nch = read_line_buf[read_cursor];
5946 							if(!nch) break;
5947 							if(nch == '\t')
5948 							{
5949 								if(tabs == 0){
5950 									mate_flags = tmpi;
5951 									dummy_mate_chr = read_line_buf+read_cursor+1;
5952 								}
5953 								else if(tabs == 1)
5954 									dummy_char_strpos = read_cursor;
5955 								else if(tabs == 2)
5956 								{
5957 									dummy_old_read_pos = tmpi;
5958 									break;
5959 								}
5960 								tmpi=0;
5961 								tabs++;
5962 							}else{
5963 								if(tabs==0 || tabs == 2) tmpi = tmpi * 10 + (nch - '0');
5964 							}
5965 						}
5966 
5967 
5968 						dummy_flags |= SAM_FLAG_FIRST_READ_IN_PAIR;
5969 						if(mate_flags & SAM_FLAG_UNMAPPED)  dummy_flags |= SAM_FLAG_MATE_UNMATCHED;
5970 						if(mate_flags & SAM_FLAG_REVERSE_STRAND_MATCHED)  dummy_flags |= SAM_FLAG_MATE_REVERSE_STRAND_MATCHED;
5971 						if(mate_flags & SAM_FLAG_MATE_REVERSE_STRAND_MATCHED)  dummy_flags |= SAM_FLAG_REVERSE_STRAND_MATCHED;
5972 
5973 						memcpy(dummy_mate_chr_buf, dummy_mate_chr, read_line_buf +dummy_char_strpos - dummy_mate_chr);
5974 						dummy_mate_chr_buf[read_line_buf +dummy_char_strpos - dummy_mate_chr]=0;
5975 
5976 						char hi_tag_out[18];
5977 						char nh_tag_out[18];
5978 
5979 						find_tag_out(read_line_buf, "HI", hi_tag_out);
5980 						find_tag_out(read_line_buf, "NH", nh_tag_out);
5981 
5982 						// build a fake FIRST read for the mapped SECOND read.
5983 						// note that the TLEN, MATE_POS and MATE_CHAR are incorrect for general use.
5984 						fprintf(writer->out_fp, "%s\t%d\t*\t0\t0\t*\t%s\t%d\t0\tN\tI%s%s\n", read_name_buf, dummy_flags, dummy_mate_chr_buf, dummy_old_read_pos, nh_tag_out, hi_tag_out);
5985 						fputs(read_name_buf, writer->out_fp);
5986 						putc('\t',  writer->out_fp);
5987 						int write_len = fputs(read_line_buf, writer->out_fp);
5988 						if(write_len < 0) is_disk_full = 1;
5989 						writer -> unpaired_reads +=1;
5990 					}
5991 
5992 					//else SUBREADprintf("WARNING: Unpaired read found in file:%s\n", read_name_buf);
5993 				}
5994 			}
5995 
5996 			fclose(bbfp);
5997 			unlink(tmpfname);
5998 			free(read_name_buf);
5999 			free(read_line_buf);
6000 		}
6001 
6002 
6003 
6004 		if(1)
6005 		{
6006 			writer -> unpaired_reads += first_read_name_table -> numOfElements;
6007 
6008 			KeyValuePair * cursor;
6009 			int bucket;
6010 
6011 			// go through the hash table and write correct FIRST lines and dummy SECOND lines.
6012 			for(bucket=0; bucket< first_read_name_table -> numOfBuckets; bucket++)
6013 			{
6014 				cursor = first_read_name_table -> bucketArray[bucket];
6015 				while(1)
6016 				{
6017 					if (!cursor) break;
6018 					char * first_read_text = (char *)cursor -> value;
6019 					char * first_read_name = (char *)cursor -> key;
6020 
6021 					if(first_read_text[0]!=(char)0xff)
6022 					{
6023 						int dummy_flags = 4 | 1, mate_flags = 0;
6024 						char * dummy_mate_chr = NULL;
6025 						unsigned int dummy_old_read_pos = 0, tmpi=0, dummy_char_strpos = 0;
6026 						int tabs = 0;
6027 						int read_cursor = 0;
6028 
6029 						for(read_cursor = 0;; read_cursor++)
6030 						{
6031 							char nch = first_read_text[read_cursor];
6032 							if(!nch) break;
6033 							if(nch == '\t')
6034 							{
6035 								if(tabs == 0){
6036 									mate_flags = tmpi;
6037 									dummy_mate_chr = first_read_text+read_cursor+1;
6038 								}
6039 								else if(tabs == 1)
6040 									dummy_char_strpos = read_cursor;
6041 								else if(tabs == 2)
6042 								{
6043 									dummy_old_read_pos = tmpi;
6044 									break;
6045 								}
6046 								tmpi=0;
6047 								tabs++;
6048 							}else{
6049 								if(tabs==0 || tabs == 2) tmpi = tmpi * 10 + (nch - '0');
6050 							}
6051 						}
6052 
6053 						dummy_flags |= SAM_FLAG_SECOND_READ_IN_PAIR;
6054 						if(mate_flags & SAM_FLAG_UNMAPPED)  dummy_flags |= SAM_FLAG_MATE_UNMATCHED;
6055 						if(mate_flags & SAM_FLAG_REVERSE_STRAND_MATCHED)  dummy_flags |= SAM_FLAG_MATE_REVERSE_STRAND_MATCHED;
6056 						if(mate_flags & SAM_FLAG_MATE_REVERSE_STRAND_MATCHED)  dummy_flags |= SAM_FLAG_REVERSE_STRAND_MATCHED;
6057 
6058 						if((!first_read_text[0])||(!first_read_text[1]))
6059 						{
6060 							SUBREADprintf("unable to recover the first read : '%s' , flags = %d\n", first_read_name, mate_flags);
6061 							assert(0);
6062 						}
6063 
6064 						char nh_tag_out[18];
6065 						char hi_tag_out[18];
6066 						find_tag_out(first_read_text, "NH", nh_tag_out);
6067 						find_tag_out(first_read_text, "HI", hi_tag_out);
6068 
6069 						strtok(first_read_name, "\t");
6070 						fputs(first_read_name, writer->out_fp);
6071 						putc('\t',  writer->out_fp);
6072 						fputs(first_read_text, writer->out_fp);
6073 						first_read_text[dummy_char_strpos] = 0;
6074 						fprintf(writer->out_fp, "%s\t%d\t*\t0\t0\t*\t%s\t%d\t0\tN\tI%s%s\n", first_read_name, dummy_flags, dummy_mate_chr, dummy_old_read_pos, nh_tag_out,hi_tag_out);
6075 					}
6076 					cursor = cursor->next;
6077 				}
6078 			}
6079 
6080 
6081 		}
6082 
6083 		HashTableDestroy(first_read_name_table);
6084 	}
6085 	fclose(writer -> out_fp);
6086 	signal (SIGTERM, old_sig_TERM);
6087 	signal (SIGINT, old_sig_INT);
6088 	return is_disk_full;
6089 }
6090 
sort_SAM_check_chunk(SAM_sort_writer * writer)6091 void sort_SAM_check_chunk(SAM_sort_writer * writer)
6092 {
6093 	if(writer -> current_chunk_size > SAM_SORT_BLOCK_SIZE * SAM_SORT_BLOCKS)
6094 	{
6095 		int xk1;
6096 		for(xk1=0;xk1<SAM_SORT_BLOCKS;xk1++)
6097 		{
6098 			if(writer -> current_block_fp_array[xk1])
6099 				fclose(writer -> current_block_fp_array[xk1]);
6100 		}
6101 		memset(writer -> current_block_fp_array, 0, sizeof(FILE *)*SAM_SORT_BLOCKS);
6102 		writer -> current_chunk_size = 0;
6103 		writer -> current_chunk++;
6104 	}
6105 }
6106 
6107 // the SAM_line includes "\n" at the tail!
6108 // line_len = strlen(SAM_line)
sort_SAM_add_line(SAM_sort_writer * writer,char * SAM_line,int line_len)6109 int sort_SAM_add_line(SAM_sort_writer * writer, char * SAM_line, int line_len)
6110 {
6111 	int is_disk_full = 0;
6112 	assert(writer -> all_chunks_header_fp);
6113 	if(line_len<3) return 0;
6114 	if(SAM_line[0]=='@'){
6115 		int wlen = fputs(SAM_line, writer -> out_fp);
6116 		if(wlen < 0){
6117 			return -2;
6118 		}
6119 	}
6120 	else
6121 	{
6122 		char read_name[MAX_READ_NAME_LEN + MAX_CHROMOSOME_NAME_LEN * 2 + 26];
6123 		char chromosome_1_name[MAX_CHROMOSOME_NAME_LEN];
6124 		char chromosome_2_name[MAX_CHROMOSOME_NAME_LEN];
6125 		unsigned int pos_1, pos_2;
6126 		int hi_tag,flags = 0, line_cursor = 0, field_cursor = 0, tabs=0;
6127 		char * second_col_pos = NULL;
6128 
6129 		chromosome_1_name[0]=0;
6130 		chromosome_2_name[0]=0;
6131 		pos_1 = 0;
6132 		pos_2 = 0;
6133 		hi_tag = -1;
6134 
6135 		while(line_cursor < line_len)
6136 		{
6137 			char nch = SAM_line[line_cursor++];
6138 			if(!nch)break;
6139 
6140 			if(nch == '\t')
6141 			{
6142 				field_cursor = 0;
6143 				tabs++;
6144 				if(tabs == 1) second_col_pos = SAM_line + line_cursor;
6145 				if(tabs>7) break;
6146 			}
6147 			else if(tabs == 0)
6148 			{
6149 				read_name[field_cursor++] = nch;
6150 				if(MAX_READ_NAME_LEN<=field_cursor){
6151 					return -1;
6152 				}
6153 				read_name[field_cursor] = 0;
6154 			}
6155 			else if(tabs == 1)
6156 				flags = flags*10+(nch-'0');
6157 			else if(tabs == 2)
6158 			{
6159 				chromosome_1_name[field_cursor++] = nch;
6160 				chromosome_1_name[field_cursor]=0;
6161 				if(MAX_CHROMOSOME_NAME_LEN - 1 <= field_cursor) return -1;
6162 			}
6163 			else if(tabs == 3)
6164 				pos_1 = pos_1 * 10 + (nch-'0');
6165 			else if(tabs == 6)
6166 			{
6167 				chromosome_2_name[field_cursor++] = nch;
6168 				chromosome_2_name[field_cursor] = 0;
6169 				if(MAX_CHROMOSOME_NAME_LEN - 1 <= field_cursor) return -1;
6170 			}
6171 			else if(tabs == 7)
6172 				pos_2 = pos_2 * 10 + (nch-'0');
6173 
6174 		}
6175 		if(tabs <= 7) return -1;
6176 
6177 		//if(memcmp("V0112_0155:7:1101:4561:132881", read_name, 27)==0)
6178 
6179 		char * hi_tag_str = strstr(SAM_line,"\tHI:i:");
6180 		if(hi_tag_str)
6181 		{
6182 			hi_tag = 0;
6183 			for(line_cursor=6; ; line_cursor++)
6184 			{
6185 				char nch = hi_tag_str[line_cursor];
6186 				if(!isdigit(nch)) break;
6187 				hi_tag = hi_tag*10 + (nch-'0');
6188 			}
6189 		}
6190 
6191 		line_len = strlen(second_col_pos);
6192 		sort_SAM_check_chunk(writer);
6193 
6194 		for(field_cursor = 0; read_name[field_cursor] ; field_cursor++)
6195 			if(read_name[field_cursor] == '/') read_name[field_cursor] = 0;
6196 
6197 		if(chromosome_2_name[0]=='=')
6198 			strcpy(chromosome_2_name, chromosome_1_name);
6199 
6200 
6201 		// new read name format: OLD_READ_NAME\tCHR_R1:POS_R1:CHR_R2:POS_R2
6202 
6203 
6204 		if(flags & SAM_FLAG_MATE_UNMATCHED)
6205 		{
6206 			if(chromosome_2_name[0] != '*')
6207 				strcpy(chromosome_2_name , "*");
6208 			pos_2 = 0;
6209 		}
6210 
6211 
6212 		if(flags & SAM_FLAG_UNMAPPED)
6213 		{
6214 			if(chromosome_1_name[0] != '*')
6215 				strcpy(chromosome_1_name , "*");
6216 			pos_1 = 0;
6217 		}
6218 
6219 		char hi_key [13];
6220 		if(hi_tag >=0)// && pos_1 && pos_2)
6221 			sprintf(hi_key, ":%d", hi_tag);
6222 		else
6223 			hi_key[0]=0;
6224 
6225 		if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)
6226 			sprintf(read_name+strlen(read_name), "\t%s:%u:%s:%u%s",chromosome_2_name, pos_2, chromosome_1_name, pos_1, hi_key);
6227 		else
6228 			sprintf(read_name+strlen(read_name), "\t%s:%u:%s:%u%s",chromosome_1_name, pos_1, chromosome_2_name, pos_2, hi_key);
6229 
6230 		//if(memcmp("V0112_0155:7:1101:4561:132881", read_name, 27)==0)
6231 		//	printf("RRN=%s\n", read_name);
6232 
6233 		int read_name_len = strlen(read_name);
6234 		srUInt_64 read_line_hash = sort_SAM_hash(read_name);
6235 
6236 		int block_id = read_line_hash % SAM_SORT_BLOCKS;
6237 		if(!writer -> current_block_fp_array[block_id])
6238 		{
6239 			char tmpfname[MAX_FILE_NAME_LENGTH+40];
6240 			sprintf(tmpfname,"%sCHK%08d-BLK%03d.bin", writer -> tmp_path , writer -> current_chunk , block_id);
6241 			writer -> current_block_fp_array[block_id] = f_subr_open(tmpfname, "wb");
6242 		}
6243 
6244 		if(line_len < 2)
6245 		{
6246 			SUBREADprintf("unable to put the first read.\n");
6247 			assert(0);
6248 		}
6249 
6250 		if(second_col_pos[0]==0 || second_col_pos[1]==0)
6251 		{
6252 			SUBREADprintf("unable to put the first read TEXT.\n");
6253 			assert(0);
6254 		}
6255 
6256 
6257 //		printf("WRNAME:%s\n", read_name);
6258 
6259 		fwrite(&flags, 2, 1, writer -> current_block_fp_array[block_id]);
6260 		fwrite(&read_name_len, 2, 1, writer -> current_block_fp_array[block_id]);
6261 		fwrite(read_name, 1, read_name_len, writer -> current_block_fp_array[block_id]);
6262 		fwrite(&line_len, 2, 1, writer -> current_block_fp_array[block_id]);
6263 		int write_len = fwrite(second_col_pos, 1, line_len, writer -> current_block_fp_array[block_id]);
6264 		if(write_len < line_len)is_disk_full = -2;
6265 
6266 		writer -> output_file_size += line_len;
6267 		writer -> current_chunk_size += line_len;
6268 		writer -> written_reads ++;
6269 	}
6270 
6271 	return is_disk_full;
6272 }
6273 
is_SAM_unsorted(char * SAM_line,char * tmp_read_name,short * tmp_flag,srInt_64 read_no)6274 int is_SAM_unsorted(char * SAM_line, char * tmp_read_name, short * tmp_flag, srInt_64 read_no)
6275 {
6276 	char read_name[MAX_READ_NAME_LEN];
6277 	int flags = 0, line_cursor = 0, field_cursor = 0, tabs=0;
6278 	read_name[0] =0;
6279 
6280 	while(1)
6281 	{
6282 		char nch = SAM_line[line_cursor++];
6283 		if(!nch)break;
6284 		if(nch == '\t')
6285 		{
6286 			field_cursor = 0;
6287 			tabs++;
6288 			if(tabs>1) break;
6289 		}
6290 		else if(tabs == 0)
6291 		{
6292 			read_name[field_cursor++] = nch;
6293 			assert(MAX_READ_NAME_LEN>field_cursor);
6294 			read_name[field_cursor] = 0;
6295 		}
6296 		else if(tabs == 1)
6297 			flags = flags*10+(nch-'0');
6298 	}
6299 
6300 		//int is_second_read = (flags & 0x80) ? 1:0;
6301 	for(field_cursor = 0; read_name[field_cursor] ; field_cursor++)
6302 		if(read_name[field_cursor] == '/') read_name[field_cursor] = 0;
6303 
6304 
6305 	(*tmp_flag) = flags;
6306 	if(!(flags &1)) return 0;
6307 	if(read_no % 2 == 0)
6308 	{
6309 		if(flags & SAM_FLAG_SECOND_READ_IN_PAIR)return 1;
6310 		strcpy(tmp_read_name , read_name);
6311 	}
6312 	else
6313 	{
6314 		if(flags & SAM_FLAG_FIRST_READ_IN_PAIR) return 1;
6315 		if(strcmp(tmp_read_name, read_name))return 1;
6316 	}
6317 
6318 	return 0;
6319 }
6320 
is_certainly_bam_file(char * fname,int * is_first_read_PE,srInt_64 * SAMBAM_header_size)6321 int is_certainly_bam_file(char * fname, int * is_first_read_PE, srInt_64 * SAMBAM_header_size)
6322 {
6323 
6324 	int read_type = probe_file_type_EX(fname, is_first_read_PE, SAMBAM_header_size);
6325 	if(read_type == FILE_TYPE_NONEXIST || read_type == FILE_TYPE_EMPTY || read_type == FILE_TYPE_UNKNOWN)
6326 		return -1;
6327 	if(read_type == FILE_TYPE_BAM)
6328 		return 1;
6329 	return 0;
6330 }
6331 
6332 
is_pipe_file(char * fname)6333 int is_pipe_file(char * fname)
6334 {
6335 	FILE * fp = fopen(fname,"r");
6336 	if(!fp) return 0;
6337 
6338 	int seeked = fseeko(fp, 0, SEEK_SET);
6339 	fclose(fp);
6340 
6341 	return (seeked != 0);
6342 }
6343 
warning_file_type(char * fname,int expected_type)6344 int warning_file_type(char * fname, int expected_type)
6345 {
6346 	int ret_pipe_file = is_pipe_file(fname);
6347 	if(ret_pipe_file)
6348 	{
6349 		print_in_box(80,0,0,"WARNING file '%s' is not a regular file.", fname);
6350 		print_in_box(80,0,0,"	No alignment can be done on a pipe file.");
6351 		print_in_box(80,0,0,"	If the FASTQ file is gzipped, please use gzFASTQinput option.");
6352 		print_in_box(80,0,0,"");
6353 		return 1;
6354 	}
6355 
6356 	int read_type = probe_file_type(fname, NULL);
6357 
6358 	if(read_type == FILE_TYPE_NONEXIST)
6359 	{
6360 		SUBREADprintf("ERROR: unable to open file '%s'. File name might be incorrect, or you do not have the permission to read the file.\n", fname);
6361 		return -1;
6362 	}
6363 	else if(read_type == FILE_TYPE_EMPTY)
6364 	{
6365 		SUBREADprintf("\nERROR: file '%s' is empty.\n\n", fname);
6366 		return -1;
6367 	}
6368 	else if((expected_type == FILE_TYPE_FAST_ && (read_type!= FILE_TYPE_FASTQ && read_type!= FILE_TYPE_FASTA && read_type!= FILE_TYPE_GZIP_FASTQ && read_type!= FILE_TYPE_GZIP_FASTA))||
6369 		(expected_type == FILE_TYPE_GZIP_FAST_ && read_type!= FILE_TYPE_GZIP_FASTA) ||
6370 		((  expected_type != FILE_TYPE_GZIP_FAST_ && expected_type != FILE_TYPE_FAST_) && expected_type != read_type))
6371 	{
6372 		char * req_fmt = "SAM";
6373 		if(expected_type==FILE_TYPE_BAM) req_fmt = "BAM";
6374 		else if(expected_type==FILE_TYPE_FAST_) req_fmt = "FASTQ or FASTA";
6375 		else if(expected_type==FILE_TYPE_GZIP_FAST_) req_fmt = "gzip FASTQ or FASTA";
6376 
6377 		char * real_fmt = "SAM";
6378 		if(read_type==FILE_TYPE_BAM) real_fmt = "BAM";
6379 		else if(read_type==FILE_TYPE_FASTA) real_fmt = "FASTA";
6380 		else if(read_type==FILE_TYPE_FASTQ) real_fmt = "FASTQ";
6381 		else if(read_type==FILE_TYPE_GZIP_FASTQ) real_fmt = "gzip FASTQ";
6382 		else if(read_type==FILE_TYPE_GZIP_FASTA) real_fmt = "gzip FASTA";
6383 
6384 		print_in_box(80,0,0,"WARNING format issue in file '%s':", fname);
6385 		print_in_box(80,0,0,"	The required format is : %s", req_fmt);
6386 		if(read_type == FILE_TYPE_UNKNOWN)
6387 			print_in_box(80,0,0,"	The file format is unknown.");
6388 		else
6389 			print_in_box(80,0,0,"	The real format seems to be : %s", real_fmt);
6390 		print_in_box(80,0,0,"A wrong format may result in wrong results or crash the program.");
6391 		print_in_box(80,0,0,"Please refer to the manual for file format options.");
6392 		print_in_box(80,0,0,"If the file is in the correct format, please ignore this message.");
6393 		print_in_box(80,0,0,"");
6394 
6395 		return 1;
6396 	}
6397 	return 0;
6398 }
6399 
gzgets_noempty(void * fp,char * buf,int maxlen)6400 char * gzgets_noempty(void * fp, char * buf, int maxlen)
6401 {
6402 	char * ret;
6403 	while(1)
6404 	{
6405 		ret = gzgets(fp,buf, maxlen);
6406 		if(!ret)return NULL;
6407 		if(ret[0]!='\n') return ret;
6408 	}
6409 }
6410 
6411 
fgets_noempty(char * buf,int maxlen,FILE * fp)6412 char * fgets_noempty(char * buf, int maxlen, FILE * fp)
6413 {
6414 	char * ret;
6415 	while(1)
6416 	{
6417 		ret = fgets(buf, maxlen, fp);
6418 		if(!ret)return NULL;
6419 		if(ret[0]!='\n') return ret;
6420 	}
6421 }
6422 
is_comment_line(const char * l,int file_type,unsigned int lineno)6423 int is_comment_line(const char * l, int file_type, unsigned int lineno)
6424 {
6425 	int tabs = 0, xk1 = 0;
6426 	if(l[0]=='#') return 1;
6427 
6428 	if(isalpha(l[0]) && file_type == FILE_TYPE_RSUBREAD)
6429 	{
6430 		char target_chr[16];
6431 		memcpy(target_chr, l, 16);
6432 		for(xk1=0; xk1<16; xk1++)
6433 			target_chr[xk1] = tolower(target_chr[xk1]);
6434 
6435 		if(memcmp(target_chr, "geneid\tchr\tstart",16)==0) return 1;
6436 	}
6437 
6438 	xk1=0;
6439 	while(l[xk1]) tabs += (l[xk1++] == '\t');
6440 
6441 	return tabs < ((file_type == FILE_TYPE_GTF)?8:4);
6442 }
6443 
6444 
6445 
probe_file_type_fast(char * fname)6446 int probe_file_type_fast(char * fname){
6447 	FILE * fp = f_subr_open(fname, "rb");
6448 	if(!fp) return FILE_TYPE_NONEXIST;
6449 
6450 	int ret = FILE_TYPE_UNKNOWN;
6451 	int nch;
6452 	char *test_buf=malloc(5000);
6453 
6454 	nch = fgetc(fp);
6455 
6456 	if(feof(fp))
6457 		ret = FILE_TYPE_EMPTY;
6458 	else
6459 	{
6460 		if(nch == '@')	// FASTQ OR SAM
6461 		{
6462 			char * rptr = fgets_noempty(test_buf, 4999, fp);
6463 			int second_line_len = 0;
6464 			if(rptr)
6465 			{
6466 				rptr = fgets_noempty(test_buf, 4999, fp);
6467 				if(rptr)
6468 				{
6469 					second_line_len = strlen(test_buf);
6470 					int tabs = 0, x1;
6471 					for(x1=0;x1<4999;x1++)
6472 					{
6473 						if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6474 						if(test_buf[x1]=='\t'){
6475 							tabs++;
6476 							continue;
6477 						}
6478 
6479 						if(tabs == 1)
6480 							if(!isdigit(test_buf[x1]))break;
6481 					}
6482 					if(rptr[0]=='@' || tabs>7)
6483 						ret = FILE_TYPE_SAM;
6484 				}
6485 			}
6486 			if(ret == FILE_TYPE_UNKNOWN)
6487 			{
6488 				rptr = fgets_noempty(test_buf, 4999, fp);
6489 				if(rptr[0] == '+')
6490 				{
6491 					rptr = fgets_noempty(test_buf, 4999, fp);
6492 					if(rptr && second_line_len == strlen(test_buf))
6493 						ret = FILE_TYPE_FASTQ;
6494 				}
6495 			}
6496 		}
6497 		else if(nch == '>') // FASTA
6498 		{
6499 			ret = FILE_TYPE_FASTA;
6500 		}
6501 		else if(nch == 31) // BAM OR GZ_FASTQ
6502 		{
6503 			nch = fgetc(fp);
6504 			if(nch == 139)
6505 			{
6506 				fclose(fp);
6507 				fp=NULL;
6508 				gzFile zfp = gzopen(fname, "rb");
6509 				if(zfp)
6510 				{
6511 					int rlen = gzread(zfp, test_buf,4);
6512 					if(rlen == 4 && memcmp(test_buf,"BAM\1",4)==0)
6513 						ret = FILE_TYPE_BAM;
6514 					if(rlen == 4 && test_buf[0]=='@')
6515 						ret = FILE_TYPE_GZIP_FASTQ;
6516 					if(rlen == 4 && test_buf[0]=='>')
6517 						ret = FILE_TYPE_GZIP_FASTA;
6518 					gzclose(zfp);
6519 				}
6520 			}
6521 		}
6522 		else if(nch >= 0x20 && nch <= 0x7f) // SAM without headers
6523 		{
6524 			int tabs = 0, x1;
6525 			char * rptr = fgets(test_buf, 4999, fp);
6526 			if(rptr)
6527 				for(x1=0;x1<4999;x1++)
6528 				{
6529 					if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6530 					if(test_buf[x1]=='\t'){
6531 						tabs++;
6532 						continue;
6533 					}
6534 					if(tabs == 1)
6535 						if(!isdigit(test_buf[x1]))break;
6536 				}
6537 			if(tabs>7)
6538 				ret = FILE_TYPE_SAM;
6539 
6540 		}
6541 	}
6542 
6543 	if(fp)fclose(fp);
6544 
6545 	free(test_buf);
6546 	return ret;
6547 
6548 }
probe_file_type(char * fname,int * is_first_read_PE)6549 int probe_file_type(char * fname, int * is_first_read_PE)
6550 {
6551 	return probe_file_type_EX(fname, is_first_read_PE, NULL);
6552 }
probe_file_type_EX(char * fname,int * is_first_read_PE,srInt_64 * SAMBAM_header_length)6553 int probe_file_type_EX(char * fname, int * is_first_read_PE, srInt_64 * SAMBAM_header_length)
6554 {
6555 	FILE * fp = f_subr_open(fname, "rb");
6556 	if(!fp) return FILE_TYPE_NONEXIST;
6557 
6558 	int ret = FILE_TYPE_UNKNOWN;
6559 	int nch;
6560 	char *test_buf=malloc(5000);
6561 
6562 	nch = fgetc(fp);
6563 
6564 	if(feof(fp))
6565 		ret = FILE_TYPE_EMPTY;
6566 
6567 	else
6568 	{
6569 		if(nch == '@')	// FASTQ OR SAM
6570 		{
6571 			char * rptr = fgets_noempty(test_buf, 4999, fp);
6572 			int second_line_len = 0;
6573 			if(rptr)
6574 			{
6575 				rptr = fgets_noempty(test_buf, 4999, fp);
6576 				if(rptr)
6577 				{
6578 					second_line_len = strlen(test_buf);
6579 					int tabs = 0, x1;
6580 					for(x1=0;x1<4999;x1++)
6581 					{
6582 						if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6583 						if(test_buf[x1]=='\t'){
6584 							tabs++;
6585 							continue;
6586 						}
6587 
6588 						if(tabs == 1)
6589 							if(!isdigit(test_buf[x1]))break;
6590 					}
6591 					if(rptr[0]=='@' || tabs>7)
6592 						ret = FILE_TYPE_SAM;
6593 				}
6594 			}
6595 			if(ret == FILE_TYPE_UNKNOWN)
6596 			{
6597 				rptr = fgets_noempty(test_buf, 4999, fp);
6598 				if(rptr[0] == '+')
6599 				{
6600 					rptr = fgets_noempty(test_buf, 4999, fp);
6601 					if(rptr && second_line_len == strlen(test_buf))
6602 						ret = FILE_TYPE_FASTQ;
6603 				}
6604 			}
6605 		}
6606 		else if(nch == '>') // FASTA
6607 		{
6608 			char * rptr = fgets(test_buf, 4999, fp);
6609 			int x1;
6610 			if(rptr)
6611 			{
6612 				ret = FILE_TYPE_FASTA;
6613 				for(x1=0;x1<4999;x1++)
6614 				{
6615 					if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6616 					nch = toupper(test_buf[x1]);
6617 					if(nch < ' ' || nch>127)
6618 					{
6619 						ret = FILE_TYPE_UNKNOWN;
6620 						break;
6621 					}
6622 				}
6623 				rptr = fgets(test_buf, 4999, fp);
6624 				if(rptr && ret == FILE_TYPE_FASTA)
6625 				{
6626 					for(x1=0;x1<4999;x1++)
6627 					{
6628 						if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6629 						nch = toupper(test_buf[x1]);
6630 						if(nch == 'A' || nch == 'T' || nch == 'G' || nch == 'C' || nch == 'N' || nch == '.' || (nch >='0' && nch <= '3'))
6631 							;
6632 						else
6633 						{
6634 							ret = FILE_TYPE_UNKNOWN;
6635 							break;
6636 						}
6637 					}
6638 
6639 					if(x1==0) ret = FILE_TYPE_UNKNOWN;
6640 				}
6641 			}
6642 		}
6643 		else if(nch == 31) // BAM OR GZ_FASTQ
6644 		{
6645 			nch = fgetc(fp);
6646 			if(nch == 139)
6647 			{
6648 				fclose(fp);
6649 				fp=NULL;
6650 				gzFile zfp = gzopen(fname, "rb");
6651 				if(zfp)
6652 				{
6653 					int rlen = gzread(zfp, test_buf,4);
6654 					if(rlen == 4 && memcmp(test_buf,"BAM\1",4)==0)
6655 						ret = FILE_TYPE_BAM;
6656 					if(rlen == 4 && test_buf[0]=='@')
6657 						ret = FILE_TYPE_GZIP_FASTQ;
6658 					if(rlen == 4 && test_buf[0]=='>')
6659 						ret = FILE_TYPE_GZIP_FASTA;
6660 					gzclose(zfp);
6661 				}
6662 			}
6663 		}
6664 		else if(nch >= 0x20 && nch <= 0x7f) // SAM without headers
6665 		{
6666 			int tabs = 0, x1;
6667 			char * rptr = fgets(test_buf, 4999, fp);
6668 			if(rptr)
6669 				for(x1=0;x1<4999;x1++)
6670 				{
6671 					if(test_buf[x1]=='\n' || !test_buf[x1]) break;
6672 					if(test_buf[x1]=='\t'){
6673 						tabs++;
6674 						continue;
6675 					}
6676 					if(tabs == 1)
6677 						if(!isdigit(test_buf[x1]))break;
6678 				}
6679 			if(tabs>7)
6680 				ret = FILE_TYPE_SAM;
6681 
6682 		}
6683 	}
6684 
6685 	if(fp)fclose(fp);
6686 
6687 	//SUBREADprintf("RET=%d, FIRSTPE=%p, SAMLEN=%p\n" , ret, is_first_read_PE, SAMBAM_header_length);
6688 	if(FILE_TYPE_BAM == ret || FILE_TYPE_SAM == ret)
6689 		if(is_first_read_PE || SAMBAM_header_length)
6690 		{
6691 			SamBam_FILE * tpfp = SamBam_fopen(fname, (FILE_TYPE_BAM  == ret)?SAMBAM_FILE_BAM:SAMBAM_FILE_SAM);
6692 			while(1)
6693 			{
6694 				char * tbr = SamBam_fgets(tpfp, test_buf, 4999, 0);
6695 				if( is_first_read_PE &&  tpfp -> is_paired_end >= 10)
6696 					(*is_first_read_PE) = tpfp -> is_paired_end - 10;
6697 				if(tbr == NULL)break;
6698 				if(tbr[0]=='@') continue;
6699 				break;
6700 			}
6701 
6702 			if( SAMBAM_header_length) (*SAMBAM_header_length) = tpfp -> header_length;
6703 			SamBam_fclose(tpfp);
6704 		}
6705 
6706 	free(test_buf);
6707 	//if(is_first_read_PE)assert(0);
6708 	return ret;
6709 }
6710 
warning_hash_hash(HashTable * t1,HashTable * t2,char * msg)6711 void warning_hash_hash(HashTable * t1, HashTable * t2, char * msg){
6712 	int buck_i, shown = 0;
6713 	for(buck_i = 0; buck_i < t1 -> numOfBuckets; buck_i++){
6714 		KeyValuePair * cursor = t1 -> bucketArray[buck_i];
6715 		while(cursor){
6716 			char * t1chro = (char *) cursor -> key;
6717 			int found = HashTableGet(t2, t1chro) != NULL;
6718 			if(!found) if(strlen(t1chro)>3 &&  t1chro[0]=='c'&&t1chro[1]=='h'&&t1chro[2]=='r' ) found = HashTableGet(t2, t1chro+3) != NULL;
6719 			if(!found) {
6720 				char tmp_t1chro [MAX_CHROMOSOME_NAME_LEN+1];
6721 				sprintf(tmp_t1chro, "chr%s", t1chro);
6722 				found = HashTableGet(t2, tmp_t1chro) != NULL;
6723 			}
6724 
6725 			if(!found){
6726 				if(!shown){
6727 					print_in_box(80,0,0,"");
6728 					print_in_box(80,0,0,msg);
6729 					shown = 1;
6730 				}
6731 				print_in_box(80,0,0,"   %s", t1chro);
6732 			}
6733 			cursor = cursor -> next;
6734 		}
6735 	}
6736 	if(shown) print_in_box(80,0,0,"");
6737 }
6738 
6739 
6740 #ifdef MAKE_INPUTTEST
main(int argc,char ** argv)6741 int main(int argc, char ** argv)
6742 {
6743 	FILE * ifp;
6744 	srInt_64 rno=0;
6745 	short tmp_flags, is_sorted = 1;
6746 	char buff[3000], tmp_rname[MAX_FILE_NAME_LENGTH];
6747 
6748 	ifp = f_subr_open(argv[1],"r");
6749 	while(1)
6750 	{
6751 		char * rr = fgets(buff,2999, ifp);
6752 		if(!rr) break;
6753 		if(buff[0]=='@')continue;
6754 		if(is_SAM_unsorted(buff, tmp_rname, &tmp_flags, rno))
6755 		{
6756 			printf("The input file is unsorted.\n");
6757 			is_sorted = 0;
6758 			break;
6759 		}
6760 		rno++;
6761 	}
6762 
6763 	fclose(ifp);
6764 
6765 	//if(is_sorted) return 0;
6766 
6767 	ifp = f_subr_open(argv[1],"r");
6768 	SAM_sort_writer writer;
6769 	if(sort_SAM_create(&writer, argv[2], ".")){
6770 		printf("ERROR: unable to create the writer.\n");
6771 		return -1;
6772 	}
6773 
6774 	while(1)
6775 	{
6776 		char * rr = fgets(buff,2999, ifp);
6777 		if(!rr) break;
6778 		int line_len = strlen(buff);
6779 		sort_SAM_add_line(&writer, buff, line_len);
6780 	}
6781 	fclose(ifp);
6782 	sort_SAM_finalise(&writer);
6783 	printf("WRITTEN=%llu\nUNPAIR=%llu\n", writer.written_reads, writer.unpaired_reads);
6784 }
6785 #endif
6786 #ifdef MAKE_TYPETEST
6787 
6788 
main(int argc,char ** argv)6789 int main(int argc, char ** argv)
6790 {
6791 	char * fn = argv[1];
6792 	int type = probe_file_type(fn, NULL);
6793 	switch(type)
6794 	{
6795 		case FILE_TYPE_FASTQ: printf("Type: FASTQ\n"); break;
6796 		case FILE_TYPE_FASTA: printf("Type: FASTA\n"); break;
6797 		case FILE_TYPE_SAM  : printf("Type: SAM\n"); break;
6798 		case FILE_TYPE_BAM  : printf("Type: BAM\n"); break;
6799 		default: printf("Unknown type.\n");
6800 	}
6801 }
6802 
6803 #endif
6804