1 /***************************************************************
2 
3    The Subread software package is free software package:
4    you can redistribute it and/or modify it under the terms
5    of the GNU General Public License as published by the
6    Free Software Foundation, either version 3 of the License,
7    or (at your option) any later version.
8 
9    Subread is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty
11    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 
13    See the GNU General Public License for more details.
14 
15    Authors: Drs Yang Liao and Wei Shi
16 
17   ***************************************************************/
18 
19 
20 #ifndef _SAMBAM_FILE_H_
21 #define _SAMBAM_FILE_H_
22 
23 #include <zlib.h>
24 #include "HelperFunctions.h"
25 
26 typedef unsigned char BS_uint_8;
27 typedef unsigned short BS_uint_16;
28 typedef unsigned int BS_uint_32;
29 
30 #define BAM_MAX_CHROMOSOME_NAME_LEN 200
31 #define BAM_MAX_CIGAR_LEN (30000)
32 #define BAM_MAX_READ_NAME_LEN 256
33 #define BAM_MAX_READ_LEN 3000
34 
35 #define SAMBAM_FILE_SAM	10
36 #define SAMBAM_FILE_BAM 20
37 
38 #define BAM_FILE_STAGE_HEADER 10
39 #define BAM_FILE_STAGE_ALIGNMENT 20
40 
41 
42 #define SAMBAM_COMPRESS_LEVEL_FASTEST Z_NO_COMPRESSION
43 #define SAMBAM_COMPRESS_LEVEL_NORMAL Z_BEST_SPEED
44 
45 #define SAMBAM_GZIP_WINDOW_BITS -15
46 #define SAMBAM_INPUT_STREAM_SIZE 140000
47 
48 #define TEST_BAD_BAM_CHUNKS 9999925
49 
50 typedef struct
51 {
52 	char read_name[BAM_MAX_READ_NAME_LEN];
53 	char * chro_name;
54 	unsigned int chro_offset;
55 	unsigned short flags;
56 	char * mate_chro_name;
57 	unsigned int mate_chro_offset;
58 	int templete_length;
59 	unsigned char mapping_quality;
60 	int NH_number;
61 
62 	char cigar[BAM_MAX_CIGAR_LEN];
63 	char sequence[BAM_MAX_READ_LEN];
64 	char seq_quality[BAM_MAX_READ_LEN];
65 
66 	char buff_for_seq[BAM_MAX_READ_LEN*2];
67 
68 } SamBam_Alignment;
69 
70 
71 #define SB_FETCH(a)  if((a) -> input_binary_stream_write_ptr - (a) -> input_binary_stream_read_ptr < 3000){int test_rlen_2 = SamBam_fetch_next_chunk(a); if(test_rlen_2 == -2){(a)->is_bam_broken = 1;}}
72 #define SB_EOF(a)  ((a)-> is_eof && (  (a) -> input_binary_stream_write_ptr <= (a) -> input_binary_stream_read_ptr ))
73 #define SB_READ(a)  ((a) -> input_binary_stream_buffer + (a) -> input_binary_stream_read_ptr - (a) -> input_binary_stream_buffer_start_ptr)
74 #define SB_RINC(a, len)   ((a) -> input_binary_stream_read_ptr) += len
75 
76 typedef struct
77 {
78 	FILE * os_file;
79 
80 	int file_type;
81 	int bam_file_stage;
82 
83 	unsigned long long bam_file_next_section_start;
84 	unsigned long long input_binary_stream_read_ptr;
85 	unsigned long long input_binary_stream_write_ptr;
86 	unsigned long long input_binary_stream_buffer_start_ptr;
87 	unsigned long long header_length;
88 
89 	SamBam_Reference_Info * bam_chro_table;
90 	int bam_chro_table_size;
91 	SamBam_Alignment aln_buff;
92 
93 	char * input_binary_stream_buffer;
94 	int is_eof;
95 	int is_paired_end;
96 	int is_bam_broken;
97 } SamBam_FILE;
98 
99 struct SamBam_sorted_compressor_st{
100 	char plain_text[66000];
101 	char zipped_bin[70000];
102 	int text_size, bin_size;
103 	unsigned int CRC32_plain;
104 	z_stream strm;
105 	pthread_t thread_stub;
106 	srInt_64 bam_block_no;
107 	int last_job_done;
108 };
109 
110 typedef struct
111 {
112 	FILE * bam_fp;
113 	FILE * BAI_fp;
114 	long long current_BAM_pos;
115 	char tmpf_prefix[MAX_FILE_NAME_LENGTH];
116 	z_stream output_stream;
117 	char * chunk_buffer;
118 	char * compressed_chunk_buffer;
119 	char * header_plain_text_buffer;
120 	int header_plain_text_buffer_used;
121 	int header_plain_text_buffer_max;
122 	long long chunk_buffer_used;
123 	long long chunk_buffer_max_size;
124 	int writer_state;
125 	int is_internal_error;
126 	int sort_reads_by_coord;
127 	int fastest_compression;
128 	int sorted_batch_id;
129 	unsigned int crc0;
130 
131 	int threads;
132 	z_stream * threads_output_stream;
133 	char ** threads_chunk_buffer;
134 	char ** threads_chunk_buffer_compressed;
135 	long long * threads_chunk_buffer_used;
136 	long long * threads_chunk_buffer_max_size;
137 
138 	HashTable * chromosome_name_table;
139 	HashTable * chromosome_id_table;
140 	HashTable * chromosome_len_table;
141 	subread_lock_t thread_bam_lock;
142 
143 	worker_master_mutex_t sorted_notifier;
144 	HashTable * block_no_p1_to_vpos_tab;
145 	//int sorted_compress_plain_text_used;
146 	int sorted_compress_this_thread_no;
147 	srInt_64 this_bam_block_no;
148 	struct SamBam_sorted_compressor_st * writer_threads;
149 } SamBam_Writer;
150 
151 // This function reads the next BAM section from the bam_fp. The buffer has a variable length but should be at least 64K bytes.
152 // I recommend you to allocate 80KB of memory.
153 // This function returns the size of the compressed data ( CDATA ). It returns < 0 if EOF.
154 int PBam_get_next_zchunk(FILE * bam_fp, char * buffer, int buffer_length, unsigned int * real_len);
155 
156 // load the header of a BAM file (the header is important to load BAM reads)
157 // this function puts the File Pointer to the first read chunk in the BAM.
158 // It returns 0 if finished loading, or non-zero if wrong.
159 // If the chunk contains read data after the chromosome table, the read data is copied into remainder_read_data, and its lengtb is returned in remainder_read_data_len.
160 int PBum_load_header(FILE * bam_fp, SamBam_Reference_Info** chro_tab, char * remainder_read_data, int * remainder_read_data_len);
161 
162 
163 // load a new line from the BAM buffer (chunk) at chunk_ptr.
164 // if seq_needed==0, then no sequence nor quality str will be loaded.
165 // it returns the length (without "\0" after the tail) of the SAM string.
166 int PBam_chunk_gets(char * chunk, int *chunk_ptr, int chunk_limit, SamBam_Reference_Info * bam_chro_table, char * buff , int buff_len, SamBam_Alignment*aln, int seq_needed);
167 
168 // This function opens a file, either SAM or BAM, in read-only mode.
169 // The "file_type" parameter specifies which type of file it is: SAMBAM_FILE_BAM or SAMBAM_FILE_SAM.
170 SamBam_FILE * SamBam_fopen(char * fname , int file_type);
171 
172 // This function closes any opened file and releases memory footprint. It works just like "fclose()".
173 void SamBam_fclose(SamBam_FILE * fp);
174 
175 // This function tells if a file is exhausted.
176 // Note that a non-exhausted file can still contain no more alignment results.
177 // Hence, it is recommended to check the return value of SamBam_fgets() to tell if the file has reached its end.
178 int SamBam_feof(SamBam_FILE * fp);
179 
180 // This function works like fgets except it decode the BAM file.
181 // If the buffer is not long enough to store the line, the remainder of this line is omitted and the next call will read the next alignment.
182 // A very important difference between fgets and SamBam_fgets is that this function returns NULL when there are no more lines.
183 // It is recommended to use the return value as the indicator of EOF like:
184 /**
185  * SamBam_FILE * fp = SamBam_fopen("my.bam", SAMBAM_FILE_BAM);
186  * while(1)
187  * {
188  *   char buf[3000];
189  *   char * ret = SamBam_fgets(fp, buf, 3000);
190  *   if(ret) puts(buf);
191  *   else break;
192  * }
193  * SamBam_fclose(fp);
194  */
195 char * SamBam_fgets(SamBam_FILE * fp , char * buff , int buff_len , int seq_needed);
196 
197 int SamBam_writer_create(SamBam_Writer * writer, char * BAM_fname, int threads, int sort_reads_by_coord, int is_tmp_BAM, char * tmpfname);
198 
199 int SamBam_writer_close(SamBam_Writer * writer);
200 
201 int SamBam_writer_add_header(SamBam_Writer * writer, char * header_text, int add_chro);
202 
203 int SamBam_writer_add_chromosome(SamBam_Writer * writer, char * chro_name, unsigned int chro_length, int add_header_too);
204 
205 int SamBam_writer_add_read_bin(SamBam_Writer * writer, int thread_no, char * rbin, int committable);
206 
207 int SamBam_writer_calc_cigar_span(char * bin);
208 
209 int SamBam_writer_add_read_fqs_scRNA(gzFile * outfp, char * bambin);
210 
211 int SamBam_writer_add_read(SamBam_Writer * writer, int threadno, char * read_name, unsigned int flags, char * chro_name, unsigned int chro_position, int mapping_quality, char * cigar, char * next_chro_name, unsigned int next_chro_pos, int temp_len, int read_len, char * read_text, char * qual_text, char * additional_columns, int can_submit);
212 
213 void SamBam_writer_optimize_bins(HashTable *bin_tab, ArrayList *bin_arr, HashTable ** new_tab, ArrayList ** new_arrs);
214 
215 int is_badBAM(char * fn);
216 
217 int SamBam_unzip(char * out, int out_max_len , char * in , int inlen, int sync_only);
218 
219 int SamBam_fetch_next_chunk(SamBam_FILE *fp);
220 
221 int SamBam_compress_cigar(char * cigar, int * cigar_int, int * ret_coverage, int max_secs);
222 char cigar_op_char(int ch);
223 void SamBam_read2bin(char * read_txt, char * read_bin);
224 
225 int convert_BAM_binary_to_SAM(SamBam_Reference_Info * chro_table, char * bam_bin, char * sam_txt);
226 int is_paired_end_BAM(char * fn);
227 void SamBam_writer_finalise_thread(SamBam_Writer * writer, int thread_id);
228 void SamBam_writer_finish_header( SamBam_Writer * writer );
229 void SamBam_writer_finalise_one_thread(SamBam_Writer * writer);
230 int SamBam_writer_add_read_line(SamBam_Writer * writer, int thread_no, char * rline, int commitable);
231 char *duplicate_TAB_record_field(char * rline, int fld_no, int toend);
232 #endif
233