1 /*************************************************************** 2 3 The Subread software package is free software package: 4 you can redistribute it and/or modify it under the terms 5 of the GNU General Public License as published by the 6 Free Software Foundation, either version 3 of the License, 7 or (at your option) any later version. 8 9 Subread is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty 11 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 12 13 See the GNU General Public License for more details. 14 15 Authors: Drs Yang Liao and Wei Shi 16 17 ***************************************************************/ 18 19 20 #ifndef _SAMBAM_FILE_H_ 21 #define _SAMBAM_FILE_H_ 22 23 #include <zlib.h> 24 #include "HelperFunctions.h" 25 26 typedef unsigned char BS_uint_8; 27 typedef unsigned short BS_uint_16; 28 typedef unsigned int BS_uint_32; 29 30 #define BAM_MAX_CHROMOSOME_NAME_LEN 200 31 #define BAM_MAX_CIGAR_LEN (30000) 32 #define BAM_MAX_READ_NAME_LEN 256 33 #define BAM_MAX_READ_LEN 3000 34 35 #define SAMBAM_FILE_SAM 10 36 #define SAMBAM_FILE_BAM 20 37 38 #define BAM_FILE_STAGE_HEADER 10 39 #define BAM_FILE_STAGE_ALIGNMENT 20 40 41 42 #define SAMBAM_COMPRESS_LEVEL_FASTEST Z_NO_COMPRESSION 43 #define SAMBAM_COMPRESS_LEVEL_NORMAL Z_BEST_SPEED 44 45 #define SAMBAM_GZIP_WINDOW_BITS -15 46 #define SAMBAM_INPUT_STREAM_SIZE 140000 47 48 #define TEST_BAD_BAM_CHUNKS 9999925 49 50 typedef struct 51 { 52 char read_name[BAM_MAX_READ_NAME_LEN]; 53 char * chro_name; 54 unsigned int chro_offset; 55 unsigned short flags; 56 char * mate_chro_name; 57 unsigned int mate_chro_offset; 58 int templete_length; 59 unsigned char mapping_quality; 60 int NH_number; 61 62 char cigar[BAM_MAX_CIGAR_LEN]; 63 char sequence[BAM_MAX_READ_LEN]; 64 char seq_quality[BAM_MAX_READ_LEN]; 65 66 char buff_for_seq[BAM_MAX_READ_LEN*2]; 67 68 } SamBam_Alignment; 69 70 71 #define SB_FETCH(a) if((a) -> input_binary_stream_write_ptr - (a) -> input_binary_stream_read_ptr < 3000){int test_rlen_2 = SamBam_fetch_next_chunk(a); if(test_rlen_2 == -2){(a)->is_bam_broken = 1;}} 72 #define SB_EOF(a) ((a)-> is_eof && ( (a) -> input_binary_stream_write_ptr <= (a) -> input_binary_stream_read_ptr )) 73 #define SB_READ(a) ((a) -> input_binary_stream_buffer + (a) -> input_binary_stream_read_ptr - (a) -> input_binary_stream_buffer_start_ptr) 74 #define SB_RINC(a, len) ((a) -> input_binary_stream_read_ptr) += len 75 76 typedef struct 77 { 78 FILE * os_file; 79 80 int file_type; 81 int bam_file_stage; 82 83 unsigned long long bam_file_next_section_start; 84 unsigned long long input_binary_stream_read_ptr; 85 unsigned long long input_binary_stream_write_ptr; 86 unsigned long long input_binary_stream_buffer_start_ptr; 87 unsigned long long header_length; 88 89 SamBam_Reference_Info * bam_chro_table; 90 int bam_chro_table_size; 91 SamBam_Alignment aln_buff; 92 93 char * input_binary_stream_buffer; 94 int is_eof; 95 int is_paired_end; 96 int is_bam_broken; 97 } SamBam_FILE; 98 99 struct SamBam_sorted_compressor_st{ 100 char plain_text[66000]; 101 char zipped_bin[70000]; 102 int text_size, bin_size; 103 unsigned int CRC32_plain; 104 z_stream strm; 105 pthread_t thread_stub; 106 srInt_64 bam_block_no; 107 int last_job_done; 108 }; 109 110 typedef struct 111 { 112 FILE * bam_fp; 113 FILE * BAI_fp; 114 long long current_BAM_pos; 115 char tmpf_prefix[MAX_FILE_NAME_LENGTH]; 116 z_stream output_stream; 117 char * chunk_buffer; 118 char * compressed_chunk_buffer; 119 char * header_plain_text_buffer; 120 int header_plain_text_buffer_used; 121 int header_plain_text_buffer_max; 122 long long chunk_buffer_used; 123 long long chunk_buffer_max_size; 124 int writer_state; 125 int is_internal_error; 126 int sort_reads_by_coord; 127 int fastest_compression; 128 int sorted_batch_id; 129 unsigned int crc0; 130 131 int threads; 132 z_stream * threads_output_stream; 133 char ** threads_chunk_buffer; 134 char ** threads_chunk_buffer_compressed; 135 long long * threads_chunk_buffer_used; 136 long long * threads_chunk_buffer_max_size; 137 138 HashTable * chromosome_name_table; 139 HashTable * chromosome_id_table; 140 HashTable * chromosome_len_table; 141 subread_lock_t thread_bam_lock; 142 143 worker_master_mutex_t sorted_notifier; 144 HashTable * block_no_p1_to_vpos_tab; 145 //int sorted_compress_plain_text_used; 146 int sorted_compress_this_thread_no; 147 srInt_64 this_bam_block_no; 148 struct SamBam_sorted_compressor_st * writer_threads; 149 } SamBam_Writer; 150 151 // This function reads the next BAM section from the bam_fp. The buffer has a variable length but should be at least 64K bytes. 152 // I recommend you to allocate 80KB of memory. 153 // This function returns the size of the compressed data ( CDATA ). It returns < 0 if EOF. 154 int PBam_get_next_zchunk(FILE * bam_fp, char * buffer, int buffer_length, unsigned int * real_len); 155 156 // load the header of a BAM file (the header is important to load BAM reads) 157 // this function puts the File Pointer to the first read chunk in the BAM. 158 // It returns 0 if finished loading, or non-zero if wrong. 159 // If the chunk contains read data after the chromosome table, the read data is copied into remainder_read_data, and its lengtb is returned in remainder_read_data_len. 160 int PBum_load_header(FILE * bam_fp, SamBam_Reference_Info** chro_tab, char * remainder_read_data, int * remainder_read_data_len); 161 162 163 // load a new line from the BAM buffer (chunk) at chunk_ptr. 164 // if seq_needed==0, then no sequence nor quality str will be loaded. 165 // it returns the length (without "\0" after the tail) of the SAM string. 166 int PBam_chunk_gets(char * chunk, int *chunk_ptr, int chunk_limit, SamBam_Reference_Info * bam_chro_table, char * buff , int buff_len, SamBam_Alignment*aln, int seq_needed); 167 168 // This function opens a file, either SAM or BAM, in read-only mode. 169 // The "file_type" parameter specifies which type of file it is: SAMBAM_FILE_BAM or SAMBAM_FILE_SAM. 170 SamBam_FILE * SamBam_fopen(char * fname , int file_type); 171 172 // This function closes any opened file and releases memory footprint. It works just like "fclose()". 173 void SamBam_fclose(SamBam_FILE * fp); 174 175 // This function tells if a file is exhausted. 176 // Note that a non-exhausted file can still contain no more alignment results. 177 // Hence, it is recommended to check the return value of SamBam_fgets() to tell if the file has reached its end. 178 int SamBam_feof(SamBam_FILE * fp); 179 180 // This function works like fgets except it decode the BAM file. 181 // If the buffer is not long enough to store the line, the remainder of this line is omitted and the next call will read the next alignment. 182 // A very important difference between fgets and SamBam_fgets is that this function returns NULL when there are no more lines. 183 // It is recommended to use the return value as the indicator of EOF like: 184 /** 185 * SamBam_FILE * fp = SamBam_fopen("my.bam", SAMBAM_FILE_BAM); 186 * while(1) 187 * { 188 * char buf[3000]; 189 * char * ret = SamBam_fgets(fp, buf, 3000); 190 * if(ret) puts(buf); 191 * else break; 192 * } 193 * SamBam_fclose(fp); 194 */ 195 char * SamBam_fgets(SamBam_FILE * fp , char * buff , int buff_len , int seq_needed); 196 197 int SamBam_writer_create(SamBam_Writer * writer, char * BAM_fname, int threads, int sort_reads_by_coord, int is_tmp_BAM, char * tmpfname); 198 199 int SamBam_writer_close(SamBam_Writer * writer); 200 201 int SamBam_writer_add_header(SamBam_Writer * writer, char * header_text, int add_chro); 202 203 int SamBam_writer_add_chromosome(SamBam_Writer * writer, char * chro_name, unsigned int chro_length, int add_header_too); 204 205 int SamBam_writer_add_read_bin(SamBam_Writer * writer, int thread_no, char * rbin, int committable); 206 207 int SamBam_writer_calc_cigar_span(char * bin); 208 209 int SamBam_writer_add_read_fqs_scRNA(gzFile * outfp, char * bambin); 210 211 int SamBam_writer_add_read(SamBam_Writer * writer, int threadno, char * read_name, unsigned int flags, char * chro_name, unsigned int chro_position, int mapping_quality, char * cigar, char * next_chro_name, unsigned int next_chro_pos, int temp_len, int read_len, char * read_text, char * qual_text, char * additional_columns, int can_submit); 212 213 void SamBam_writer_optimize_bins(HashTable *bin_tab, ArrayList *bin_arr, HashTable ** new_tab, ArrayList ** new_arrs); 214 215 int is_badBAM(char * fn); 216 217 int SamBam_unzip(char * out, int out_max_len , char * in , int inlen, int sync_only); 218 219 int SamBam_fetch_next_chunk(SamBam_FILE *fp); 220 221 int SamBam_compress_cigar(char * cigar, int * cigar_int, int * ret_coverage, int max_secs); 222 char cigar_op_char(int ch); 223 void SamBam_read2bin(char * read_txt, char * read_bin); 224 225 int convert_BAM_binary_to_SAM(SamBam_Reference_Info * chro_table, char * bam_bin, char * sam_txt); 226 int is_paired_end_BAM(char * fn); 227 void SamBam_writer_finalise_thread(SamBam_Writer * writer, int thread_id); 228 void SamBam_writer_finish_header( SamBam_Writer * writer ); 229 void SamBam_writer_finalise_one_thread(SamBam_Writer * writer); 230 int SamBam_writer_add_read_line(SamBam_Writer * writer, int thread_no, char * rline, int commitable); 231 char *duplicate_TAB_record_field(char * rline, int fld_no, int toend); 232 #endif 233