1 /*************************************************************** 2 3 The Subread software package is free software package: 4 you can redistribute it and/or modify it under the terms 5 of the GNU General Public License as published by the 6 Free Software Foundation, either version 3 of the License, 7 or (at your option) any later version. 8 9 Subread is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty 11 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 12 13 See the GNU General Public License for more details. 14 15 Authors: Drs Yang Liao and Wei Shi 16 17 ***************************************************************/ 18 19 20 #ifndef __INPUT_FILES_H_ 21 #define __INPUT_FILES_H_ 22 23 #include "subread.h" 24 #include "hashtable.h" 25 26 #define GENE_SPACE_BASE 1 27 #define GENE_SPACE_COLOR 2 28 29 #define GENE_INPUT_PLAIN 0 30 #define GENE_INPUT_FASTQ 1 31 #define GENE_INPUT_FASTA 2 32 #define GENE_INPUT_BCL 3 33 #define GENE_INPUT_SCRNA_FASTQ 4 34 #define GENE_INPUT_SCRNA_BAM 5 35 #define GENE_INPUT_GZIP_FASTQ 51 36 #define GENE_INPUT_GZIP_FASTA 52 37 38 #define GENE_INPUT_SAM_SINGLE 93 39 #define GENE_INPUT_SAM_PAIR_1 94 40 #define GENE_INPUT_SAM_PAIR_2 95 41 42 43 #define MAX_LINE_LENGTH 3000 44 #define MIN_FILE_POINTERS_ALLOWED 50 45 46 #define FILE_TYPE_SAM 50 47 #define FILE_TYPE_BAM 500 48 #define FILE_TYPE_FAST_ 100 49 #define FILE_TYPE_FASTQ 105 50 #define FILE_TYPE_FASTA 110 51 #define FILE_TYPE_GZIP_FAST_ 1000 52 #define FILE_TYPE_GZIP_FASTQ 1105 53 #define FILE_TYPE_GZIP_FASTA 1110 54 #define FILE_TYPE_UNKNOWN 999 55 #define FILE_TYPE_EMPTY 999990 56 #define FILE_TYPE_NONEXIST 999999 57 #define FILE_TYPE_RSUBREAD 10 58 #define FILE_TYPE_GTF 100 59 60 #define FEATURECOUNTS_BUFFER_SIZE (1024*1024*12) 61 62 63 64 #include <stdlib.h> 65 #include <stdio.h> 66 #include "core-indel.h" 67 #include "hashtable.h" 68 69 70 #define SAM_SORT_BLOCKS 229 71 #define SAM_SORT_BLOCK_SIZE 512333303LLU 72 //#define SAM_SORT_BLOCK_SIZE 11123333LLU 73 // 74 typedef struct { 75 srInt_64 output_file_size; 76 srInt_64 current_chunk_size; 77 unsigned int current_chunk; 78 srInt_64 written_reads; 79 srInt_64 unpaired_reads; 80 81 FILE * current_block_fp_array [SAM_SORT_BLOCKS]; 82 FILE * all_chunks_header_fp; 83 84 FILE * out_fp; 85 char tmp_path[MAX_FILE_NAME_LENGTH]; 86 } SAM_sort_writer; 87 88 89 typedef struct { 90 int thread_id; 91 92 char * input_buff_SBAM; 93 int input_buff_SBAM_used; 94 int input_buff_SBAM_ptr; 95 int reads_in_SBAM; 96 subread_lock_t SBAM_lock; 97 98 srInt_64 input_buff_SBAM_file_start; 99 srInt_64 input_buff_SBAM_file_end; 100 101 unsigned int chunk_number; 102 unsigned int readno_in_chunk; 103 104 unsigned char * input_buff_BIN; 105 int input_buff_BIN_used; 106 int input_buff_BIN_ptr; 107 int input_buff_BIN_capacity; 108 int orphant_block_no; 109 int need_find_start; 110 srInt_64 orphant_space; 111 z_stream strm; 112 113 char immediate_last_read_bin[FC_LONG_READ_RECORD_HARDLIMIT]; 114 char immediate_last_read_full_name[MAX_READ_NAME_LEN*2 +80 ]; 115 int immediate_last_read_flags; 116 int immediate_last_read_bin_len; 117 int immediate_last_read_name_len; 118 119 HashTable * orphant_table; 120 pthread_t thread_stab; 121 } SAM_pairer_thread_t; 122 123 typedef struct { 124 FILE * input_fp; 125 int input_is_BAM; 126 int tiny_mode; 127 int display_progress; 128 int is_bad_format; 129 int is_single_end_mode; 130 int force_do_not_sort; 131 int long_cigar_mode; 132 int long_read_minimum_length; 133 int is_finished; 134 int merge_level_finished; 135 int max_file_open_number; 136 subread_lock_t input_fp_lock; 137 subread_lock_t SAM_BAM_table_lock; 138 subread_lock_t unsorted_notification_lock; 139 140 srInt_64 total_input_reads; 141 srInt_64 total_orphan_reads; 142 143 HashTable * unsorted_notification_table; 144 HashTable * sam_contig_number_table; 145 HashTable * bam_margin_table; 146 147 int total_threads; 148 int input_chunk_no; 149 int input_buff_SBAM_size; 150 int input_buff_BIN_size; 151 char tmp_file_prefix[MAX_FILE_NAME_LENGTH+1]; 152 char in_file_name[MAX_FILE_NAME_LENGTH+1]; 153 154 SAM_pairer_thread_t * threads; 155 int BAM_header_parsed; 156 unsigned int BAM_l_text; 157 unsigned int BAM_n_ref; 158 int is_unsorted_notified; 159 int is_incomplete_BAM; 160 int need_read_group_tag; 161 int is_internal_error; 162 int is_final_run; 163 164 void (* reset_output_function) (void * pairer); 165 int (* output_function) (void * pairer, int thread_no, char * bin1, char * bin2); 166 int (* output_header) (void * pairer, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len); 167 void (* unsorted_notification) (void * pairer, char * bin1, char * bin2); // it is called only once 168 // reserved for the application passing its own data to the output function. 169 void * appendix1; 170 void * appendix2; 171 void * appendix3; 172 void * appendix4; 173 void * appendix5; 174 175 } SAM_pairer_context_t; 176 177 178 #define SAM_PAIRER_WRITE_BUFFER ( 64000 ) 179 typedef struct { 180 unsigned char BIN_buffer[SAM_PAIRER_WRITE_BUFFER]; 181 int BIN_buffer_ptr; 182 z_stream strm; 183 184 } SAM_pairer_writer_thread_t; 185 186 typedef struct { 187 SAM_pairer_writer_thread_t * threads; 188 int all_threads; 189 int compression_level; 190 int has_dummy; 191 FILE * bam_fp; 192 char bam_name[MAX_FILE_NAME_LENGTH]; 193 subread_lock_t output_fp_lock; 194 } SAM_pairer_writer_main_t; 195 196 197 void fastq_64_to_33(char * qs); 198 199 // the caller is in charge of deallocation 200 char * memstrcpy(char * in); 201 202 int chars2color(char c1, char c2); 203 204 int genekey2color(char last_base,char * key); 205 206 // Convert a string key into an integer key 207 int genekey2int(char * key, int space_type); 208 209 // Open a read file. This function automatically decides its type. 210 // Return 0 if successfully opened, or 1 if error occurred 211 int geinput_open(const char * filename, gene_input_t * input); 212 213 // Open a sam file. Parameter half_no indicates which read of the pair is concerned. 214 // half_no = 1: the first read; half_no = 2: the last read; half_no = 0: single-end 215 // Return 0 if successfully opened, or 1 if error occurred 216 int geinput_open_sam(const char * filename, gene_input_t * input, int half_no); 217 218 // Read a line from the input and reward the pointer to the last position. 219 int geinput_readline_back(gene_input_t * input, char * linebuffer) ; 220 221 // Get the next read from the input file 222 // Return the length of this read or -1 if EOF. 223 // The memory space for read_string must be at least 512 bytes. 224 int geinput_next_read(gene_input_t * input, char * read_name, char * read_string, char * quality_string); 225 int geinput_next_read_trim(gene_input_t * input, char * read_name, char * read_string, char * quality_string, short trim_5, short trim_3, int * is_secondary); 226 227 void geinput_jump_read(gene_input_t * input); 228 229 // Close the input file 230 void geinput_close(gene_input_t * input); 231 232 // return the next ATGC char from a input file. 233 // return 0 if this read segment reaches the end; return -1 if EOF; return -2 if error 234 int geinput_next_char(gene_input_t * input); 235 236 // line buffer has to be at least 300 bytes 237 // it returns the length of reading 238 int geinput_readline(gene_input_t * input, char * linebuffer, int conv_to_upper) ; 239 240 241 242 // read a line into the buff, 243 // the line should not be longer than 300 chars or the remaining part will be discarded. 244 // therefore the buff has to be at least 300 chars. 245 int read_line(int max_len, FILE * fp, char * buff, int conv_to_upper); 246 247 // count the number of reads in a flie 248 double guess_reads_density(char * fname, int is_sam) ; 249 250 // guess the size of the chromosome lib 251 // return the number of bases, or (-index-1) if the file at the index is not found. 252 srInt_64 guess_gene_bases(char ** files, int file_number); 253 254 255 void reverse_read(char * ReadString, int Length, int space_type); 256 257 void reverse_quality(char * QualtyString, int Length); 258 259 unsigned int read_numbers(gene_input_t * input); 260 261 //This function returns 0 if the line is a mapped read; -1 if the line is in a wrong format and 1 if the read is unmapped. 262 int parse_SAM_line(char * sam_line, char * read_name, int * flags, char * chro, unsigned int * pos, char * cigar, int * mapping_quality, unsigned int * pair_dist, char * sequence , char * quality_string, int * rl, int * repeated); 263 264 #define reverse_char(c) ((c)=='A'?'T':((c)=='G'?'C':((c)=='C'?'G':'A'))) 265 266 int find_subread_end(int len, int TOTAL_SUBREADS,int subread) ; 267 268 int break_SAM_file(char * in_SAM_file, int is_BAM, char * temp_file_prefix, unsigned int * real_read_count, int * block_no, chromosome_t * known_chromosomes, int is_sequence_needed, int base_ignored_head_tail, gene_value_index_t *array_index, gene_offset_t * offsets, srInt_64 * all_Mapped_bases , HashTable * event_table_ptr, char * VCF_file, srInt_64 * all_mapped_reads, int do_fragment_filtering, int push_to_read_head, int use_soft_clipped_bases); 269 270 int get_known_chromosomes(char * in_SAM_file, chromosome_t * known_chromosomes); 271 272 273 int load_exon_annotation(char * annotation_file_name, gene_t ** output_genes, gene_offset_t* offsets ); 274 275 int is_in_exon_annotations(gene_t *output_genes, unsigned int offset, int is_start); 276 277 int does_file_exist (char * filename); 278 279 double guess_reads_density_format(char * fname, int is_sam, int * min_phred, int * max_phred, int * tested_reads); 280 281 FILE * get_temp_file_pointer(char *temp_file_name, HashTable* fp_table, int * close_immediately); 282 283 int write_read_block_file(FILE *temp_fp , unsigned int read_number, char *read_name, int flags, char * chro, unsigned int pos, char *cigar, int mapping_quality, char *sequence , char *quality_string, int rl , int is_sequence_needed, char strand, unsigned short read_pos, unsigned short read_len, unsigned short M_seg); 284 285 int get_read_block(char *chro, unsigned int pos, char *temp_file_suffix, chromosome_t *known_chromosomes, unsigned int * max_base_position); 286 int my_strcmp(const void * s1, const void * s2); 287 288 void destroy_cigar_event_table(HashTable * event_table); 289 290 291 int is_SAM_unsorted(char * SAM_line, char * tmp_read_name, short * tmp_flag, srInt_64 read_no); 292 int sort_SAM_add_line(SAM_sort_writer * writer, char * SAM_line, int line_len); 293 int sort_SAM_finalise(SAM_sort_writer * writer); 294 int sort_SAM_create(SAM_sort_writer * writer, char * output_file, char * tmp_path); 295 void colorread2base(char * read_buffer, int read_len); 296 297 int warning_file_type(char * fname, int expected_type); 298 char color2char(char clr, char c1); 299 300 int is_certainly_bam_file(char * fname, int * is_firstread_PE, srInt_64 * SAMBAM_header_length); 301 302 srUInt_64 sort_SAM_hash(char * str); 303 304 char * fgets_noempty(char * buf, int maxlen, FILE * fp); 305 306 char * gzgets_noempty(void * fp, char * buf, int maxlen); 307 int probe_file_type(char * fname, int * is_first_PE); 308 int probe_file_type_fast(char * fname); 309 void geinput_seek(gene_input_t * input, gene_inputfile_position_t * pos); 310 void geinput_tell(gene_input_t * input, gene_inputfile_position_t * pos); 311 srInt_64 geinput_file_offset( gene_input_t * input); 312 int probe_file_type_EX(char * fname, int * is_first_read_PE, srInt_64 * SAMBAM_header_length); 313 314 315 int SAM_pairer_create(SAM_pairer_context_t * pairer, int all_threads, int bin_buff_size_per_thread, int BAM_input, int is_Tiny_Mode, int is_single_end_mode, int force_do_not_sort, int need_read_group_tag, int display_progress, char * in_file, void (* reset_output_function) (void * pairer), int (* output_header_function) (void * pairer, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len), int (* output_function) (void * pairer, int thread_no, char * bin1, char * bin2), char * tmp_path, void * appendix1, int long_read_minimum_length) ; 316 int SAM_pairer_run( SAM_pairer_context_t * pairer); 317 void SAM_pairer_destroy(SAM_pairer_context_t * pairer); 318 void SAM_pairer_writer_reset(void * pairer); 319 void SAM_pairer_set_unsorted_notification(SAM_pairer_context_t * pairer, void (* unsorted_notification) (void * pairer, char * bin1, char * bin2)); 320 321 int SAM_pairer_multi_thread_output( void * pairer, int thread_no, char * bin1, char * bin2 ); 322 int SAM_pairer_multi_thread_header (void * pairer_vp, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len); 323 int SAP_pairer_skip_tag_body_len(char * tag_start); 324 int SAM_pairer_writer_create( SAM_pairer_writer_main_t * bam_main , int all_threads, int has_dummy , int BAM_output, int BAM_compression_level, char * out_file); 325 void SAM_pairer_writer_destroy( SAM_pairer_writer_main_t * bam_main ) ; 326 int SAM_pairer_iterate_int_tags(unsigned char * bin, int bin_len, char * tag_name, int * saved_value); 327 int SAM_pairer_iterate_tags(unsigned char * bin, int bin_len, char * tag_name, char * data_type, char ** saved_value); 328 int SAM_pairer_warning_file_open_limit(); 329 int SAM_pairer_get_tag_bin_start(char * bin); 330 void *delay_realloc(void * old_pntr, size_t old_size, size_t new_size); 331 int is_comment_line(const char * l, int file_type, unsigned int lineno); 332 void warning_hash_hash(HashTable * t1, HashTable * t2, char * msg); 333 int geinput_preload_buffer(gene_input_t * input, subread_lock_t* read_lock); 334 int geinput_open_scRNA_fqs(char * fnames, gene_input_t * input, int reads_per_chunk, int threads ); 335 int geinput_open_scRNA_BAM(char * fnames, gene_input_t * input, int reads_per_chunk, int threads ); 336 int geinput_open_bcl( const char * dir_name, gene_input_t * input, int reads_in_chunk, int threads ); 337 char *strtokmm(char *str, const char *delim, char ** next); 338 #endif 339