1 /***************************************************************
2 
3    The Subread software package is free software package:
4    you can redistribute it and/or modify it under the terms
5    of the GNU General Public License as published by the
6    Free Software Foundation, either version 3 of the License,
7    or (at your option) any later version.
8 
9    Subread is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty
11    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 
13    See the GNU General Public License for more details.
14 
15    Authors: Drs Yang Liao and Wei Shi
16 
17   ***************************************************************/
18 
19 
20 #ifndef __INPUT_FILES_H_
21 #define __INPUT_FILES_H_
22 
23 #include "subread.h"
24 #include "hashtable.h"
25 
26 #define GENE_SPACE_BASE 1
27 #define GENE_SPACE_COLOR 2
28 
29 #define GENE_INPUT_PLAIN 0
30 #define GENE_INPUT_FASTQ 1
31 #define GENE_INPUT_FASTA 2
32 #define GENE_INPUT_BCL   3
33 #define GENE_INPUT_SCRNA_FASTQ 4
34 #define GENE_INPUT_SCRNA_BAM 5
35 #define GENE_INPUT_GZIP_FASTQ 51
36 #define GENE_INPUT_GZIP_FASTA 52
37 
38 #define GENE_INPUT_SAM_SINGLE   93
39 #define GENE_INPUT_SAM_PAIR_1   94
40 #define GENE_INPUT_SAM_PAIR_2   95
41 
42 
43 #define MAX_LINE_LENGTH 3000
44 #define MIN_FILE_POINTERS_ALLOWED 50
45 
46 #define FILE_TYPE_SAM     50
47 #define FILE_TYPE_BAM     500
48 #define FILE_TYPE_FAST_   100
49 #define FILE_TYPE_FASTQ   105
50 #define FILE_TYPE_FASTA   110
51 #define FILE_TYPE_GZIP_FAST_   1000
52 #define FILE_TYPE_GZIP_FASTQ   1105
53 #define FILE_TYPE_GZIP_FASTA   1110
54 #define FILE_TYPE_UNKNOWN 999
55 #define FILE_TYPE_EMPTY   999990
56 #define FILE_TYPE_NONEXIST 999999
57 #define FILE_TYPE_RSUBREAD 10
58 #define FILE_TYPE_GTF 100
59 
60 #define FEATURECOUNTS_BUFFER_SIZE (1024*1024*12)
61 
62 
63 
64 #include <stdlib.h>
65 #include <stdio.h>
66 #include "core-indel.h"
67 #include "hashtable.h"
68 
69 
70 #define SAM_SORT_BLOCKS 229
71 #define SAM_SORT_BLOCK_SIZE 512333303LLU
72 //#define SAM_SORT_BLOCK_SIZE 11123333LLU
73 //
74 typedef struct {
75 	srInt_64 output_file_size;
76 	srInt_64 current_chunk_size;
77 	unsigned int current_chunk;
78 	srInt_64 written_reads;
79 	srInt_64 unpaired_reads;
80 
81 	FILE * current_block_fp_array [SAM_SORT_BLOCKS];
82 	FILE * all_chunks_header_fp;
83 
84 	FILE * out_fp;
85 	char tmp_path[MAX_FILE_NAME_LENGTH];
86 } SAM_sort_writer;
87 
88 
89 typedef struct {
90 	int thread_id;
91 
92 	char * input_buff_SBAM;
93 	int input_buff_SBAM_used;
94 	int input_buff_SBAM_ptr;
95 	int reads_in_SBAM;
96 	subread_lock_t SBAM_lock;
97 
98 	srInt_64 input_buff_SBAM_file_start;
99 	srInt_64 input_buff_SBAM_file_end;
100 
101 	unsigned int chunk_number;
102 	unsigned int readno_in_chunk;
103 
104 	unsigned char * input_buff_BIN;
105 	int input_buff_BIN_used;
106 	int input_buff_BIN_ptr;
107 	int input_buff_BIN_capacity;
108 	int orphant_block_no;
109 	int need_find_start;
110 	srInt_64 orphant_space;
111 	z_stream strm;
112 
113 	char immediate_last_read_bin[FC_LONG_READ_RECORD_HARDLIMIT];
114 	char immediate_last_read_full_name[MAX_READ_NAME_LEN*2 +80 ];
115 	int immediate_last_read_flags;
116 	int immediate_last_read_bin_len;
117 	int immediate_last_read_name_len;
118 
119 	HashTable * orphant_table;
120 	pthread_t thread_stab;
121 } SAM_pairer_thread_t;
122 
123 typedef struct {
124 	FILE * input_fp;
125 	int input_is_BAM;
126 	int tiny_mode;
127 	int display_progress;
128 	int is_bad_format;
129 	int is_single_end_mode;
130 	int force_do_not_sort;
131 	int long_cigar_mode;
132 	int long_read_minimum_length;
133 	int is_finished;
134 	int merge_level_finished;
135 	int max_file_open_number;
136 	subread_lock_t input_fp_lock;
137 	subread_lock_t SAM_BAM_table_lock;
138 	subread_lock_t unsorted_notification_lock;
139 
140 	srInt_64 total_input_reads;
141 	srInt_64 total_orphan_reads;
142 
143 	HashTable * unsorted_notification_table;
144 	HashTable * sam_contig_number_table;
145 	HashTable * bam_margin_table;
146 
147 	int total_threads;
148 	int input_chunk_no;
149 	int input_buff_SBAM_size;
150 	int input_buff_BIN_size;
151 	char tmp_file_prefix[MAX_FILE_NAME_LENGTH+1];
152 	char in_file_name[MAX_FILE_NAME_LENGTH+1];
153 
154 	SAM_pairer_thread_t * threads;
155 	int BAM_header_parsed;
156 	unsigned int BAM_l_text;
157 	unsigned int BAM_n_ref;
158 	int is_unsorted_notified;
159 	int is_incomplete_BAM;
160 	int need_read_group_tag;
161 	int is_internal_error;
162 	int is_final_run;
163 
164 	void (* reset_output_function) (void * pairer);
165 	int (* output_function) (void * pairer, int thread_no, char * bin1, char * bin2);
166 	int (* output_header) (void * pairer, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len);
167 	void (* unsorted_notification) (void * pairer, char * bin1, char * bin2); // it is called only once
168 	// reserved for the application passing its own data to the output function.
169 	void * appendix1;
170 	void * appendix2;
171 	void * appendix3;
172 	void * appendix4;
173 	void * appendix5;
174 
175 } SAM_pairer_context_t;
176 
177 
178 #define SAM_PAIRER_WRITE_BUFFER ( 64000 )
179 typedef struct {
180 	unsigned char BIN_buffer[SAM_PAIRER_WRITE_BUFFER];
181 	int BIN_buffer_ptr;
182 	z_stream strm;
183 
184 } SAM_pairer_writer_thread_t;
185 
186 typedef struct {
187 	SAM_pairer_writer_thread_t * threads;
188 	int all_threads;
189 	int compression_level;
190 	int has_dummy;
191 	FILE * bam_fp;
192 	char bam_name[MAX_FILE_NAME_LENGTH];
193 	subread_lock_t output_fp_lock;
194 } SAM_pairer_writer_main_t;
195 
196 
197 void fastq_64_to_33(char * qs);
198 
199 // the caller is in charge of deallocation
200 char * memstrcpy(char * in);
201 
202 int chars2color(char c1, char c2);
203 
204 int genekey2color(char last_base,char * key);
205 
206 // Convert a string key into an integer key
207 int genekey2int(char * key, int space_type);
208 
209 // Open a read file. This function automatically decides its type.
210 // Return 0 if successfully opened, or 1 if error occurred
211 int geinput_open(const char * filename, gene_input_t * input);
212 
213 // Open a sam file. Parameter half_no indicates which read of the pair is concerned.
214 // half_no = 1: the first read; half_no = 2: the last read; half_no = 0: single-end
215 // Return 0 if successfully opened, or 1 if error occurred
216 int geinput_open_sam(const char * filename, gene_input_t * input, int half_no);
217 
218 // Read a line from the input and reward the pointer to the last position.
219 int geinput_readline_back(gene_input_t * input, char * linebuffer) ;
220 
221 // Get the next read from the input file
222 // Return the length of this read or -1 if EOF.
223 // The memory space for read_string must be at least 512 bytes.
224 int geinput_next_read(gene_input_t * input, char * read_name, char * read_string, char * quality_string);
225 int geinput_next_read_trim(gene_input_t * input, char * read_name, char * read_string, char * quality_string, short trim_5, short trim_3, int * is_secondary);
226 
227 void geinput_jump_read(gene_input_t * input);
228 
229 // Close the input file
230 void geinput_close(gene_input_t * input);
231 
232 // return the next ATGC char from a input file.
233 // return 0 if this read segment reaches the end; return -1 if EOF; return -2 if error
234 int geinput_next_char(gene_input_t * input);
235 
236 // line buffer has to be at least 300 bytes
237 // it returns the length of reading
238 int geinput_readline(gene_input_t * input, char * linebuffer, int conv_to_upper) ;
239 
240 
241 
242 // read a line into the buff,
243 // the line should not be longer than 300 chars or the remaining part will be discarded.
244 // therefore the buff has to be at least 300 chars.
245 int read_line(int max_len, FILE * fp, char * buff, int conv_to_upper);
246 
247 // count the number of reads in a flie
248 double guess_reads_density(char * fname, int is_sam) ;
249 
250 // guess the size of the chromosome lib
251 // return the number of bases, or (-index-1) if the file at the index is not found.
252 srInt_64 guess_gene_bases(char ** files, int file_number);
253 
254 
255 void reverse_read(char * ReadString, int Length, int space_type);
256 
257 void reverse_quality(char * QualtyString, int Length);
258 
259 unsigned int read_numbers(gene_input_t * input);
260 
261 //This function returns 0 if the line is a mapped read; -1 if the line is in a wrong format and 1 if the read is unmapped.
262 int parse_SAM_line(char * sam_line, char * read_name, int * flags, char * chro, unsigned int * pos, char * cigar, int * mapping_quality, unsigned int * pair_dist, char * sequence , char * quality_string, int * rl, int * repeated);
263 
264 #define reverse_char(c)	((c)=='A'?'T':((c)=='G'?'C':((c)=='C'?'G':'A')))
265 
266 int find_subread_end(int len, int  TOTAL_SUBREADS,int subread) ;
267 
268 int break_SAM_file(char * in_SAM_file, int is_BAM, char * temp_file_prefix, unsigned int * real_read_count, int * block_no, chromosome_t * known_chromosomes, int is_sequence_needed, int base_ignored_head_tail, gene_value_index_t *array_index, gene_offset_t * offsets, srInt_64 * all_Mapped_bases , HashTable * event_table_ptr, char * VCF_file, srInt_64 * all_mapped_reads, int do_fragment_filtering, int push_to_read_head, int use_soft_clipped_bases);
269 
270 int get_known_chromosomes(char * in_SAM_file, chromosome_t * known_chromosomes);
271 
272 
273 int load_exon_annotation(char * annotation_file_name, gene_t ** output_genes, gene_offset_t* offsets );
274 
275 int is_in_exon_annotations(gene_t *output_genes, unsigned int offset, int is_start);
276 
277 int does_file_exist (char * filename);
278 
279 double guess_reads_density_format(char * fname, int is_sam, int * min_phred, int * max_phred, int * tested_reads);
280 
281 FILE * get_temp_file_pointer(char *temp_file_name, HashTable* fp_table, int * close_immediately);
282 
283 int write_read_block_file(FILE *temp_fp , unsigned int read_number, char *read_name, int flags, char * chro, unsigned int pos, char *cigar, int mapping_quality, char *sequence , char *quality_string, int rl , int is_sequence_needed, char strand, unsigned short read_pos, unsigned short read_len, unsigned short M_seg);
284 
285 int get_read_block(char *chro, unsigned int pos, char *temp_file_suffix, chromosome_t *known_chromosomes, unsigned int * max_base_position);
286 int my_strcmp(const void * s1, const void * s2);
287 
288 void destroy_cigar_event_table(HashTable * event_table);
289 
290 
291 int is_SAM_unsorted(char * SAM_line, char * tmp_read_name, short * tmp_flag, srInt_64 read_no);
292 int sort_SAM_add_line(SAM_sort_writer * writer, char * SAM_line, int line_len);
293 int sort_SAM_finalise(SAM_sort_writer * writer);
294 int sort_SAM_create(SAM_sort_writer * writer, char * output_file, char * tmp_path);
295 void colorread2base(char * read_buffer, int read_len);
296 
297 int warning_file_type(char * fname, int expected_type);
298 char color2char(char clr, char c1);
299 
300 int is_certainly_bam_file(char * fname, int * is_firstread_PE, srInt_64  * SAMBAM_header_length);
301 
302 srUInt_64 sort_SAM_hash(char * str);
303 
304 char * fgets_noempty(char * buf, int maxlen, FILE * fp);
305 
306 char * gzgets_noempty(void * fp, char * buf, int maxlen);
307 int probe_file_type(char * fname, int * is_first_PE);
308 int probe_file_type_fast(char * fname);
309 void geinput_seek(gene_input_t * input, gene_inputfile_position_t * pos);
310 void geinput_tell(gene_input_t * input, gene_inputfile_position_t * pos);
311 srInt_64 geinput_file_offset( gene_input_t * input);
312 int probe_file_type_EX(char * fname, int * is_first_read_PE, srInt_64 * SAMBAM_header_length);
313 
314 
315 int SAM_pairer_create(SAM_pairer_context_t * pairer, int all_threads, int bin_buff_size_per_thread, int BAM_input, int is_Tiny_Mode, int is_single_end_mode, int force_do_not_sort, int need_read_group_tag, int display_progress, char * in_file, void (* reset_output_function) (void * pairer), int (* output_header_function) (void * pairer, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len), int (* output_function) (void * pairer, int thread_no, char * bin1, char * bin2), char * tmp_path, void * appendix1, int long_read_minimum_length) ;
316 int SAM_pairer_run( SAM_pairer_context_t * pairer);
317 void SAM_pairer_destroy(SAM_pairer_context_t * pairer);
318 void SAM_pairer_writer_reset(void * pairer);
319 void SAM_pairer_set_unsorted_notification(SAM_pairer_context_t * pairer, void (* unsorted_notification) (void * pairer, char * bin1, char * bin2));
320 
321 int SAM_pairer_multi_thread_output( void * pairer, int thread_no, char * bin1, char * bin2 );
322 int SAM_pairer_multi_thread_header (void * pairer_vp, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len);
323 int SAP_pairer_skip_tag_body_len(char * tag_start);
324 int SAM_pairer_writer_create( SAM_pairer_writer_main_t * bam_main , int all_threads, int has_dummy , int BAM_output, int BAM_compression_level, char * out_file);
325 void SAM_pairer_writer_destroy( SAM_pairer_writer_main_t * bam_main ) ;
326 int SAM_pairer_iterate_int_tags(unsigned char * bin, int bin_len, char * tag_name, int * saved_value);
327 int SAM_pairer_iterate_tags(unsigned char * bin, int bin_len, char * tag_name, char * data_type, char ** saved_value);
328 int SAM_pairer_warning_file_open_limit();
329 int SAM_pairer_get_tag_bin_start(char * bin);
330 void *delay_realloc(void * old_pntr, size_t old_size, size_t new_size);
331 int is_comment_line(const char * l, int file_type, unsigned int lineno);
332 void warning_hash_hash(HashTable * t1, HashTable * t2, char * msg);
333 int geinput_preload_buffer(gene_input_t * input, subread_lock_t* read_lock);
334 int geinput_open_scRNA_fqs(char * fnames,  gene_input_t * input, int reads_per_chunk, int threads );
335 int geinput_open_scRNA_BAM(char * fnames,  gene_input_t * input, int reads_per_chunk, int threads );
336 int geinput_open_bcl( const char * dir_name,  gene_input_t * input, int reads_in_chunk, int threads );
337 char *strtokmm(char *str, const char *delim, char ** next);
338 #endif
339