1 /***************************************************************
2 
3    The Subread software package is free software package:
4    you can redistribute it and/or modify it under the terms
5    of the GNU General Public License as published by the
6    Free Software Foundation, either version 3 of the License,
7    or (at your option) any later version.
8 
9    Subread is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty
11    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 
13    See the GNU General Public License for more details.
14 
15    Authors: Drs Yang Liao and Wei Shi
16 
17   ***************************************************************/
18 
19 
20 //Core.c is the totally refactored body of the current subread, subjunc, subfusion and subindel programs.
21 //This program runs in this way:
22 // 1, it loads reads from FASTQ, FASTA, PLAIN or SAM files
23 // 2, it change the read and the quality strings according to FF/FR/RR/RF orders, such that every read is in the "F" manner.
24 //    Namely, the second read in a pair is reversed.
25 // 3, it builds voting table for each read.
26 // 4, it performs the first iteration.
27 // 5, it performs the second iteration.
28 // 6, it reports results regarding the reads.
29 // 7, if the input files contain too many reads (e.g., greater than 14M in total), step 1 ~ 6 are repeated until all reads are finalised.
30 // 8, it reports global results.
31 
32 //Core.c maintains the following context:
33 // 1, the global context.
34 // 2, the overall context for each module.
35 // 3, the local context for voting each read for each module.
36 // The structure of each context is named as
37 //     Context_ModuleName_Scope
38 
39 //Thread context is maintained for each thread.
40 
41 // There should be a function for each module to store/load its global context.
42 
43 #ifndef _SUBREAD_CORE_H_
44 #define _SUBREAD_CORE_H_
45 #include <pthread.h>
46 #include "subread.h"
47 #include "sambam-file.h"
48 
49 //#define is_target(s) (memcmp((s),"chr901_242237_242737_0:0:0_0:0:0_3761", 22)==0)
50 
51 #define is_target(s) 0
52 
53 //#define _global_retrieve_voting_context(global_context, pair_number) (global_context->chunk_vote_records[pair_number])
54 
55 //#define _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, best_read_id) ((global_context->input_reads.is_paired_end_reads?(global_context -> chunk_alignment_records + (2*pair_number+is_second_read)* global_context->config.multi_best_reads + best_read_id):(global_context -> chunk_alignment_records+pair_number*global_context->config.multi_best_reads + best_read_id)))
56 
57 //#define _global_retrieve_subjunc_ptr(global_context, pair_number, is_second_read, best_read_id) ((global_context->input_reads.is_paired_end_reads?(global_context -> chunk_subjunc_records + (2*pair_number+is_second_read)* global_context->config.multi_best_reads + best_read_id):(global_context -> chunk_subjunc_records+pair_number*global_context->config.multi_best_reads + best_read_id)))
58 
59 //#define _global_retrieve_big_margin_ptr(global_context,pair_number, is_second_read)  (is_second_read?(global_context->big_margin_record + (2*pair_number+is_second_read)* global_context->config.big_margin_record_size):(global_context->big_margin_record + pair_number * global_context->config.big_margin_record_size))
60 
61 #define mark_gapped_read(res) (res)-> result_flags|= CORE_IS_GAPPED_READ;
62 
63 #define CORE_MAX_CIGAR_STR_LEN 110
64 #define CORE_ADDITIONAL_INFO_LENGTH 400
65 #define READPAIRS_FOR_CALC_EXPT_TLEN 1000
66 
67 typedef struct{
68 	subread_read_number_t fragments;
69 	unsigned int maximum_interval_length;
70 	unsigned int expected_items;
71 	HashTable * entry_table;
72 } bucketed_table_t;
73 
74 typedef struct{
75 	int capacity;
76 	int items;
77 	unsigned int keyed_bucket;
78 	unsigned int maximum_interval_length;
79 
80 	unsigned int * positions;
81 	void ** details;
82 } bucketed_table_bucket_t;
83 
84 
85 typedef struct{
86 	unsigned long long capacity;
87 	subread_read_number_t fragments;
88 	subread_read_number_t * fragment_numbers;
89 } fragment_list_t;
90 
91 
92 
93 
94 typedef struct{
95 	int is_paired_end_reads;
96 	int is_internal_error;
97 	gene_input_t first_read_file;
98 	gene_input_t second_read_file;
99 	unsigned long long first_read_file_size;
100 	subread_lock_t input_lock;
101 	double avg_read_length;
102 } read_input_t;
103 
104 typedef struct{
105 	char read_name[MAX_READ_NAME_LEN];
106 	unsigned short flags;
107 	char chro_name[MAX_CHROMOSOME_NAME_LEN];
108 	unsigned int location;
109 	unsigned short map_quality;
110 	char cigar[CORE_MAX_CIGAR_STR_LEN];
111 	char other_chro_name[MAX_CHROMOSOME_NAME_LEN];
112 	unsigned int other_location;
113 	long long int tlen;
114 	int rlen;
115 	char read_text[MAX_READ_LENGTH];
116 	char qual_text[MAX_READ_LENGTH];
117 	char additional_columns[CORE_ADDITIONAL_INFO_LENGTH];
118 } output_read_buffer_t;
119 
120 typedef struct{
121 	int fragment_number_in_chunk;
122 	int multi_mapping_locations;
123 	int this_mapping_location;
124 	output_read_buffer_t r1;
125 	output_read_buffer_t r2;
126 } output_fragment_buffer_t;
127 
128 typedef struct{
129 	// running_scheme
130 	int all_threads;
131 	int is_first_iteration_running;
132 	int is_second_iteration_running;
133 	int is_third_iteration_running;
134 	int use_memory_buffer;
135 	float memory_use_multiplex;
136 	char temp_file_prefix[MAX_FILE_NAME_LENGTH];
137 	subread_read_number_t reads_per_chunk;
138 	int fast_run;
139 
140 	// input_scheme
141 	char first_read_file[MAX_FILE_NAME_LENGTH * MAX_SCRNA_FASTQ_FILES * 3];
142 	char second_read_file[MAX_FILE_NAME_LENGTH];
143 	char exon_annotation_file[MAX_FILE_NAME_LENGTH];
144 	char exon_annotation_file_screen_out[MAX_FILE_NAME_LENGTH];
145 	char exon_annotation_alias_file[MAX_FILE_NAME_LENGTH];
146 	int exon_annotation_file_type;
147 	char exon_annotation_gene_id_column[MAX_READ_NAME_LEN];
148 	char exon_annotation_feature_name_column[MAX_READ_NAME_LEN];
149 
150 
151 	int do_remove_neighbour_for_scRNA;
152 	int scRNA_input_mode;
153 	short read_trim_5;
154 	short read_trim_3;
155 	int is_first_read_reversed;
156 	int is_second_read_reversed;
157 	int space_type;
158 	int is_methylation_reads;
159 	int phred_score_format;
160 	int is_SAM_file_input;
161 	int is_gzip_fastq;
162 
163 
164 	// reporting scheme
165 	char read_group_id[MAX_FILE_NAME_LENGTH];
166 	char read_group_txt[MAX_FILE_NAME_LENGTH];
167 	char output_prefix[MAX_FILE_NAME_LENGTH];
168 	int report_sam_file;
169 	int report_no_unpaired_reads;
170 	int min_mapped_fraction;
171 	int max_mismatch_exonic_reads;
172 	int max_mismatch_junction_reads;
173 	int ignore_unmapped_reads;
174 	int report_unmapped_using_mate_pos;
175 	int report_multi_mapping_reads;
176 	int downscale_mapping_quality;
177 	int is_BAM_input;
178 	int is_BAM_output;
179 	int is_input_read_order_required;
180 	int sort_reads_by_coordinates;
181 	int convert_color_to_base;
182 	int SAM_extra_columns;
183 	int report_multiple_best_in_pairs;
184 	unsigned int multi_best_reads;
185 	unsigned int reported_multi_best_reads;
186 
187 	// basic voting
188 	char index_prefix[MAX_FILE_NAME_LENGTH];
189 	int top_scores;
190 	int max_vote_combinations;
191 	int max_vote_simples;
192 	int max_vote_number_cutoff;
193 	int total_subreads;
194 	int minimum_subread_for_first_read;
195 	int minimum_subread_for_second_read;
196 	float minimum_exonic_subread_fraction;
197 	int minimum_pair_distance;
198 	int maximum_pair_distance;
199 	int restrected_read_order;
200 	int show_soft_cliping;
201 	int max_indel_length;
202 	int expected_pair_distance;
203 	int ambiguous_mapping_tolerance;
204 	int use_hamming_distance_break_ties;
205 	int use_quality_score_break_ties;
206 	int big_margin_record_size;
207 	int PE_predominant_weight;
208 	int no_TLEN_preference;
209 
210 	// subjunc
211 	int entry_program_name;
212 	int experiment_type;
213 	int do_breakpoint_detection;
214 	int do_big_margin_filtering_for_junctions;
215 	int do_big_margin_filtering_for_reads;
216 	int limited_tree_scan;
217 	int use_hamming_distance_in_exon;
218 	unsigned int maximum_intron_length;
219 	int high_quality_base_threshold;
220 	int max_insertion_at_junctions;
221 	int check_donor_at_junctions;
222 
223 	// subfusion
224 	int do_fusion_detection;
225 	int do_long_del_detection;
226 	int do_structural_variance_detection;
227 	int prefer_donor_receptor_junctions;
228 	int more_accurate_fusions;
229 	int maximum_translocation_length;
230 	int maximum_colocating_distance;
231 
232 	// indel
233 	char do_superlong_indel_detection;
234 	char extending_search_indels;
235 	int k_mer_length;
236 	int realignment_minimum_variant_distance;
237 	int reassembly_start_read_number;
238 	int reassembly_key_length;
239 	int reassembly_subread_length;
240 	int reassembly_window_multiplex;
241 	int reassembly_tolerable_voting;
242 	int reassembly_window_alleles;
243 	int init_max_event_number;
244 	int use_dynamic_programming_indel;
245 	int use_bitmap_event_table;
246 	int maximise_sensitivity_indel;
247 	int flanking_subread_indel_mismatch;
248 	int DP_penalty_create_gap;
249 	int DP_penalty_extend_gap;
250 	int DP_match_score;
251 	int DP_mismatch_penalty;
252 
253 } configuration_t;
254 
255 #define CORE_IS_GT_AG_DONORS 1
256 #define CORE_NOTFOUND_DONORS 2
257 #define CORE_IS_STRAND_JUMPED 4
258 #define CORE_IS_NEGATIVE_STRAND 8
259 #define CORE_IS_FULLY_EXPLAINED 16
260 #define CORE_IS_BREAKEVEN 32
261 #define CORE_IS_GAPPED_READ 64
262 #define CORE_IS_PAIRED_END 128
263 #define CORE_TOO_MANY_MISMATCHES 256
264 
265 #define CORE_CIGAR_OPT_M 0
266 #define CORE_CIGAR_OPT_S 1
267 #define CORE_CIGAR_OPT_D 2
268 #define CORE_CIGAR_OPT_I 3
269 #define CORE_CIGAR_OPT_B 4
270 #define CORE_CIGAR_OPT_N 5
271 #define CORE_CIGAR_OPT_BB 6
272 #define CORE_CIGAR_OPT_NN 7
273 
274 #define CORE_PROGRAM_SUBREAD 100
275 #define CORE_PROGRAM_SUBJUNC 200
276 #define CORE_PROGRAM_SUBINDEL 1000
277 
278 #define CORE_EXPERIMENT_DNASEQ 1000
279 #define CORE_EXPERIMENT_RNASEQ 2000
280 
281 #define PRINT_BOX_WRAPPED 4
282 #define PRINT_BOX_NOCOLOR_FOR_COLON 2
283 #define PRINT_BOX_CENTER 1
284 
285 typedef struct{
286 	unsigned int event_small_side;	// the last base before the event
287 	unsigned int event_large_side;	// the first base after the event
288 					// for exon-exon junctions, these two numbers are the last and first base in the exons. (ZERO-BASED INDEX)
289 					// for indels, the smaller side is the base before in inserted/deleted bases; the larger side is the base after them.
290 					// for a fusion, the two sides are the base before the fusion point (the fusion point is not a base, but a vertical bar between two bases -- two sides.)
291 					//
292 					//
293 					//					The 'V' marked bases are the two sides.
294 					//                                      V              V
295 					//    Exon-exon junction:    100 .......s|gt........ag|s....... 300	chrX
296 					//					 ^            ^   These two bars are the junction point.
297 					//
298 					//
299 					//
300 					//    DNA-fusions:
301 					//
302 					//    small_side ===>  .|.  <=== large_side		normal arrangement (N/Y)
303 					//
304 					//    small_side ===>  .|
305 					//    large_side ===>  .|				strand jumpped (N/N)
306 					//
307 					//    		        |.  <=== small_side		strand jumpped (Y/Y)
308 					//    		        |.  <=== large_side
309 					//
310 					//                      ^ This is fusion point. It is not a base, but a vertical bar between two sides.
311 					//
312 					//
313 					//
314 					// note: the fusion point is larger than event_small_side if small_side_increasing_coordinate == N; it is smaller than event_small_side if small_side_increasing_coordinate == Y;
315 					//       the fusion point is smaller than event_large_side if large_side_increasing_coordinate == Y; it is larger than event_large_side if large_side_increasing_coordinate == N.
316 
317 	short indel_length;
318 	short junction_flanking_left;
319 	short junction_flanking_right;
320 
321 	char indel_at_junction;
322 	char is_negative_strand;	// this only works to junction detection, according to 'GT/AG' or 'CT/AC' donors. This only applys to junctions.
323 	char is_strand_jumped;		// "strand jumped" means that the left and right sides are on different strands. This only applys to fusions.
324 	char is_donor_found_or_annotation;		// only for junctions: GT/AG is found at the location. 1: found, 0:not found: 64: from annotation (thus unknown)
325 						// Also, if "is_strand_jumped" is true, all coordinates (e.g., splicing points, cover_start, cover_end, etc) are on "reversed read" view.
326 
327 	char small_side_increasing_coordinate;
328 	char large_side_increasing_coordinate;	// normal exon-exon junctions must have N and Y on small/large increasing coordinates.
329 
330 	//char is_ambiguous;
331 	char connected_next_event_distance;	// the distance (negative or positive) to the next event in the table. For example, if the cigar string is 10M3I1M1I10M, event "3I" will have 1 here .
332 	char connected_previous_event_distance;	// the distance (negative or positive) to the next event in the table. For example, if the cigar string is 10M3I1M1I10M, event "1I" will have 1 here.
333 
334 	//char inserted_bases[(1+MAX_INSERTION_LENGTH) / 4 + 1];
335 	char * inserted_bases;
336 	unsigned short supporting_reads;
337 	unsigned short anti_supporting_reads;
338 	unsigned short final_counted_reads;
339 	unsigned short final_reads_mismatches;
340 	unsigned char event_type;
341 	unsigned int global_event_id;
342 	float event_quality;
343 
344 	unsigned long long critical_read_id;
345 	int critical_supporting_reads;
346 } chromosome_event_t;
347 
348 
349 
350 typedef struct
351 {
352 
353 	unsigned int selected_position;
354 	short result_flags;
355 	short read_length;
356 	// 4 bytes
357 	gene_vote_number_t selected_votes;
358 	gene_vote_number_t used_subreads_in_vote;
359 	unsigned char noninformative_subreads_in_vote;
360 	// this coverage is the range on reads, in point of view of "main piece" strand (i.e., "is_negative_strand")
361 	char indels_in_confident_coverage;
362 	char is_fully_covered;
363 
364 	gene_vote_number_t selected_indel_record [MAX_INDEL_SECTIONS*3 + 1];
365 	unsigned short confident_coverage_start;
366 	unsigned short confident_coverage_end;
367 
368 	short subread_quality;
369 
370 } mapping_result_t;
371 
372 typedef struct{
373 	mapping_result_t * mapping_result;
374 	unsigned int first_base_position;
375 	char cigar_string[CORE_MAX_CIGAR_STR_LEN];
376 	chromosome_event_t * supporting_chromosome_events[MAX_EVENTS_IN_READ];
377 	short flanking_size_left[MAX_EVENTS_IN_READ];
378 	short flanking_size_right[MAX_EVENTS_IN_READ];
379 	char crirical_support[MAX_EVENTS_IN_READ];
380 
381 	char first_base_is_jumpped;
382 	short final_mismatched_bases;
383 	short final_matched_bases;
384 	short best_second_diff_bases;
385 	short realign_flags;
386 	short final_quality;
387 	short chromosomal_length;
388 	short MAPQ_adjustment;
389 	int known_junction_supp;
390 	int final_penalty;
391 } realignment_result_t;
392 
393 #define BUCKETED_TABLE_INIT_ITEMS 3
394 #define FRAGMENT_LIST_INIT_ITEMS 3
395 
396 
397 typedef struct
398 {
399 	short split_point;
400 	gene_vote_number_t minor_votes;
401 	char  double_indel_offset;
402 	char indel_at_junction;
403 	char small_side_increasing_coordinate;
404 	char large_side_increasing_coordinate;
405 	unsigned int minor_position;
406 	// this coverage is the range on reads, in point of view of "main piece" strand
407 	unsigned short minor_coverage_start;
408 	unsigned short minor_coverage_end;
409 
410 } subjunc_result_t;
411 
412 // THREE_TOP_UPDATE dictates the number of top-votes.
413 
414 typedef struct {
415 	int is_vote_t_item;
416 	int item_index_i;
417 	int item_index_j;
418 	unsigned int mapping_position;
419 	int major_half_votes;
420 	unsigned short read_start_base;
421 }simple_mapping_t;
422 
423 typedef struct{
424 	simple_mapping_t * r1_loc;
425 	simple_mapping_t * r2_loc;
426 
427 	unsigned long long score_adj;
428 } vote_combination_t;
429 
430 typedef struct{
431 	unsigned long long maxinum_read_number;
432 	int result_chunk_fd;
433 	int junction_chunk_fd;
434 	int big_margin_chunk_fd;
435 
436 	mapping_result_t * result_chunk_addr;
437 	subjunc_result_t * junction_chunk_addr;
438 	unsigned short * big_margin_chunk_addr;
439 
440 	subread_lock_t resize_lock;
441 } bigtable_t;
442 
443 
444 // this cached item is for an entire read (not read pair, not one best result)
445 typedef struct {
446 	int status;
447 	subread_read_number_t pair_number;
448 	int is_second_read;
449 
450 	unsigned short big_margin_data[9*3];
451 	mapping_result_t * alignment_res;
452 	subjunc_result_t * subjunc_res;
453 
454 } bigtable_cached_result_t;
455 
456 
457 typedef struct{
458 	void * junction_tmp_r1;
459 	void * junction_tmp_r2;
460 
461 	void * alignment_tmp_r1;
462 	void * alignment_tmp_r2;
463 
464 	void * vote_simple_1_buffer;
465 	void * vote_simple_2_buffer;
466 
467 	void * comb_buffer;
468 } topK_buffer_t;
469 
470 typedef struct{
471 	unsigned long long all_correct_PE_reads;
472 	int thread_id;
473 	pthread_t thread;
474 
475 	// module_thread_context is different to module_context, though some contents in module_thread_context could be also in module context (e.g., junction tables)
476 	// modules functions may deside to use objects in which context.
477 	void * module_thread_contexts[5];
478 	gene_value_index_t * current_value_index;
479 
480 	subread_lock_t output_lock;
481 	unsigned int all_mapped_reads;
482 	unsigned int not_properly_pairs_wrong_arrangement;
483 	unsigned int not_properly_pairs_different_chro;
484 	unsigned int not_properly_different_strands;
485 	unsigned int not_properly_pairs_TLEN_wrong;
486 	unsigned int all_unmapped_reads;
487 	unsigned int not_properly_pairs_only_one_end_mapped;
488 	unsigned int all_multimapping_reads;
489 	unsigned int all_uniquely_mapped_reads;
490 
491 	topK_buffer_t topKbuff;
492 } thread_context_t;
493 
494 
495 typedef struct{
496 	// basic running configuration
497 	configuration_t config;
498 
499 	// for the index
500 	gehash_t * current_index;
501 	gene_value_index_t * current_value_index;
502 	gene_value_index_t all_value_indexes[100];
503 	int index_block_number;
504 	int current_index_block_number;
505 	int will_remove_input_file;
506 	int is_phred_warning;
507 	int is_final_voting_run;
508 
509 	// global locks
510 	subread_lock_t thread_initial_lock;
511 
512 	// global FILE*
513 	SamBam_Writer * output_bam_writer;
514 	FILE * output_sam_fp;
515 	FILE * long_insertion_FASTA_fp;
516 	char * output_sam_inner_buffer;
517 	int output_sam_is_full;
518 
519 	// running contexts
520 	void * module_contexts[5];
521 	thread_context_t * all_thread_contexts;
522 	subread_read_number_t last_written_fragment_number;
523 	int need_merge_buffer_now;
524 	read_input_t input_reads;
525 	bigtable_t bigtable;
526 	int rebuilt_command_line_size;
527 	char * rebuilt_command_line;
528 
529 	subread_lock_t bigtable_lock;
530 	subread_lock_t output_lock;
531 	int bigtable_cache_size;
532 	FILE * bigtable_cache_file_fp;
533 	long long bigtable_cache_file_loaded_fragments_begin;
534 	long long bigtable_cache_file_fragments;
535 	bigtable_cached_result_t * bigtable_cache;
536 	char *bigtable_cache_malloc_ptr, *bigtable_cache_malloc_ptr_junc;
537 	unsigned int bigtable_chunked_fragments;
538 	int bigtable_dirty_data;
539 
540 	gene_offset_t chromosome_table;
541 	double start_time;
542 	double align_start_time;
543 	double timecost_load_index;
544 	double timecost_voting;
545 	double timecost_before_realign;
546 	double timecost_for_realign;
547 
548 	unsigned long long expected_TLEN_sum;
549 	unsigned int expected_TLEN_read_numbers;
550 	unsigned long long all_processed_reads;
551 	unsigned long long all_correct_PE_reads;
552 	unsigned int all_junctions;
553 	unsigned int all_fusions;
554 	unsigned int all_indels;
555 
556 	unsigned int all_mapped_reads;
557 	unsigned int not_properly_pairs_wrong_arrangement;
558 	unsigned int not_properly_pairs_different_chro;
559 	unsigned int not_properly_different_strands;
560 	unsigned int not_properly_pairs_TLEN_wrong;
561 	unsigned int all_unmapped_reads;
562 	unsigned int not_properly_pairs_only_one_end_mapped;
563 	unsigned int all_multimapping_reads;
564 	unsigned int all_uniquely_mapped_reads;
565 
566 	unsigned long long current_circle_start_abs_offset_file1;
567 	gene_inputfile_position_t current_circle_start_position_file1;
568 	gene_inputfile_position_t current_circle_start_position_file2;
569 	gene_inputfile_position_t current_circle_end_position_file1;
570 	gene_inputfile_position_t current_circle_end_position_file2;
571 	subread_read_number_t processed_reads_in_chunk;
572 	subread_read_number_t running_processed_reads_in_chunk;
573 
574 	// sunfusion structural variance
575 	bucketed_table_t funky_table_BC;
576 	bucketed_table_t funky_table_DE;
577 	fragment_list_t funky_list_A;
578 	fragment_list_t funky_list_DE;
579 
580 	bucketed_table_t breakpoint_table_P;
581 	bucketed_table_t breakpoint_table_QR;
582 	bucketed_table_t breakpoint_table_YZ;
583 
584 	bucketed_table_t translocation_result_table;
585 	bucketed_table_t inversion_result_table;
586 	topK_buffer_t topKbuff;
587 
588 	// per chunk parameters
589 	subread_read_number_t read_block_start;
590 	char * exonic_region_bitmap;
591 	HashTable * sam_chro_to_anno_chr_alias;
592 	HashTable * annotation_chro_table;
593 } global_context_t;
594 
595 
596 #define MODULE_INDEL_ID 0
597 #define MODULE_JUNCTION_ID 1
598 
599 #define STEP_VOTING 10
600 #define STEP_ITERATION_ONE 20
601 #define STEP_ITERATION_TWO 30
602 #define STEP_ITERATION_THREE 40
603 #define STEP_WRITE_CHUNK_RESULTS 90
604 
605 #define MEMORY_OPTIMISATION_LAPTOP 10
606 #define MEMORY_OPTIMISATION_DESKTOP 20
607 #define MEMORY_OPTIMISATION_SUPERCOMPUTER 100
608 
609 // This function initialise the module contexts that are needed according to the running configuration.
610 int init_modules(global_context_t * context);
611 
612 // This function undo any memory allocation and close opened files in init_modules.
613 int destroy_modules(global_context_t * context);
614 
615 // This function shows welcome messages and shows the current configuration to stdout.
616 // This function also checks the legality of the parameters in context->config
617 int print_configuration(global_context_t * context);
618 
619 // This function opens all related files except the index. It however counts the piece number of the index.
620 int load_global_context(global_context_t * context);
621 
622 // This function deallocates memory and closes files that were allowcated and opened in load_global_context.
623 int destroy_global_context(global_context_t * context);
624 
625 
626 // This function processes the read chunks (24M reads each), including three steps: 1, voting; 2, iteration one; 3, iteration two.
627 int read_chunk_circles(global_context_t * context);
628 
629 
630 // write the final results in each module. Note that the SAM file is writen in read_chunk_circles, after each read chunk is processed.
631 int write_final_results(global_context_t * context);
632 
633 // this function is called by an interface to start the main process with a customized opt parser.
634 int core_main(int argc , char ** argv, int (parse_opts (int , char **, global_context_t * )));
635 
636 // decompress a cigar string to cigar
637 // cigar_len is the maximum length of decompressed cigar
638 // bincigar is the maximum length of compressed cigar
639 // it returns the length of decompressed cigar, or -1 if buffer is too short.
640 int bincigar2cigar(char * cigar, int cigar_len, char * bincigar, int bincigar_max_len, int read_len);
641 
642 // compress a cigar string to bincigar
643 // bincigar_len is the maximum length of compressed cigar
644 // it returns the length of compressed cigar, or -1 if buffer is too short.
645 int cigar2bincigar(char *cigar, char *bincigar, int bincigar_len);
646 
647 // print the logo of our program
648 void print_subread_logo();
649 
650 // print a line in the box
651 void print_in_box(int line_width, int is_boundary, int options, char * pattern,...);
652 
653 // find the value index covering this read
654 // it returns NULL if no index is found.
655 gene_value_index_t * find_current_value_index(global_context_t * global_context, unsigned int pos, int len);
656 
657 // generate the time string
658 void char_strftime(char * tbuf);
659 
660 int term_strncpy(char * dst, char * src, int max_dst_memory);
661 
662 int is_result_in_PE(mapping_result_t * aln);
663 
664 void core_version_number(char * program);
665 
666 mapping_result_t * _global_retrieve_alignment_ptr(global_context_t * global_context, subread_read_number_t pair_number, int is_second_read, int best_read_id);
667 subjunc_result_t * _global_retrieve_subjunc_ptr(global_context_t * global_context, subread_read_number_t pair_number, int is_second_read, int best_read_id);
668 unsigned short * _global_retrieve_big_margin_ptr(global_context_t * global_context, subread_read_number_t pair_number, subread_read_number_t is_second_read);
669 
670 // This assumes the first part of Cigar has differet strandness to the main part of the cigar.
671 // Pos is the LAST WANTED BASE location before the first strand jump (split by 'b' or 'n').
672 // The first base in the read actually has a larger coordinate than Pos.
673 unsigned int reverse_cigar(unsigned int pos, char * cigar, char * new_cigar);
674 
675 int chimeric_cigar_parts(global_context_t * global_context , unsigned int sel_pos, int is_first_section_negative_strand, int is_first_section_reversed, char * in_cigar, unsigned int * out_poses, char ** out_cigars, char * out_strands, int read_len, short * out_read_lens, char * read_name);
676 
677 void warning_file_limit();
678 void quick_sort(void * arr,int arr_size, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r));
679 void basic_sort(void * arr,int arr_size, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r));
680 
681 // L_Minus_R should return -1, 0 or 1 when L<R, L==R or L>R.
682 // The result is from Small to Large.
683 void merge_sort(void * arr, int arr_size, int L_Minus_R (void * arr, int l, int r), void exchange(void * arr, int l, int r), void merge_SmallFirst(void * arr, int start, int items, int items2));
684 
685 void absoffset_to_posstr(global_context_t * global_context, unsigned int pos, char * res);
686 
687 void test_PE_and_same_chro(global_context_t * global_context , unsigned int pos1, unsigned int pos2, int * is_PE_distance, int * is_same_chromosome , int rlen1, int rlen2);
688 
689 int FIXLENstrcmp(char * fixed_len, char * rname);
690 
691 int is_valid_digit(char * optarg, char * optname);
692 int is_valid_digit_range(char * optarg, char * optname, int min, int max_inc);
693 int is_valid_float(char * optarg, char * optname);
694 int exec_cmd(char * cmd, char * outstr, int out_limit);
695 int is_pos_in_annotated_exon_regions(global_context_t * global_context, unsigned int pos);
696 char * get_sam_chro_name_from_alias(HashTable * tab, char * anno_chro);
697 void subread_rebuild_cmd(int argc, char ** argv, global_context_t * global_context);
698 #endif
699