1 /*************************************************************** 2 3 The Subread software package is free software package: 4 you can redistribute it and/or modify it under the terms 5 of the GNU General Public License as published by the 6 Free Software Foundation, either version 3 of the License, 7 or (at your option) any later version. 8 9 Subread is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty 11 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 12 13 See the GNU General Public License for more details. 14 15 Authors: Drs Yang Liao and Wei Shi 16 17 ***************************************************************/ 18 19 20 #ifndef _GENE_ALGORITHMS_H_ 21 #define _GENE_ALGORITHMS_H_ 22 23 #include "subread.h" 24 #include "hashtable.h" 25 #include "sorted-hashtable.h" 26 27 // Load the ".reads" file into the memory 28 // Return 0 if succeed or -1 if errors. 29 int load_offsets(gene_offset_t* offsets , const char index_prefix []); 30 void destroy_offsets(gene_offset_t* offsets); 31 32 // Locate the position of a linear address 33 // Return 0 if the linear position is in a reasonable range or -1 if it is out of range. 34 // The pointer to the name of the chromosome is put into chro_name, and the position in this chromosome is in pos. 35 int locate_gene_position(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos); 36 int locate_gene_position_max(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos, int * head_cut, int * tail_cut, int rl); 37 38 unsigned int linear_gene_position(const gene_offset_t* offsets , char *chro_name, unsigned int chro_pos); 39 40 int remove_repeated_reads(gehash_t * table, gehash_t * huge_table,int index_threshold); 41 42 #define init_gene_vote(a) {memset((a)->items, 0, GENE_VOTE_TABLE_SIZE*sizeof( *((a)->items))); (a)->max_vote = 0; (a) -> max_indel_recorder[0]=0; (a)->max_tmp_indel_recorder = NULL; (a)->max_mask = 0; (a) -> noninformative_subreads = 0;} 43 // return current votes for a given position 44 // if create_new_pos == 0 then do not take this position if it does not exist in the vote array 45 //void add_gene_vote(gene_vote_t* vote, int position, int create_new_pos); 46 //void add_gene_vote_weighted(gene_vote_t* vote, int position, int create_new_pos, int w); 47 48 // return the votes; the position is put into position_result 49 int max_gene_vote(gene_vote_t* vote, int * position_result, int query_id); 50 51 // evaluate the number of matched characters in the piece_str 52 int evaluate_piece(char * piece_str, int chromosome, int offset, int is_counterpart, int start_p, int end_p); 53 54 // These two functions maintain the global vote for pieces 55 int init_allvote(gene_allvote_t* allvote, int expected_len, int allowed_indels); 56 void clear_allvote(gene_allvote_t* allvote); 57 58 59 void add_allvote_q(gene_allvote_t* allvote,int qid , int pos, gene_vote_number_t votes, gene_quality_score_t quality, int is_counterpart, short mask, char * max_indel_recorder, gene_value_index_t * array_index, char * read_txt, int read_len, int max_indel, int total_subreads, int space_type, int report_junction, int is_head_high_quality, char * qual_txt, int phred_version, char span_coverage, short **dynamic_programming_short , char** dynamic_programming_char); 60 61 unsigned char get_next_char(FILE * fp); 62 63 extern double begin_ftime; 64 65 void print_text_scrolling_bar(char * hint, float percentage, int width, int * internal_counter); 66 void print_window_scrolling_bar(char * hint, float percentage, int width, int * internal_counter); 67 68 void print_running_log(double finished_rate, double read_per_second, double expected_seconds, unsigned long long int total_reads, int is_pair) ; 69 70 71 int select_positions(gene_vote_t * vote_read1, gene_vote_t * vote_read2, gene_vote_number_t * numvote_read1, gene_vote_number_t * numvote_read2, gene_quality_score_t * sum_quality, gene_quality_score_t * qual_r1, gene_quality_score_t * qual_r2, gehash_data_t * pos_read1, gehash_data_t * pos_read2, gene_vote_number_t * read1_indel_recorder, gene_vote_number_t* read2_indel_recorder, unsigned int max_pair_dist, unsigned int min_pair_dist, int min_major, int min_minor, int is_negatve, int number_of_anchors_quality, int max_indel_len, int * is_breakeven); 72 73 74 int select_positions_array(char * read1_str, int read1_len, char * read2_str, int read2_len, gene_vote_t * vote_read1, gene_vote_t * vote_read2, gene_vote_number_t * numvote_read1, gene_vote_number_t * numvote_read2, gene_quality_score_t * sum_quality, gene_quality_score_t * q_r1, gene_quality_score_t *q_r2, gene_vote_number_t * indel_rec_r1, gene_vote_number_t * indel_rec_r2, gehash_data_t * pos_read1, gehash_data_t * pos_read2, unsigned int max_pair_dest, unsigned int min_pair_dest, int min_major, int min_minor,int is_negative_strand, gene_value_index_t * my_array_index, int color_space, int indel_tolerance, int number_of_anchors_quality, const char quality_str1 [], const char quality_str2 [], int quality_scale, int max_indel_len, int * is_breakeven); 75 76 77 #define QUALITY_SCALE_NONE 0 78 #define QUALITY_SCALE_LINEAR 1 79 #define QUALITY_SCALE_LOG 2 80 81 82 // This function returns the lowest score of phred in the given 16-byte quality string 83 // Assuming that the score = char - '@' 84 gene_quality_score_t get_subread_quality(const char * quality_str, const char * read_str, int quality_scale, int phred_version); 85 86 gene_vote_number_t calculate_penalty_score(const char * quality_str, const char * read_str, int quality_scale, int base_matchness); 87 88 int is_valid_subread(const char * read_str); 89 90 void final_matchingness_scoring(const char read_str[], const char quality_str[], int read_len, gene_vote_t * vote, gehash_data_t * max_position, gene_vote_number_t * max_vote, short *max_mask, gene_quality_score_t * max_quality, gene_value_index_t * my_array_index, int space_type, int indel_tolerance, int quality_scale, gene_vote_number_t * max_indel_recorder, int * max_coverage_start, int * max_coverage_end); 91 92 float match_read(const char read_str[], int read_len, unsigned int potential_position, gene_value_index_t * my_array_index, int space_type, int indel_tolerance, const char quality_str [], int quality_scale) ; 93 94 void show_cigar(char * info, int len, int is_reversed_map, char * buf, int max_indel, int total_subreads, char * read, int * pos_offset, int * adjust_rl); 95 96 int dynamic_align(char * read, int read_len, gene_value_index_t * index, unsigned int begin_position, int max_indel, char * movement_buffer, int expected_offset,int begin_read_offset, int end_read_offset, short **dynamic_programming_short , char** dynamic_programming_char ); 97 98 int window_indel_align(char * read, int read_len, gene_value_index_t * index, unsigned int begin_position, int max_indel, char * movement_buffer, int expected_offset,int begin_read_offset, int end_read_offset); 99 100 void explain_indel_in_middle(gene_allvote_t* allvote, int qid , int pos, char * max_indel_recorder, gene_value_index_t * array_index, char * read_txt, int read_len, int max_indel, int total_subreads, int head_start_point, int tail_end_point, int head_indel_movement, int tail_indel_movement, int find_junctions, short **dynamic_programming_short , char** dynamic_programming_char); 101 102 void find_and_explain_indel(gene_allvote_t* allvote,int qid , int pos, gene_vote_number_t votes, gene_quality_score_t quality, int is_counterpart, char mask, char * max_indel_recorder, gene_value_index_t * array_index, char * read_txt, int read_len, int max_indel, int total_subreads, int space_type,int find_junctions, int is_head_high_quality, char * qual_txt, int phred_version, short** dynamic_programming_short , char **dynamic_programming_char ); 103 104 int extend_covered_region(gene_value_index_t *array_index, unsigned int read_start_pos, char * read, int read_len, int cover_start, int cover_end, int window_size, int match_req_5end, int match_req_3end, int indel_tolerance, int space_type, int tail_indel, short * head_indel_pos, int * head_indel_movement, short * tail_indel_pos, int * tail_indel_movement, int is_head_high_quality, char * qual_str, int qual_format, float head_matching_rate, float tail_matching_rate); 105 106 float final_mapping_quality(gene_value_index_t *array_index, unsigned int pos, char * read_txt, char * qual_txt, char * cigar_txt, int phred_version, int * mismatch, int rl, char * refined_cigar, unsigned int * new_pos); 107 108 int bad_quality_base_number(char * qualityb, int rl , int format); 109 110 int min_matched_bases(char * qualityb, int rl , int format, float match_score); 111 112 float read_quality_score(char * qualityb, int rl , int format); 113 114 void compress_cigar(char * cigar, int read_len, char *read, int * position_offset, int * adjust_rl); 115 116 void print_votes(gene_vote_t * vote, char *index_prefix); 117 118 void destory_allvote(gene_allvote_t* allvote); 119 120 float get_base_error_prob33(char v); 121 float get_base_error_prob64(char v); 122 123 124 int get_base_error_prob33i(char v); 125 int get_base_error_prob64i(char v); 126 127 void add_repeated_numbers(int qid, gene_vote_t * vote, unsigned char * repeated_regions); 128 void bad_reverse_cigar(char * cigar); 129 130 void subread_lock_occupy(subread_lock_t * lock); 131 void subread_init_lock(subread_lock_t * lock); 132 void subread_destroy_lock(subread_lock_t * lock); 133 void subread_lock_release(subread_lock_t * lock); 134 void remove_indel_neighbours(HashTable * indel_table); 135 float match_base_quality(gene_value_index_t *array_index, char * read_txt, unsigned int pos, char * qual_txt, int read_len, int is_negative, int phred_version, int * high_qual_unmatch, int * all_MM, int ql_kill, int head_clipped, int tail_clipped); 136 int match_chro_indel(char * read, gene_value_index_t * index, unsigned int pos, int test_len, int is_negative_strand, int space_type, int indel_size, gene_vote_number_t * indel_recorder, int total_subreads); 137 float match_base_quality_cs(gene_value_index_t *array_index, char * read_txt, unsigned int pos, char * qual_txt, int read_len, int phred_version, int * high_qual_unmatch, int * all_MM, int ql_kill, int head_clipped, int tail_clipped); 138 void print_version_info(); 139 140 int fc_strcmp_chro(const void * s1, const void * s2); 141 srUInt_64 fc_chro_hash(const void *key) ; 142 143 #endif 144 145