1 /*************************************************************** 2 3 The Subread software package is free software package: 4 you can redistribute it and/or modify it under the terms 5 of the GNU General Public License as published by the 6 Free Software Foundation, either version 3 of the License, 7 or (at your option) any later version. 8 9 Subread is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty 11 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 12 13 See the GNU General Public License for more details. 14 15 Authors: Drs Yang Liao and Wei Shi 16 17 ***************************************************************/ 18 19 20 #ifndef _SORTED_HASHTABLE_H_ 21 #define _SORTED_HASHTABLE_H_ 22 #include <stdlib.h> 23 #include <stdio.h> 24 25 #include "subread.h" 26 #include "gene-algorithms.h" 27 #include "gene-value-index.h" 28 29 #define GEHASH_DEFAULT_SIZE 2000000000 30 #define GEHASH_BUCKET_LENGTH 100 31 32 #define gehash_fast_t gehash_t 33 #define gehash_destory_fast gehash_destory 34 35 36 // This function creates a new hash table. The invoter may provide the expected size of the table or -1 for a default size (2 billions) 37 // This function returns 0 if success or an errno 38 // The version of the hash table created by this function must be SUBINDEX_VER0. 39 int gehash_create(gehash_t * the_table, size_t expected_size, char is_small_table); 40 41 // The EX version creates a hashtable with the given version number 42 int gehash_create_ex(gehash_t * the_table, size_t expected_size, char is_small_table, int version_number, int GENE_SLIDING_STEP, int padding); 43 44 // This function puts a data item into the table. If there is duplication, it insert another copy into the table but do not overlap on the old one. 45 int gehash_insert(gehash_t * the_table, gehash_key_t key, gehash_data_t data, unsigned int * bucket_sizes); 46 void gehash_try_insert_measure(unsigned int * bucket_sizes, int bucket_no, gehash_key_t key); 47 48 // This function does what gehash_insert does, but insert nothing if the key has occured max_key_occurance times. 49 int gehash_insert_limited(gehash_t * the_table, gehash_key_t key, gehash_data_t data, int max_key_occurance, int prob_replace); 50 51 // This function queries the table and put the matched data item into data_result. 52 // This function returns 0 if not found, or the number of matched items. 53 // The invoter is in charge of allocating memory for results. 54 size_t gehash_get(gehash_t * the_table, gehash_key_t key, gehash_data_t * data_result, size_t max_result_space); 55 56 // Test existance, disregarding numbers. 57 // Return 1 if exist, 0 if not. 58 int gehash_exist(gehash_t * the_table, gehash_key_t key); 59 60 61 size_t gehash_go_q_CtoT(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote,gene_vote_number_t weight, int max_match_number, int indel_tolerance, int subread_number,int max_error_bases, unsigned int low_border, unsigned int high_border); 62 63 size_t gehash_go_q_tolerable(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote, gene_vote_number_t weight, gene_quality_score_t quality, int max_match_number, int indel_tolerance, int subread_number,int max_error_bases, int subread_len, unsigned int low_border, unsigned int high_border); 64 65 size_t gehash_go_q(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote,int indel_tolerance, int subread_number, unsigned int low_border, unsigned int high_border); 66 size_t gehash_go_X(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote,int indel_tolerance, int subread_number, unsigned int low_border, unsigned int high_border, int run_round, unsigned int * shift_indel_locations, unsigned int * shift_indel_NO); 67 68 // This function performs the same functionality, but runs only on AMD-64 cpus, and the length of each key must be 4 bytes. 69 size_t gehash_get_hpc(gehash_t * the_table, gehash_key_t key, gehash_data_t * data_result, size_t max_result_space); 70 71 // This function removes all items under the key. It returns the number of items that has been removed in this call. 72 size_t gehash_remove(gehash_t * the_table, gehash_key_t key); 73 74 // Free all memory that is allocated for the table. Only the table structure itself is not freed. 75 void gehash_destory(gehash_t * the_table); 76 77 // This function conpletely dumps a table into a disk file. 78 // It returns 0 if success, otherwise -1. 79 int gehash_dump(gehash_t * the_table, const char fname []); 80 81 void finalise_vote(gene_vote_t * vote); 82 83 // This function loads a dumpped hash table. 84 // The invoker does not need to initialise the table; it will be initialised in the function. 85 // It returns 0 if success, otherwise -1. 86 int gehash_load(gehash_t * the_table, const char fname []); 87 88 void gehash_prealloc(gehash_t * the_table); 89 90 size_t gehash_update(gehash_t * the_table, gehash_key_t key, gehash_data_t data_new); 91 92 short indel_recorder_copy(gene_vote_number_t *dst, gene_vote_number_t* src); 93 94 void assign_best_vote(gene_vote_t * vote, int i, int j); 95 96 void select_best_vote(gene_vote_t * vote); 97 void gehash_sort(gehash_t * the_table); 98 int gehash_load_option(const char fname [], int option_no, int * result); 99 100 // calculate # of buckets for estimaing their sizes. 101 unsigned int calculate_buckets_by_size(size_t exp_size, int version, int is_small_tab, int index_gap); 102 #endif 103