1 /***************************************************************
2 
3    The Subread software package is free software package:
4    you can redistribute it and/or modify it under the terms
5    of the GNU General Public License as published by the
6    Free Software Foundation, either version 3 of the License,
7    or (at your option) any later version.
8 
9    Subread is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty
11    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 
13    See the GNU General Public License for more details.
14 
15    Authors: Drs Yang Liao and Wei Shi
16 
17   ***************************************************************/
18 
19 
20 #ifndef _SORTED_HASHTABLE_H_
21 #define _SORTED_HASHTABLE_H_
22 #include <stdlib.h>
23 #include <stdio.h>
24 
25 #include "subread.h"
26 #include "gene-algorithms.h"
27 #include "gene-value-index.h"
28 
29 #define GEHASH_DEFAULT_SIZE	2000000000
30 #define GEHASH_BUCKET_LENGTH 100
31 
32 #define gehash_fast_t gehash_t
33 #define gehash_destory_fast gehash_destory
34 
35 
36 // This function creates a new hash table. The invoter may provide the expected size of the table or -1 for a default size (2 billions)
37 // This function returns 0 if success or an errno
38 // The version of the hash table created by this function must be SUBINDEX_VER0.
39 int gehash_create(gehash_t * the_table, size_t expected_size, char is_small_table);
40 
41 // The EX version creates a hashtable with the given version number
42 int gehash_create_ex(gehash_t * the_table, size_t expected_size, char is_small_table, int version_number, int GENE_SLIDING_STEP, int padding);
43 
44 // This function puts a data item into the table. If there is duplication, it insert another copy into the table but do not overlap on the old one.
45 int gehash_insert(gehash_t * the_table, gehash_key_t key, gehash_data_t data, unsigned int * bucket_sizes);
46 void gehash_try_insert_measure(unsigned int * bucket_sizes, int bucket_no, gehash_key_t key);
47 
48 // This function does what gehash_insert does, but insert nothing if the key has occured max_key_occurance times.
49 int gehash_insert_limited(gehash_t * the_table, gehash_key_t key, gehash_data_t data, int max_key_occurance, int prob_replace);
50 
51 // This function queries the table and put the matched data item into data_result.
52 // This function returns 0 if not found, or the number of matched items.
53 // The invoter is in charge of allocating memory for results.
54 size_t gehash_get(gehash_t * the_table, gehash_key_t key, gehash_data_t * data_result, size_t max_result_space);
55 
56 // Test existance, disregarding numbers.
57 // Return 1 if exist, 0 if not.
58 int gehash_exist(gehash_t * the_table, gehash_key_t key);
59 
60 
61 size_t gehash_go_q_CtoT(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote,gene_vote_number_t weight, int max_match_number, int indel_tolerance, int subread_number,int max_error_bases, unsigned int low_border, unsigned int high_border);
62 
63 size_t gehash_go_q_tolerable(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote, gene_vote_number_t weight, gene_quality_score_t quality, int max_match_number, int indel_tolerance, int subread_number,int max_error_bases, int subread_len, unsigned int low_border, unsigned int high_border);
64 
65 size_t gehash_go_q(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote,int indel_tolerance, int subread_number, unsigned int low_border, unsigned int high_border);
66 size_t gehash_go_X(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote,int indel_tolerance, int subread_number, unsigned int low_border, unsigned int high_border, int run_round, unsigned int * shift_indel_locations, unsigned int * shift_indel_NO);
67 
68 // This function performs the same functionality, but runs only on AMD-64 cpus, and the length of each key must be 4 bytes.
69 size_t gehash_get_hpc(gehash_t * the_table, gehash_key_t key, gehash_data_t * data_result, size_t max_result_space);
70 
71 // This function removes all items under the key. It returns the number of items that has been removed in this call.
72 size_t gehash_remove(gehash_t * the_table, gehash_key_t key);
73 
74 // Free all memory that is allocated for the table. Only the table structure itself is not freed.
75 void gehash_destory(gehash_t * the_table);
76 
77 // This function conpletely dumps a table into a disk file.
78 // It returns 0 if success, otherwise -1.
79 int gehash_dump(gehash_t * the_table, const char fname []);
80 
81 void finalise_vote(gene_vote_t * vote);
82 
83 // This function loads a dumpped hash table.
84 // The invoker does not need to initialise the table; it will be initialised in the function.
85 // It returns 0 if success, otherwise -1.
86 int gehash_load(gehash_t * the_table, const char fname []);
87 
88 void gehash_prealloc(gehash_t * the_table);
89 
90 size_t gehash_update(gehash_t * the_table, gehash_key_t key, gehash_data_t data_new);
91 
92 short indel_recorder_copy(gene_vote_number_t *dst, gene_vote_number_t* src);
93 
94 void assign_best_vote(gene_vote_t * vote, int i, int j);
95 
96 void select_best_vote(gene_vote_t * vote);
97 void gehash_sort(gehash_t * the_table);
98 int gehash_load_option(const char fname [], int option_no, int * result);
99 
100 // calculate # of buckets for estimaing their sizes.
101 unsigned int calculate_buckets_by_size(size_t exp_size, int version, int is_small_tab, int index_gap);
102 #endif
103