1 /***************************************************************
2
3 The Subread software package is free software package:
4 you can redistribute it and/or modify it under the terms
5 of the GNU General Public License as published by the
6 Free Software Foundation, either version 3 of the License,
7 or (at your option) any later version.
8
9 Subread is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty
11 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12
13 See the GNU General Public License for more details.
14
15 Authors: Drs Yang Liao and Wei Shi
16
17 ***************************************************************/
18
19
20 #include<assert.h>
21 #include <stdlib.h>
22 #include <string.h>
23
24 #ifndef MACOS
25 #if !defined(__FreeBSD__) && !defined(__DragonFly__)
26 #include <malloc.h>
27 #endif
28 #endif
29
30 #include<math.h>
31 #include"core.h"
32 #include"long-hashtable.h"
33
lnhash_create(lnhash_t * the_table,unsigned int num_buckets)34 int lnhash_create(lnhash_t * the_table, unsigned int num_buckets)
35 {
36 int x1;
37 the_table -> num_elements = 0;
38 the_table -> num_buckets = num_buckets;
39 the_table -> is_sorted = 0;
40 the_table -> subread_repeat_max = 25600;
41
42 the_table -> buckets = malloc(num_buckets * sizeof(lnhash_buckct_t));
43 if(sizeof(void*) == 4)
44 the_table -> key_repeated_numbers = malloc(sizeof(short) * 0xfffffff);
45 else
46 the_table -> key_repeated_numbers = malloc(sizeof(short) * 0x100000000llu);
47 for(x1=0; x1<num_buckets; x1++)
48 {
49 the_table -> buckets[x1].num_elements = 0;
50 the_table -> buckets[x1].space_elements = 0;
51 the_table -> buckets[x1].key_array = NULL;
52 the_table -> buckets[x1].value_array = NULL;
53 }
54
55 return 0;
56 }
57
lnhash_destroy(lnhash_t * the_table)58 void lnhash_destroy(lnhash_t * the_table)
59 {
60 int x1;
61 free(the_table -> key_repeated_numbers);
62 for(x1=0; x1<the_table -> num_buckets; x1++)
63 {
64 if(the_table -> buckets[x1].key_array)
65 free(the_table -> buckets[x1].key_array);
66 if(the_table -> buckets[x1].value_array)
67 free(the_table -> buckets[x1].value_array);
68 }
69
70 free(the_table -> buckets);
71 }
72
lnhash_locate_bucket(lnhash_t * the_table,gehash_key_t key)73 unsigned int lnhash_locate_bucket(lnhash_t * the_table, gehash_key_t key)
74 {
75 return key % the_table -> num_buckets;
76 }
77
lnhash_resize_bucket(lnhash_buckct_t * buck,unsigned int new_size)78 void lnhash_resize_bucket(lnhash_buckct_t * buck, unsigned int new_size)
79 {
80 if(buck -> space_elements < new_size)
81 {
82 if(buck -> space_elements>0)
83 {
84 buck -> space_elements *= LNHASH_BUCKET_SIZE_INCREMENT;
85 buck -> key_array = realloc(buck -> key_array, sizeof(gehash_key_t) * buck -> space_elements);
86 buck -> value_array = realloc(buck -> value_array, sizeof(lnhash_data_t) * buck -> space_elements);
87 }
88 else
89 {
90 buck -> space_elements = LNHASH_INIT_BUCKET_SIZE;
91 buck -> key_array = malloc(sizeof(gehash_key_t) * buck -> space_elements);
92 buck -> value_array = malloc(sizeof(lnhash_data_t) * buck -> space_elements);
93 }
94 }
95 }
96
lnhash_insert(lnhash_t * the_table,gehash_key_t key,lnhash_data_t data)97 int lnhash_insert(lnhash_t * the_table, gehash_key_t key, lnhash_data_t data)
98 {
99 int bucket_no = lnhash_locate_bucket(the_table, key);
100 lnhash_buckct_t * buck = &(the_table -> buckets[bucket_no]);
101
102 if(the_table -> key_repeated_numbers[key] < the_table -> subread_repeat_max)
103 {
104 the_table -> key_repeated_numbers[key]++;
105
106 lnhash_resize_bucket(buck, buck -> num_elements +1);
107 buck -> key_array[buck -> num_elements] = key;
108 buck -> value_array[buck -> num_elements] = data;
109 buck -> num_elements++;
110
111 return 0;
112 }
113 else if(the_table -> key_repeated_numbers[key] == the_table -> subread_repeat_max)
114 {
115 return 1;
116
117 int read_c, write_c=0;
118 for(read_c =0; read_c < buck -> num_elements; read_c++)
119 {
120 if(buck -> key_array[read_c] != key)
121 {
122 if(write_c!=read_c)
123 {
124 buck -> key_array[write_c] = buck -> key_array[read_c];
125 buck -> value_array[write_c] = buck -> value_array[read_c];
126 }
127 write_c++;
128 }
129 }
130
131 buck -> num_elements = write_c;
132 the_table -> key_repeated_numbers[key] = 65535;
133 }
134
135 return 1;
136 }
137
138 //void merge_sort(void * arr, int arr_size, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r), void merge(void * arr, int start, int items, int items2));
139 //
lnhash_mergesort_compare(void * arr,int l,int r)140 int lnhash_mergesort_compare(void * arr, int l, int r)
141 {
142 lnhash_buckct_t * buck = arr;
143 gehash_key_t lk = buck->key_array[l];
144 gehash_key_t rk = buck->key_array[r];
145 if(lk==rk)return 0;
146 if(lk> rk)return 1;
147 return -1;
148 }
149
lnhash_mergesort_exchange(void * arr,int l,int r)150 void lnhash_mergesort_exchange(void * arr, int l, int r)
151 {
152 lnhash_buckct_t * buck = arr;
153 gehash_key_t tmpk;
154 lnhash_data_t tmpd;
155 tmpk = buck->key_array[l];
156 tmpd = buck->value_array[l];
157
158 buck->value_array[l] = buck->value_array[r];
159 buck->key_array[l] = buck->key_array[r];
160
161 buck->key_array[r] = tmpk;
162 buck->value_array[r] = tmpd;
163 }
164
lnhash_mergesort_merge(void * arr,int start,int items,int items2)165 void lnhash_mergesort_merge(void * arr, int start, int items, int items2)
166 {
167 lnhash_buckct_t * buck = arr;
168 int i1_cursor = start, i2_cursor = start + items, out_cursor = 0;
169 int end_point = start + items + items2;
170
171 gehash_key_t * tmpk_arr = malloc(sizeof(gehash_key_t) * (items + items2));
172 lnhash_data_t * tmpd_arr = malloc(sizeof(lnhash_data_t) * (items + items2));
173
174 while(1)
175 {
176 if(i1_cursor == start + items && i2_cursor == end_point) break;
177 assert(out_cursor <items + items2);
178
179 int choose_2 = 0;
180
181 if(i1_cursor >= start+items)
182 choose_2 = 1;
183
184 if(i2_cursor < end_point)
185 if(buck->key_array[i1_cursor] > buck->key_array[i2_cursor])
186 choose_2 = 1;
187
188 if(choose_2)
189 {
190 tmpk_arr[out_cursor] = buck->key_array[i2_cursor];
191 tmpd_arr[out_cursor] = buck->value_array[i2_cursor];
192 i2_cursor++;
193 }
194 else
195 {
196 tmpk_arr[out_cursor] = buck->key_array[i1_cursor];
197 tmpd_arr[out_cursor] = buck->value_array[i1_cursor];
198 i1_cursor++;
199 }
200 out_cursor++;
201 }
202
203 memcpy(buck->key_array + start, tmpk_arr, sizeof(gehash_key_t ) * (items + items2));
204 memcpy(buck->value_array + start, tmpd_arr, sizeof(lnhash_data_t) * (items + items2));
205
206 free(tmpk_arr);
207 free(tmpd_arr);
208 }
209
lnhash_sort_bucket(lnhash_buckct_t * buck)210 void lnhash_sort_bucket(lnhash_buckct_t * buck)
211 {
212 merge_sort(buck, buck->num_elements, lnhash_mergesort_compare, lnhash_mergesort_exchange, lnhash_mergesort_merge);
213 }
214
lnhash_resort(lnhash_t * the_table)215 void lnhash_resort(lnhash_t * the_table)
216 {
217 int x1;
218 for(x1 = 0;x1 < the_table -> num_buckets; x1++)
219 {
220 lnhash_buckct_t * buck = &(the_table -> buckets[x1]);
221 lnhash_sort_bucket(buck);
222 }
223
224 the_table -> is_sorted = 1;
225 }
226
lnhash_get_record_column(lnhash_data_t pos)227 int lnhash_get_record_column(lnhash_data_t pos)
228 {
229 return (int)(pos % LNHASH_VOTE_ARRAY_HEIGHT);
230 }
231
lnhash_update_votes(lnhash_vote_t * votes,int record_column,lnhash_data_t test_pos,short dist_to_head)232 int lnhash_update_votes(lnhash_vote_t * votes, int record_column, lnhash_data_t test_pos, short dist_to_head)
233 {
234 int row_cursor;
235
236 for(row_cursor=0; row_cursor<votes->vote_record_items[record_column]; row_cursor++)
237 {
238 if(votes -> vote_records[record_column][row_cursor].head_position == test_pos)
239 {
240 votes -> vote_records[record_column][row_cursor].num_votes++;
241 votes -> vote_records[record_column][row_cursor].coverage_start = min(
242 votes -> vote_records[record_column][row_cursor].coverage_start, dist_to_head);
243 votes -> vote_records[record_column][row_cursor].coverage_end = max(
244 votes -> vote_records[record_column][row_cursor].coverage_end, dist_to_head + 16);
245 return 1;
246 }
247 }
248 return 0;
249 }
250
lnhash_add_votes(lnhash_vote_t * votes,int record_column,lnhash_data_t test_pos,short dist_to_head)251 void lnhash_add_votes(lnhash_vote_t * votes, int record_column, lnhash_data_t test_pos, short dist_to_head)
252 {
253 if(votes->vote_record_items[record_column]<LNHASH_VOTE_ARRAY_WIDTH)
254 {
255 int row_cursor = votes->vote_record_items[record_column];
256 votes -> vote_records[record_column][row_cursor].head_position = test_pos;
257 votes -> vote_records[record_column][row_cursor].num_votes = 1;
258 votes -> vote_records[record_column][row_cursor].coverage_start = dist_to_head;
259 votes -> vote_records[record_column][row_cursor].coverage_end = dist_to_head + 16;
260 votes->vote_record_items[record_column]++;
261 }
262 }
263
264 // no indels are considered.
lnhash_query(lnhash_t * the_table,lnhash_vote_t * votes,gehash_key_t queried_key,short dist_to_head)265 int lnhash_query(lnhash_t * the_table, lnhash_vote_t * votes, gehash_key_t queried_key, short dist_to_head)
266 {
267 // assert(the_table -> is_sorted);
268 unsigned int bucket_no = lnhash_locate_bucket(the_table , queried_key);
269 int lower_cursor, higher_cursor, match_cursor = 0, hits=0;
270
271 lnhash_buckct_t * buck = &(the_table -> buckets[bucket_no]);
272
273 if(the_table -> is_sorted)
274 {
275 lower_cursor = 0;
276 higher_cursor = buck -> num_elements;
277 if(higher_cursor<1) return 0;
278
279 while(1)
280 {
281 if(lower_cursor> higher_cursor)
282 return 0;
283 int middle_no = (lower_cursor+higher_cursor)/2;
284
285 if(buck->key_array[middle_no] == queried_key)
286 {
287 match_cursor = middle_no;
288 break;
289 }
290 else if(buck->key_array[middle_no] > queried_key)
291 higher_cursor = middle_no-1;
292 else lower_cursor = middle_no+1;
293 }
294
295 while( match_cursor > 0 )
296 {
297 if(buck->key_array[match_cursor-1] != queried_key)
298 break;
299 match_cursor--;
300 }
301 }
302
303 for(; match_cursor < buck -> num_elements; match_cursor++)
304 {
305 if(buck->key_array[match_cursor]!=queried_key)
306 {
307 if(the_table -> is_sorted) break;
308 else continue;
309 }
310
311 lnhash_data_t test_pos = buck -> value_array[match_cursor];
312
313 test_pos -= dist_to_head;
314 //if((test_pos & 0x3fff) > 1300) continue;
315
316 int record_column = lnhash_get_record_column(test_pos);
317
318 int found = lnhash_update_votes(votes, record_column, test_pos, dist_to_head);
319 if(!found) lnhash_add_votes (votes, record_column, test_pos, dist_to_head);
320 hits ++;
321 }
322 return hits;
323 }
324
compare_voting_items(void * arrV,int i,int j)325 int compare_voting_items(void * arrV, int i, int j)
326 {
327 lnhash_vote_record_t * arr = (lnhash_vote_record_t*) arrV;
328 if(arr[i].head_position > arr[j].head_position) return 1;
329 else if(arr[i].head_position < arr[j].head_position) return -1;
330 return 0;
331 }
332
exchange_voting_items(void * arrV,int i,int j)333 void exchange_voting_items(void * arrV, int i, int j)
334 {
335 lnhash_vote_record_t * arr = (lnhash_vote_record_t*) arrV;
336 lnhash_vote_record_t tmp;
337 memcpy(&tmp, arr + i, sizeof(lnhash_vote_record_t));
338 memcpy(arr + i, arr + j, sizeof(lnhash_vote_record_t));
339 memcpy(arr + j, &tmp, sizeof(lnhash_vote_record_t));
340 }
341
merge_vorting_items(void * arrV,int start,int items1,int items2)342 void merge_vorting_items(void * arrV, int start, int items1, int items2)
343 {
344 lnhash_vote_record_t * arr = (lnhash_vote_record_t*) arrV;
345 lnhash_vote_record_t * arrtmp = malloc( sizeof(lnhash_vote_record_t) * (items1 + items2));
346
347 int cursor_1 = 0, cursor_2 = 0, xk1;
348
349 for(xk1=0; xk1<items1+items2; xk1++)
350 {
351 lnhash_vote_record_t * curr_pnt1_item = arr + start + cursor_1;
352 lnhash_vote_record_t * curr_pnt2_item = arr + start + items1+ cursor_2;
353 int use_item1 = cursor_1 < items1;
354
355 if(use_item1 && cursor_2 < items2)
356 if(curr_pnt1_item -> head_position >= curr_pnt2_item -> head_position)
357 use_item1 = 0;
358 if(use_item1){
359 memcpy(arrtmp+xk1, arr+start+cursor_1, sizeof(lnhash_vote_record_t));
360 cursor_1++;
361 }
362 else{
363 memcpy(arrtmp+xk1, arr+start+items1+cursor_2, sizeof(lnhash_vote_record_t));
364 cursor_2++;
365 }
366 }
367
368 memcpy(arr + start, arrtmp, sizeof(lnhash_vote_record_t) * (items1 + items2));
369 free(arrtmp);
370 }
371
372
sorted_voting_table_EX(lnhash_vote_t * vote,lnhash_vote_record_t ** result_buffer,int minimum_votes,int offsetted)373 int sorted_voting_table_EX(lnhash_vote_t * vote, lnhash_vote_record_t ** result_buffer, int minimum_votes, int offsetted)
374 {
375 int ret_size = LNHASH_VOTE_ARRAY_HEIGHT * 3;
376 int ret_items = 0;
377 lnhash_vote_record_t * ret = malloc(sizeof(lnhash_vote_record_t) * ret_size);
378
379 int vote_X, vote_Y;
380 for(vote_X = 0; vote_X < LNHASH_VOTE_ARRAY_HEIGHT; vote_X++)
381 for(vote_Y = 0; vote_Y < vote -> vote_record_items[vote_X]; vote_Y++)
382 {
383 lnhash_vote_record_t * rec = &(vote -> vote_records[vote_X][vote_Y]);
384 if(rec -> num_votes >= minimum_votes)
385 {
386 if(ret_size <= ret_items)
387 {
388 ret_size *= 1.3;
389 ret = realloc(ret, sizeof(lnhash_vote_record_t) * ret_size);
390 }
391
392 if(offsetted) rec -> head_position += rec -> coverage_start;
393 memcpy(ret + ret_items, rec, sizeof(lnhash_vote_record_t));
394
395 ret_items ++;
396 }
397 }
398
399
400 merge_sort(ret, ret_items, compare_voting_items , exchange_voting_items, merge_vorting_items);
401
402 (*result_buffer) = ret;
403 return ret_items;
404
405 }
406
sorted_voting_table_offset(lnhash_vote_t * vote,lnhash_vote_record_t ** result_buffer,int minimum_votes)407 int sorted_voting_table_offset(lnhash_vote_t * vote, lnhash_vote_record_t ** result_buffer, int minimum_votes){
408 return sorted_voting_table_EX(vote, result_buffer, minimum_votes, 1);
409 }
sorted_voting_table(lnhash_vote_t * vote,lnhash_vote_record_t ** result_buffer,int minimum_votes)410 int sorted_voting_table(lnhash_vote_t * vote, lnhash_vote_record_t ** result_buffer, int minimum_votes)
411 {
412 return sorted_voting_table_EX(vote, result_buffer, minimum_votes, 0);
413 }
414
415