1 /***************************************************************
2 
3    The Subread software package is free software package:
4    you can redistribute it and/or modify it under the terms
5    of the GNU General Public License as published by the
6    Free Software Foundation, either version 3 of the License,
7    or (at your option) any later version.
8 
9    Subread is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty
11    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 
13    See the GNU General Public License for more details.
14 
15    Authors: Drs Yang Liao and Wei Shi
16 
17   ***************************************************************/
18 
19 
20 #include<assert.h>
21 #include <stdlib.h>
22 #include <string.h>
23 
24 #ifndef MACOS
25 #if !defined(__FreeBSD__) && !defined(__DragonFly__)
26 #include <malloc.h>
27 #endif
28 #endif
29 
30 #include<math.h>
31 #include"core.h"
32 #include"long-hashtable.h"
33 
lnhash_create(lnhash_t * the_table,unsigned int num_buckets)34 int lnhash_create(lnhash_t * the_table, unsigned int num_buckets)
35 {
36 	int x1;
37 	the_table -> num_elements = 0;
38 	the_table -> num_buckets = num_buckets;
39 	the_table -> is_sorted = 0;
40 	the_table -> subread_repeat_max = 25600;
41 
42 	the_table -> buckets = malloc(num_buckets * sizeof(lnhash_buckct_t));
43 	if(sizeof(void*) == 4)
44 		the_table -> key_repeated_numbers = malloc(sizeof(short) * 0xfffffff);
45 	else
46 		the_table -> key_repeated_numbers = malloc(sizeof(short) * 0x100000000llu);
47 	for(x1=0; x1<num_buckets; x1++)
48 	{
49 		the_table -> buckets[x1].num_elements = 0;
50 		the_table -> buckets[x1].space_elements = 0;
51 		the_table -> buckets[x1].key_array = NULL;
52 		the_table -> buckets[x1].value_array = NULL;
53 	}
54 
55 	return 0;
56 }
57 
lnhash_destroy(lnhash_t * the_table)58 void lnhash_destroy(lnhash_t * the_table)
59 {
60 	int x1;
61 	free(the_table -> key_repeated_numbers);
62 	for(x1=0; x1<the_table -> num_buckets; x1++)
63 	{
64 		if(the_table -> buckets[x1].key_array)
65 			free(the_table -> buckets[x1].key_array);
66 		if(the_table -> buckets[x1].value_array)
67 			free(the_table -> buckets[x1].value_array);
68 	}
69 
70 	free(the_table -> buckets);
71 }
72 
lnhash_locate_bucket(lnhash_t * the_table,gehash_key_t key)73 unsigned int lnhash_locate_bucket(lnhash_t * the_table, gehash_key_t key)
74 {
75 	return key % the_table -> num_buckets;
76 }
77 
lnhash_resize_bucket(lnhash_buckct_t * buck,unsigned int new_size)78 void lnhash_resize_bucket(lnhash_buckct_t * buck, unsigned int new_size)
79 {
80 	if(buck -> space_elements < new_size)
81 	{
82 		if(buck -> space_elements>0)
83 		{
84 			buck -> space_elements *= LNHASH_BUCKET_SIZE_INCREMENT;
85 			buck -> key_array = realloc(buck -> key_array, sizeof(gehash_key_t) * buck -> space_elements);
86 			buck -> value_array = realloc(buck -> value_array, sizeof(lnhash_data_t) * buck -> space_elements);
87 		}
88 		else
89 		{
90 			buck -> space_elements = LNHASH_INIT_BUCKET_SIZE;
91 			buck -> key_array = malloc(sizeof(gehash_key_t) * buck -> space_elements);
92 			buck -> value_array = malloc(sizeof(lnhash_data_t) * buck -> space_elements);
93 		}
94 	}
95 }
96 
lnhash_insert(lnhash_t * the_table,gehash_key_t key,lnhash_data_t data)97 int lnhash_insert(lnhash_t * the_table, gehash_key_t key, lnhash_data_t data)
98 {
99 	int bucket_no = lnhash_locate_bucket(the_table, key);
100 	lnhash_buckct_t * buck = &(the_table -> buckets[bucket_no]);
101 
102 	if(the_table -> key_repeated_numbers[key] < the_table -> subread_repeat_max)
103 	{
104 		the_table -> key_repeated_numbers[key]++;
105 
106 		lnhash_resize_bucket(buck, buck -> num_elements +1);
107 		buck -> key_array[buck -> num_elements] = key;
108 		buck -> value_array[buck -> num_elements] = data;
109 		buck -> num_elements++;
110 
111 		return 0;
112 	}
113 	else if(the_table -> key_repeated_numbers[key] == the_table -> subread_repeat_max)
114 	{
115 		return 1;
116 
117 		int read_c, write_c=0;
118 		for(read_c =0; read_c < buck -> num_elements; read_c++)
119 		{
120 			if(buck -> key_array[read_c] != key)
121 			{
122 				if(write_c!=read_c)
123 				{
124 					buck -> key_array[write_c] = buck -> key_array[read_c];
125 					buck -> value_array[write_c] = buck -> value_array[read_c];
126 				}
127 				write_c++;
128 			}
129 		}
130 
131 		buck -> num_elements = write_c;
132 		the_table -> key_repeated_numbers[key] = 65535;
133 	}
134 
135 	return 1;
136 }
137 
138 //void merge_sort(void * arr, int arr_size, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r), void merge(void * arr, int start, int items, int items2));
139 //
lnhash_mergesort_compare(void * arr,int l,int r)140 int lnhash_mergesort_compare(void * arr, int l, int r)
141 {
142 	lnhash_buckct_t  * buck = arr;
143 	gehash_key_t lk = buck->key_array[l];
144 	gehash_key_t rk = buck->key_array[r];
145 	if(lk==rk)return 0;
146 	if(lk> rk)return 1;
147 	return -1;
148 }
149 
lnhash_mergesort_exchange(void * arr,int l,int r)150 void lnhash_mergesort_exchange(void * arr, int l, int r)
151 {
152 	lnhash_buckct_t  * buck = arr;
153 	gehash_key_t tmpk;
154 	lnhash_data_t tmpd;
155 	tmpk = buck->key_array[l];
156 	tmpd = buck->value_array[l];
157 
158 	buck->value_array[l] = buck->value_array[r];
159 	buck->key_array[l] = buck->key_array[r];
160 
161 	buck->key_array[r] = tmpk;
162 	buck->value_array[r] = tmpd;
163 }
164 
lnhash_mergesort_merge(void * arr,int start,int items,int items2)165 void lnhash_mergesort_merge(void * arr, int start, int items, int items2)
166 {
167 	lnhash_buckct_t  * buck = arr;
168 	int i1_cursor = start, i2_cursor = start + items, out_cursor = 0;
169 	int end_point = start + items + items2;
170 
171 	gehash_key_t * tmpk_arr = malloc(sizeof(gehash_key_t) * (items + items2));
172 	lnhash_data_t * tmpd_arr = malloc(sizeof(lnhash_data_t) * (items + items2));
173 
174 	while(1)
175 	{
176 		if(i1_cursor == start + items && i2_cursor == end_point) break;
177 		assert(out_cursor <items + items2);
178 
179 		int choose_2 = 0;
180 
181 		if(i1_cursor >= start+items)
182 			choose_2 = 1;
183 
184 		if(i2_cursor < end_point)
185 			if(buck->key_array[i1_cursor] > buck->key_array[i2_cursor])
186 				choose_2 = 1;
187 
188 		if(choose_2)
189 		{
190 			tmpk_arr[out_cursor] = buck->key_array[i2_cursor];
191 			tmpd_arr[out_cursor] = buck->value_array[i2_cursor];
192 			i2_cursor++;
193 		}
194 		else
195 		{
196 			tmpk_arr[out_cursor] = buck->key_array[i1_cursor];
197 			tmpd_arr[out_cursor] = buck->value_array[i1_cursor];
198 			i1_cursor++;
199 		}
200 		out_cursor++;
201 	}
202 
203 	memcpy(buck->key_array   + start, tmpk_arr, sizeof(gehash_key_t ) * (items + items2));
204 	memcpy(buck->value_array + start, tmpd_arr, sizeof(lnhash_data_t) * (items + items2));
205 
206 	free(tmpk_arr);
207 	free(tmpd_arr);
208 }
209 
lnhash_sort_bucket(lnhash_buckct_t * buck)210 void lnhash_sort_bucket(lnhash_buckct_t * buck)
211 {
212 	merge_sort(buck, buck->num_elements, lnhash_mergesort_compare, lnhash_mergesort_exchange, lnhash_mergesort_merge);
213 }
214 
lnhash_resort(lnhash_t * the_table)215 void lnhash_resort(lnhash_t * the_table)
216 {
217 	int x1;
218 	for(x1 = 0;x1 < the_table -> num_buckets; x1++)
219 	{
220 		lnhash_buckct_t * buck = &(the_table -> buckets[x1]);
221 		lnhash_sort_bucket(buck);
222 	}
223 
224 	the_table -> is_sorted = 1;
225 }
226 
lnhash_get_record_column(lnhash_data_t pos)227 int lnhash_get_record_column(lnhash_data_t pos)
228 {
229 	return (int)(pos % LNHASH_VOTE_ARRAY_HEIGHT);
230 }
231 
lnhash_update_votes(lnhash_vote_t * votes,int record_column,lnhash_data_t test_pos,short dist_to_head)232 int lnhash_update_votes(lnhash_vote_t * votes, int record_column, lnhash_data_t test_pos, short dist_to_head)
233 {
234 	int row_cursor;
235 
236 	for(row_cursor=0; row_cursor<votes->vote_record_items[record_column]; row_cursor++)
237 	{
238 		if(votes -> vote_records[record_column][row_cursor].head_position == test_pos)
239 		{
240 			votes -> vote_records[record_column][row_cursor].num_votes++;
241 			votes -> vote_records[record_column][row_cursor].coverage_start = min(
242 				votes -> vote_records[record_column][row_cursor].coverage_start, dist_to_head);
243 			votes -> vote_records[record_column][row_cursor].coverage_end = max(
244 				votes -> vote_records[record_column][row_cursor].coverage_end, dist_to_head + 16);
245 			return 1;
246 		}
247 	}
248 	return 0;
249 }
250 
lnhash_add_votes(lnhash_vote_t * votes,int record_column,lnhash_data_t test_pos,short dist_to_head)251 void lnhash_add_votes(lnhash_vote_t * votes, int record_column, lnhash_data_t test_pos, short dist_to_head)
252 {
253 	if(votes->vote_record_items[record_column]<LNHASH_VOTE_ARRAY_WIDTH)
254 	{
255 		int row_cursor = votes->vote_record_items[record_column];
256 		votes -> vote_records[record_column][row_cursor].head_position = test_pos;
257 		votes -> vote_records[record_column][row_cursor].num_votes = 1;
258 		votes -> vote_records[record_column][row_cursor].coverage_start = dist_to_head;
259 		votes -> vote_records[record_column][row_cursor].coverage_end = dist_to_head + 16;
260 		votes->vote_record_items[record_column]++;
261 	}
262 }
263 
264 // no indels are considered.
lnhash_query(lnhash_t * the_table,lnhash_vote_t * votes,gehash_key_t queried_key,short dist_to_head)265 int lnhash_query(lnhash_t * the_table, lnhash_vote_t * votes, gehash_key_t queried_key, short dist_to_head)
266 {
267 //	assert(the_table -> is_sorted);
268 	unsigned int bucket_no = lnhash_locate_bucket(the_table , queried_key);
269 	int lower_cursor, higher_cursor, match_cursor = 0, hits=0;
270 
271 	lnhash_buckct_t * buck = &(the_table -> buckets[bucket_no]);
272 
273 	if(the_table -> is_sorted)
274 	{
275 		lower_cursor = 0;
276 		higher_cursor = buck -> num_elements;
277 		if(higher_cursor<1) return 0;
278 
279 		while(1)
280 		{
281 			if(lower_cursor> higher_cursor)
282 				return 0;
283 			int middle_no = (lower_cursor+higher_cursor)/2;
284 
285 			if(buck->key_array[middle_no] == queried_key)
286 			{
287 				match_cursor = middle_no;
288 				break;
289 			}
290 			else if(buck->key_array[middle_no] > queried_key)
291 				higher_cursor = middle_no-1;
292 			else	lower_cursor = middle_no+1;
293 		}
294 
295 		while( match_cursor > 0 )
296 		{
297 			if(buck->key_array[match_cursor-1] != queried_key)
298 				break;
299 			match_cursor--;
300 		}
301 	}
302 
303 	for(; match_cursor < buck -> num_elements; match_cursor++)
304 	{
305 		if(buck->key_array[match_cursor]!=queried_key)
306 		{
307 			if(the_table -> is_sorted) break;
308 			else continue;
309 		}
310 
311 		lnhash_data_t test_pos = buck -> value_array[match_cursor];
312 
313 		test_pos -= dist_to_head;
314 		//if((test_pos & 0x3fff) > 1300) continue;
315 
316 		int record_column = lnhash_get_record_column(test_pos);
317 
318 		int found = lnhash_update_votes(votes, record_column, test_pos, dist_to_head);
319 		if(!found)  lnhash_add_votes   (votes, record_column, test_pos, dist_to_head);
320 		hits ++;
321 	}
322 	return hits;
323 }
324 
compare_voting_items(void * arrV,int i,int j)325 int compare_voting_items(void * arrV, int i, int j)
326 {
327 	lnhash_vote_record_t * arr = (lnhash_vote_record_t*) arrV;
328 	if(arr[i].head_position > arr[j].head_position) return 1;
329 	else if(arr[i].head_position < arr[j].head_position) return -1;
330 	return 0;
331 }
332 
exchange_voting_items(void * arrV,int i,int j)333 void exchange_voting_items(void * arrV, int i, int j)
334 {
335 	lnhash_vote_record_t * arr = (lnhash_vote_record_t*) arrV;
336 	lnhash_vote_record_t tmp;
337 	memcpy(&tmp, arr + i, sizeof(lnhash_vote_record_t));
338 	memcpy(arr + i, arr + j, sizeof(lnhash_vote_record_t));
339 	memcpy(arr + j, &tmp, sizeof(lnhash_vote_record_t));
340 }
341 
merge_vorting_items(void * arrV,int start,int items1,int items2)342 void merge_vorting_items(void * arrV, int start, int items1, int items2)
343 {
344 	lnhash_vote_record_t * arr = (lnhash_vote_record_t*) arrV;
345 	lnhash_vote_record_t * arrtmp = malloc( sizeof(lnhash_vote_record_t) * (items1 + items2));
346 
347 	int cursor_1 = 0, cursor_2 = 0, xk1;
348 
349 	for(xk1=0; xk1<items1+items2; xk1++)
350 	{
351 		lnhash_vote_record_t * curr_pnt1_item = arr + start + cursor_1;
352 		lnhash_vote_record_t * curr_pnt2_item = arr + start + items1+ cursor_2;
353 		int use_item1 = cursor_1 < items1;
354 
355 		if(use_item1 && cursor_2 < items2)
356 			if(curr_pnt1_item -> head_position >= curr_pnt2_item -> head_position)
357 				use_item1 = 0;
358 		if(use_item1){
359 			memcpy(arrtmp+xk1, arr+start+cursor_1, sizeof(lnhash_vote_record_t));
360 			cursor_1++;
361 		}
362 		else{
363 			memcpy(arrtmp+xk1, arr+start+items1+cursor_2, sizeof(lnhash_vote_record_t));
364 			cursor_2++;
365 		}
366 	}
367 
368 	memcpy(arr + start, arrtmp, sizeof(lnhash_vote_record_t) * (items1 + items2));
369 	free(arrtmp);
370 }
371 
372 
sorted_voting_table_EX(lnhash_vote_t * vote,lnhash_vote_record_t ** result_buffer,int minimum_votes,int offsetted)373 int sorted_voting_table_EX(lnhash_vote_t * vote, lnhash_vote_record_t ** result_buffer, int minimum_votes, int offsetted)
374 {
375 	int ret_size = LNHASH_VOTE_ARRAY_HEIGHT * 3;
376 	int ret_items = 0;
377 	lnhash_vote_record_t * ret = malloc(sizeof(lnhash_vote_record_t) * ret_size);
378 
379 	int vote_X, vote_Y;
380 	for(vote_X = 0; vote_X < LNHASH_VOTE_ARRAY_HEIGHT; vote_X++)
381 		for(vote_Y = 0; vote_Y < vote -> vote_record_items[vote_X]; vote_Y++)
382 		{
383 			lnhash_vote_record_t * rec = &(vote -> vote_records[vote_X][vote_Y]);
384 			if(rec -> num_votes >= minimum_votes)
385 			{
386 				if(ret_size <= ret_items)
387 				{
388 					ret_size *= 1.3;
389 					ret = realloc(ret, sizeof(lnhash_vote_record_t) * ret_size);
390 				}
391 
392 				if(offsetted) rec -> head_position += rec -> coverage_start;
393 				memcpy(ret + ret_items, rec, sizeof(lnhash_vote_record_t));
394 
395 				ret_items ++;
396 			}
397 		}
398 
399 
400 	merge_sort(ret, ret_items, compare_voting_items , exchange_voting_items, merge_vorting_items);
401 
402 	(*result_buffer) = ret;
403 	return ret_items;
404 
405 }
406 
sorted_voting_table_offset(lnhash_vote_t * vote,lnhash_vote_record_t ** result_buffer,int minimum_votes)407 int sorted_voting_table_offset(lnhash_vote_t * vote, lnhash_vote_record_t ** result_buffer, int minimum_votes){
408 	return sorted_voting_table_EX(vote, result_buffer, minimum_votes, 1);
409 }
sorted_voting_table(lnhash_vote_t * vote,lnhash_vote_record_t ** result_buffer,int minimum_votes)410 int sorted_voting_table(lnhash_vote_t * vote, lnhash_vote_record_t ** result_buffer, int minimum_votes)
411 {
412 	return sorted_voting_table_EX(vote, result_buffer, minimum_votes, 0);
413 }
414 
415