1 /***************************************************************
2 
3    The Subread software package is free software package:
4    you can redistribute it and/or modify it under the terms
5    of the GNU General Public License as published by the
6    Free Software Foundation, either version 3 of the License,
7    or (at your option) any later version.
8 
9    Subread is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty
11    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 
13    See the GNU General Public License for more details.
14 
15    Authors: Drs Yang Liao and Wei Shi
16 
17   ***************************************************************/
18 
19 
20 #include "sorted-hashtable.h"
21 #include "gene-algorithms.h"
22 #include<assert.h>
23 #include <stdlib.h>
24 #include <string.h>
25 
26 #if !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__DragonFly__)
27 #include <malloc.h>
28 #endif
29 
30 #include<math.h>
31 #include"core.h"
32 
33 #define _gehash_hash(k) ((unsigned int)(k))
34 
35 
gehash_create(gehash_t * the_table,size_t expected_size,char is_small_table)36 int gehash_create(gehash_t * the_table, size_t expected_size, char is_small_table) {
37 	return gehash_create_ex(the_table, expected_size, is_small_table, SUBINDEX_VER0, 3, 0);
38 }
39 
calculate_buckets_by_size(size_t expected_size,int version_number,int is_small_table,int index_gap)40 unsigned int calculate_buckets_by_size( size_t expected_size, int version_number, int is_small_table, int index_gap ){
41 	if(expected_size ==0)
42 		expected_size = GEHASH_DEFAULT_SIZE;
43 	int expected_bucket_number = expected_size / GEHASH_BUCKET_LENGTH;
44 
45 	if(index_gap >= 3) expected_bucket_number /=3;
46 
47 	if(SUBINDEX_VER1 > version_number) {
48 		if(expected_bucket_number < 10111 && !is_small_table)expected_bucket_number = 10111;
49 		else if(is_small_table)	expected_bucket_number = 4;
50 	} else
51 		if(expected_bucket_number <= 0x3ffff )expected_bucket_number = 0x3ffff+4;
52 
53 
54 	for (;;expected_bucket_number++) {
55 		int j, valid_v;
56 
57 		valid_v = 1;
58 		int test_prime = 13;
59 		if(SUBINDEX_VER1 > version_number && is_small_table) test_prime = 3;
60 		for(j=2; j<=test_prime;j++)
61 		{
62 			if (expected_bucket_number % j == 0)
63 				valid_v = 0;
64 		}
65 
66 		if (valid_v)
67 			break;
68 	}
69 
70 	//SUBREADprintf("EXP_BUCKS=%u\n",expected_bucket_number);
71 
72 	return expected_bucket_number;
73 }
74 
gehash_create_ex(gehash_t * the_table,size_t expected_size,char is_small_table,int version_number,int index_gap,int padding)75 int gehash_create_ex(gehash_t * the_table, size_t expected_size, char is_small_table, int version_number, int index_gap, int padding) {
76 	int expected_bucket_number;
77 	int i;
78 
79 	memset(the_table, 0, sizeof(gehash_t));
80 	if(expected_size ==0)
81 		expected_size = GEHASH_DEFAULT_SIZE;
82 
83 	expected_bucket_number = calculate_buckets_by_size(expected_size, version_number, is_small_table, index_gap);
84 
85 	// calculate the number of buckets for creating the data structure
86 
87 
88 	the_table -> version_number = version_number;
89 	the_table -> current_items = 0;
90 	the_table -> is_small_table = is_small_table;
91 	the_table -> buckets_number = expected_bucket_number;
92 	the_table -> buckets = (struct gehash_bucket *) malloc(
93 			  expected_bucket_number *
94   			  sizeof(struct gehash_bucket )
95 			);
96 	the_table -> padding = padding;
97 
98 	if(!the_table -> buckets)
99 	{
100 		SUBREADputs(MESSAGE_OUT_OF_MEMORY);
101 		return 1;
102 	}
103 
104 	for(i=0; i<expected_bucket_number; i++){
105 		the_table -> buckets [i].item_keys = NULL;
106 		the_table -> buckets [i].current_items = 0;
107 		the_table -> buckets [i].space_size = 0;
108 	}
109 
110 	the_table -> index_gap = index_gap;
111 
112 	return 0;
113 }
114 
_gehash_resize_bucket(gehash_t * the_table,int bucket_no,char is_small_table)115 int _gehash_resize_bucket(gehash_t * the_table , int bucket_no, char is_small_table)
116 {
117 	int new_bucket_length;
118 	struct gehash_bucket * current_bucket = &(the_table -> buckets [bucket_no]);
119 	gehash_key_t * new_item_keys = NULL;
120 	short * new_new_item_keys = NULL;
121 	gehash_data_t * new_item_values;
122 
123 	if (current_bucket->space_size<1)
124 	{
125 		if(is_small_table)
126 			new_bucket_length = 5 ;
127 		else
128 			new_bucket_length = 2 ;
129 
130 		if(the_table->version_number == SUBINDEX_VER0)
131 			new_item_keys = (gehash_key_t *) malloc(sizeof(gehash_key_t) * new_bucket_length);
132 		else
133 			new_new_item_keys = (short *) malloc(sizeof(short)*new_bucket_length);
134 
135 		new_item_values = (gehash_data_t *) malloc(sizeof(gehash_data_t) * new_bucket_length);
136 
137 		if(!((new_item_keys|| new_new_item_keys) && new_item_values))
138 		{
139 			SUBREADputs(MESSAGE_OUT_OF_MEMORY);
140 			return 1;
141 		}
142 
143 		/*if(the_table->version_number == SUBINDEX_VER0)
144 			bzero(new_item_keys, sizeof(gehash_key_t) * new_bucket_length);
145 		else
146 			bzero(new_new_item_keys, sizeof(short) * new_bucket_length);
147 
148 		bzero(new_item_values, sizeof(gehash_data_t) * new_bucket_length);
149 		*/
150 
151 
152 		if(the_table->version_number == SUBINDEX_VER0)
153 			current_bucket->item_keys = new_item_keys;
154 		else
155 			current_bucket->new_item_keys = new_new_item_keys;
156 
157 		current_bucket->item_values = new_item_values;
158 		current_bucket->space_size = new_bucket_length;
159 
160 
161 	}
162 	else
163 	{
164 		if(is_small_table)
165 			//new_bucket_length = (int)max(15,current_bucket->space_size*1.5+1);
166 			new_bucket_length = (int)(current_bucket->space_size*5);
167 		else
168 			new_bucket_length = max((int)(current_bucket->space_size*1.5), (int)(current_bucket->space_size+4));
169 
170 		if(the_table->version_number == SUBINDEX_VER0)
171 			current_bucket->item_keys = (gehash_key_t *) realloc(current_bucket->item_keys ,  sizeof(gehash_key_t) * new_bucket_length);
172 		else
173 			current_bucket->new_item_keys = (short *) realloc(current_bucket->new_item_keys, sizeof(short)*new_bucket_length);
174 
175 		 current_bucket->item_values = (gehash_data_t *) realloc(current_bucket->item_values, sizeof(gehash_data_t) * new_bucket_length);
176 
177 
178 		if(!((current_bucket->new_item_keys||current_bucket->new_item_keys) && current_bucket->item_values))
179 		{
180 			SUBREADputs(MESSAGE_OUT_OF_MEMORY);
181 			return 1;
182 		}
183 
184 		current_bucket->space_size = new_bucket_length;
185 	}
186 	return 0;
187 }
188 
189 #define _gehash_get_bucket(tab, key)  ( (tab) -> buckets + _gehash_hash( key ) % (tab) -> buckets_number )
190 #define _gehash_get_bucketNO(tab, key)  ( _gehash_hash( key ) % (tab) -> buckets_number )
191 
gehash_try_insert_measure(unsigned int * bucket_sizes,int bucket_no,gehash_key_t key)192 void gehash_try_insert_measure(unsigned int * bucket_sizes, int bucket_no, gehash_key_t key){
193 	unsigned int buck_this = _gehash_hash(key)%bucket_no;
194 	//if(buck_this == 16305862)SUBREADprintf("Insert bucket : %u/%u -> %u at PTR=%p\n", buck_this, bucket_no, bucket_sizes[buck_this] , bucket_sizes);
195 	bucket_sizes[buck_this]++;
196 }
197 
gehash_insert(gehash_t * the_table,gehash_key_t key,gehash_data_t data,unsigned int * bucket_sizes)198 int gehash_insert(gehash_t * the_table, gehash_key_t key, gehash_data_t data, unsigned int * bucket_sizes)
199 {
200 	struct gehash_bucket * current_bucket;
201 	int is_fault = 0;
202 
203 	//SUBREADprintf("Insert subread : %u/%u\n",   _gehash_hash(key)% the_table->buckets_number,  the_table->buckets_number);
204 	current_bucket = _gehash_get_bucket (the_table, key);
205 
206 	if(the_table->version_number == SUBINDEX_VER0)
207 	{
208 		if (current_bucket->current_items >= current_bucket->space_size) {
209 			int bucket_number;
210 			if(bucket_sizes){
211 				SUBREADprintf("Bucket size was wrongly calculated.\n");
212 				return 1;
213 			}
214 
215 			bucket_number = _gehash_hash(key) % the_table -> buckets_number;
216 			is_fault = _gehash_resize_bucket(the_table, bucket_number, the_table->is_small_table);
217 			if(is_fault)
218 				return 1;
219 		}
220 		current_bucket->item_keys[current_bucket->current_items] = key;
221 	}
222 	else
223 	{
224 		short step_number = key/the_table -> buckets_number;
225 		if(current_bucket-> new_item_keys == NULL && bucket_sizes){
226 			//this will go only once when all the bucktes are NULL.
227 
228 			unsigned int * memory_sizes = malloc(sizeof(int) * the_table -> buckets_number);
229 			memset(memory_sizes, 0xff, sizeof(int) * the_table -> buckets_number);
230 
231 			unsigned int grp_bytes = 0, grps = 0, this_grp_bucks = 0, xk1, bucks_per_grp = the_table -> buckets_number / GEHASH_MEM_PTR_NO +2;
232 			for(xk1=0; xk1<  the_table -> buckets_number ; xk1++){
233 				unsigned int buck_size =(sizeof(short) + sizeof(gehash_data_t)) * bucket_sizes[xk1];
234 				grp_bytes += buck_size;
235 				this_grp_bucks ++;
236 				if(  this_grp_bucks >= bucks_per_grp ){
237 					memory_sizes[grps++] = grp_bytes;
238 					grp_bytes=0;
239 					this_grp_bucks=0;
240 				}
241 			}
242 			if(this_grp_bucks)
243 				memory_sizes[grps] = grp_bytes;
244 
245 			for(xk1=0; xk1 < GEHASH_MEM_PTR_NO; xk1++){
246 				unsigned int current_bytes = memory_sizes[xk1];
247 				if(current_bytes<0xff000000u){
248 					if(the_table -> malloc_ptr[xk1]!=NULL){
249 						SUBREADprintf("ERROR : no-NULL ptr : %p\n",the_table -> malloc_ptr[xk1]);
250 					}
251 					the_table -> malloc_ptr[xk1] = malloc(current_bytes);
252 					if(!the_table -> malloc_ptr[xk1]){
253 						SUBREADputs(MESSAGE_OUT_OF_MEMORY);
254 						return 1;
255 					}
256 				}
257 			}
258 
259 			grp_bytes = 0; grps = 0; this_grp_bucks = 0;
260 			for(xk1=0; xk1<  the_table -> buckets_number ; xk1++){
261 				struct gehash_bucket * mem_bucket = the_table -> buckets+xk1;
262 				memset(mem_bucket, 0, sizeof(struct gehash_bucket));
263 				mem_bucket -> new_item_keys = (short*)((char*)the_table -> malloc_ptr[grps] + grp_bytes);
264 				mem_bucket -> item_values = (gehash_data_t*)((char *)mem_bucket -> new_item_keys + sizeof(short) * bucket_sizes[xk1]);
265 				mem_bucket -> space_size = bucket_sizes[xk1];
266 
267 				unsigned int buck_size =(sizeof(short) + sizeof(gehash_data_t)) * bucket_sizes[xk1];
268 				grp_bytes += buck_size;
269 				this_grp_bucks ++;
270 				if(  this_grp_bucks >= bucks_per_grp ){
271 					grps++;
272 					grp_bytes=0;
273 					this_grp_bucks=0;
274 				}
275 			}
276 
277 			the_table -> free_item_only = 2;
278 			free(memory_sizes);
279 		}
280 
281 		if (current_bucket->current_items >= current_bucket->space_size) {
282 			if(bucket_sizes){
283 				SUBREADprintf("Bucket [%d] size was wrongly calculated : %d >= %u.\n",  _gehash_get_bucketNO (the_table, key), current_bucket->current_items,  current_bucket->space_size);
284 				return 1;
285 			}
286 
287 			int bucket_number;
288 			bucket_number = _gehash_hash(key) % the_table -> buckets_number;
289 
290 			is_fault = _gehash_resize_bucket(the_table, bucket_number, the_table->is_small_table);
291 			if(is_fault)
292 				return 1;
293 		}
294 		current_bucket->new_item_keys[current_bucket->current_items] = step_number;
295 	}
296 
297 	current_bucket->item_values[current_bucket->current_items] = data;
298 	current_bucket->current_items ++;
299 	the_table ->current_items ++;
300 	return 0;
301 }
302 
gehash_insert_limited(gehash_t * the_table,gehash_key_t key,gehash_data_t data,int limit,int replace_prob)303 int gehash_insert_limited(gehash_t * the_table, gehash_key_t key, gehash_data_t data, int limit, int replace_prob)
304 {
305 	struct gehash_bucket * current_bucket;
306 	int occurance = 0, xk1 ;
307 
308 	assert(the_table->version_number == SUBINDEX_VER0);
309 	current_bucket = _gehash_get_bucket (the_table, key);
310 
311 
312 	for(xk1=0; xk1<current_bucket->current_items ; xk1++)
313 		if(current_bucket->item_keys[xk1]==key) occurance++;
314 
315 	if(occurance >= limit)
316 	{
317 
318 		if(myrand_rand()%32768 < replace_prob)
319 			return 1;
320 		int replacement_id = myrand_rand()%occurance;
321 		occurance = 0;
322 		for(xk1=0; xk1<current_bucket->current_items ; xk1++)
323 			if(current_bucket->item_keys[xk1]==key)
324 			{
325 				if(occurance == replacement_id)
326 				{
327 					current_bucket->item_values[xk1]=data;
328 					return 0;
329 				}
330 				occurance++;
331 			}
332 
333 	}
334 
335 	gehash_insert(the_table, key, data, NULL);
336 	return 0;
337 }
338 
339 
340 
341 
342 #define INDEL_SEGMENT_SIZE 5
343 
344 #define _index_vote(key) (((unsigned int)(key))%GENE_VOTE_TABLE_SIZE)
345 #define _index_vote_tol(key) (((unsigned int)(key)/INDEL_SEGMENT_SIZE)%GENE_VOTE_TABLE_SIZE)
346 
347 
348 #define is_quality_subread(scr)	((scr)>15?1:0)
349 
gehash_go_q_tolerable(gehash_t * the_table,gehash_key_t key,int offset,int read_len,int is_reversed,gene_vote_t * vote,gene_vote_number_t weight,gene_quality_score_t quality,int max_match_number,int indel_tolerance,int subread_number,int max_error_bases,int subread_len,unsigned int low_border,unsigned int high_border)350 size_t gehash_go_q_tolerable(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote,gene_vote_number_t weight, gene_quality_score_t quality ,int max_match_number, int indel_tolerance, int subread_number, int max_error_bases, int subread_len, unsigned int low_border, unsigned int high_border)
351 {
352 	if(max_error_bases >= 10) return 0;
353 	char error_pos_stack[10];	// max error bases = 10;
354 
355 	gehash_key_t mutation_key;
356 	int ret = 0;
357 
358 	ret+=gehash_go_q(the_table, key, offset, read_len, is_reversed, vote, indel_tolerance, subread_number, low_border, high_border);
359 	int error_bases ;
360 	for (error_bases=1; error_bases <= max_error_bases; error_bases++)
361 	{
362 		int i, j;
363 		for(i=0; i<error_bases; i++)
364 			error_pos_stack [i] = i;
365 		while (1)
366 		{
367 
368 			char mutation_stack[10];
369 			memset(mutation_stack, 0 , error_bases);
370 			while(1)
371 			{
372 				int is_end2 = 1;
373 				for (i = error_bases-1; i>=0; i--)
374 					if (mutation_stack[i] <= 3)
375 					{
376 						int base_to_change=-1;
377 						mutation_key = key;
378 
379 						for (j = 0; j < error_bases; j++)
380 						{
381 							base_to_change = error_pos_stack[j];
382 							int new_value = mutation_stack[j];
383 							mutation_key = mutation_key & ~(0x3 << (2*base_to_change));
384 							mutation_key = mutation_key | (new_value << (2*base_to_change));
385 						}
386 						for (j = i+1; j<error_bases; j++)
387 							mutation_stack[j] = 0;
388 						is_end2 = 0;
389 						if(key != mutation_key)
390 							ret+=gehash_go_q(the_table, mutation_key, offset, read_len, is_reversed, vote, indel_tolerance, subread_number, low_border, high_border);
391 						mutation_stack[i] ++;
392 						break;
393 					}
394 				if (is_end2)break;
395 			}
396 
397 
398 			int is_end = 1;
399 			for (i = error_bases-1; i>=0; i--)
400 				if (error_pos_stack[i] < subread_len - (error_bases - i))
401 				{
402 					error_pos_stack[i] ++;
403 					for (j = i+1; j<error_bases; j++)
404 						error_pos_stack[j] = error_pos_stack[i] + (j-i);
405 					is_end = 0;
406 					break;
407 				}
408 
409 			if(is_end) break;
410 		}
411 	}
412 	return ret;
413 }
414 
415 
gehash_go_q_CtoT(gehash_t * the_table,gehash_key_t key,int offset,int read_len,int is_reversed,gene_vote_t * vote,gene_vote_number_t weight,int max_match_number,int indel_tolerance,int subread_number,int max_error_bases,unsigned int low_border,unsigned int high_border)416 size_t gehash_go_q_CtoT(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote, gene_vote_number_t weight,int max_match_number, int indel_tolerance, int subread_number, int max_error_bases, unsigned int low_border, unsigned int high_border)
417 {
418 	assert(max_error_bases < 5);
419 	int error_pos_stack[10];	// max error bases = 10;
420 	int C_poses[16];
421 	int  availableC=0;
422 
423 	gehash_key_t mutation_key;
424 	int ret = 0,i;
425 
426 	ret+=gehash_go_q(the_table, key, offset, read_len, is_reversed, vote, indel_tolerance, subread_number, low_border, high_border);
427 	int error_bases ;
428 
429 	for(i=0; i<16; i++)
430 	{
431 		if(((key >> ((15-i)*2) )&3) == 3)	// if it is 'T' at i-th base
432 		{
433 			 C_poses[availableC++] = i;
434 		}
435 	}
436 
437 
438 
439 	for (error_bases= min(availableC,max_error_bases); error_bases <= min(availableC,max_error_bases); error_bases++)
440 	{
441 		assert(error_bases < 5);
442 		int j;
443 		for(i=0; i<error_bases; i++)
444 		{
445 			error_pos_stack [i] = i;
446 		}
447 
448 		while (1)
449 		{
450 			char mutation_stack[10];
451 			memset(mutation_stack, 0 , error_bases);
452 			while(1)
453 			{
454 				int is_end2 = 1;
455 				for (i = error_bases-1; i>=0; i--)
456 					if (mutation_stack[i] <1)
457 					{
458 						int base_to_change=-1;
459 						mutation_key = key;
460 
461 						for (j = 0; j < error_bases; j++)
462 						{
463 							base_to_change = C_poses[error_pos_stack[j]];
464 							int new_value = mutation_stack[j]?2:3; //(C or T)
465 							mutation_key = mutation_key & ~(0x3 << (2*(15-base_to_change)));
466 							mutation_key = mutation_key | (new_value << (2*(15-base_to_change)));
467 						}
468 						for (j = i+1; j<error_bases; j++)
469 							mutation_stack[j] = 0;
470 						is_end2 = 0;
471 
472 						if(key != mutation_key){
473 							int retone;
474 							retone = gehash_go_q(the_table, mutation_key, offset, read_len, is_reversed, vote, indel_tolerance, subread_number, low_border, high_border);
475 							ret+=retone;
476 						}
477 						mutation_stack[i] ++;
478 						break;
479 					}
480 				if (is_end2)break;
481 			}
482 
483 
484 			int is_end = 1;
485 			for (i = error_bases-1; i>=0; i--)
486 				if (error_pos_stack[i] < availableC - (error_bases - i))
487 				{
488 					error_pos_stack[i] ++;
489 					for (j = i+1; j<error_bases; j++)
490 						error_pos_stack[j] = error_pos_stack[i] + (j-i);
491 					is_end = 0;
492 					break;
493 				}
494 
495 			if(is_end) break;
496 		}
497 	}
498 	return ret;
499 }
500 
501 
assign_best_vote(gene_vote_t * vote,int i,int j)502 void assign_best_vote(gene_vote_t * vote, int i, int j)
503 {
504 	vote->max_mask = vote->masks[i][j];
505 	vote->max_vote = vote->votes[i][j];
506 	vote->max_position =vote->pos[i][j];
507 	vote->max_coverage_start = vote->coverage_start [i][j];
508 	vote->max_coverage_end = vote->coverage_end [i][j];
509 	memcpy(vote->max_indel_recorder, vote->indel_recorder[i][j], 3*MAX_INDEL_TOLERANCE * sizeof(*vote->max_indel_recorder));
510 }
511 
512 
gehash_go_q(gehash_t * the_table,gehash_key_t raw_key,int offset,int read_len,int is_reversed,gene_vote_t * vote,int indel_tolerance,int subread_number,unsigned int low_border,unsigned int high_border)513 size_t gehash_go_q(gehash_t * the_table, gehash_key_t raw_key, int offset, int read_len, int is_reversed, gene_vote_t * vote, int indel_tolerance, int subread_number, unsigned int low_border, unsigned int high_border)
514 {
515 	//SUBREADprintf("Q=%u, OFFSET=%d, B=%u ~ %u\n", raw_key, offset, low_border, high_border);
516 
517 	if(the_table->version_number == SUBINDEX_VER0)
518 	{
519 		gehash_key_t key = raw_key;
520 		struct gehash_bucket * current_bucket;
521 		int i=0, items;
522 
523 		gehash_key_t  *current_keys;//, *endp12;
524 
525 		current_bucket = _gehash_get_bucket (the_table, key);
526 		items = current_bucket -> current_items;
527 		current_keys = current_bucket -> item_keys;
528 
529 		if(!items) return 0;
530 
531 		#define SPEED_UP_DENOMINAOR 3
532 		int jump_step = items / SPEED_UP_DENOMINAOR;
533 		int last_accepted_index = 0;
534 
535 		if(jump_step<1) jump_step=1;
536 
537 		if(key > current_keys[0])
538 		{
539 			while(1)
540 			{
541 				while(1)
542 				{
543 					int next_p = last_accepted_index + jump_step;
544 					if(next_p >= items) break;
545 					if(current_keys[next_p]>=key) break;
546 					last_accepted_index = next_p;
547 				}
548 				if(jump_step>SPEED_UP_DENOMINAOR)
549 					jump_step /= SPEED_UP_DENOMINAOR;
550 				else if(jump_step>1)
551 					jump_step = 1;
552 				else
553 					break;
554 			}
555 			last_accepted_index++;
556 		}
557 
558 		if(current_keys[last_accepted_index]!=key) return 0;
559 
560 		short offset_from_5 = offset;
561 		//short offset_from_5 = is_reversed?(read_len - offset - 16):offset ;
562 
563 		if(*(current_bucket -> item_values+last_accepted_index) > 0xffff0000)	// no position should be greater than this.
564 		{
565 			// assumed to be non-informative subread.
566 			vote -> noninformative_subreads++;
567 			return 0;
568 		}
569 
570 		if (indel_tolerance <1)
571 			for (; last_accepted_index<items && current_keys[last_accepted_index] == key ; last_accepted_index++)
572 			{
573 				unsigned int kv = current_bucket->item_values[last_accepted_index] - offset;
574 				int offsetX = _index_vote(kv);
575 				int datalen = vote -> items[offsetX];
576 				unsigned int * dat = vote -> pos[offsetX];
577 
578 				if (kv > 0xffff0000)
579 					continue;
580 				for (i=0;i<datalen;i++)
581 				{
582 					if (dat[i] == kv && (subread_number + 1 > vote -> last_subread_cluster[offsetX][i]))
583 					{
584 						gene_vote_number_t test_max = (vote->votes[offsetX][i]);
585 						test_max += 1;
586 						vote->votes[offsetX][i] = test_max;
587 						if (offset_from_5 <  vote->coverage_start [offsetX][i])
588 							vote->coverage_start [offsetX][i] = offset_from_5;
589 						if (offset_from_5 +16 > vote->coverage_end [offsetX][i])
590 							vote->coverage_end [offsetX][i] = offset_from_5+16;
591 
592 						vote -> last_subread_cluster[offsetX][i] = subread_number + 1;
593 
594 						vote->max_vote = max(vote->max_vote , test_max);
595 						i = 9999999;
596 					}
597 				}
598 
599 				if (i < 9999999 && datalen<GENE_VOTE_SPACE)
600 				{
601 
602 					if (kv < low_border || kv > high_border)
603 						continue;
604 
605 
606 					vote -> items[offsetX] ++;
607 					dat[i] = kv;
608 					vote->votes[offsetX][i]=1;
609 					vote->masks[offsetX][i]= (is_reversed?IS_NEGATIVE_STRAND:0);
610 					vote->coverage_start [offsetX][i] = offset_from_5;
611 					vote->coverage_end [offsetX][i] = offset_from_5+16;
612 					vote -> last_subread_cluster[offsetX][i] = subread_number + 1;
613 
614 					if(vote->max_vote==0)
615 						vote->max_vote = 1;
616 				}
617 			}
618 		else
619 		{
620 			// We duplicated all codes for indel_tolerance >= 1 for the minimal impact to performance.
621 			//int ii_end = (indel_tolerance % INDEL_SEGMENT_SIZE)?(indel_tolerance - indel_tolerance%INDEL_SEGMENT_SIZE+INDEL_SEGMENT_SIZE):indel_tolerance;
622 
623 			for (; last_accepted_index<items && current_keys[last_accepted_index] == key ; last_accepted_index++)
624 			{
625 				unsigned int kv = current_bucket->item_values[last_accepted_index] - offset;
626 				int ii_end = (indel_tolerance % INDEL_SEGMENT_SIZE)?(indel_tolerance - indel_tolerance%INDEL_SEGMENT_SIZE+INDEL_SEGMENT_SIZE):indel_tolerance;
627 				int iix;
628 				i=0;
629 
630 				for(iix = 0; iix<=ii_end; iix = iix>0?-iix:(-iix+INDEL_SEGMENT_SIZE))
631 				{
632 					int offsetX = _index_vote_tol(kv+iix);
633 					int datalen = vote -> items[offsetX];
634 					if(!datalen)continue;
635 
636 					unsigned int * dat = vote -> pos[offsetX];
637 
638 					for (i=0;i<datalen;i++)
639 					{
640 						int di = dat[i];
641 						int dist0 = kv-di;
642 						if( dist0 >= -indel_tolerance && dist0 <= indel_tolerance && is_reversed  == (0!=(vote -> masks[offsetX][i]&IS_NEGATIVE_STRAND))) {
643 							int toli =  vote -> toli[offsetX][i];
644 
645 							if(di >= 46494104 && di <= 46496104 ){
646 								SUBREADprintf("VOTES: at %u, subread_no = %d , last_cluster = %d , toli = %d\n", di, subread_number , vote -> last_subread_cluster[offsetX][i] , toli);
647 							}
648 
649 							if( toli > 0 && subread_number + 1 == vote -> last_subread_cluster[offsetX][i] ){
650 								int move_dist = 0;
651 								if( toli >=3 ) move_dist = vote -> indel_recorder[offsetX][i][toli-3+2];
652 								int new_dist = move_dist;
653 								move_dist -= vote -> indel_recorder[offsetX][i][toli+2];
654 								new_dist -= dist0;
655 								if(abs(move_dist) > abs(new_dist)){
656 									toli -= 3;
657 									vote -> toli[offsetX][i] = toli;
658 									vote -> last_subread_cluster[offsetX][i]--;
659 									vote -> votes[offsetX][i] --;
660 								}
661 							}
662 							if(subread_number + 1 <= vote -> last_subread_cluster[offsetX][i])
663 								continue;
664 
665 
666 							gene_vote_number_t test_max = (vote->votes[offsetX][i]);
667 							test_max += 1;
668 							vote -> votes[offsetX][i] = test_max;
669 
670 							/*
671 							if (offset_from_5 <  vote->coverage_start [offsetX][i])
672 							{
673 								vote->coverage_start [offsetX][i] = offset_from_5;
674 							}*/
675 							if (offset_from_5 +16 > vote->coverage_end [offsetX][i])
676 							{
677 								vote->coverage_end [offsetX][i] = offset_from_5+16;
678 							}
679 
680 
681 							if (dist0 !=  vote->current_indel_cursor[offsetX][i])
682 							{
683 								toli +=3;
684 								if (toli < indel_tolerance*3)
685 								{
686 									vote -> toli[offsetX][i] = toli;
687 									vote -> indel_recorder[offsetX][i][toli] = subread_number+1;
688 									vote -> indel_recorder[offsetX][i][toli+1] = subread_number+1;
689 									vote -> indel_recorder[offsetX][i][toli+2] = dist0;
690 
691 									if(toli < indel_tolerance*3-3) vote -> indel_recorder[offsetX][i][toli+3]=0;
692 								}
693 								vote->current_indel_cursor [offsetX][i] = (char)dist0;
694 							}
695 							else
696 								vote -> indel_recorder[offsetX][i][toli+1] = subread_number+1;
697 
698 							vote -> last_subread_cluster[offsetX][i] = subread_number + 1;
699 
700 							vote->max_vote = max(vote->max_vote , test_max);
701 							i = 9999999;
702 						}
703 					}
704 					if (i==9999999){
705 						break;
706 					}
707 				}
708 
709 				if (i < 9999999)
710 				{
711 					if (kv < low_border || kv > high_border)
712 						continue;
713 
714 					int offsetX2 = _index_vote_tol(kv);
715 					int datalen2 = vote -> items[offsetX2];
716 					unsigned int * dat2 = vote -> pos[offsetX2];
717 
718 					if (datalen2<GENE_VOTE_SPACE)
719 					{
720 						vote -> items[offsetX2] ++;
721 						dat2[datalen2] = kv;
722 						vote -> masks[offsetX2][datalen2]=(is_reversed?IS_NEGATIVE_STRAND:0);
723 						vote -> votes[offsetX2][datalen2]=1;
724 						vote -> toli[offsetX2][datalen2]=0;
725 						vote -> last_subread_cluster[offsetX2][datalen2] = subread_number + 1;
726 
727 						// data structure of recorder:
728 						// {unsigned char subread_start; unsigned char subread_end, char indel_offset_from_start}
729 						// All subread numbers are added with 1 for not being 0.
730 
731 						vote -> indel_recorder[offsetX2][datalen2][0] = vote -> indel_recorder[offsetX2][datalen2][1] = subread_number+1;
732 						vote -> indel_recorder[offsetX2][datalen2][2] = 0;
733 						vote -> indel_recorder[offsetX2][datalen2][3] = 0;
734 						vote->current_indel_cursor [offsetX2][datalen2] = 0;
735 						vote->coverage_start [offsetX2][datalen2] = offset_from_5;
736 						vote->coverage_end [offsetX2][datalen2] = offset_from_5+16;
737 
738 						if (vote->max_vote==0)
739 							vote->max_vote = 1;
740 					}
741 				}
742 			}
743 		}
744 
745 		return 1;
746 		//return match_end-match_start;
747 	}
748 	else
749 	{
750 
751 		// VER_1
752 		// VER_2
753 
754 		struct gehash_bucket * current_bucket;
755 		int i = 0, items;
756 
757 		short *current_keys;//, *endp12;
758 		short key = raw_key / the_table->buckets_number;
759 
760 		current_bucket = _gehash_get_bucket (the_table, raw_key);
761 		items = current_bucket -> current_items;
762 		current_keys = current_bucket -> new_item_keys;
763 
764 		if(!items) return 0;
765 
766 //#warning "======== MAKE SURE THAT '-1' IS CORRECT ============"
767 		int imin=0, imax=items - 1;
768 		int last_accepted_index;
769 
770 		while(1)
771 		{
772 			last_accepted_index=(imin+imax)/2;
773 			short current_key = current_keys[last_accepted_index];
774 			if(current_key>key)
775 			{
776 				imax = last_accepted_index - 1;
777 			}
778 			else if(current_key<key)
779 			{
780 				imin = last_accepted_index + 1;
781 			}
782 			else
783 				break;
784 
785 			if(imax<imin)
786 				return 0;
787 
788 		}
789 
790 		while(last_accepted_index){
791 			if(current_keys[last_accepted_index-1] == key) last_accepted_index-=1;
792 			else break;
793 		}
794 
795 
796 		/*if(*(current_bucket -> item_values+last_accepted_index) > 0xffff0000)	// no position should be greater than this.
797 		{
798 			// assumed to be non-informative subread.
799 			vote -> noninformative_subreads++;
800 			return 0;
801 		}*/
802 
803 		int subread_number_P1 =  subread_number + 1;
804 		int of_p_16 = offset + 16;
805 
806 		{
807 			int ii_end = INDEL_SEGMENT_SIZE;
808 			if(indel_tolerance>5) ii_end=(indel_tolerance % INDEL_SEGMENT_SIZE)?(indel_tolerance - indel_tolerance%INDEL_SEGMENT_SIZE+INDEL_SEGMENT_SIZE):indel_tolerance;
809 
810 			for (;  last_accepted_index<items && current_keys[last_accepted_index] == key ; last_accepted_index++)
811 			{
812 				int iix, offsetX2, offsetX, datalen, datalen2;
813 				unsigned int kv = current_bucket->item_values[last_accepted_index] - offset;
814 				offsetX2 = _index_vote_tol(kv);
815 				datalen = vote -> items[offsetX2];
816 
817 				datalen2 = datalen;
818 				offsetX = offsetX2;
819 				unsigned int * dat2, *dat;
820 				dat = dat2 = vote -> pos[offsetX2];
821 
822 				//SUBREADprintf("You can find KV at %u\n", kv);
823 
824 				for(iix = 0; iix<=ii_end; iix = iix>0?-iix:(-iix+INDEL_SEGMENT_SIZE))
825 				{
826 					if(iix)
827 					{
828 						offsetX = _index_vote_tol(kv+iix);
829 						datalen = vote -> items[offsetX];
830 						if(!datalen)continue;
831 
832 						dat = vote -> pos[offsetX];
833 					}else if(!datalen)continue;
834 
835 					for (i=0;i<datalen;i++)
836 					{
837 						int dist0 = kv-dat[i];
838 
839 
840 						if( dist0 >= -indel_tolerance && dist0 <= indel_tolerance && is_reversed  == (0!= vote->masks[offsetX][i]))
841 						{
842 
843 //							if(di >= 46494104 && di <= 46496104 ){
844 //								SUBREADprintf("VOTES: at %u, subread_no = %d , last_cluster = %d , toli = %d\n", di, subread_number , vote -> last_subread_cluster[offsetX][i] , toli);
845 //							}
846 
847 							int toli =  vote -> toli[offsetX][i];
848 							if( subread_number_P1 == vote -> last_subread_cluster[offsetX][i]  && toli >0){
849 								int move_dist = 0;
850 								if( toli >=3 ) move_dist = vote -> indel_recorder[offsetX][i][toli-3+2];
851 								int new_dist = move_dist;
852 								move_dist -= vote -> indel_recorder[offsetX][i][toli+2];
853 								new_dist -= dist0;
854 								if(abs(move_dist) > abs(new_dist)){
855 									toli -= 3;
856 									vote -> toli[offsetX][i] = toli;
857 									vote -> last_subread_cluster[offsetX][i]--;
858 									vote -> votes[offsetX][i] --;
859 								}
860 							}
861 
862 							if(subread_number_P1 <= vote -> last_subread_cluster[offsetX][i]) continue;
863 							gene_vote_number_t test_max = (vote->votes[offsetX][i]);
864 							test_max += 1;
865 							vote -> votes[offsetX][i] = test_max;
866 
867 							if (offset +16 > vote->coverage_end [offsetX][i])
868 								vote->coverage_end [offsetX][i] = of_p_16;
869 
870 
871 							if (dist0 ==  vote->current_indel_cursor[offsetX][i])
872 								vote -> indel_recorder[offsetX][i][toli+1] = subread_number_P1;
873 							else {
874 								toli +=3;
875 								if (toli < MAX_INDEL_SECTIONS*3)
876 								{
877 									vote -> toli[offsetX][i] = toli;
878 									vote -> indel_recorder[offsetX][i][toli] = subread_number_P1;
879 									vote -> indel_recorder[offsetX][i][toli+1] = subread_number_P1;
880 									vote -> indel_recorder[offsetX][i][toli+2] = dist0;
881 
882 									if(toli < MAX_INDEL_SECTIONS*3-3) vote -> indel_recorder[offsetX][i][toli+3]=0;
883 								}
884 								vote->current_indel_cursor [offsetX][i] = (char)dist0;
885 							}
886 
887 							vote -> last_subread_cluster[offsetX][i] = subread_number_P1;
888 							if(vote->max_vote < test_max)vote->max_vote = test_max;
889 							i = 9999999;
890 							break;
891 						}
892 					}
893 					if (i==9999999)break;
894 
895 				}
896 
897 				if (i < 9999999)
898 				{
899 					if (kv < low_border || kv > high_border)
900 						continue;
901 
902 					if (datalen2<GENE_VOTE_SPACE)
903 					{
904 						vote -> items[offsetX2] ++;
905 						dat2[datalen2] = kv;
906 						vote -> masks[offsetX2][datalen2]=(is_reversed?IS_NEGATIVE_STRAND:0);
907 						vote -> votes[offsetX2][datalen2]=1;
908 						vote -> toli[offsetX2][datalen2]=0;
909 
910 						// data structure of recorder:
911 						// {unsigned char subread_start; unsigned char subread_end, char indel_offset_from_start}
912 						// All subread numbers are added with 1 for not being 0.
913 
914 						vote -> indel_recorder[offsetX2][datalen2][0] = vote -> indel_recorder[offsetX2][datalen2][1] = subread_number_P1;
915 						vote -> indel_recorder[offsetX2][datalen2][2] = 0;
916 						vote -> indel_recorder[offsetX2][datalen2][3] = 0;
917 						vote->current_indel_cursor [offsetX2][datalen2] = 0;
918 						vote->coverage_start [offsetX2][datalen2] = offset;
919 						vote->coverage_end [offsetX2][datalen2] = of_p_16;
920 						vote -> last_subread_cluster[offsetX2][datalen2] = subread_number_P1;
921 
922 						if (vote->max_vote==0)
923 							vote->max_vote = 1;
924 					}
925 				}
926 				else i=0;
927 			}
928 		}
929 		return 1;
930 	}
931 }
932 
933 
934 
gehash_go_X(gehash_t * the_table,gehash_key_t raw_key,int offset,int read_len,int is_reversed,gene_vote_t * vote,int indel_tolerance,int subread_number,unsigned int low_border,unsigned int high_border,int run_round,unsigned int * shift_indel_locs,unsigned int * shift_indel_NO)935 size_t gehash_go_X(gehash_t * the_table, gehash_key_t raw_key, int offset, int read_len, int is_reversed, gene_vote_t * vote, int indel_tolerance, int subread_number, unsigned int low_border, unsigned int high_border, int run_round, unsigned int * shift_indel_locs, unsigned int * shift_indel_NO){
936 	if(0)if(the_table->version_number == SUBINDEX_VER0){
937 		SUBREADprintf("ERROR: the version of the index is too old.\n");
938 		assert(the_table->version_number != SUBINDEX_VER0);
939 		return -1;
940 	}
941 
942 	// VER_1
943 	// VER_2
944 
945 	struct gehash_bucket * current_bucket;
946 	int i = 0, items;
947 
948 	short *current_keys;//, *endp12;
949 	short key = raw_key / the_table->buckets_number;
950 
951 	current_bucket = _gehash_get_bucket (the_table, raw_key);
952 	items = current_bucket -> current_items;
953 	current_keys = current_bucket -> new_item_keys;
954 
955 	if(!items) return 0;
956 
957 //#warning "======== MAKE SURE THAT '-1' IS CORRECT ============"
958 	int imin=0, imax=items - 1;
959 	int last_accepted_index;
960 
961 	while(1)
962 	{
963 		last_accepted_index=(imin+imax)/2;
964 		short current_key = current_keys[last_accepted_index];
965 		if(current_key>key)
966 		{
967 			imax = last_accepted_index - 1;
968 		}
969 		else if(current_key<key)
970 		{
971 			imin = last_accepted_index + 1;
972 		}
973 		else
974 			break;
975 
976 		if(imax<imin)
977 			return 0;
978 
979 	}
980 
981 
982 	int subread_number_P1 =  subread_number + 1;
983 	int of_p_16 = offset + 16;
984 	is_reversed = is_reversed?IS_NEGATIVE_STRAND:0;
985 	int start_scan_idx = last_accepted_index, scan_step = 0;
986 
987 	{
988 		int ii_end = INDEL_SEGMENT_SIZE;
989 		if(indel_tolerance>5) ii_end=(indel_tolerance % INDEL_SEGMENT_SIZE)?(indel_tolerance - indel_tolerance%INDEL_SEGMENT_SIZE+INDEL_SEGMENT_SIZE):indel_tolerance;
990 
991 		//for (; last_accepted_index<items && current_keys[last_accepted_index] == key ; last_accepted_index++){
992 		while(1){
993 			int iix, offsetX2, offsetX, datalen, datalen2;
994 			unsigned int kv = current_bucket->item_values[last_accepted_index] - offset;
995 
996 			offsetX = offsetX2 = _index_vote_tol(kv);
997 			datalen = datalen2 = vote -> items[offsetX];
998 
999 			unsigned int * dat2, *dat;
1000 			dat = dat2 = vote -> pos[offsetX2];
1001 			int found_some = 0;
1002 
1003 			//SUBREADprintf("You can find KV at %u\n", kv);
1004 
1005 			for(iix = 0; iix<=ii_end; iix = iix>0?-iix:(-iix+INDEL_SEGMENT_SIZE)) {
1006 				if(iix) {
1007 					offsetX = _index_vote_tol(kv+iix);
1008 					datalen = vote -> items[offsetX];
1009 					if(!datalen)continue;
1010 					dat = vote -> pos[offsetX];
1011 				} else if(!datalen) continue;
1012 
1013 				for (i=0;i<datalen;i++)
1014 				{
1015 					int dist0 = kv-dat[i];
1016 					int applied_indel_tol = ( run_round>0 && vote -> marked_shift_indel[offsetX][i])? 0: indel_tolerance ;
1017 					if( dist0 >= -applied_indel_tol && dist0 <= applied_indel_tol && is_reversed == vote->masks[offsetX][i]){
1018 						int toli =  vote -> toli[offsetX][i];
1019 
1020 						if(run_round == 0 && toli >0 && dist0 ==0 && ! vote -> marked_shift_indel[offsetX][i]){
1021 							vote -> marked_shift_indel[offsetX][i] = 1;
1022 							shift_indel_locs[(* shift_indel_NO)++] = dat[i];
1023 						}
1024 
1025 						if( subread_number_P1 == vote -> last_subread_cluster[offsetX][i]  && toli >0){
1026 							int move_dist = 0;
1027 							if( toli >=3 ) move_dist = vote -> indel_recorder[offsetX][i][toli-3+2];
1028 							int new_dist = move_dist;
1029 							move_dist -= vote -> indel_recorder[offsetX][i][toli+2];
1030 							new_dist -= dist0;
1031 							if(abs(move_dist) > abs(new_dist)){
1032 								toli -= 3;
1033 								vote -> toli[offsetX][i] = toli;
1034 								vote -> last_subread_cluster[offsetX][i]--;
1035 								vote -> votes[offsetX][i] --;
1036 							}
1037 						}
1038 
1039 						if(subread_number_P1 <= vote -> last_subread_cluster[offsetX][i]) continue;
1040 						gene_vote_number_t test_max = (vote->votes[offsetX][i]);
1041 						test_max += 1;
1042 						vote -> votes[offsetX][i] = test_max;
1043 
1044 						if (offset +16 > vote->coverage_end [offsetX][i])
1045 							vote->coverage_end [offsetX][i] = of_p_16;
1046 
1047 						if (dist0 ==  vote->current_indel_cursor[offsetX][i]){
1048 							vote -> indel_recorder[offsetX][i][toli+1] = subread_number_P1;
1049 						} else {
1050 							toli +=3;
1051 							if (toli < MAX_INDEL_SECTIONS*3)
1052 							{
1053 								vote -> toli[offsetX][i] = toli;
1054 								vote -> indel_recorder[offsetX][i][toli] = subread_number_P1;
1055 								vote -> indel_recorder[offsetX][i][toli+1] = subread_number_P1;
1056 								vote -> indel_recorder[offsetX][i][toli+2] = dist0;
1057 
1058 								if(toli < MAX_INDEL_SECTIONS*3-3) vote -> indel_recorder[offsetX][i][toli+3]=0;
1059 							}
1060 							vote->current_indel_cursor [offsetX][i] = (char)dist0;
1061 						}
1062 
1063 						vote -> last_subread_cluster[offsetX][i] = subread_number_P1;
1064 						if(vote->max_vote < test_max)vote->max_vote = test_max;
1065 						found_some = 1;
1066 						break;
1067 					}
1068 				}
1069 				if(found_some) break;
1070 			}
1071 
1072 			if (!found_some) {
1073 				if (kv >=low_border && kv <= high_border && datalen2<GENE_VOTE_SPACE) {
1074 					vote -> items[offsetX2] ++;
1075 					dat2[datalen2] = kv;
1076 					vote -> masks[offsetX2][datalen2] = is_reversed;
1077 					vote -> votes[offsetX2][datalen2] = 1;
1078 					vote -> toli[offsetX2][datalen2] = 0;
1079 					vote -> marked_shift_indel[offsetX2][datalen2] = 0;
1080 					if(run_round>0){
1081 						int kk;
1082 						for(kk = 0; kk < * shift_indel_NO ; kk++){
1083 							if( kv >= shift_indel_locs[kk] - indel_tolerance && kv <= shift_indel_locs[kk] + indel_tolerance ){
1084 								vote -> marked_shift_indel[offsetX2][datalen2] = 1;
1085 								break;
1086 							}
1087 						}
1088 					}
1089 
1090 					// data structure of recorder:
1091 					// {unsigned char subread_start; unsigned char subread_end, char indel_offset_from_start}
1092 					// All subread numbers are added with 1 for not being 0.
1093 
1094 					vote -> indel_recorder[offsetX2][datalen2][0] = vote -> indel_recorder[offsetX2][datalen2][1] = subread_number_P1;
1095 					vote -> indel_recorder[offsetX2][datalen2][2] = 0;
1096 					vote -> indel_recorder[offsetX2][datalen2][3] = 0;
1097 					vote->current_indel_cursor [offsetX2][datalen2] = 0;
1098 					vote->coverage_start [offsetX2][datalen2] = offset;
1099 					vote->coverage_end [offsetX2][datalen2] = of_p_16;
1100 					vote -> last_subread_cluster[offsetX2][datalen2] = subread_number_P1;
1101 
1102 					if (vote->max_vote==0)
1103 						vote->max_vote = 1;
1104 				}
1105 			}
1106 
1107 			if(! scan_step){
1108 				last_accepted_index++;
1109 				if(last_accepted_index == items || current_keys[last_accepted_index]!= key){
1110 					scan_step = 1;
1111 					last_accepted_index = start_scan_idx;
1112 				}
1113 			}
1114 			if(scan_step){
1115 				last_accepted_index --;
1116 				if(last_accepted_index <0 || current_keys[last_accepted_index]!= key) break;
1117 			}
1118 		}
1119 	}
1120 	return 1;
1121 }
1122 
1123 
1124 
1125 
select_best_vote(gene_vote_t * vote)1126 void select_best_vote(gene_vote_t * vote)
1127 {
1128 	int i,j;
1129 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1130 		for(j=0; j< vote->items[i]; j++)
1131 		{
1132 			if(vote -> votes[i][j] == vote->max_vote)
1133 			{
1134 				vote -> max_position = vote -> pos[i][j];
1135 				vote -> max_mask = vote -> masks[i][j];
1136 				vote -> max_coverage_start = vote->coverage_start[i][j];
1137 				vote -> max_coverage_end = vote->coverage_end[i][j];
1138 			}
1139 		}
1140 }
1141 
indel_recorder_copy(gene_vote_number_t * dst,gene_vote_number_t * src)1142 short indel_recorder_copy(gene_vote_number_t *dst, gene_vote_number_t * src)
1143 {
1144 	short all_indels = 0;
1145 //	memcpy(dst, src, 3*MAX_INDEL_TOLERANCE);  return;
1146 
1147 
1148 	int i=0;
1149 	while(src[i] && (i<3*MAX_INDEL_TOLERANCE-2))
1150 	{
1151 		dst[i] = src[i];
1152 		i++;
1153 		dst[i] = src[i];
1154 		i++;
1155 		dst[i] = src[i];
1156 		all_indels = dst[i];
1157 		i++;
1158 	}
1159 	dst[i] = 0;
1160 	return all_indels;
1161 
1162 }
1163 
finalise_vote(gene_vote_t * vote)1164 void finalise_vote(gene_vote_t * vote)
1165 {
1166 	if(vote->max_tmp_indel_recorder)
1167 	{
1168 		indel_recorder_copy(vote->max_indel_recorder, vote->max_tmp_indel_recorder);
1169 	}
1170 //		memcpy(vote->max_indel_recorder, vote->max_tmp_indel_recorder,  sizeof(char)*3 * MAX_INDEL_TOLERANCE);
1171 }
1172 
gehash_exist(gehash_t * the_table,gehash_key_t key)1173 int gehash_exist(gehash_t * the_table, gehash_key_t key)
1174 {
1175 	struct gehash_bucket * current_bucket;
1176 	int  items;
1177 	gehash_key_t * keyp, *endkp;
1178 
1179 	assert(the_table->version_number == SUBINDEX_VER0);
1180 	current_bucket = _gehash_get_bucket (the_table, key);
1181 	items = current_bucket -> current_items;
1182 
1183 	if(items <1)return 0;
1184 
1185 	keyp = current_bucket -> item_keys;
1186 	endkp = keyp + items;
1187 
1188 	while(1)
1189 	{
1190 		if(*keyp == key)
1191 			return 1;
1192 		if(++keyp >= endkp) break;
1193 	}
1194 	return 0;
1195 }
1196 
1197 
gehash_update(gehash_t * the_table,gehash_key_t key,gehash_data_t data_new)1198 size_t gehash_update(gehash_t * the_table, gehash_key_t key, gehash_data_t data_new)
1199 {
1200 	struct gehash_bucket * current_bucket;
1201 	size_t matched;
1202 	int items;
1203 	gehash_key_t * keyp, *endkp;
1204 
1205 	assert(the_table->version_number == SUBINDEX_VER0);
1206 	current_bucket = _gehash_get_bucket (the_table, key);
1207 
1208 	matched = 0;
1209 	items = current_bucket -> current_items;
1210 	keyp = current_bucket -> item_keys;
1211 	endkp = keyp + items;
1212 
1213 	while(1)
1214 	{
1215 		if(*keyp == key)
1216 		{
1217 			current_bucket -> item_values[keyp-current_bucket -> item_keys] = data_new;
1218 			matched++;
1219 		}
1220 		keyp +=1;
1221 		if(keyp >= endkp)
1222 			break;
1223 	}
1224 	return matched;
1225 }
1226 
gehash_get(gehash_t * the_table,gehash_key_t key,gehash_data_t * data_result,size_t max_result_space)1227 size_t gehash_get(gehash_t * the_table, gehash_key_t key, gehash_data_t * data_result, size_t max_result_space)
1228 {
1229 	struct gehash_bucket * current_bucket;
1230 	size_t matched;
1231 	int items;
1232 	gehash_key_t * keyp, *endkp;
1233 
1234 	if(max_result_space<1)
1235 		return 0;
1236 
1237 	assert(the_table->version_number == SUBINDEX_VER0);
1238 	current_bucket = _gehash_get_bucket (the_table, key);
1239 
1240 	matched = 0;
1241 	items = current_bucket -> current_items;
1242 	if(items<1) return 0;
1243 
1244 	keyp = current_bucket -> item_keys;
1245 	endkp = keyp + items;
1246 
1247 	while(1)
1248 	{
1249 		if(*keyp == key)
1250 		{
1251 			data_result [matched] = current_bucket -> item_values[keyp-current_bucket -> item_keys];
1252 			matched +=1;
1253 			if(matched >= max_result_space)
1254 				break;
1255 		}
1256 		keyp +=1;
1257 		if(keyp >= endkp)
1258 			break;
1259 	}
1260 	return matched;
1261 }
1262 
1263 
gehash_remove(gehash_t * the_table,gehash_key_t key)1264 size_t gehash_remove(gehash_t * the_table, gehash_key_t key)
1265 {
1266 	struct gehash_bucket * current_bucket;
1267 	int i;
1268 	size_t removed;
1269 
1270 	assert(the_table->version_number == SUBINDEX_VER0);
1271 	current_bucket = _gehash_get_bucket (the_table, key);
1272 
1273 	if(current_bucket -> current_items < 1)
1274 		return 0;
1275 
1276 	removed = 0;
1277 	for(i=0; ; i++)
1278 	{
1279 		while(current_bucket -> item_keys [i+removed] == key && i+removed < current_bucket -> current_items)
1280 			removed += 1;
1281 
1282 		if(i+removed >= current_bucket -> current_items)
1283 			break;
1284 
1285 		if(removed)
1286 		{
1287 			current_bucket -> item_keys [i] =
1288 				current_bucket -> item_keys [i + removed];
1289 
1290 			current_bucket -> item_values [i] =
1291 				current_bucket -> item_values [i + removed];
1292 		}
1293 
1294 	}
1295 
1296 	current_bucket -> current_items -= removed;
1297 	the_table-> current_items -= removed;
1298 
1299 	return removed;
1300 }
1301 
gehash_get_hpc(gehash_t * the_table,gehash_key_t key,gehash_data_t * data_result,size_t max_result_space)1302 size_t gehash_get_hpc(gehash_t * the_table, gehash_key_t key, gehash_data_t * data_result, size_t max_result_space)
1303 {
1304 	return -1;
1305 }
1306 
1307 
gehash_insert_sorted(gehash_t * the_table,gehash_key_t key,gehash_data_t data)1308 void gehash_insert_sorted(gehash_t * the_table, gehash_key_t key, gehash_data_t data)
1309 {
1310 
1311 	SUBREADprintf("UNIMPLEMENTED! gehash_insert_sorted \n");
1312 }
1313 
1314 
1315 
1316 // Data Struct of dumpping:
1317 // {
1318 //      size_t current_items;
1319 //      size_t buckets_number;
1320 //      struct
1321 //      {
1322 //	      size_t current_items;
1323 //	      size_t space_size;
1324 //	      gehash_key_t item_keys [current_items];
1325 //	      gehash_data_t item_values [current_items]
1326 //      } [buckets_number];
1327 // }
1328 //
1329 
load_int32(FILE * fp)1330 unsigned int load_int32(FILE * fp)
1331 {
1332 	int ret;
1333 	int read_length;
1334 	read_length = fread(&ret, sizeof(int), 1, fp);
1335 	if(read_length<=0)assert(0);
1336 	return ret;
1337 }
1338 
load_int64(FILE * fp)1339 srInt_64 load_int64(FILE * fp)
1340 {
1341 	srInt_64 ret;
1342 	int read_length;
1343 	read_length = fread(&ret, sizeof(srInt_64), 1, fp);
1344 	if(read_length<=0)assert(0);
1345 	return ret;
1346 }
1347 
1348 
gehash_load_option(const char fname[],int option_no,int * result)1349 int gehash_load_option(const char fname [], int option_no, int * result){
1350 	char tabname[MAX_FILE_NAME_LENGTH];
1351 	char magic_chars[8];
1352 	int found = 0, rrtv;
1353 	sprintf(tabname, "%s.00.b.tab", fname);
1354 	FILE * fp = f_subr_open(tabname, "rb");
1355 	if(fp == NULL){
1356 		sprintf(tabname, "%s.00.c.tab", fname);
1357 		fp = f_subr_open(tabname, "rb");
1358 	}
1359 	if(fp){
1360 		rrtv = fread(magic_chars,1,8,fp);
1361 		if(rrtv < 8) return -1;
1362 		if(memcmp(magic_chars, "2subindx",7)==0) {
1363 			while(1) {
1364 				short option_key, option_length;
1365 
1366 				rrtv = fread(&option_key, 2, 1, fp);
1367 				if(rrtv < 1) return -1;
1368 				if(!option_key) break;
1369 
1370 				rrtv = fread(&option_length, 2, 1, fp);
1371 				if(rrtv < 1) return -1;
1372 
1373 				if(option_key == option_no){
1374 					*result = 0;
1375 					rrtv = fread(result ,option_length,1,fp);
1376 					if(rrtv < 1) return -1;
1377 					found = 1;
1378 				}
1379 				else
1380 					fseek(fp, option_length, SEEK_CUR);
1381 			}
1382 		}
1383 		fclose(fp);
1384 	}
1385 	return found;
1386 }
1387 
gehash_load(gehash_t * the_table,const char fname[])1388 int gehash_load(gehash_t * the_table, const char fname [])
1389 {
1390 	int i, read_length, rrtv;
1391 	char magic_chars[8];
1392 	magic_chars[7]=0;
1393 
1394 	if(0)if(sizeof( size_t ) != 8|| sizeof(long int ) != 8|| sizeof(int ) != 4){
1395 		SUBREADprintf("LINT: %zd , INT: %zd , SIZD : %zd\n", sizeof(long int ), sizeof(int ), sizeof(size_t));
1396 		//return -1;
1397 	}
1398 
1399 	memset(the_table -> malloc_ptr ,0, sizeof(void*) * GEHASH_MEM_PTR_NO);
1400 	the_table -> index_gap = 0;
1401 
1402 	FILE * fp = f_subr_open(fname, "rb");
1403 	if (!fp)
1404 	{
1405 		SUBREADprintf ("Table file '%s' is not found.\n", fname);
1406 		return 1;
1407 	}
1408 
1409 	rrtv = fread(magic_chars,1,8,fp);
1410 	if(rrtv !=8){
1411 		SUBREADprintf("Error: the index magic string cannot be found. It may contain format errors or file '%s' may be truncated.\n", fname);
1412 		assert(rrtv==8);
1413 	}
1414 
1415 	if(memcmp(magic_chars, "2subindx",8)!=0)
1416 	{
1417 		print_in_box(80,0,0,"");
1418 		print_in_box(80,0,0,"WARNING your reference index was built under an old version of subread");
1419 		print_in_box(80,0,0,"        package. Please rebuild the index for the reference genome.");
1420 		print_in_box(80,0,0,"");
1421 	}
1422 	if(memcmp(magic_chars+1, "subindx",7)==0)
1423 	{
1424 		if('1'==magic_chars[0])
1425 			the_table -> version_number = SUBINDEX_VER1;
1426 		else if('2'==magic_chars[0])
1427 			the_table -> version_number = SUBINDEX_VER2;
1428 		else	assert(0);
1429 
1430 		if(SUBINDEX_VER2 == the_table -> version_number)
1431 		{
1432 			while(1)
1433 			{
1434 				short option_key, option_length;
1435 
1436 				rrtv = fread(&option_key, 2, 1, fp);
1437 				if(rrtv <1){
1438 					SUBREADprintf("Error: the index header cannot be fully loaded. It may contain format errors or file '%s' may be truncated.\n", fname);
1439 					return -1;
1440 				}
1441 				//SUBREADprintf("READOPT_TAB = %04X\n", option_key);
1442 
1443 				if(!option_key) break;
1444 
1445 				rrtv = fread(&option_length, 2, 1, fp);
1446 				if(rrtv <1){
1447 					SUBREADprintf("Error: the index header cannot be fully loaded. It may contain format errors or file '%s' may be truncated.\n", fname);
1448 					return -1;
1449 				}
1450 
1451 				rrtv = 999;
1452 				if(option_key == SUBREAD_INDEX_OPTION_INDEX_GAP)
1453 					rrtv = fread(&(the_table -> index_gap),2,1,fp);
1454 				else if (option_key ==SUBREAD_INDEX_OPTION_INDEX_PADDING)
1455 					rrtv = fread(&(the_table -> padding),2,1,fp);
1456 				else
1457 					fseek(fp, option_length, SEEK_CUR);
1458 				if(rrtv <1){
1459 					SUBREADprintf("Error: the index header cannot be fully loaded. It may contain format errors or file '%s' may be truncated.\n", fname);
1460 					return -1;
1461 				}
1462 			}
1463 			assert(the_table -> index_gap);
1464 		}
1465 		else if(SUBINDEX_VER1 == the_table -> version_number)
1466 			the_table -> index_gap = 3;
1467 
1468 		the_table -> current_items = load_int64(fp);
1469 		if(the_table -> current_items < 1 || the_table -> current_items > 0xffffffffllu){
1470 			SUBREADputs("ERROR: the index format is unrecognizable.");
1471 			assert(the_table -> current_items >0 && the_table -> current_items <=0xffffffffllu);
1472 		}
1473 
1474 		unsigned int * bucket_bytes = malloc(sizeof(int) * GEHASH_MEM_PTR_NO);
1475 		memset(bucket_bytes, 0xff, sizeof(int) * GEHASH_MEM_PTR_NO);
1476 		the_table -> buckets_number = load_int32(fp);
1477 		the_table -> buckets = (struct gehash_bucket * )malloc(sizeof(struct gehash_bucket) * the_table -> buckets_number);
1478 		if(!the_table -> buckets)
1479 		{
1480 			SUBREADputs("Creating buckets.");
1481 			SUBREADputs(MESSAGE_OUT_OF_MEMORY);
1482 			return 1;
1483 		}
1484 
1485 		srInt_64 fp_curr = ftello(fp);
1486 		unsigned int accued_bytes = 0, grp_i = 0, curr_bucks = 0, per_group_bucks = the_table -> buckets_number/ GEHASH_MEM_PTR_NO + 2;
1487 
1488 		for(i=0; i<the_table -> buckets_number ; i++){
1489 			unsigned int current_items = load_int32(fp);
1490 			load_int32(fp);//useless for loading : space size
1491 			unsigned int current_bytes = current_items*( sizeof(short) + sizeof(gehash_data_t) );
1492 			accued_bytes += current_bytes;
1493 			curr_bucks ++;
1494 			if(curr_bucks >= per_group_bucks){
1495 				//SUBREADprintf("Allocating %d : %d buckets : %u\n", grp_i, i, accued_bytes);
1496 				bucket_bytes[grp_i++] = accued_bytes;
1497 				accued_bytes = 0;
1498 				curr_bucks = 0;
1499 			}
1500 			#ifdef __MINGW32__
1501 			{
1502 //if(i%500000 == 0)SUBREADprintf("ESTMHUGE %d/%d\n", i, the_table -> buckets_number);
1503 
1504 				char * buffkk = malloc(current_bytes );
1505 				fread(buffkk,current_bytes ,1, fp);
1506 				free(buffkk);
1507 			}
1508 			#else
1509 			fseeko(fp, current_bytes, SEEK_CUR);
1510 			#endif
1511 		}
1512 		if(curr_bucks)
1513 			bucket_bytes[grp_i++] = accued_bytes;
1514 		fseeko(fp, (off_t)fp_curr, SEEK_SET);
1515 
1516 		for(i=0; i<GEHASH_MEM_PTR_NO ; i++){
1517 
1518 //if(i%50000 == 0)SUBREADprintf("MEMHUGE %d/%d\n", i, the_table -> buckets_number);
1519 			unsigned int current_bytes = bucket_bytes[i];
1520 			if(current_bytes<0xff000000u){
1521 				the_table -> malloc_ptr[i] = malloc(current_bytes);
1522 				//SUBREADprintf("Allocating %d buckets : %u\n", i, current_bytes);
1523 				if(!the_table -> malloc_ptr[i]){
1524 					SUBREADputs(MESSAGE_OUT_OF_MEMORY);
1525 					return 1;
1526 				}
1527 			}
1528 		}
1529 
1530 		grp_i = 0;
1531 		curr_bucks = 0;
1532 		accued_bytes = 0;
1533 		for(i=0; i<the_table -> buckets_number ; i++){
1534 
1535 //if(i%50000 == 0)SUBREADprintf("FILLHUGE %d/%d\n", i, the_table -> buckets_number);
1536 
1537 			struct gehash_bucket * current_bucket = the_table -> buckets+i;
1538 			current_bucket -> current_items = load_int32(fp);
1539 			load_int32(fp);//useless for lo: space size
1540 			current_bucket -> space_size =  current_bucket -> current_items;
1541 			unsigned int current_bytes = current_bucket -> current_items*( sizeof(short) + sizeof(gehash_data_t) );
1542 
1543 			current_bucket -> new_item_keys = (short*)(the_table -> malloc_ptr[grp_i] + accued_bytes);
1544 			current_bucket -> item_values = (gehash_data_t*)(the_table -> malloc_ptr[grp_i] + accued_bytes + current_bucket -> current_items * sizeof(short));
1545 			read_length = fread(current_bucket -> new_item_keys, sizeof(short), current_bucket -> current_items, fp);
1546 			read_length += fread(current_bucket -> item_values, sizeof(gehash_data_t), current_bucket -> current_items, fp);
1547 			if(read_length < 2* current_bucket -> current_items){
1548 				SUBREADprintf("ERROR: the index is incomplete.\n");
1549 				return 1;
1550 			}
1551 
1552 			accued_bytes += current_bytes;
1553 			curr_bucks ++;
1554 			if(curr_bucks >= per_group_bucks){
1555 				curr_bucks=0;
1556 				accued_bytes=0;
1557 				grp_i++;
1558 			}
1559 		}
1560 
1561 		int rval = fread(&(the_table -> is_small_table), sizeof(char), 1, fp);
1562 		if (rval != 1)SUBREADprintf("ERROR: cannot find the table table.\n");
1563 		free(bucket_bytes);
1564 		fclose(fp);
1565 		return 0;
1566 
1567 	}
1568 	else
1569 	{
1570 		fclose(fp);
1571 		the_table -> index_gap = 3;
1572 		fp = f_subr_open(fname, "rb");
1573 		the_table -> version_number = SUBINDEX_VER0;
1574 		the_table -> current_items = load_int64(fp);
1575 		the_table -> buckets_number = load_int32(fp);
1576 		the_table -> buckets = (struct gehash_bucket * )malloc(sizeof(struct gehash_bucket) * the_table -> buckets_number);
1577 		if(!the_table -> buckets)
1578 		{
1579 			SUBREADputs(MESSAGE_OUT_OF_MEMORY);
1580 			return 1;
1581 		}
1582 
1583 		for (i=0; i<the_table -> buckets_number; i++)
1584 		{
1585 			struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
1586 			current_bucket -> current_items = load_int32(fp);
1587 			current_bucket -> space_size = load_int32(fp);
1588 			current_bucket -> space_size = current_bucket -> current_items;
1589 			current_bucket -> item_keys = (gehash_key_t *) malloc ( sizeof(gehash_key_t) * current_bucket -> space_size);
1590 			current_bucket -> item_values = (gehash_data_t *) malloc ( sizeof(gehash_data_t) * current_bucket -> space_size);
1591 
1592 			if(!(current_bucket -> item_keys&&current_bucket -> item_values))
1593 			{
1594 				SUBREADputs(MESSAGE_OUT_OF_MEMORY);
1595 				return 1;
1596 
1597 			}
1598 
1599 			if(current_bucket -> current_items > 0)
1600 			{
1601 				read_length = fread(current_bucket -> item_keys, sizeof(gehash_key_t), current_bucket -> current_items, fp);
1602 				if(read_length < current_bucket -> current_items){
1603 					SUBREADprintf("ERROR: the index is incomplete.\n");
1604 					return 1;
1605 				}
1606 				read_length = fread(current_bucket -> item_values, sizeof(gehash_data_t), current_bucket -> current_items, fp);
1607 				if(read_length < current_bucket -> current_items){
1608 					SUBREADprintf("ERROR: the index is incomplete.\n");
1609 					return 1;
1610 				}
1611 			}
1612 
1613 		}
1614 
1615 		read_length = fread(&(the_table -> is_small_table), sizeof(char), 1, fp);
1616 		if(read_length!=1){
1617 			SUBREADprintf("ERROR: the index is incomplete.\n");
1618 			return 1;
1619 		}
1620 		fclose(fp);
1621 		return 0;
1622 	}
1623 }
1624 
1625 
gehash_sort(gehash_t * the_table)1626 void gehash_sort(gehash_t * the_table)
1627 {
1628 	int i;
1629 	for (i=0; i<the_table -> buckets_number; i++)
1630 	{
1631 		struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
1632 		int ii, jj;
1633 		gehash_key_t tmp_key;
1634 		gehash_data_t tmp_val;
1635 
1636 		if(current_bucket -> current_items>=1)
1637 		{
1638 			for(ii=0;ii<current_bucket -> current_items -1; ii++)
1639 			{
1640 				for (jj = ii+1; jj < current_bucket -> current_items; jj++)
1641 				{
1642 					if (current_bucket -> item_keys[ii] > current_bucket -> item_keys[jj])
1643 					{
1644 						tmp_key = current_bucket -> item_keys[ii];
1645 						current_bucket -> item_keys[ii] = current_bucket -> item_keys[jj];
1646 						current_bucket -> item_keys[jj] = tmp_key;
1647 
1648 						tmp_val = current_bucket -> item_values[ii];
1649 						current_bucket -> item_values[ii] = current_bucket -> item_values[jj];
1650 						current_bucket -> item_values[jj] = tmp_val;
1651 					}
1652 				}
1653 			}
1654 		}
1655 	}
1656 }
1657 
write_cell(short option_key,short option_length,char * option_value,FILE * fp)1658 void write_cell(short option_key, short option_length, char * option_value, FILE * fp)
1659 {
1660 	fwrite(&option_key, 2, 1, fp);
1661 	fwrite(&option_length, 2, 1, fp);
1662 	fwrite(option_value, option_length, 1, fp);
1663 }
1664 
1665 
write_options(FILE * fp,gehash_t * the_table)1666 void write_options(FILE * fp, gehash_t * the_table)
1667 {
1668 	short option_key, option_length;
1669 	short option_value;
1670 
1671 	option_key = SUBREAD_INDEX_OPTION_INDEX_PADDING;
1672 	option_length = 2;
1673 	option_value = the_table -> padding;
1674 
1675 	write_cell(option_key, option_length, (char *) &option_value,fp);
1676 
1677 	option_key = SUBREAD_INDEX_OPTION_INDEX_GAP;
1678 	option_length = 2;
1679 	option_value = the_table -> index_gap;
1680 
1681 	write_cell(option_key, option_length, (char *) &option_value,fp);
1682 
1683 	option_key = 0;
1684 	fwrite(&option_key, 2, 1, fp);
1685 }
1686 
is_1_greater_than_2(int bucket_i,int all_buckets,short k1,gehash_data_t v1,short k2,gehash_data_t v2)1687 int is_1_greater_than_2(int bucket_i, int all_buckets , short k1, gehash_data_t v1, short k2, gehash_data_t v2){
1688 	if(k1 > k2) return 1;
1689 	if(k1 == k2){
1690 		unsigned int real_key = 1u*k1*1u*all_buckets;
1691 		real_key += bucket_i;
1692 		if((real_key%791)%2==0 && v1>v2) return 1;
1693 		if((real_key%791)%2==1 && v1<v2) return 1;
1694 	}
1695 	return 0;
1696 }
1697 
gehash_dump(gehash_t * the_table,const char fname[])1698 int gehash_dump(gehash_t * the_table, const char fname [])
1699 {
1700 	int ii, jj, xx;
1701 	int i, scroll_counter = 0;
1702 	FILE * fp = f_subr_open(fname, "wb");
1703 	int maximum_bucket_size = 0;
1704 	if (!fp)
1705 	{
1706 		SUBREADprintf ("Table file '%s' is not able to open.\n", fname);
1707 		return -1;
1708 	}
1709 
1710 	if(the_table->version_number == SUBINDEX_VER2)
1711 	{
1712 		fwrite("2subindx",1,8,fp);
1713 		write_options(fp, the_table);
1714 	}
1715 
1716 	assert(sizeof(srInt_64 ) == 8);
1717 	assert(sizeof(int ) == 4);
1718 	fwrite(& (the_table -> current_items ), sizeof(srInt_64), 1, fp);
1719 	fwrite(& (the_table -> buckets_number), sizeof(int), 1, fp);
1720 
1721 	print_in_box(80,0,0,"Save current index block...              ");
1722 
1723 	for (i=0; i<the_table -> buckets_number; i++)
1724 	{
1725 		struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
1726 		if(current_bucket -> current_items > maximum_bucket_size)
1727 			maximum_bucket_size = current_bucket -> current_items;
1728 	}
1729 
1730 	#define SORT_LANE_NUMBER 16
1731 	short * sort_space_new_key[SORT_LANE_NUMBER];
1732 	gehash_data_t * sort_space_data [SORT_LANE_NUMBER];
1733 	int items_in_sort[SORT_LANE_NUMBER] ;
1734 	int items_in_merge[SORT_LANE_NUMBER] ;
1735 	if(the_table->version_number > SUBINDEX_VER0)
1736 	{
1737 		for(xx=0;xx<SORT_LANE_NUMBER;xx++)
1738 		{
1739 			sort_space_new_key[xx] = (short *)malloc(sizeof(short)*(maximum_bucket_size/SORT_LANE_NUMBER+2));
1740 			sort_space_data[xx] = (gehash_data_t *)malloc(sizeof(gehash_data_t)*(maximum_bucket_size/SORT_LANE_NUMBER+2));
1741 		}
1742 	}
1743 
1744 	for (i=0; i<the_table -> buckets_number; i++)
1745 	{
1746 		struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
1747 		gehash_data_t tmp_val=0;
1748 
1749 		if(i % (the_table -> buckets_number/10) == 0)
1750 			print_window_scrolling_bar("", 1.0*i/the_table -> buckets_number, 74, &scroll_counter);
1751 
1752 		if(current_bucket -> current_items>=1)
1753 		{
1754 			if(the_table->version_number == SUBINDEX_VER0)
1755 			{
1756 				for(ii=0;ii<current_bucket -> current_items -1; ii++)
1757 				{
1758 					gehash_key_t tmp_key;
1759 					for (jj = ii+1; jj < current_bucket -> current_items; jj++)
1760 					{
1761 
1762 						if(is_1_greater_than_2(i, the_table -> buckets_number, current_bucket -> item_keys[ii], current_bucket -> item_values[ii],
1763 											    current_bucket -> item_keys[jj], current_bucket -> item_values[jj] )) {
1764 							tmp_key = current_bucket -> item_keys[ii];
1765 							current_bucket -> item_keys[ii] = current_bucket -> item_keys[jj];
1766 							current_bucket -> item_keys[jj] = tmp_key;
1767 
1768 							tmp_val = current_bucket -> item_values[ii];
1769 							current_bucket -> item_values[ii] = current_bucket -> item_values[jj];
1770 							current_bucket -> item_values[jj] = tmp_val;
1771 						}
1772 					}
1773 				}
1774 			}
1775 			else
1776 			{
1777 				if(1){
1778 					memset(items_in_sort, 0, sizeof(int)*SORT_LANE_NUMBER);
1779 					for(ii=0;ii<current_bucket -> current_items;ii++)
1780 					{
1781 						int sort_lane_no = ii%SORT_LANE_NUMBER;
1782 						int xx_th_item = items_in_sort[sort_lane_no] ++;
1783 						sort_space_new_key[sort_lane_no][xx_th_item] = current_bucket -> new_item_keys[ii];
1784 						sort_space_data[sort_lane_no][xx_th_item] = current_bucket -> item_values[ii];
1785 					}
1786 
1787 					// sort the individual lanes.
1788 					for(xx=0;xx<SORT_LANE_NUMBER;xx++)
1789 					{
1790 						for(ii=0;ii<items_in_sort[xx]-1; ii++)
1791 						{
1792 							for(jj = ii+1; jj < items_in_sort[xx]; jj++)
1793 							{
1794 								short tmp_key;
1795 								if(is_1_greater_than_2(i, the_table -> buckets_number,   sort_space_new_key[xx][ii], sort_space_data[xx][ii], sort_space_new_key[xx][jj], sort_space_data[xx][jj] )) {
1796 									tmp_key = sort_space_new_key[xx][ii];
1797 									sort_space_new_key[xx][ii] = sort_space_new_key[xx][jj];
1798 									sort_space_new_key[xx][jj] = tmp_key;
1799 
1800 									tmp_val = sort_space_data[xx][ii];
1801 									sort_space_data[xx][ii]=sort_space_data[xx][jj];
1802 									sort_space_data[xx][jj]=tmp_val;
1803 								}
1804 							}
1805 						}
1806 					}
1807 
1808 					// items_in_sort : array, each element for a lane
1809 					// items_in_merge : how many items in each lane that have been merged (ie, read_ptr for each lane)
1810 					// sort_space_new_key : N lanes each has been sorted, each has many keys (ie subreads)
1811 					// sort_space_data : N lanes, each has many values (ie positions)
1812 					// current_bucket -> new_item_keys : One array containing sorted keys (write_ptr is ii)
1813 					// current_bucket -> item_values:  One array containing sorted values (write_ptr is ii))
1814 					memset(items_in_merge, 0, sizeof(int)*SORT_LANE_NUMBER);
1815 
1816 					// tmp_key : the current key being examed. A larger key is put AFTER a smaller key.
1817 					for(ii=0;ii<current_bucket -> current_items;ii++)
1818 					{
1819 						int tmp_key=0x7fffffff;
1820 						int selected_lane = 0, trying_lane;
1821 						for(trying_lane=0;trying_lane<SORT_LANE_NUMBER;trying_lane++)
1822 						{
1823 							int ii_in_trying_lane = items_in_merge[trying_lane];
1824 							if(ii_in_trying_lane >= items_in_sort[trying_lane]) continue;
1825 
1826 							if(tmp_key >= 0x10000 ||is_1_greater_than_2(
1827 									i, the_table -> buckets_number,
1828 									sort_space_new_key[selected_lane][items_in_merge[selected_lane]],
1829 									sort_space_data[selected_lane][items_in_merge[selected_lane]],
1830 									sort_space_new_key[trying_lane][items_in_merge[trying_lane]],
1831 									sort_space_data[trying_lane][items_in_merge[trying_lane]]
1832 								)){
1833 								selected_lane=trying_lane;
1834 								tmp_key = sort_space_new_key[trying_lane][ii_in_trying_lane];
1835 							}
1836 						}
1837 
1838 						current_bucket -> new_item_keys[ii] = sort_space_new_key[selected_lane][items_in_merge[selected_lane]];
1839 						current_bucket -> item_values[ii] = sort_space_data[selected_lane][items_in_merge[selected_lane]];
1840 
1841 						items_in_merge[selected_lane]++;
1842 						//printf("V [%d] [%d] =%d\n",i , ii, tmp_key);
1843 					}
1844 				}
1845 			}
1846 		}
1847 
1848 
1849 		if(0) // debug
1850 		{
1851 			int inidx=0;
1852 			srInt_64 real_old = -1;
1853 			for(xx = 0; xx < current_bucket -> current_items; xx++){
1854 				unsigned int real_key = 1u * current_bucket -> new_item_keys[xx] *1u * the_table -> buckets_number;
1855 				real_key += i;
1856 
1857 				if(real_old != real_key) inidx=0;
1858 
1859 				if(1||real_key >= 0xffffff00 || real_key <5000)
1860 				SUBREADprintf("SUBREAD_%010u POS_%010u INIDX_%04d BUCKET_%04d_ITEM_%08d\n",real_key ,current_bucket -> item_values[xx], inidx, i, xx);
1861 				real_old = real_key;
1862 				inidx++;
1863 			}
1864 		}
1865 
1866 		int is_full = 0;
1867 		int write_len = fwrite(& (current_bucket -> current_items), sizeof(int), 1, fp);
1868 		if(write_len<1) is_full = 1;
1869 		write_len = fwrite(& (current_bucket -> space_size), sizeof(int), 1, fp);
1870 		if(write_len<1) is_full = 1;
1871 
1872 		if(the_table->version_number == SUBINDEX_VER0)
1873 			fwrite(current_bucket -> item_keys, sizeof(gehash_key_t), current_bucket -> current_items, fp);
1874 		else{
1875 			write_len = fwrite(current_bucket -> new_item_keys, sizeof(short), current_bucket -> current_items, fp);
1876 			if(write_len < current_bucket -> current_items) is_full = 1;
1877 		}
1878 		write_len = fwrite(current_bucket -> item_values, sizeof(gehash_data_t), current_bucket -> current_items, fp);
1879 		if(write_len < current_bucket -> current_items) is_full = 1;
1880 		if(is_full){
1881 			fclose(fp);
1882 			SUBREADprintf("ERROR: Unable to write into the output file. Please check the disk space in the output directory.\n");
1883 			return 1;
1884 		}
1885 	}
1886 
1887 	if(the_table->version_number > SUBINDEX_VER0)
1888 	{
1889 		for(xx=0;xx<SORT_LANE_NUMBER;xx++)
1890 		{
1891 			free(sort_space_new_key[xx]);
1892 			free(sort_space_data[xx]);
1893 		}
1894 	}
1895 
1896 
1897 	int write_len = fwrite(&(the_table -> is_small_table), sizeof(char), 1, fp);
1898 	fclose(fp);
1899 
1900 	if(write_len < 1){
1901 		SUBREADprintf("ERROR: Unable to write into the output file. Please check the disk space in the output directory.\n");
1902 		return 1;
1903 	}
1904 	print_in_box(80,0,0,"");
1905 	return 0;
1906 }
1907 
gehash_destory(gehash_t * the_table)1908 void gehash_destory(gehash_t * the_table)
1909 {
1910 	int i, is_ptr = 0;
1911 
1912 	for(i = 0; i < GEHASH_MEM_PTR_NO; i++) if(the_table -> malloc_ptr[i]){
1913 		free(the_table -> malloc_ptr[i]);
1914 		is_ptr=1;
1915 	}
1916 
1917 	if(!is_ptr) for (i=0; i<the_table -> buckets_number; i++)
1918 	{
1919 		struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
1920 		if (current_bucket -> space_size > 0)
1921 		{
1922 			if(0==the_table->free_item_only)free (current_bucket -> item_keys);
1923 			free (current_bucket -> item_values);
1924 		}
1925 	}
1926 
1927 	free (the_table -> buckets);
1928 
1929 	the_table -> current_items = 0;
1930 	the_table -> buckets_number = 0;
1931 }
1932