1 /***************************************************************
2
3 The Subread software package is free software package:
4 you can redistribute it and/or modify it under the terms
5 of the GNU General Public License as published by the
6 Free Software Foundation, either version 3 of the License,
7 or (at your option) any later version.
8
9 Subread is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty
11 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12
13 See the GNU General Public License for more details.
14
15 Authors: Drs Yang Liao and Wei Shi
16
17 ***************************************************************/
18
19
20 #include "sorted-hashtable.h"
21 #include "gene-algorithms.h"
22 #include<assert.h>
23 #include <stdlib.h>
24 #include <string.h>
25
26 #if !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__DragonFly__)
27 #include <malloc.h>
28 #endif
29
30 #include<math.h>
31 #include"core.h"
32
33 #define _gehash_hash(k) ((unsigned int)(k))
34
35
gehash_create(gehash_t * the_table,size_t expected_size,char is_small_table)36 int gehash_create(gehash_t * the_table, size_t expected_size, char is_small_table) {
37 return gehash_create_ex(the_table, expected_size, is_small_table, SUBINDEX_VER0, 3, 0);
38 }
39
calculate_buckets_by_size(size_t expected_size,int version_number,int is_small_table,int index_gap)40 unsigned int calculate_buckets_by_size( size_t expected_size, int version_number, int is_small_table, int index_gap ){
41 if(expected_size ==0)
42 expected_size = GEHASH_DEFAULT_SIZE;
43 int expected_bucket_number = expected_size / GEHASH_BUCKET_LENGTH;
44
45 if(index_gap >= 3) expected_bucket_number /=3;
46
47 if(SUBINDEX_VER1 > version_number) {
48 if(expected_bucket_number < 10111 && !is_small_table)expected_bucket_number = 10111;
49 else if(is_small_table) expected_bucket_number = 4;
50 } else
51 if(expected_bucket_number <= 0x3ffff )expected_bucket_number = 0x3ffff+4;
52
53
54 for (;;expected_bucket_number++) {
55 int j, valid_v;
56
57 valid_v = 1;
58 int test_prime = 13;
59 if(SUBINDEX_VER1 > version_number && is_small_table) test_prime = 3;
60 for(j=2; j<=test_prime;j++)
61 {
62 if (expected_bucket_number % j == 0)
63 valid_v = 0;
64 }
65
66 if (valid_v)
67 break;
68 }
69
70 //SUBREADprintf("EXP_BUCKS=%u\n",expected_bucket_number);
71
72 return expected_bucket_number;
73 }
74
gehash_create_ex(gehash_t * the_table,size_t expected_size,char is_small_table,int version_number,int index_gap,int padding)75 int gehash_create_ex(gehash_t * the_table, size_t expected_size, char is_small_table, int version_number, int index_gap, int padding) {
76 int expected_bucket_number;
77 int i;
78
79 memset(the_table, 0, sizeof(gehash_t));
80 if(expected_size ==0)
81 expected_size = GEHASH_DEFAULT_SIZE;
82
83 expected_bucket_number = calculate_buckets_by_size(expected_size, version_number, is_small_table, index_gap);
84
85 // calculate the number of buckets for creating the data structure
86
87
88 the_table -> version_number = version_number;
89 the_table -> current_items = 0;
90 the_table -> is_small_table = is_small_table;
91 the_table -> buckets_number = expected_bucket_number;
92 the_table -> buckets = (struct gehash_bucket *) malloc(
93 expected_bucket_number *
94 sizeof(struct gehash_bucket )
95 );
96 the_table -> padding = padding;
97
98 if(!the_table -> buckets)
99 {
100 SUBREADputs(MESSAGE_OUT_OF_MEMORY);
101 return 1;
102 }
103
104 for(i=0; i<expected_bucket_number; i++){
105 the_table -> buckets [i].item_keys = NULL;
106 the_table -> buckets [i].current_items = 0;
107 the_table -> buckets [i].space_size = 0;
108 }
109
110 the_table -> index_gap = index_gap;
111
112 return 0;
113 }
114
_gehash_resize_bucket(gehash_t * the_table,int bucket_no,char is_small_table)115 int _gehash_resize_bucket(gehash_t * the_table , int bucket_no, char is_small_table)
116 {
117 int new_bucket_length;
118 struct gehash_bucket * current_bucket = &(the_table -> buckets [bucket_no]);
119 gehash_key_t * new_item_keys = NULL;
120 short * new_new_item_keys = NULL;
121 gehash_data_t * new_item_values;
122
123 if (current_bucket->space_size<1)
124 {
125 if(is_small_table)
126 new_bucket_length = 5 ;
127 else
128 new_bucket_length = 2 ;
129
130 if(the_table->version_number == SUBINDEX_VER0)
131 new_item_keys = (gehash_key_t *) malloc(sizeof(gehash_key_t) * new_bucket_length);
132 else
133 new_new_item_keys = (short *) malloc(sizeof(short)*new_bucket_length);
134
135 new_item_values = (gehash_data_t *) malloc(sizeof(gehash_data_t) * new_bucket_length);
136
137 if(!((new_item_keys|| new_new_item_keys) && new_item_values))
138 {
139 SUBREADputs(MESSAGE_OUT_OF_MEMORY);
140 return 1;
141 }
142
143 /*if(the_table->version_number == SUBINDEX_VER0)
144 bzero(new_item_keys, sizeof(gehash_key_t) * new_bucket_length);
145 else
146 bzero(new_new_item_keys, sizeof(short) * new_bucket_length);
147
148 bzero(new_item_values, sizeof(gehash_data_t) * new_bucket_length);
149 */
150
151
152 if(the_table->version_number == SUBINDEX_VER0)
153 current_bucket->item_keys = new_item_keys;
154 else
155 current_bucket->new_item_keys = new_new_item_keys;
156
157 current_bucket->item_values = new_item_values;
158 current_bucket->space_size = new_bucket_length;
159
160
161 }
162 else
163 {
164 if(is_small_table)
165 //new_bucket_length = (int)max(15,current_bucket->space_size*1.5+1);
166 new_bucket_length = (int)(current_bucket->space_size*5);
167 else
168 new_bucket_length = max((int)(current_bucket->space_size*1.5), (int)(current_bucket->space_size+4));
169
170 if(the_table->version_number == SUBINDEX_VER0)
171 current_bucket->item_keys = (gehash_key_t *) realloc(current_bucket->item_keys , sizeof(gehash_key_t) * new_bucket_length);
172 else
173 current_bucket->new_item_keys = (short *) realloc(current_bucket->new_item_keys, sizeof(short)*new_bucket_length);
174
175 current_bucket->item_values = (gehash_data_t *) realloc(current_bucket->item_values, sizeof(gehash_data_t) * new_bucket_length);
176
177
178 if(!((current_bucket->new_item_keys||current_bucket->new_item_keys) && current_bucket->item_values))
179 {
180 SUBREADputs(MESSAGE_OUT_OF_MEMORY);
181 return 1;
182 }
183
184 current_bucket->space_size = new_bucket_length;
185 }
186 return 0;
187 }
188
189 #define _gehash_get_bucket(tab, key) ( (tab) -> buckets + _gehash_hash( key ) % (tab) -> buckets_number )
190 #define _gehash_get_bucketNO(tab, key) ( _gehash_hash( key ) % (tab) -> buckets_number )
191
gehash_try_insert_measure(unsigned int * bucket_sizes,int bucket_no,gehash_key_t key)192 void gehash_try_insert_measure(unsigned int * bucket_sizes, int bucket_no, gehash_key_t key){
193 unsigned int buck_this = _gehash_hash(key)%bucket_no;
194 //if(buck_this == 16305862)SUBREADprintf("Insert bucket : %u/%u -> %u at PTR=%p\n", buck_this, bucket_no, bucket_sizes[buck_this] , bucket_sizes);
195 bucket_sizes[buck_this]++;
196 }
197
gehash_insert(gehash_t * the_table,gehash_key_t key,gehash_data_t data,unsigned int * bucket_sizes)198 int gehash_insert(gehash_t * the_table, gehash_key_t key, gehash_data_t data, unsigned int * bucket_sizes)
199 {
200 struct gehash_bucket * current_bucket;
201 int is_fault = 0;
202
203 //SUBREADprintf("Insert subread : %u/%u\n", _gehash_hash(key)% the_table->buckets_number, the_table->buckets_number);
204 current_bucket = _gehash_get_bucket (the_table, key);
205
206 if(the_table->version_number == SUBINDEX_VER0)
207 {
208 if (current_bucket->current_items >= current_bucket->space_size) {
209 int bucket_number;
210 if(bucket_sizes){
211 SUBREADprintf("Bucket size was wrongly calculated.\n");
212 return 1;
213 }
214
215 bucket_number = _gehash_hash(key) % the_table -> buckets_number;
216 is_fault = _gehash_resize_bucket(the_table, bucket_number, the_table->is_small_table);
217 if(is_fault)
218 return 1;
219 }
220 current_bucket->item_keys[current_bucket->current_items] = key;
221 }
222 else
223 {
224 short step_number = key/the_table -> buckets_number;
225 if(current_bucket-> new_item_keys == NULL && bucket_sizes){
226 //this will go only once when all the bucktes are NULL.
227
228 unsigned int * memory_sizes = malloc(sizeof(int) * the_table -> buckets_number);
229 memset(memory_sizes, 0xff, sizeof(int) * the_table -> buckets_number);
230
231 unsigned int grp_bytes = 0, grps = 0, this_grp_bucks = 0, xk1, bucks_per_grp = the_table -> buckets_number / GEHASH_MEM_PTR_NO +2;
232 for(xk1=0; xk1< the_table -> buckets_number ; xk1++){
233 unsigned int buck_size =(sizeof(short) + sizeof(gehash_data_t)) * bucket_sizes[xk1];
234 grp_bytes += buck_size;
235 this_grp_bucks ++;
236 if( this_grp_bucks >= bucks_per_grp ){
237 memory_sizes[grps++] = grp_bytes;
238 grp_bytes=0;
239 this_grp_bucks=0;
240 }
241 }
242 if(this_grp_bucks)
243 memory_sizes[grps] = grp_bytes;
244
245 for(xk1=0; xk1 < GEHASH_MEM_PTR_NO; xk1++){
246 unsigned int current_bytes = memory_sizes[xk1];
247 if(current_bytes<0xff000000u){
248 if(the_table -> malloc_ptr[xk1]!=NULL){
249 SUBREADprintf("ERROR : no-NULL ptr : %p\n",the_table -> malloc_ptr[xk1]);
250 }
251 the_table -> malloc_ptr[xk1] = malloc(current_bytes);
252 if(!the_table -> malloc_ptr[xk1]){
253 SUBREADputs(MESSAGE_OUT_OF_MEMORY);
254 return 1;
255 }
256 }
257 }
258
259 grp_bytes = 0; grps = 0; this_grp_bucks = 0;
260 for(xk1=0; xk1< the_table -> buckets_number ; xk1++){
261 struct gehash_bucket * mem_bucket = the_table -> buckets+xk1;
262 memset(mem_bucket, 0, sizeof(struct gehash_bucket));
263 mem_bucket -> new_item_keys = (short*)((char*)the_table -> malloc_ptr[grps] + grp_bytes);
264 mem_bucket -> item_values = (gehash_data_t*)((char *)mem_bucket -> new_item_keys + sizeof(short) * bucket_sizes[xk1]);
265 mem_bucket -> space_size = bucket_sizes[xk1];
266
267 unsigned int buck_size =(sizeof(short) + sizeof(gehash_data_t)) * bucket_sizes[xk1];
268 grp_bytes += buck_size;
269 this_grp_bucks ++;
270 if( this_grp_bucks >= bucks_per_grp ){
271 grps++;
272 grp_bytes=0;
273 this_grp_bucks=0;
274 }
275 }
276
277 the_table -> free_item_only = 2;
278 free(memory_sizes);
279 }
280
281 if (current_bucket->current_items >= current_bucket->space_size) {
282 if(bucket_sizes){
283 SUBREADprintf("Bucket [%d] size was wrongly calculated : %d >= %u.\n", _gehash_get_bucketNO (the_table, key), current_bucket->current_items, current_bucket->space_size);
284 return 1;
285 }
286
287 int bucket_number;
288 bucket_number = _gehash_hash(key) % the_table -> buckets_number;
289
290 is_fault = _gehash_resize_bucket(the_table, bucket_number, the_table->is_small_table);
291 if(is_fault)
292 return 1;
293 }
294 current_bucket->new_item_keys[current_bucket->current_items] = step_number;
295 }
296
297 current_bucket->item_values[current_bucket->current_items] = data;
298 current_bucket->current_items ++;
299 the_table ->current_items ++;
300 return 0;
301 }
302
gehash_insert_limited(gehash_t * the_table,gehash_key_t key,gehash_data_t data,int limit,int replace_prob)303 int gehash_insert_limited(gehash_t * the_table, gehash_key_t key, gehash_data_t data, int limit, int replace_prob)
304 {
305 struct gehash_bucket * current_bucket;
306 int occurance = 0, xk1 ;
307
308 assert(the_table->version_number == SUBINDEX_VER0);
309 current_bucket = _gehash_get_bucket (the_table, key);
310
311
312 for(xk1=0; xk1<current_bucket->current_items ; xk1++)
313 if(current_bucket->item_keys[xk1]==key) occurance++;
314
315 if(occurance >= limit)
316 {
317
318 if(myrand_rand()%32768 < replace_prob)
319 return 1;
320 int replacement_id = myrand_rand()%occurance;
321 occurance = 0;
322 for(xk1=0; xk1<current_bucket->current_items ; xk1++)
323 if(current_bucket->item_keys[xk1]==key)
324 {
325 if(occurance == replacement_id)
326 {
327 current_bucket->item_values[xk1]=data;
328 return 0;
329 }
330 occurance++;
331 }
332
333 }
334
335 gehash_insert(the_table, key, data, NULL);
336 return 0;
337 }
338
339
340
341
342 #define INDEL_SEGMENT_SIZE 5
343
344 #define _index_vote(key) (((unsigned int)(key))%GENE_VOTE_TABLE_SIZE)
345 #define _index_vote_tol(key) (((unsigned int)(key)/INDEL_SEGMENT_SIZE)%GENE_VOTE_TABLE_SIZE)
346
347
348 #define is_quality_subread(scr) ((scr)>15?1:0)
349
gehash_go_q_tolerable(gehash_t * the_table,gehash_key_t key,int offset,int read_len,int is_reversed,gene_vote_t * vote,gene_vote_number_t weight,gene_quality_score_t quality,int max_match_number,int indel_tolerance,int subread_number,int max_error_bases,int subread_len,unsigned int low_border,unsigned int high_border)350 size_t gehash_go_q_tolerable(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote,gene_vote_number_t weight, gene_quality_score_t quality ,int max_match_number, int indel_tolerance, int subread_number, int max_error_bases, int subread_len, unsigned int low_border, unsigned int high_border)
351 {
352 if(max_error_bases >= 10) return 0;
353 char error_pos_stack[10]; // max error bases = 10;
354
355 gehash_key_t mutation_key;
356 int ret = 0;
357
358 ret+=gehash_go_q(the_table, key, offset, read_len, is_reversed, vote, indel_tolerance, subread_number, low_border, high_border);
359 int error_bases ;
360 for (error_bases=1; error_bases <= max_error_bases; error_bases++)
361 {
362 int i, j;
363 for(i=0; i<error_bases; i++)
364 error_pos_stack [i] = i;
365 while (1)
366 {
367
368 char mutation_stack[10];
369 memset(mutation_stack, 0 , error_bases);
370 while(1)
371 {
372 int is_end2 = 1;
373 for (i = error_bases-1; i>=0; i--)
374 if (mutation_stack[i] <= 3)
375 {
376 int base_to_change=-1;
377 mutation_key = key;
378
379 for (j = 0; j < error_bases; j++)
380 {
381 base_to_change = error_pos_stack[j];
382 int new_value = mutation_stack[j];
383 mutation_key = mutation_key & ~(0x3 << (2*base_to_change));
384 mutation_key = mutation_key | (new_value << (2*base_to_change));
385 }
386 for (j = i+1; j<error_bases; j++)
387 mutation_stack[j] = 0;
388 is_end2 = 0;
389 if(key != mutation_key)
390 ret+=gehash_go_q(the_table, mutation_key, offset, read_len, is_reversed, vote, indel_tolerance, subread_number, low_border, high_border);
391 mutation_stack[i] ++;
392 break;
393 }
394 if (is_end2)break;
395 }
396
397
398 int is_end = 1;
399 for (i = error_bases-1; i>=0; i--)
400 if (error_pos_stack[i] < subread_len - (error_bases - i))
401 {
402 error_pos_stack[i] ++;
403 for (j = i+1; j<error_bases; j++)
404 error_pos_stack[j] = error_pos_stack[i] + (j-i);
405 is_end = 0;
406 break;
407 }
408
409 if(is_end) break;
410 }
411 }
412 return ret;
413 }
414
415
gehash_go_q_CtoT(gehash_t * the_table,gehash_key_t key,int offset,int read_len,int is_reversed,gene_vote_t * vote,gene_vote_number_t weight,int max_match_number,int indel_tolerance,int subread_number,int max_error_bases,unsigned int low_border,unsigned int high_border)416 size_t gehash_go_q_CtoT(gehash_t * the_table, gehash_key_t key, int offset, int read_len, int is_reversed, gene_vote_t * vote, gene_vote_number_t weight,int max_match_number, int indel_tolerance, int subread_number, int max_error_bases, unsigned int low_border, unsigned int high_border)
417 {
418 assert(max_error_bases < 5);
419 int error_pos_stack[10]; // max error bases = 10;
420 int C_poses[16];
421 int availableC=0;
422
423 gehash_key_t mutation_key;
424 int ret = 0,i;
425
426 ret+=gehash_go_q(the_table, key, offset, read_len, is_reversed, vote, indel_tolerance, subread_number, low_border, high_border);
427 int error_bases ;
428
429 for(i=0; i<16; i++)
430 {
431 if(((key >> ((15-i)*2) )&3) == 3) // if it is 'T' at i-th base
432 {
433 C_poses[availableC++] = i;
434 }
435 }
436
437
438
439 for (error_bases= min(availableC,max_error_bases); error_bases <= min(availableC,max_error_bases); error_bases++)
440 {
441 assert(error_bases < 5);
442 int j;
443 for(i=0; i<error_bases; i++)
444 {
445 error_pos_stack [i] = i;
446 }
447
448 while (1)
449 {
450 char mutation_stack[10];
451 memset(mutation_stack, 0 , error_bases);
452 while(1)
453 {
454 int is_end2 = 1;
455 for (i = error_bases-1; i>=0; i--)
456 if (mutation_stack[i] <1)
457 {
458 int base_to_change=-1;
459 mutation_key = key;
460
461 for (j = 0; j < error_bases; j++)
462 {
463 base_to_change = C_poses[error_pos_stack[j]];
464 int new_value = mutation_stack[j]?2:3; //(C or T)
465 mutation_key = mutation_key & ~(0x3 << (2*(15-base_to_change)));
466 mutation_key = mutation_key | (new_value << (2*(15-base_to_change)));
467 }
468 for (j = i+1; j<error_bases; j++)
469 mutation_stack[j] = 0;
470 is_end2 = 0;
471
472 if(key != mutation_key){
473 int retone;
474 retone = gehash_go_q(the_table, mutation_key, offset, read_len, is_reversed, vote, indel_tolerance, subread_number, low_border, high_border);
475 ret+=retone;
476 }
477 mutation_stack[i] ++;
478 break;
479 }
480 if (is_end2)break;
481 }
482
483
484 int is_end = 1;
485 for (i = error_bases-1; i>=0; i--)
486 if (error_pos_stack[i] < availableC - (error_bases - i))
487 {
488 error_pos_stack[i] ++;
489 for (j = i+1; j<error_bases; j++)
490 error_pos_stack[j] = error_pos_stack[i] + (j-i);
491 is_end = 0;
492 break;
493 }
494
495 if(is_end) break;
496 }
497 }
498 return ret;
499 }
500
501
assign_best_vote(gene_vote_t * vote,int i,int j)502 void assign_best_vote(gene_vote_t * vote, int i, int j)
503 {
504 vote->max_mask = vote->masks[i][j];
505 vote->max_vote = vote->votes[i][j];
506 vote->max_position =vote->pos[i][j];
507 vote->max_coverage_start = vote->coverage_start [i][j];
508 vote->max_coverage_end = vote->coverage_end [i][j];
509 memcpy(vote->max_indel_recorder, vote->indel_recorder[i][j], 3*MAX_INDEL_TOLERANCE * sizeof(*vote->max_indel_recorder));
510 }
511
512
gehash_go_q(gehash_t * the_table,gehash_key_t raw_key,int offset,int read_len,int is_reversed,gene_vote_t * vote,int indel_tolerance,int subread_number,unsigned int low_border,unsigned int high_border)513 size_t gehash_go_q(gehash_t * the_table, gehash_key_t raw_key, int offset, int read_len, int is_reversed, gene_vote_t * vote, int indel_tolerance, int subread_number, unsigned int low_border, unsigned int high_border)
514 {
515 //SUBREADprintf("Q=%u, OFFSET=%d, B=%u ~ %u\n", raw_key, offset, low_border, high_border);
516
517 if(the_table->version_number == SUBINDEX_VER0)
518 {
519 gehash_key_t key = raw_key;
520 struct gehash_bucket * current_bucket;
521 int i=0, items;
522
523 gehash_key_t *current_keys;//, *endp12;
524
525 current_bucket = _gehash_get_bucket (the_table, key);
526 items = current_bucket -> current_items;
527 current_keys = current_bucket -> item_keys;
528
529 if(!items) return 0;
530
531 #define SPEED_UP_DENOMINAOR 3
532 int jump_step = items / SPEED_UP_DENOMINAOR;
533 int last_accepted_index = 0;
534
535 if(jump_step<1) jump_step=1;
536
537 if(key > current_keys[0])
538 {
539 while(1)
540 {
541 while(1)
542 {
543 int next_p = last_accepted_index + jump_step;
544 if(next_p >= items) break;
545 if(current_keys[next_p]>=key) break;
546 last_accepted_index = next_p;
547 }
548 if(jump_step>SPEED_UP_DENOMINAOR)
549 jump_step /= SPEED_UP_DENOMINAOR;
550 else if(jump_step>1)
551 jump_step = 1;
552 else
553 break;
554 }
555 last_accepted_index++;
556 }
557
558 if(current_keys[last_accepted_index]!=key) return 0;
559
560 short offset_from_5 = offset;
561 //short offset_from_5 = is_reversed?(read_len - offset - 16):offset ;
562
563 if(*(current_bucket -> item_values+last_accepted_index) > 0xffff0000) // no position should be greater than this.
564 {
565 // assumed to be non-informative subread.
566 vote -> noninformative_subreads++;
567 return 0;
568 }
569
570 if (indel_tolerance <1)
571 for (; last_accepted_index<items && current_keys[last_accepted_index] == key ; last_accepted_index++)
572 {
573 unsigned int kv = current_bucket->item_values[last_accepted_index] - offset;
574 int offsetX = _index_vote(kv);
575 int datalen = vote -> items[offsetX];
576 unsigned int * dat = vote -> pos[offsetX];
577
578 if (kv > 0xffff0000)
579 continue;
580 for (i=0;i<datalen;i++)
581 {
582 if (dat[i] == kv && (subread_number + 1 > vote -> last_subread_cluster[offsetX][i]))
583 {
584 gene_vote_number_t test_max = (vote->votes[offsetX][i]);
585 test_max += 1;
586 vote->votes[offsetX][i] = test_max;
587 if (offset_from_5 < vote->coverage_start [offsetX][i])
588 vote->coverage_start [offsetX][i] = offset_from_5;
589 if (offset_from_5 +16 > vote->coverage_end [offsetX][i])
590 vote->coverage_end [offsetX][i] = offset_from_5+16;
591
592 vote -> last_subread_cluster[offsetX][i] = subread_number + 1;
593
594 vote->max_vote = max(vote->max_vote , test_max);
595 i = 9999999;
596 }
597 }
598
599 if (i < 9999999 && datalen<GENE_VOTE_SPACE)
600 {
601
602 if (kv < low_border || kv > high_border)
603 continue;
604
605
606 vote -> items[offsetX] ++;
607 dat[i] = kv;
608 vote->votes[offsetX][i]=1;
609 vote->masks[offsetX][i]= (is_reversed?IS_NEGATIVE_STRAND:0);
610 vote->coverage_start [offsetX][i] = offset_from_5;
611 vote->coverage_end [offsetX][i] = offset_from_5+16;
612 vote -> last_subread_cluster[offsetX][i] = subread_number + 1;
613
614 if(vote->max_vote==0)
615 vote->max_vote = 1;
616 }
617 }
618 else
619 {
620 // We duplicated all codes for indel_tolerance >= 1 for the minimal impact to performance.
621 //int ii_end = (indel_tolerance % INDEL_SEGMENT_SIZE)?(indel_tolerance - indel_tolerance%INDEL_SEGMENT_SIZE+INDEL_SEGMENT_SIZE):indel_tolerance;
622
623 for (; last_accepted_index<items && current_keys[last_accepted_index] == key ; last_accepted_index++)
624 {
625 unsigned int kv = current_bucket->item_values[last_accepted_index] - offset;
626 int ii_end = (indel_tolerance % INDEL_SEGMENT_SIZE)?(indel_tolerance - indel_tolerance%INDEL_SEGMENT_SIZE+INDEL_SEGMENT_SIZE):indel_tolerance;
627 int iix;
628 i=0;
629
630 for(iix = 0; iix<=ii_end; iix = iix>0?-iix:(-iix+INDEL_SEGMENT_SIZE))
631 {
632 int offsetX = _index_vote_tol(kv+iix);
633 int datalen = vote -> items[offsetX];
634 if(!datalen)continue;
635
636 unsigned int * dat = vote -> pos[offsetX];
637
638 for (i=0;i<datalen;i++)
639 {
640 int di = dat[i];
641 int dist0 = kv-di;
642 if( dist0 >= -indel_tolerance && dist0 <= indel_tolerance && is_reversed == (0!=(vote -> masks[offsetX][i]&IS_NEGATIVE_STRAND))) {
643 int toli = vote -> toli[offsetX][i];
644
645 if(di >= 46494104 && di <= 46496104 ){
646 SUBREADprintf("VOTES: at %u, subread_no = %d , last_cluster = %d , toli = %d\n", di, subread_number , vote -> last_subread_cluster[offsetX][i] , toli);
647 }
648
649 if( toli > 0 && subread_number + 1 == vote -> last_subread_cluster[offsetX][i] ){
650 int move_dist = 0;
651 if( toli >=3 ) move_dist = vote -> indel_recorder[offsetX][i][toli-3+2];
652 int new_dist = move_dist;
653 move_dist -= vote -> indel_recorder[offsetX][i][toli+2];
654 new_dist -= dist0;
655 if(abs(move_dist) > abs(new_dist)){
656 toli -= 3;
657 vote -> toli[offsetX][i] = toli;
658 vote -> last_subread_cluster[offsetX][i]--;
659 vote -> votes[offsetX][i] --;
660 }
661 }
662 if(subread_number + 1 <= vote -> last_subread_cluster[offsetX][i])
663 continue;
664
665
666 gene_vote_number_t test_max = (vote->votes[offsetX][i]);
667 test_max += 1;
668 vote -> votes[offsetX][i] = test_max;
669
670 /*
671 if (offset_from_5 < vote->coverage_start [offsetX][i])
672 {
673 vote->coverage_start [offsetX][i] = offset_from_5;
674 }*/
675 if (offset_from_5 +16 > vote->coverage_end [offsetX][i])
676 {
677 vote->coverage_end [offsetX][i] = offset_from_5+16;
678 }
679
680
681 if (dist0 != vote->current_indel_cursor[offsetX][i])
682 {
683 toli +=3;
684 if (toli < indel_tolerance*3)
685 {
686 vote -> toli[offsetX][i] = toli;
687 vote -> indel_recorder[offsetX][i][toli] = subread_number+1;
688 vote -> indel_recorder[offsetX][i][toli+1] = subread_number+1;
689 vote -> indel_recorder[offsetX][i][toli+2] = dist0;
690
691 if(toli < indel_tolerance*3-3) vote -> indel_recorder[offsetX][i][toli+3]=0;
692 }
693 vote->current_indel_cursor [offsetX][i] = (char)dist0;
694 }
695 else
696 vote -> indel_recorder[offsetX][i][toli+1] = subread_number+1;
697
698 vote -> last_subread_cluster[offsetX][i] = subread_number + 1;
699
700 vote->max_vote = max(vote->max_vote , test_max);
701 i = 9999999;
702 }
703 }
704 if (i==9999999){
705 break;
706 }
707 }
708
709 if (i < 9999999)
710 {
711 if (kv < low_border || kv > high_border)
712 continue;
713
714 int offsetX2 = _index_vote_tol(kv);
715 int datalen2 = vote -> items[offsetX2];
716 unsigned int * dat2 = vote -> pos[offsetX2];
717
718 if (datalen2<GENE_VOTE_SPACE)
719 {
720 vote -> items[offsetX2] ++;
721 dat2[datalen2] = kv;
722 vote -> masks[offsetX2][datalen2]=(is_reversed?IS_NEGATIVE_STRAND:0);
723 vote -> votes[offsetX2][datalen2]=1;
724 vote -> toli[offsetX2][datalen2]=0;
725 vote -> last_subread_cluster[offsetX2][datalen2] = subread_number + 1;
726
727 // data structure of recorder:
728 // {unsigned char subread_start; unsigned char subread_end, char indel_offset_from_start}
729 // All subread numbers are added with 1 for not being 0.
730
731 vote -> indel_recorder[offsetX2][datalen2][0] = vote -> indel_recorder[offsetX2][datalen2][1] = subread_number+1;
732 vote -> indel_recorder[offsetX2][datalen2][2] = 0;
733 vote -> indel_recorder[offsetX2][datalen2][3] = 0;
734 vote->current_indel_cursor [offsetX2][datalen2] = 0;
735 vote->coverage_start [offsetX2][datalen2] = offset_from_5;
736 vote->coverage_end [offsetX2][datalen2] = offset_from_5+16;
737
738 if (vote->max_vote==0)
739 vote->max_vote = 1;
740 }
741 }
742 }
743 }
744
745 return 1;
746 //return match_end-match_start;
747 }
748 else
749 {
750
751 // VER_1
752 // VER_2
753
754 struct gehash_bucket * current_bucket;
755 int i = 0, items;
756
757 short *current_keys;//, *endp12;
758 short key = raw_key / the_table->buckets_number;
759
760 current_bucket = _gehash_get_bucket (the_table, raw_key);
761 items = current_bucket -> current_items;
762 current_keys = current_bucket -> new_item_keys;
763
764 if(!items) return 0;
765
766 //#warning "======== MAKE SURE THAT '-1' IS CORRECT ============"
767 int imin=0, imax=items - 1;
768 int last_accepted_index;
769
770 while(1)
771 {
772 last_accepted_index=(imin+imax)/2;
773 short current_key = current_keys[last_accepted_index];
774 if(current_key>key)
775 {
776 imax = last_accepted_index - 1;
777 }
778 else if(current_key<key)
779 {
780 imin = last_accepted_index + 1;
781 }
782 else
783 break;
784
785 if(imax<imin)
786 return 0;
787
788 }
789
790 while(last_accepted_index){
791 if(current_keys[last_accepted_index-1] == key) last_accepted_index-=1;
792 else break;
793 }
794
795
796 /*if(*(current_bucket -> item_values+last_accepted_index) > 0xffff0000) // no position should be greater than this.
797 {
798 // assumed to be non-informative subread.
799 vote -> noninformative_subreads++;
800 return 0;
801 }*/
802
803 int subread_number_P1 = subread_number + 1;
804 int of_p_16 = offset + 16;
805
806 {
807 int ii_end = INDEL_SEGMENT_SIZE;
808 if(indel_tolerance>5) ii_end=(indel_tolerance % INDEL_SEGMENT_SIZE)?(indel_tolerance - indel_tolerance%INDEL_SEGMENT_SIZE+INDEL_SEGMENT_SIZE):indel_tolerance;
809
810 for (; last_accepted_index<items && current_keys[last_accepted_index] == key ; last_accepted_index++)
811 {
812 int iix, offsetX2, offsetX, datalen, datalen2;
813 unsigned int kv = current_bucket->item_values[last_accepted_index] - offset;
814 offsetX2 = _index_vote_tol(kv);
815 datalen = vote -> items[offsetX2];
816
817 datalen2 = datalen;
818 offsetX = offsetX2;
819 unsigned int * dat2, *dat;
820 dat = dat2 = vote -> pos[offsetX2];
821
822 //SUBREADprintf("You can find KV at %u\n", kv);
823
824 for(iix = 0; iix<=ii_end; iix = iix>0?-iix:(-iix+INDEL_SEGMENT_SIZE))
825 {
826 if(iix)
827 {
828 offsetX = _index_vote_tol(kv+iix);
829 datalen = vote -> items[offsetX];
830 if(!datalen)continue;
831
832 dat = vote -> pos[offsetX];
833 }else if(!datalen)continue;
834
835 for (i=0;i<datalen;i++)
836 {
837 int dist0 = kv-dat[i];
838
839
840 if( dist0 >= -indel_tolerance && dist0 <= indel_tolerance && is_reversed == (0!= vote->masks[offsetX][i]))
841 {
842
843 // if(di >= 46494104 && di <= 46496104 ){
844 // SUBREADprintf("VOTES: at %u, subread_no = %d , last_cluster = %d , toli = %d\n", di, subread_number , vote -> last_subread_cluster[offsetX][i] , toli);
845 // }
846
847 int toli = vote -> toli[offsetX][i];
848 if( subread_number_P1 == vote -> last_subread_cluster[offsetX][i] && toli >0){
849 int move_dist = 0;
850 if( toli >=3 ) move_dist = vote -> indel_recorder[offsetX][i][toli-3+2];
851 int new_dist = move_dist;
852 move_dist -= vote -> indel_recorder[offsetX][i][toli+2];
853 new_dist -= dist0;
854 if(abs(move_dist) > abs(new_dist)){
855 toli -= 3;
856 vote -> toli[offsetX][i] = toli;
857 vote -> last_subread_cluster[offsetX][i]--;
858 vote -> votes[offsetX][i] --;
859 }
860 }
861
862 if(subread_number_P1 <= vote -> last_subread_cluster[offsetX][i]) continue;
863 gene_vote_number_t test_max = (vote->votes[offsetX][i]);
864 test_max += 1;
865 vote -> votes[offsetX][i] = test_max;
866
867 if (offset +16 > vote->coverage_end [offsetX][i])
868 vote->coverage_end [offsetX][i] = of_p_16;
869
870
871 if (dist0 == vote->current_indel_cursor[offsetX][i])
872 vote -> indel_recorder[offsetX][i][toli+1] = subread_number_P1;
873 else {
874 toli +=3;
875 if (toli < MAX_INDEL_SECTIONS*3)
876 {
877 vote -> toli[offsetX][i] = toli;
878 vote -> indel_recorder[offsetX][i][toli] = subread_number_P1;
879 vote -> indel_recorder[offsetX][i][toli+1] = subread_number_P1;
880 vote -> indel_recorder[offsetX][i][toli+2] = dist0;
881
882 if(toli < MAX_INDEL_SECTIONS*3-3) vote -> indel_recorder[offsetX][i][toli+3]=0;
883 }
884 vote->current_indel_cursor [offsetX][i] = (char)dist0;
885 }
886
887 vote -> last_subread_cluster[offsetX][i] = subread_number_P1;
888 if(vote->max_vote < test_max)vote->max_vote = test_max;
889 i = 9999999;
890 break;
891 }
892 }
893 if (i==9999999)break;
894
895 }
896
897 if (i < 9999999)
898 {
899 if (kv < low_border || kv > high_border)
900 continue;
901
902 if (datalen2<GENE_VOTE_SPACE)
903 {
904 vote -> items[offsetX2] ++;
905 dat2[datalen2] = kv;
906 vote -> masks[offsetX2][datalen2]=(is_reversed?IS_NEGATIVE_STRAND:0);
907 vote -> votes[offsetX2][datalen2]=1;
908 vote -> toli[offsetX2][datalen2]=0;
909
910 // data structure of recorder:
911 // {unsigned char subread_start; unsigned char subread_end, char indel_offset_from_start}
912 // All subread numbers are added with 1 for not being 0.
913
914 vote -> indel_recorder[offsetX2][datalen2][0] = vote -> indel_recorder[offsetX2][datalen2][1] = subread_number_P1;
915 vote -> indel_recorder[offsetX2][datalen2][2] = 0;
916 vote -> indel_recorder[offsetX2][datalen2][3] = 0;
917 vote->current_indel_cursor [offsetX2][datalen2] = 0;
918 vote->coverage_start [offsetX2][datalen2] = offset;
919 vote->coverage_end [offsetX2][datalen2] = of_p_16;
920 vote -> last_subread_cluster[offsetX2][datalen2] = subread_number_P1;
921
922 if (vote->max_vote==0)
923 vote->max_vote = 1;
924 }
925 }
926 else i=0;
927 }
928 }
929 return 1;
930 }
931 }
932
933
934
gehash_go_X(gehash_t * the_table,gehash_key_t raw_key,int offset,int read_len,int is_reversed,gene_vote_t * vote,int indel_tolerance,int subread_number,unsigned int low_border,unsigned int high_border,int run_round,unsigned int * shift_indel_locs,unsigned int * shift_indel_NO)935 size_t gehash_go_X(gehash_t * the_table, gehash_key_t raw_key, int offset, int read_len, int is_reversed, gene_vote_t * vote, int indel_tolerance, int subread_number, unsigned int low_border, unsigned int high_border, int run_round, unsigned int * shift_indel_locs, unsigned int * shift_indel_NO){
936 if(0)if(the_table->version_number == SUBINDEX_VER0){
937 SUBREADprintf("ERROR: the version of the index is too old.\n");
938 assert(the_table->version_number != SUBINDEX_VER0);
939 return -1;
940 }
941
942 // VER_1
943 // VER_2
944
945 struct gehash_bucket * current_bucket;
946 int i = 0, items;
947
948 short *current_keys;//, *endp12;
949 short key = raw_key / the_table->buckets_number;
950
951 current_bucket = _gehash_get_bucket (the_table, raw_key);
952 items = current_bucket -> current_items;
953 current_keys = current_bucket -> new_item_keys;
954
955 if(!items) return 0;
956
957 //#warning "======== MAKE SURE THAT '-1' IS CORRECT ============"
958 int imin=0, imax=items - 1;
959 int last_accepted_index;
960
961 while(1)
962 {
963 last_accepted_index=(imin+imax)/2;
964 short current_key = current_keys[last_accepted_index];
965 if(current_key>key)
966 {
967 imax = last_accepted_index - 1;
968 }
969 else if(current_key<key)
970 {
971 imin = last_accepted_index + 1;
972 }
973 else
974 break;
975
976 if(imax<imin)
977 return 0;
978
979 }
980
981
982 int subread_number_P1 = subread_number + 1;
983 int of_p_16 = offset + 16;
984 is_reversed = is_reversed?IS_NEGATIVE_STRAND:0;
985 int start_scan_idx = last_accepted_index, scan_step = 0;
986
987 {
988 int ii_end = INDEL_SEGMENT_SIZE;
989 if(indel_tolerance>5) ii_end=(indel_tolerance % INDEL_SEGMENT_SIZE)?(indel_tolerance - indel_tolerance%INDEL_SEGMENT_SIZE+INDEL_SEGMENT_SIZE):indel_tolerance;
990
991 //for (; last_accepted_index<items && current_keys[last_accepted_index] == key ; last_accepted_index++){
992 while(1){
993 int iix, offsetX2, offsetX, datalen, datalen2;
994 unsigned int kv = current_bucket->item_values[last_accepted_index] - offset;
995
996 offsetX = offsetX2 = _index_vote_tol(kv);
997 datalen = datalen2 = vote -> items[offsetX];
998
999 unsigned int * dat2, *dat;
1000 dat = dat2 = vote -> pos[offsetX2];
1001 int found_some = 0;
1002
1003 //SUBREADprintf("You can find KV at %u\n", kv);
1004
1005 for(iix = 0; iix<=ii_end; iix = iix>0?-iix:(-iix+INDEL_SEGMENT_SIZE)) {
1006 if(iix) {
1007 offsetX = _index_vote_tol(kv+iix);
1008 datalen = vote -> items[offsetX];
1009 if(!datalen)continue;
1010 dat = vote -> pos[offsetX];
1011 } else if(!datalen) continue;
1012
1013 for (i=0;i<datalen;i++)
1014 {
1015 int dist0 = kv-dat[i];
1016 int applied_indel_tol = ( run_round>0 && vote -> marked_shift_indel[offsetX][i])? 0: indel_tolerance ;
1017 if( dist0 >= -applied_indel_tol && dist0 <= applied_indel_tol && is_reversed == vote->masks[offsetX][i]){
1018 int toli = vote -> toli[offsetX][i];
1019
1020 if(run_round == 0 && toli >0 && dist0 ==0 && ! vote -> marked_shift_indel[offsetX][i]){
1021 vote -> marked_shift_indel[offsetX][i] = 1;
1022 shift_indel_locs[(* shift_indel_NO)++] = dat[i];
1023 }
1024
1025 if( subread_number_P1 == vote -> last_subread_cluster[offsetX][i] && toli >0){
1026 int move_dist = 0;
1027 if( toli >=3 ) move_dist = vote -> indel_recorder[offsetX][i][toli-3+2];
1028 int new_dist = move_dist;
1029 move_dist -= vote -> indel_recorder[offsetX][i][toli+2];
1030 new_dist -= dist0;
1031 if(abs(move_dist) > abs(new_dist)){
1032 toli -= 3;
1033 vote -> toli[offsetX][i] = toli;
1034 vote -> last_subread_cluster[offsetX][i]--;
1035 vote -> votes[offsetX][i] --;
1036 }
1037 }
1038
1039 if(subread_number_P1 <= vote -> last_subread_cluster[offsetX][i]) continue;
1040 gene_vote_number_t test_max = (vote->votes[offsetX][i]);
1041 test_max += 1;
1042 vote -> votes[offsetX][i] = test_max;
1043
1044 if (offset +16 > vote->coverage_end [offsetX][i])
1045 vote->coverage_end [offsetX][i] = of_p_16;
1046
1047 if (dist0 == vote->current_indel_cursor[offsetX][i]){
1048 vote -> indel_recorder[offsetX][i][toli+1] = subread_number_P1;
1049 } else {
1050 toli +=3;
1051 if (toli < MAX_INDEL_SECTIONS*3)
1052 {
1053 vote -> toli[offsetX][i] = toli;
1054 vote -> indel_recorder[offsetX][i][toli] = subread_number_P1;
1055 vote -> indel_recorder[offsetX][i][toli+1] = subread_number_P1;
1056 vote -> indel_recorder[offsetX][i][toli+2] = dist0;
1057
1058 if(toli < MAX_INDEL_SECTIONS*3-3) vote -> indel_recorder[offsetX][i][toli+3]=0;
1059 }
1060 vote->current_indel_cursor [offsetX][i] = (char)dist0;
1061 }
1062
1063 vote -> last_subread_cluster[offsetX][i] = subread_number_P1;
1064 if(vote->max_vote < test_max)vote->max_vote = test_max;
1065 found_some = 1;
1066 break;
1067 }
1068 }
1069 if(found_some) break;
1070 }
1071
1072 if (!found_some) {
1073 if (kv >=low_border && kv <= high_border && datalen2<GENE_VOTE_SPACE) {
1074 vote -> items[offsetX2] ++;
1075 dat2[datalen2] = kv;
1076 vote -> masks[offsetX2][datalen2] = is_reversed;
1077 vote -> votes[offsetX2][datalen2] = 1;
1078 vote -> toli[offsetX2][datalen2] = 0;
1079 vote -> marked_shift_indel[offsetX2][datalen2] = 0;
1080 if(run_round>0){
1081 int kk;
1082 for(kk = 0; kk < * shift_indel_NO ; kk++){
1083 if( kv >= shift_indel_locs[kk] - indel_tolerance && kv <= shift_indel_locs[kk] + indel_tolerance ){
1084 vote -> marked_shift_indel[offsetX2][datalen2] = 1;
1085 break;
1086 }
1087 }
1088 }
1089
1090 // data structure of recorder:
1091 // {unsigned char subread_start; unsigned char subread_end, char indel_offset_from_start}
1092 // All subread numbers are added with 1 for not being 0.
1093
1094 vote -> indel_recorder[offsetX2][datalen2][0] = vote -> indel_recorder[offsetX2][datalen2][1] = subread_number_P1;
1095 vote -> indel_recorder[offsetX2][datalen2][2] = 0;
1096 vote -> indel_recorder[offsetX2][datalen2][3] = 0;
1097 vote->current_indel_cursor [offsetX2][datalen2] = 0;
1098 vote->coverage_start [offsetX2][datalen2] = offset;
1099 vote->coverage_end [offsetX2][datalen2] = of_p_16;
1100 vote -> last_subread_cluster[offsetX2][datalen2] = subread_number_P1;
1101
1102 if (vote->max_vote==0)
1103 vote->max_vote = 1;
1104 }
1105 }
1106
1107 if(! scan_step){
1108 last_accepted_index++;
1109 if(last_accepted_index == items || current_keys[last_accepted_index]!= key){
1110 scan_step = 1;
1111 last_accepted_index = start_scan_idx;
1112 }
1113 }
1114 if(scan_step){
1115 last_accepted_index --;
1116 if(last_accepted_index <0 || current_keys[last_accepted_index]!= key) break;
1117 }
1118 }
1119 }
1120 return 1;
1121 }
1122
1123
1124
1125
select_best_vote(gene_vote_t * vote)1126 void select_best_vote(gene_vote_t * vote)
1127 {
1128 int i,j;
1129 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1130 for(j=0; j< vote->items[i]; j++)
1131 {
1132 if(vote -> votes[i][j] == vote->max_vote)
1133 {
1134 vote -> max_position = vote -> pos[i][j];
1135 vote -> max_mask = vote -> masks[i][j];
1136 vote -> max_coverage_start = vote->coverage_start[i][j];
1137 vote -> max_coverage_end = vote->coverage_end[i][j];
1138 }
1139 }
1140 }
1141
indel_recorder_copy(gene_vote_number_t * dst,gene_vote_number_t * src)1142 short indel_recorder_copy(gene_vote_number_t *dst, gene_vote_number_t * src)
1143 {
1144 short all_indels = 0;
1145 // memcpy(dst, src, 3*MAX_INDEL_TOLERANCE); return;
1146
1147
1148 int i=0;
1149 while(src[i] && (i<3*MAX_INDEL_TOLERANCE-2))
1150 {
1151 dst[i] = src[i];
1152 i++;
1153 dst[i] = src[i];
1154 i++;
1155 dst[i] = src[i];
1156 all_indels = dst[i];
1157 i++;
1158 }
1159 dst[i] = 0;
1160 return all_indels;
1161
1162 }
1163
finalise_vote(gene_vote_t * vote)1164 void finalise_vote(gene_vote_t * vote)
1165 {
1166 if(vote->max_tmp_indel_recorder)
1167 {
1168 indel_recorder_copy(vote->max_indel_recorder, vote->max_tmp_indel_recorder);
1169 }
1170 // memcpy(vote->max_indel_recorder, vote->max_tmp_indel_recorder, sizeof(char)*3 * MAX_INDEL_TOLERANCE);
1171 }
1172
gehash_exist(gehash_t * the_table,gehash_key_t key)1173 int gehash_exist(gehash_t * the_table, gehash_key_t key)
1174 {
1175 struct gehash_bucket * current_bucket;
1176 int items;
1177 gehash_key_t * keyp, *endkp;
1178
1179 assert(the_table->version_number == SUBINDEX_VER0);
1180 current_bucket = _gehash_get_bucket (the_table, key);
1181 items = current_bucket -> current_items;
1182
1183 if(items <1)return 0;
1184
1185 keyp = current_bucket -> item_keys;
1186 endkp = keyp + items;
1187
1188 while(1)
1189 {
1190 if(*keyp == key)
1191 return 1;
1192 if(++keyp >= endkp) break;
1193 }
1194 return 0;
1195 }
1196
1197
gehash_update(gehash_t * the_table,gehash_key_t key,gehash_data_t data_new)1198 size_t gehash_update(gehash_t * the_table, gehash_key_t key, gehash_data_t data_new)
1199 {
1200 struct gehash_bucket * current_bucket;
1201 size_t matched;
1202 int items;
1203 gehash_key_t * keyp, *endkp;
1204
1205 assert(the_table->version_number == SUBINDEX_VER0);
1206 current_bucket = _gehash_get_bucket (the_table, key);
1207
1208 matched = 0;
1209 items = current_bucket -> current_items;
1210 keyp = current_bucket -> item_keys;
1211 endkp = keyp + items;
1212
1213 while(1)
1214 {
1215 if(*keyp == key)
1216 {
1217 current_bucket -> item_values[keyp-current_bucket -> item_keys] = data_new;
1218 matched++;
1219 }
1220 keyp +=1;
1221 if(keyp >= endkp)
1222 break;
1223 }
1224 return matched;
1225 }
1226
gehash_get(gehash_t * the_table,gehash_key_t key,gehash_data_t * data_result,size_t max_result_space)1227 size_t gehash_get(gehash_t * the_table, gehash_key_t key, gehash_data_t * data_result, size_t max_result_space)
1228 {
1229 struct gehash_bucket * current_bucket;
1230 size_t matched;
1231 int items;
1232 gehash_key_t * keyp, *endkp;
1233
1234 if(max_result_space<1)
1235 return 0;
1236
1237 assert(the_table->version_number == SUBINDEX_VER0);
1238 current_bucket = _gehash_get_bucket (the_table, key);
1239
1240 matched = 0;
1241 items = current_bucket -> current_items;
1242 if(items<1) return 0;
1243
1244 keyp = current_bucket -> item_keys;
1245 endkp = keyp + items;
1246
1247 while(1)
1248 {
1249 if(*keyp == key)
1250 {
1251 data_result [matched] = current_bucket -> item_values[keyp-current_bucket -> item_keys];
1252 matched +=1;
1253 if(matched >= max_result_space)
1254 break;
1255 }
1256 keyp +=1;
1257 if(keyp >= endkp)
1258 break;
1259 }
1260 return matched;
1261 }
1262
1263
gehash_remove(gehash_t * the_table,gehash_key_t key)1264 size_t gehash_remove(gehash_t * the_table, gehash_key_t key)
1265 {
1266 struct gehash_bucket * current_bucket;
1267 int i;
1268 size_t removed;
1269
1270 assert(the_table->version_number == SUBINDEX_VER0);
1271 current_bucket = _gehash_get_bucket (the_table, key);
1272
1273 if(current_bucket -> current_items < 1)
1274 return 0;
1275
1276 removed = 0;
1277 for(i=0; ; i++)
1278 {
1279 while(current_bucket -> item_keys [i+removed] == key && i+removed < current_bucket -> current_items)
1280 removed += 1;
1281
1282 if(i+removed >= current_bucket -> current_items)
1283 break;
1284
1285 if(removed)
1286 {
1287 current_bucket -> item_keys [i] =
1288 current_bucket -> item_keys [i + removed];
1289
1290 current_bucket -> item_values [i] =
1291 current_bucket -> item_values [i + removed];
1292 }
1293
1294 }
1295
1296 current_bucket -> current_items -= removed;
1297 the_table-> current_items -= removed;
1298
1299 return removed;
1300 }
1301
gehash_get_hpc(gehash_t * the_table,gehash_key_t key,gehash_data_t * data_result,size_t max_result_space)1302 size_t gehash_get_hpc(gehash_t * the_table, gehash_key_t key, gehash_data_t * data_result, size_t max_result_space)
1303 {
1304 return -1;
1305 }
1306
1307
gehash_insert_sorted(gehash_t * the_table,gehash_key_t key,gehash_data_t data)1308 void gehash_insert_sorted(gehash_t * the_table, gehash_key_t key, gehash_data_t data)
1309 {
1310
1311 SUBREADprintf("UNIMPLEMENTED! gehash_insert_sorted \n");
1312 }
1313
1314
1315
1316 // Data Struct of dumpping:
1317 // {
1318 // size_t current_items;
1319 // size_t buckets_number;
1320 // struct
1321 // {
1322 // size_t current_items;
1323 // size_t space_size;
1324 // gehash_key_t item_keys [current_items];
1325 // gehash_data_t item_values [current_items]
1326 // } [buckets_number];
1327 // }
1328 //
1329
load_int32(FILE * fp)1330 unsigned int load_int32(FILE * fp)
1331 {
1332 int ret;
1333 int read_length;
1334 read_length = fread(&ret, sizeof(int), 1, fp);
1335 if(read_length<=0)assert(0);
1336 return ret;
1337 }
1338
load_int64(FILE * fp)1339 srInt_64 load_int64(FILE * fp)
1340 {
1341 srInt_64 ret;
1342 int read_length;
1343 read_length = fread(&ret, sizeof(srInt_64), 1, fp);
1344 if(read_length<=0)assert(0);
1345 return ret;
1346 }
1347
1348
gehash_load_option(const char fname[],int option_no,int * result)1349 int gehash_load_option(const char fname [], int option_no, int * result){
1350 char tabname[MAX_FILE_NAME_LENGTH];
1351 char magic_chars[8];
1352 int found = 0, rrtv;
1353 sprintf(tabname, "%s.00.b.tab", fname);
1354 FILE * fp = f_subr_open(tabname, "rb");
1355 if(fp == NULL){
1356 sprintf(tabname, "%s.00.c.tab", fname);
1357 fp = f_subr_open(tabname, "rb");
1358 }
1359 if(fp){
1360 rrtv = fread(magic_chars,1,8,fp);
1361 if(rrtv < 8) return -1;
1362 if(memcmp(magic_chars, "2subindx",7)==0) {
1363 while(1) {
1364 short option_key, option_length;
1365
1366 rrtv = fread(&option_key, 2, 1, fp);
1367 if(rrtv < 1) return -1;
1368 if(!option_key) break;
1369
1370 rrtv = fread(&option_length, 2, 1, fp);
1371 if(rrtv < 1) return -1;
1372
1373 if(option_key == option_no){
1374 *result = 0;
1375 rrtv = fread(result ,option_length,1,fp);
1376 if(rrtv < 1) return -1;
1377 found = 1;
1378 }
1379 else
1380 fseek(fp, option_length, SEEK_CUR);
1381 }
1382 }
1383 fclose(fp);
1384 }
1385 return found;
1386 }
1387
gehash_load(gehash_t * the_table,const char fname[])1388 int gehash_load(gehash_t * the_table, const char fname [])
1389 {
1390 int i, read_length, rrtv;
1391 char magic_chars[8];
1392 magic_chars[7]=0;
1393
1394 if(0)if(sizeof( size_t ) != 8|| sizeof(long int ) != 8|| sizeof(int ) != 4){
1395 SUBREADprintf("LINT: %zd , INT: %zd , SIZD : %zd\n", sizeof(long int ), sizeof(int ), sizeof(size_t));
1396 //return -1;
1397 }
1398
1399 memset(the_table -> malloc_ptr ,0, sizeof(void*) * GEHASH_MEM_PTR_NO);
1400 the_table -> index_gap = 0;
1401
1402 FILE * fp = f_subr_open(fname, "rb");
1403 if (!fp)
1404 {
1405 SUBREADprintf ("Table file '%s' is not found.\n", fname);
1406 return 1;
1407 }
1408
1409 rrtv = fread(magic_chars,1,8,fp);
1410 if(rrtv !=8){
1411 SUBREADprintf("Error: the index magic string cannot be found. It may contain format errors or file '%s' may be truncated.\n", fname);
1412 assert(rrtv==8);
1413 }
1414
1415 if(memcmp(magic_chars, "2subindx",8)!=0)
1416 {
1417 print_in_box(80,0,0,"");
1418 print_in_box(80,0,0,"WARNING your reference index was built under an old version of subread");
1419 print_in_box(80,0,0," package. Please rebuild the index for the reference genome.");
1420 print_in_box(80,0,0,"");
1421 }
1422 if(memcmp(magic_chars+1, "subindx",7)==0)
1423 {
1424 if('1'==magic_chars[0])
1425 the_table -> version_number = SUBINDEX_VER1;
1426 else if('2'==magic_chars[0])
1427 the_table -> version_number = SUBINDEX_VER2;
1428 else assert(0);
1429
1430 if(SUBINDEX_VER2 == the_table -> version_number)
1431 {
1432 while(1)
1433 {
1434 short option_key, option_length;
1435
1436 rrtv = fread(&option_key, 2, 1, fp);
1437 if(rrtv <1){
1438 SUBREADprintf("Error: the index header cannot be fully loaded. It may contain format errors or file '%s' may be truncated.\n", fname);
1439 return -1;
1440 }
1441 //SUBREADprintf("READOPT_TAB = %04X\n", option_key);
1442
1443 if(!option_key) break;
1444
1445 rrtv = fread(&option_length, 2, 1, fp);
1446 if(rrtv <1){
1447 SUBREADprintf("Error: the index header cannot be fully loaded. It may contain format errors or file '%s' may be truncated.\n", fname);
1448 return -1;
1449 }
1450
1451 rrtv = 999;
1452 if(option_key == SUBREAD_INDEX_OPTION_INDEX_GAP)
1453 rrtv = fread(&(the_table -> index_gap),2,1,fp);
1454 else if (option_key ==SUBREAD_INDEX_OPTION_INDEX_PADDING)
1455 rrtv = fread(&(the_table -> padding),2,1,fp);
1456 else
1457 fseek(fp, option_length, SEEK_CUR);
1458 if(rrtv <1){
1459 SUBREADprintf("Error: the index header cannot be fully loaded. It may contain format errors or file '%s' may be truncated.\n", fname);
1460 return -1;
1461 }
1462 }
1463 assert(the_table -> index_gap);
1464 }
1465 else if(SUBINDEX_VER1 == the_table -> version_number)
1466 the_table -> index_gap = 3;
1467
1468 the_table -> current_items = load_int64(fp);
1469 if(the_table -> current_items < 1 || the_table -> current_items > 0xffffffffllu){
1470 SUBREADputs("ERROR: the index format is unrecognizable.");
1471 assert(the_table -> current_items >0 && the_table -> current_items <=0xffffffffllu);
1472 }
1473
1474 unsigned int * bucket_bytes = malloc(sizeof(int) * GEHASH_MEM_PTR_NO);
1475 memset(bucket_bytes, 0xff, sizeof(int) * GEHASH_MEM_PTR_NO);
1476 the_table -> buckets_number = load_int32(fp);
1477 the_table -> buckets = (struct gehash_bucket * )malloc(sizeof(struct gehash_bucket) * the_table -> buckets_number);
1478 if(!the_table -> buckets)
1479 {
1480 SUBREADputs("Creating buckets.");
1481 SUBREADputs(MESSAGE_OUT_OF_MEMORY);
1482 return 1;
1483 }
1484
1485 srInt_64 fp_curr = ftello(fp);
1486 unsigned int accued_bytes = 0, grp_i = 0, curr_bucks = 0, per_group_bucks = the_table -> buckets_number/ GEHASH_MEM_PTR_NO + 2;
1487
1488 for(i=0; i<the_table -> buckets_number ; i++){
1489 unsigned int current_items = load_int32(fp);
1490 load_int32(fp);//useless for loading : space size
1491 unsigned int current_bytes = current_items*( sizeof(short) + sizeof(gehash_data_t) );
1492 accued_bytes += current_bytes;
1493 curr_bucks ++;
1494 if(curr_bucks >= per_group_bucks){
1495 //SUBREADprintf("Allocating %d : %d buckets : %u\n", grp_i, i, accued_bytes);
1496 bucket_bytes[grp_i++] = accued_bytes;
1497 accued_bytes = 0;
1498 curr_bucks = 0;
1499 }
1500 #ifdef __MINGW32__
1501 {
1502 //if(i%500000 == 0)SUBREADprintf("ESTMHUGE %d/%d\n", i, the_table -> buckets_number);
1503
1504 char * buffkk = malloc(current_bytes );
1505 fread(buffkk,current_bytes ,1, fp);
1506 free(buffkk);
1507 }
1508 #else
1509 fseeko(fp, current_bytes, SEEK_CUR);
1510 #endif
1511 }
1512 if(curr_bucks)
1513 bucket_bytes[grp_i++] = accued_bytes;
1514 fseeko(fp, (off_t)fp_curr, SEEK_SET);
1515
1516 for(i=0; i<GEHASH_MEM_PTR_NO ; i++){
1517
1518 //if(i%50000 == 0)SUBREADprintf("MEMHUGE %d/%d\n", i, the_table -> buckets_number);
1519 unsigned int current_bytes = bucket_bytes[i];
1520 if(current_bytes<0xff000000u){
1521 the_table -> malloc_ptr[i] = malloc(current_bytes);
1522 //SUBREADprintf("Allocating %d buckets : %u\n", i, current_bytes);
1523 if(!the_table -> malloc_ptr[i]){
1524 SUBREADputs(MESSAGE_OUT_OF_MEMORY);
1525 return 1;
1526 }
1527 }
1528 }
1529
1530 grp_i = 0;
1531 curr_bucks = 0;
1532 accued_bytes = 0;
1533 for(i=0; i<the_table -> buckets_number ; i++){
1534
1535 //if(i%50000 == 0)SUBREADprintf("FILLHUGE %d/%d\n", i, the_table -> buckets_number);
1536
1537 struct gehash_bucket * current_bucket = the_table -> buckets+i;
1538 current_bucket -> current_items = load_int32(fp);
1539 load_int32(fp);//useless for lo: space size
1540 current_bucket -> space_size = current_bucket -> current_items;
1541 unsigned int current_bytes = current_bucket -> current_items*( sizeof(short) + sizeof(gehash_data_t) );
1542
1543 current_bucket -> new_item_keys = (short*)(the_table -> malloc_ptr[grp_i] + accued_bytes);
1544 current_bucket -> item_values = (gehash_data_t*)(the_table -> malloc_ptr[grp_i] + accued_bytes + current_bucket -> current_items * sizeof(short));
1545 read_length = fread(current_bucket -> new_item_keys, sizeof(short), current_bucket -> current_items, fp);
1546 read_length += fread(current_bucket -> item_values, sizeof(gehash_data_t), current_bucket -> current_items, fp);
1547 if(read_length < 2* current_bucket -> current_items){
1548 SUBREADprintf("ERROR: the index is incomplete.\n");
1549 return 1;
1550 }
1551
1552 accued_bytes += current_bytes;
1553 curr_bucks ++;
1554 if(curr_bucks >= per_group_bucks){
1555 curr_bucks=0;
1556 accued_bytes=0;
1557 grp_i++;
1558 }
1559 }
1560
1561 int rval = fread(&(the_table -> is_small_table), sizeof(char), 1, fp);
1562 if (rval != 1)SUBREADprintf("ERROR: cannot find the table table.\n");
1563 free(bucket_bytes);
1564 fclose(fp);
1565 return 0;
1566
1567 }
1568 else
1569 {
1570 fclose(fp);
1571 the_table -> index_gap = 3;
1572 fp = f_subr_open(fname, "rb");
1573 the_table -> version_number = SUBINDEX_VER0;
1574 the_table -> current_items = load_int64(fp);
1575 the_table -> buckets_number = load_int32(fp);
1576 the_table -> buckets = (struct gehash_bucket * )malloc(sizeof(struct gehash_bucket) * the_table -> buckets_number);
1577 if(!the_table -> buckets)
1578 {
1579 SUBREADputs(MESSAGE_OUT_OF_MEMORY);
1580 return 1;
1581 }
1582
1583 for (i=0; i<the_table -> buckets_number; i++)
1584 {
1585 struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
1586 current_bucket -> current_items = load_int32(fp);
1587 current_bucket -> space_size = load_int32(fp);
1588 current_bucket -> space_size = current_bucket -> current_items;
1589 current_bucket -> item_keys = (gehash_key_t *) malloc ( sizeof(gehash_key_t) * current_bucket -> space_size);
1590 current_bucket -> item_values = (gehash_data_t *) malloc ( sizeof(gehash_data_t) * current_bucket -> space_size);
1591
1592 if(!(current_bucket -> item_keys&¤t_bucket -> item_values))
1593 {
1594 SUBREADputs(MESSAGE_OUT_OF_MEMORY);
1595 return 1;
1596
1597 }
1598
1599 if(current_bucket -> current_items > 0)
1600 {
1601 read_length = fread(current_bucket -> item_keys, sizeof(gehash_key_t), current_bucket -> current_items, fp);
1602 if(read_length < current_bucket -> current_items){
1603 SUBREADprintf("ERROR: the index is incomplete.\n");
1604 return 1;
1605 }
1606 read_length = fread(current_bucket -> item_values, sizeof(gehash_data_t), current_bucket -> current_items, fp);
1607 if(read_length < current_bucket -> current_items){
1608 SUBREADprintf("ERROR: the index is incomplete.\n");
1609 return 1;
1610 }
1611 }
1612
1613 }
1614
1615 read_length = fread(&(the_table -> is_small_table), sizeof(char), 1, fp);
1616 if(read_length!=1){
1617 SUBREADprintf("ERROR: the index is incomplete.\n");
1618 return 1;
1619 }
1620 fclose(fp);
1621 return 0;
1622 }
1623 }
1624
1625
gehash_sort(gehash_t * the_table)1626 void gehash_sort(gehash_t * the_table)
1627 {
1628 int i;
1629 for (i=0; i<the_table -> buckets_number; i++)
1630 {
1631 struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
1632 int ii, jj;
1633 gehash_key_t tmp_key;
1634 gehash_data_t tmp_val;
1635
1636 if(current_bucket -> current_items>=1)
1637 {
1638 for(ii=0;ii<current_bucket -> current_items -1; ii++)
1639 {
1640 for (jj = ii+1; jj < current_bucket -> current_items; jj++)
1641 {
1642 if (current_bucket -> item_keys[ii] > current_bucket -> item_keys[jj])
1643 {
1644 tmp_key = current_bucket -> item_keys[ii];
1645 current_bucket -> item_keys[ii] = current_bucket -> item_keys[jj];
1646 current_bucket -> item_keys[jj] = tmp_key;
1647
1648 tmp_val = current_bucket -> item_values[ii];
1649 current_bucket -> item_values[ii] = current_bucket -> item_values[jj];
1650 current_bucket -> item_values[jj] = tmp_val;
1651 }
1652 }
1653 }
1654 }
1655 }
1656 }
1657
write_cell(short option_key,short option_length,char * option_value,FILE * fp)1658 void write_cell(short option_key, short option_length, char * option_value, FILE * fp)
1659 {
1660 fwrite(&option_key, 2, 1, fp);
1661 fwrite(&option_length, 2, 1, fp);
1662 fwrite(option_value, option_length, 1, fp);
1663 }
1664
1665
write_options(FILE * fp,gehash_t * the_table)1666 void write_options(FILE * fp, gehash_t * the_table)
1667 {
1668 short option_key, option_length;
1669 short option_value;
1670
1671 option_key = SUBREAD_INDEX_OPTION_INDEX_PADDING;
1672 option_length = 2;
1673 option_value = the_table -> padding;
1674
1675 write_cell(option_key, option_length, (char *) &option_value,fp);
1676
1677 option_key = SUBREAD_INDEX_OPTION_INDEX_GAP;
1678 option_length = 2;
1679 option_value = the_table -> index_gap;
1680
1681 write_cell(option_key, option_length, (char *) &option_value,fp);
1682
1683 option_key = 0;
1684 fwrite(&option_key, 2, 1, fp);
1685 }
1686
is_1_greater_than_2(int bucket_i,int all_buckets,short k1,gehash_data_t v1,short k2,gehash_data_t v2)1687 int is_1_greater_than_2(int bucket_i, int all_buckets , short k1, gehash_data_t v1, short k2, gehash_data_t v2){
1688 if(k1 > k2) return 1;
1689 if(k1 == k2){
1690 unsigned int real_key = 1u*k1*1u*all_buckets;
1691 real_key += bucket_i;
1692 if((real_key%791)%2==0 && v1>v2) return 1;
1693 if((real_key%791)%2==1 && v1<v2) return 1;
1694 }
1695 return 0;
1696 }
1697
gehash_dump(gehash_t * the_table,const char fname[])1698 int gehash_dump(gehash_t * the_table, const char fname [])
1699 {
1700 int ii, jj, xx;
1701 int i, scroll_counter = 0;
1702 FILE * fp = f_subr_open(fname, "wb");
1703 int maximum_bucket_size = 0;
1704 if (!fp)
1705 {
1706 SUBREADprintf ("Table file '%s' is not able to open.\n", fname);
1707 return -1;
1708 }
1709
1710 if(the_table->version_number == SUBINDEX_VER2)
1711 {
1712 fwrite("2subindx",1,8,fp);
1713 write_options(fp, the_table);
1714 }
1715
1716 assert(sizeof(srInt_64 ) == 8);
1717 assert(sizeof(int ) == 4);
1718 fwrite(& (the_table -> current_items ), sizeof(srInt_64), 1, fp);
1719 fwrite(& (the_table -> buckets_number), sizeof(int), 1, fp);
1720
1721 print_in_box(80,0,0,"Save current index block... ");
1722
1723 for (i=0; i<the_table -> buckets_number; i++)
1724 {
1725 struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
1726 if(current_bucket -> current_items > maximum_bucket_size)
1727 maximum_bucket_size = current_bucket -> current_items;
1728 }
1729
1730 #define SORT_LANE_NUMBER 16
1731 short * sort_space_new_key[SORT_LANE_NUMBER];
1732 gehash_data_t * sort_space_data [SORT_LANE_NUMBER];
1733 int items_in_sort[SORT_LANE_NUMBER] ;
1734 int items_in_merge[SORT_LANE_NUMBER] ;
1735 if(the_table->version_number > SUBINDEX_VER0)
1736 {
1737 for(xx=0;xx<SORT_LANE_NUMBER;xx++)
1738 {
1739 sort_space_new_key[xx] = (short *)malloc(sizeof(short)*(maximum_bucket_size/SORT_LANE_NUMBER+2));
1740 sort_space_data[xx] = (gehash_data_t *)malloc(sizeof(gehash_data_t)*(maximum_bucket_size/SORT_LANE_NUMBER+2));
1741 }
1742 }
1743
1744 for (i=0; i<the_table -> buckets_number; i++)
1745 {
1746 struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
1747 gehash_data_t tmp_val=0;
1748
1749 if(i % (the_table -> buckets_number/10) == 0)
1750 print_window_scrolling_bar("", 1.0*i/the_table -> buckets_number, 74, &scroll_counter);
1751
1752 if(current_bucket -> current_items>=1)
1753 {
1754 if(the_table->version_number == SUBINDEX_VER0)
1755 {
1756 for(ii=0;ii<current_bucket -> current_items -1; ii++)
1757 {
1758 gehash_key_t tmp_key;
1759 for (jj = ii+1; jj < current_bucket -> current_items; jj++)
1760 {
1761
1762 if(is_1_greater_than_2(i, the_table -> buckets_number, current_bucket -> item_keys[ii], current_bucket -> item_values[ii],
1763 current_bucket -> item_keys[jj], current_bucket -> item_values[jj] )) {
1764 tmp_key = current_bucket -> item_keys[ii];
1765 current_bucket -> item_keys[ii] = current_bucket -> item_keys[jj];
1766 current_bucket -> item_keys[jj] = tmp_key;
1767
1768 tmp_val = current_bucket -> item_values[ii];
1769 current_bucket -> item_values[ii] = current_bucket -> item_values[jj];
1770 current_bucket -> item_values[jj] = tmp_val;
1771 }
1772 }
1773 }
1774 }
1775 else
1776 {
1777 if(1){
1778 memset(items_in_sort, 0, sizeof(int)*SORT_LANE_NUMBER);
1779 for(ii=0;ii<current_bucket -> current_items;ii++)
1780 {
1781 int sort_lane_no = ii%SORT_LANE_NUMBER;
1782 int xx_th_item = items_in_sort[sort_lane_no] ++;
1783 sort_space_new_key[sort_lane_no][xx_th_item] = current_bucket -> new_item_keys[ii];
1784 sort_space_data[sort_lane_no][xx_th_item] = current_bucket -> item_values[ii];
1785 }
1786
1787 // sort the individual lanes.
1788 for(xx=0;xx<SORT_LANE_NUMBER;xx++)
1789 {
1790 for(ii=0;ii<items_in_sort[xx]-1; ii++)
1791 {
1792 for(jj = ii+1; jj < items_in_sort[xx]; jj++)
1793 {
1794 short tmp_key;
1795 if(is_1_greater_than_2(i, the_table -> buckets_number, sort_space_new_key[xx][ii], sort_space_data[xx][ii], sort_space_new_key[xx][jj], sort_space_data[xx][jj] )) {
1796 tmp_key = sort_space_new_key[xx][ii];
1797 sort_space_new_key[xx][ii] = sort_space_new_key[xx][jj];
1798 sort_space_new_key[xx][jj] = tmp_key;
1799
1800 tmp_val = sort_space_data[xx][ii];
1801 sort_space_data[xx][ii]=sort_space_data[xx][jj];
1802 sort_space_data[xx][jj]=tmp_val;
1803 }
1804 }
1805 }
1806 }
1807
1808 // items_in_sort : array, each element for a lane
1809 // items_in_merge : how many items in each lane that have been merged (ie, read_ptr for each lane)
1810 // sort_space_new_key : N lanes each has been sorted, each has many keys (ie subreads)
1811 // sort_space_data : N lanes, each has many values (ie positions)
1812 // current_bucket -> new_item_keys : One array containing sorted keys (write_ptr is ii)
1813 // current_bucket -> item_values: One array containing sorted values (write_ptr is ii))
1814 memset(items_in_merge, 0, sizeof(int)*SORT_LANE_NUMBER);
1815
1816 // tmp_key : the current key being examed. A larger key is put AFTER a smaller key.
1817 for(ii=0;ii<current_bucket -> current_items;ii++)
1818 {
1819 int tmp_key=0x7fffffff;
1820 int selected_lane = 0, trying_lane;
1821 for(trying_lane=0;trying_lane<SORT_LANE_NUMBER;trying_lane++)
1822 {
1823 int ii_in_trying_lane = items_in_merge[trying_lane];
1824 if(ii_in_trying_lane >= items_in_sort[trying_lane]) continue;
1825
1826 if(tmp_key >= 0x10000 ||is_1_greater_than_2(
1827 i, the_table -> buckets_number,
1828 sort_space_new_key[selected_lane][items_in_merge[selected_lane]],
1829 sort_space_data[selected_lane][items_in_merge[selected_lane]],
1830 sort_space_new_key[trying_lane][items_in_merge[trying_lane]],
1831 sort_space_data[trying_lane][items_in_merge[trying_lane]]
1832 )){
1833 selected_lane=trying_lane;
1834 tmp_key = sort_space_new_key[trying_lane][ii_in_trying_lane];
1835 }
1836 }
1837
1838 current_bucket -> new_item_keys[ii] = sort_space_new_key[selected_lane][items_in_merge[selected_lane]];
1839 current_bucket -> item_values[ii] = sort_space_data[selected_lane][items_in_merge[selected_lane]];
1840
1841 items_in_merge[selected_lane]++;
1842 //printf("V [%d] [%d] =%d\n",i , ii, tmp_key);
1843 }
1844 }
1845 }
1846 }
1847
1848
1849 if(0) // debug
1850 {
1851 int inidx=0;
1852 srInt_64 real_old = -1;
1853 for(xx = 0; xx < current_bucket -> current_items; xx++){
1854 unsigned int real_key = 1u * current_bucket -> new_item_keys[xx] *1u * the_table -> buckets_number;
1855 real_key += i;
1856
1857 if(real_old != real_key) inidx=0;
1858
1859 if(1||real_key >= 0xffffff00 || real_key <5000)
1860 SUBREADprintf("SUBREAD_%010u POS_%010u INIDX_%04d BUCKET_%04d_ITEM_%08d\n",real_key ,current_bucket -> item_values[xx], inidx, i, xx);
1861 real_old = real_key;
1862 inidx++;
1863 }
1864 }
1865
1866 int is_full = 0;
1867 int write_len = fwrite(& (current_bucket -> current_items), sizeof(int), 1, fp);
1868 if(write_len<1) is_full = 1;
1869 write_len = fwrite(& (current_bucket -> space_size), sizeof(int), 1, fp);
1870 if(write_len<1) is_full = 1;
1871
1872 if(the_table->version_number == SUBINDEX_VER0)
1873 fwrite(current_bucket -> item_keys, sizeof(gehash_key_t), current_bucket -> current_items, fp);
1874 else{
1875 write_len = fwrite(current_bucket -> new_item_keys, sizeof(short), current_bucket -> current_items, fp);
1876 if(write_len < current_bucket -> current_items) is_full = 1;
1877 }
1878 write_len = fwrite(current_bucket -> item_values, sizeof(gehash_data_t), current_bucket -> current_items, fp);
1879 if(write_len < current_bucket -> current_items) is_full = 1;
1880 if(is_full){
1881 fclose(fp);
1882 SUBREADprintf("ERROR: Unable to write into the output file. Please check the disk space in the output directory.\n");
1883 return 1;
1884 }
1885 }
1886
1887 if(the_table->version_number > SUBINDEX_VER0)
1888 {
1889 for(xx=0;xx<SORT_LANE_NUMBER;xx++)
1890 {
1891 free(sort_space_new_key[xx]);
1892 free(sort_space_data[xx]);
1893 }
1894 }
1895
1896
1897 int write_len = fwrite(&(the_table -> is_small_table), sizeof(char), 1, fp);
1898 fclose(fp);
1899
1900 if(write_len < 1){
1901 SUBREADprintf("ERROR: Unable to write into the output file. Please check the disk space in the output directory.\n");
1902 return 1;
1903 }
1904 print_in_box(80,0,0,"");
1905 return 0;
1906 }
1907
gehash_destory(gehash_t * the_table)1908 void gehash_destory(gehash_t * the_table)
1909 {
1910 int i, is_ptr = 0;
1911
1912 for(i = 0; i < GEHASH_MEM_PTR_NO; i++) if(the_table -> malloc_ptr[i]){
1913 free(the_table -> malloc_ptr[i]);
1914 is_ptr=1;
1915 }
1916
1917 if(!is_ptr) for (i=0; i<the_table -> buckets_number; i++)
1918 {
1919 struct gehash_bucket * current_bucket = &(the_table -> buckets[i]);
1920 if (current_bucket -> space_size > 0)
1921 {
1922 if(0==the_table->free_item_only)free (current_bucket -> item_keys);
1923 free (current_bucket -> item_values);
1924 }
1925 }
1926
1927 free (the_table -> buckets);
1928
1929 the_table -> current_items = 0;
1930 the_table -> buckets_number = 0;
1931 }
1932