1 /***************************************************************
2
3 The Subread software package is free software package:
4 you can redistribute it and/or modify it under the terms
5 of the GNU General Public License as published by the
6 Free Software Foundation, either version 3 of the License,
7 or (at your option) any later version.
8
9 Subread is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty
11 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12
13 See the GNU General Public License for more details.
14
15 Authors: Drs Yang Liao and Wei Shi
16
17 ***************************************************************/
18
19
20 #include <stdlib.h>
21 #include <math.h>
22 #include <signal.h>
23 #include <dirent.h>
24 #include <string.h>
25 #include <assert.h>
26 #include <unistd.h>
27 #include "hashtable.h"
28 #include "HelperFunctions.h"
29 #include "gene-value-index.h"
30 #include "gene-algorithms.h"
31 #include "input-files.h"
32 #include "core.h"
33 #include "core-indel.h"
34 #include "core-junction.h"
35 #include "core-bigtable.h"
36 #include "sublog.h"
37
38
39 //#define indel_debug
40 //#define is_target_window_X(x) 0
41
42 int DIGIN_COUNTER=0;
43 int FRESH_COUNTER=0;
44 extern unsigned int BASE_BLOCK_LENGTH;
45
is_bad_read(char * r,int len,char * w)46 void is_bad_read(char * r, int len, char * w)
47 {
48 }
49
merge_event_sides(void * arr,int start,int items,int items2)50 void merge_event_sides(void * arr, int start, int items, int items2)
51 {
52 void ** arrl = (void **)arr;
53 chromosome_event_t * event_space = (chromosome_event_t*) arrl[1];
54 int * event_id_list = (int *)arrl[0];
55 int is_small_side = (arrl[2]!=NULL);
56
57 int * merge_tmp = malloc(sizeof(int)* (items+items2));
58 int x1, first_cursor = start, second_cursor = start+items;
59
60 for(x1=0;x1<items+items2;x1++)
61 {
62 int choose_2 = 0;
63 if(first_cursor >= start+items)
64 choose_2 = 1;
65 else if(second_cursor < start+items+items2)
66 {
67
68 chromosome_event_t * levent = event_space + event_id_list[first_cursor];
69 chromosome_event_t * revent = event_space + event_id_list[second_cursor];
70
71 if(is_small_side){
72 if(levent->event_small_side > revent ->event_small_side)
73 choose_2 = 1;
74 }else
75 {
76 if(levent->event_large_side > revent ->event_large_side)
77 choose_2 = 1;
78 }
79
80 }
81
82 if(choose_2)
83 merge_tmp[x1] = event_id_list[second_cursor++];
84 else
85 merge_tmp[x1] = event_id_list[first_cursor++];
86
87 }
88
89 memcpy(event_id_list + start, merge_tmp, sizeof(int)* (items+items2));
90
91 free(merge_tmp);
92 }
93
exchange_event_sides(void * arr,int l,int r)94 void exchange_event_sides(void * arr, int l, int r)
95 {
96 void ** arrl = (void **)arr;
97 int * event_id_list = (int *)arrl[0];
98
99 int tmp ;
100 tmp = event_id_list[l];
101 event_id_list[l] = event_id_list[r];
102 event_id_list[r] = tmp;
103
104 }
105
compare_event_inner(chromosome_event_t * event_space,int l,int r,int is_small_side)106 int compare_event_inner(chromosome_event_t * event_space, int l, int r, int is_small_side)
107 {
108
109 chromosome_event_t * revent = event_space +r;
110 chromosome_event_t * levent = event_space +l;
111 if(is_small_side){
112 if(levent->event_small_side <revent ->event_small_side) return -1;
113 else if(levent->event_small_side ==revent ->event_small_side) return 0;
114 else return 1;
115 }else
116 {
117 if(levent->event_large_side <revent ->event_large_side) return -1;
118 else if(levent->event_large_side ==revent ->event_large_side) return 0;
119 else return 1;
120 }
121
122
123 }
124
compare_event_sides(void * arr,int l,int r)125 int compare_event_sides(void * arr, int l, int r)
126 {
127 void ** arrl = (void **)arr;
128 chromosome_event_t * event_space = (chromosome_event_t*) arrl[1];
129 int * event_id_list = (int *)arrl[0];
130 int is_small_side = (arrl[2]!=NULL);
131
132 return compare_event_inner(event_space, event_id_list[l], event_id_list[r], is_small_side);
133 }
134
BINsearch_event(chromosome_event_t * event_space,int * event_ids,int is_small_side,unsigned int pos,int events)135 int BINsearch_event(chromosome_event_t * event_space, int * event_ids, int is_small_side, unsigned int pos, int events)
136 {
137 int imax, imin;
138 assert(events > 0);
139 imax = events - 1; imin = 0;
140
141 while (1)
142 {
143 int imid = (imin+ imax)/2;
144 unsigned tested_mid_pos;
145 //assert(imax<events);
146 //assert(imin<events);
147 //assert(imid<events);
148 assert(event_ids[imid]<events);
149 chromosome_event_t * mid_event = event_space+event_ids[imid];
150 tested_mid_pos = is_small_side?mid_event->event_small_side:mid_event->event_large_side;
151
152 if(tested_mid_pos == pos)
153 return imid;
154 else if(tested_mid_pos < pos)
155 imin = imid +1;
156 else imax = imid -1;
157
158 if(imin > imax)
159 return imax;
160 }
161 }
162
163
164 typedef struct{
165 int thread_id;
166 int block_start;
167 int block_end;
168 HashTable * result_tab;
169 int * small_side_ordered_event_ids, * large_side_ordered_event_ids;
170 chromosome_event_t * event_space;
171 global_context_t * global_context;
172 } AT_context_t;
173
174
175 #define ANTI_SUPPORTING_READ_LIMIT 100
176
anti_support_thread_run(void * v_cont)177 void * anti_support_thread_run(void * v_cont){
178 AT_context_t * atcont = v_cont;
179
180 int * cancelled_event_list = malloc(sizeof(int)*ANTI_SUPPORTING_READ_LIMIT), x2;
181 int * small_side_ordered_event_ids = atcont -> small_side_ordered_event_ids, * large_side_ordered_event_ids = atcont -> large_side_ordered_event_ids;
182 global_context_t * global_context = atcont -> global_context;
183 chromosome_event_t * event_space = atcont -> event_space;
184 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
185 if(indel_context->total_events<1)return NULL;
186
187 int current_read_number, cancelled_events, x1;
188
189 for(current_read_number = atcont -> block_start; current_read_number < atcont -> block_end; current_read_number++)
190 {
191 int is_second_read;
192 for (is_second_read = 0; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read ++)
193 {
194 int best_read_id;
195 for(best_read_id = 0; best_read_id < global_context -> config.multi_best_reads; best_read_id++)
196 {
197 mapping_result_t *current_result = _global_retrieve_alignment_ptr(global_context, current_read_number, is_second_read, best_read_id);
198 if(current_result -> selected_votes<1) break;
199 if(!global_context->config.report_multi_mapping_reads)if(current_result -> result_flags & CORE_IS_BREAKEVEN) continue;
200 //if(current_result -> result_flags & CORE_IS_GAPPED_READ) continue;
201 if(current_result->selected_votes < global_context->config.minimum_subread_for_first_read)
202 continue;
203
204 cancelled_events=0;
205 unsigned int mapping_head = current_result -> selected_position;
206 unsigned int coverage_start = mapping_head + current_result -> confident_coverage_start;
207 unsigned int coverage_end = mapping_head + current_result -> confident_coverage_end;
208
209 int small_side_left_event = BINsearch_event(event_space, small_side_ordered_event_ids,1, coverage_start - 1, indel_context->total_events) + 1;
210 int large_side_left_event = BINsearch_event(event_space, large_side_ordered_event_ids,0, coverage_start - 1, indel_context->total_events) + 1;
211
212 int small_side_right_event = BINsearch_event(event_space, small_side_ordered_event_ids,1, coverage_end, indel_context->total_events)+20;
213 int large_side_right_event = BINsearch_event(event_space, large_side_ordered_event_ids,0, coverage_end, indel_context->total_events)+20;
214
215 for(x1 = small_side_left_event ; x1 <= small_side_right_event ; x1++)
216 {
217 if(x1 >= indel_context->total_events) break;
218 if(ANTI_SUPPORTING_READ_LIMIT <= cancelled_events) break;
219 chromosome_event_t * event_body = event_space+ small_side_ordered_event_ids[x1];
220 if(event_body -> event_small_side <= coverage_start + 5) continue;
221 if(event_body -> event_small_side >= coverage_end - 5) continue;
222
223 long long cur_count = HashTableGet(atcont -> result_tab , NULL+small_side_ordered_event_ids[x1]+1 ) - NULL;
224 cur_count++;
225 HashTablePut( atcont -> result_tab , NULL+small_side_ordered_event_ids[x1]+1 , NULL+cur_count );
226
227 cancelled_event_list[cancelled_events++] = small_side_ordered_event_ids[x1];
228 }
229
230 for(x1 = large_side_left_event ; x1 <= large_side_right_event ; x1++)
231 {
232 if(x1 >= indel_context->total_events) break;
233 chromosome_event_t * event_body = event_space+ large_side_ordered_event_ids[x1];
234 if(event_body -> event_large_side <= coverage_start + 5) continue;
235 if(event_body -> event_large_side >= coverage_end- 5) continue;
236
237 int to_be_add = 1;
238 for(x2=0; x2<cancelled_events; x2++)
239 if(cancelled_event_list[x2] == large_side_ordered_event_ids[x1])
240 {
241 to_be_add = 0;
242 break;
243 }
244
245 if(to_be_add){
246 // printf("OCT27-ANTISUP-READ @ %u has LARGE @ %u~%u, INDELS = %d, ASUP = %d\n", coverage_end, event_body -> event_small_side, event_body -> event_large_side, event_body -> indel_length, event_body -> anti_supporting_reads);
247 long long cur_count = HashTableGet(atcont -> result_tab , NULL+large_side_ordered_event_ids[x1]+1 ) - NULL;
248 cur_count++;
249 HashTablePut( atcont -> result_tab , NULL+large_side_ordered_event_ids[x1]+1 , NULL+cur_count );
250 }
251 }
252 }
253 }
254 }
255
256 free(cancelled_event_list);
257 return NULL;
258 }
259
260
anti_support_add_count(void * ky,void * va,HashTable * tab)261 void anti_support_add_count(void * ky, void * va, HashTable * tab){
262 chromosome_event_t * event_space = tab -> appendix1 ;
263 int eno = ky-NULL-1;
264 chromosome_event_t * event_body = event_space + eno;
265 event_body -> anti_supporting_reads+= (va - NULL);
266 }
267
anti_supporting_read_scan(global_context_t * global_context)268 int anti_supporting_read_scan(global_context_t * global_context)
269 {
270
271 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
272 if(indel_context->total_events<1)return 0;
273
274 chromosome_event_t * event_space = indel_context -> event_space_dynamic;
275
276 int x1, * small_side_ordered_event_ids, * large_side_ordered_event_ids;
277
278 small_side_ordered_event_ids = malloc(sizeof(int)*indel_context -> total_events);
279 large_side_ordered_event_ids = malloc(sizeof(int)*indel_context -> total_events);
280
281 for(x1=0; x1<indel_context->total_events; x1++) {
282 small_side_ordered_event_ids[x1]=x1;
283 large_side_ordered_event_ids[x1]=x1;
284 }
285
286 void * sort_data[3];
287 sort_data[0] = small_side_ordered_event_ids;
288 sort_data[1] = event_space;
289 sort_data[2] = (void *)0xffff; // small_side_ordering
290
291 merge_sort(sort_data, indel_context->total_events, compare_event_sides, exchange_event_sides, merge_event_sides);
292
293 sort_data[0] = large_side_ordered_event_ids;
294 sort_data[2] = NULL; // large_side_ordering
295
296 merge_sort(sort_data, indel_context->total_events, compare_event_sides, exchange_event_sides, merge_event_sides);
297
298
299 AT_context_t ATconts[64];
300 pthread_t AThreads[64];
301
302 int thread_no, last_end = 0;
303 for(thread_no=0; thread_no< global_context -> config.all_threads; thread_no++){
304 AT_context_t * atc = ATconts + thread_no;
305 atc -> thread_id = thread_no;
306 atc -> block_start = last_end;
307 atc -> block_end = last_end = global_context -> processed_reads_in_chunk / global_context -> config.all_threads * thread_no;
308 if(thread_no == global_context -> config.all_threads-1) atc -> block_end = global_context -> processed_reads_in_chunk;
309
310 atc -> global_context = global_context;
311 atc -> result_tab = HashTableCreate(200000);
312 atc -> small_side_ordered_event_ids = small_side_ordered_event_ids;
313 atc -> large_side_ordered_event_ids = large_side_ordered_event_ids;
314 atc -> event_space = event_space;
315
316 pthread_create(AThreads + thread_no , NULL, anti_support_thread_run, atc);
317 }
318
319 for(thread_no=0; thread_no< global_context -> config.all_threads; thread_no++){
320 pthread_join(AThreads[thread_no], NULL);
321 ATconts[thread_no].result_tab -> appendix1 = event_space;
322 HashTableIteration( ATconts[thread_no].result_tab, anti_support_add_count );
323 HashTableDestroy(ATconts[thread_no].result_tab);
324 }
325
326 free(small_side_ordered_event_ids);
327 free(large_side_ordered_event_ids);
328
329 return 0;
330 }
331
reallocate_event_space(global_context_t * global_context,thread_context_t * thread_context,int event_no)332 chromosome_event_t * reallocate_event_space( global_context_t* global_context,thread_context_t * thread_context,int event_no)
333 {
334 int max_event_no;
335
336 if(thread_context) {
337 max_event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> current_max_event_number;
338 if(max_event_no<=event_no)
339 {
340 ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> current_max_event_number *= 1.6;
341 ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic =
342 realloc(((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic, sizeof(chromosome_event_t) *
343 ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> current_max_event_number);
344 }
345 return ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
346 } else {
347 max_event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> current_max_event_number;
348 if(max_event_no<=event_no)
349 {
350 ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> current_max_event_number *= 1.6;
351 ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic =
352 realloc(((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic, sizeof(chromosome_event_t) *
353 ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> current_max_event_number);
354 }
355 return ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
356 }
357
358 }
359
is_ambiguous_indel_score(chromosome_event_t * e)360 int is_ambiguous_indel_score(chromosome_event_t * e)
361 {
362 return 0;
363 //if(e -> indel_length == 1 && e->is_ambiguous>4)return 1;
364 //if(e -> indel_length <= 3 && e->is_ambiguous>3 && (e-> event_small_side % 7 < 3))return 1;
365 //if(e -> indel_length == 4) return e->is_ambiguous>4;
366 //return 0;
367 }
368
event_neighbour_sort_compare(void * arr,int l,int r)369 int event_neighbour_sort_compare(void * arr, int l, int r){
370 unsigned int ** sort_data = (unsigned int **) arr;
371 if(sort_data[1][l] > sort_data[1][r]) return 1;
372 if(sort_data[1][l] < sort_data[1][r]) return -1;
373 return 0;
374 }
375
event_neighbour_sort_exchange(void * arr,int l,int r)376 void event_neighbour_sort_exchange(void * arr, int l, int r){
377 unsigned int ** sort_data = (unsigned int **) arr;
378 unsigned int tmpi;
379
380 tmpi = sort_data[0][l];
381 sort_data[0][l] = sort_data[0][r];
382 sort_data[0][r] = tmpi;
383
384 tmpi = sort_data[1][l];
385 sort_data[1][l] = sort_data[1][r];
386 sort_data[1][r] = tmpi;
387 }
388
389 // smallFirst
event_neighbour_sort_merge(void * arr,int start,int items,int items2)390 void event_neighbour_sort_merge(void * arr, int start, int items, int items2){
391 unsigned int ** sort_data = (unsigned int **) arr;
392
393 unsigned int * tmp_global_id_list = malloc(sizeof(int) * (items+items2));
394 unsigned int * tmp_offset_list = malloc(sizeof(int) * (items+items2));
395
396 int i1_cursor = start, i2_cursor = items + start;
397 int tmp_cursor = 0;
398
399 while(1){
400 if(i1_cursor == items + start && i2_cursor == items + items2 + start )break;
401 int select_items_1 = (i1_cursor < items + start && event_neighbour_sort_compare(arr, i1_cursor, i2_cursor) <= 0) || (i2_cursor == start + items + items2);
402 if(select_items_1){
403 tmp_global_id_list[tmp_cursor] = sort_data[0][i1_cursor];
404 tmp_offset_list[tmp_cursor ++ ] = sort_data[1][i1_cursor ++];
405 }else{
406 tmp_global_id_list[tmp_cursor] = sort_data[0][i2_cursor];
407 tmp_offset_list[tmp_cursor ++ ] = sort_data[1][i2_cursor ++];
408 }
409 }
410
411 memcpy( sort_data[0] + start, tmp_global_id_list, sizeof(int) * (items+items2) );
412 memcpy( sort_data[1] + start, tmp_offset_list, sizeof(int) * (items+items2) );
413 free(tmp_global_id_list);
414 free(tmp_offset_list);
415 }
416
test_redundant_event(global_context_t * global_context,chromosome_event_t * testee,chromosome_event_t * neighbour)417 int test_redundant_event( global_context_t * global_context, chromosome_event_t* testee, chromosome_event_t* neighbour ){
418 int length_delta = 3, is_same = 0;
419 if(testee -> event_type == CHRO_EVENT_TYPE_INDEL) length_delta = 0;
420
421 long long dist = testee -> event_large_side;
422 dist -= neighbour -> event_large_side;
423
424 if(abs(dist) <= length_delta){
425 if(testee -> event_type == CHRO_EVENT_TYPE_INDEL){
426 int indel_diff = testee -> indel_length;
427 indel_diff -= neighbour -> indel_length;
428
429 if(abs(indel_diff) <= length_delta)
430 is_same = 1;
431 }else is_same = 1;
432 }
433
434 int is_redund = 0;
435 if(is_same){
436 if(testee -> indel_at_junction > neighbour -> indel_at_junction) is_redund = 1;
437 else if(testee -> indel_at_junction == neighbour -> indel_at_junction){
438 if(testee -> supporting_reads < neighbour -> supporting_reads) is_redund = 1;
439 else if( testee -> supporting_reads == neighbour -> supporting_reads ){
440 if(testee -> event_small_side < neighbour -> event_small_side) is_redund = 1;
441 }
442 }
443 }
444
445 return is_redund;
446 }
447
448
449 #define reallocate_to_be_removed_ids if(to_be_removed_number >= remove_id_size-1){\
450 remove_id_size = remove_id_size*3/2;\
451 to_be_removed_ids = realloc(to_be_removed_ids, remove_id_size);\
452 }
453
454 #define MIN_EVENT_SUPPORT_NO 2
455
remove_neighbour(global_context_t * global_context)456 void remove_neighbour(global_context_t * global_context)
457 {
458 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
459
460 HashTable * event_table = indel_context -> event_entry_table;
461 chromosome_event_t * event_space = indel_context -> event_space_dynamic;
462 int xk1,xk2,xk3;
463 int * to_be_removed_ids;
464 int to_be_removed_number = 0, all_junctions = 0;
465 int remove_id_size = 999999;
466
467 to_be_removed_ids = malloc(sizeof(int) * (1+remove_id_size));
468 for(xk1=0; xk1<indel_context->total_events; xk1++)
469 {
470 chromosome_event_t * event_body = &event_space[xk1];
471 if(CHRO_EVENT_TYPE_REMOVED == event_body->event_type) continue;
472 if(global_context -> config.do_remove_neighbour_for_scRNA && global_context -> config.scRNA_input_mode && event_body -> supporting_reads < MIN_EVENT_SUPPORT_NO){
473 reallocate_to_be_removed_ids;
474 to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
475 continue;
476 }
477 all_junctions++;
478 int event_type;
479 for(event_type = 0; event_type < 2; event_type++)
480 {
481 if(event_type) // for INDELs
482 {
483 int neighbour_range = 3;
484 if(event_body->event_type != CHRO_EVENT_TYPE_INDEL) continue;
485
486 reallocate_to_be_removed_ids;
487 if(is_ambiguous_indel_score(event_body))to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
488 else for(xk2=-neighbour_range; xk2<=neighbour_range; xk2++)
489 {
490 //if(!xk2)continue;
491 unsigned int test_pos_small = event_body -> event_small_side + xk2;
492 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
493
494 int found_events = search_event(global_context, event_table, event_space, test_pos_small , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_INDEL, search_return);
495 //if(test_pos_small > 284136 && test_pos_small < 284146) printf("POS=%u\tTRES=%d\n", test_pos_small, found_events);
496
497 for(xk3 = 0; xk3<found_events; xk3++)
498 {
499 reallocate_to_be_removed_ids;
500 chromosome_event_t * tested_neighbour = search_return[xk3];
501 long long int length_diff = tested_neighbour -> indel_length;
502 length_diff -= event_body -> indel_length;
503 if(length_diff == 0 && xk2==0) continue;
504 if(is_ambiguous_indel_score(tested_neighbour))
505 to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
506 else
507 if(length_diff==0)
508 {
509 if(event_body -> event_small_side - event_body->connected_previous_event_distance + 1 == tested_neighbour-> event_large_side) continue;
510 if(tested_neighbour -> event_small_side - tested_neighbour->connected_previous_event_distance + 1 == event_body-> event_large_side) continue;
511
512 if(event_body -> event_large_side + event_body->connected_next_event_distance - 1 == tested_neighbour-> event_small_side) continue;
513 if(tested_neighbour -> event_large_side + tested_neighbour->connected_next_event_distance - 1 == event_body-> event_small_side) continue;
514
515 if(event_body->event_quality < tested_neighbour -> event_quality)
516 to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
517 else if(event_body->event_quality == tested_neighbour -> event_quality &&
518 (event_body -> supporting_reads < tested_neighbour -> supporting_reads||(event_body -> supporting_reads == tested_neighbour -> supporting_reads && xk2<0)))
519 {
520 to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
521 }
522 }
523 }
524 }
525 }
526 else // for junctions/fusions
527 {
528 int neighbour_range = 11;
529 int indel_range = 4 , delta_small = 0;// max_range = 0;
530 if(event_body -> event_type == CHRO_EVENT_TYPE_INDEL) continue;
531 if(event_body -> is_donor_found_or_annotation & 64) continue; // do not remove known events
532 //if(! global_context->config.check_donor_at_junctions )max_range = 10;
533
534 //for(delta_small = - max_range; delta_small <= max_range ; delta_small ++)
535 for(xk2=-neighbour_range; xk2<=neighbour_range; xk2++)
536 {
537 if(!xk2)continue;
538 unsigned int test_pos_small = event_body -> event_small_side + xk2;
539 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
540
541 int found_events = search_event(global_context,event_table, event_space, test_pos_small + delta_small , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_JUNCTION|CHRO_EVENT_TYPE_FUSION, search_return);
542 for(xk3 = 0; xk3<found_events; xk3++)
543 {
544
545 reallocate_to_be_removed_ids;
546 chromosome_event_t * tested_neighbour = search_return[xk3];
547
548 if(tested_neighbour -> indel_at_junction > event_body -> indel_at_junction) continue;
549 if(event_body -> indel_at_junction > tested_neighbour -> indel_at_junction && abs(tested_neighbour -> event_large_side - tested_neighbour -> event_small_side + tested_neighbour -> indel_at_junction - event_body -> event_large_side + event_body -> event_small_side - event_body -> indel_at_junction) <= 16)
550 to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
551 else if(tested_neighbour -> event_large_side >= event_body -> event_large_side -indel_range + xk2 && tested_neighbour -> event_large_side <= event_body -> event_large_side + indel_range + xk2 && (event_body -> supporting_reads < tested_neighbour -> supporting_reads || (event_body -> supporting_reads == tested_neighbour -> supporting_reads && xk2<0)))
552 to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
553 }
554 }
555
556 if((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection) && event_body -> event_type == CHRO_EVENT_TYPE_FUSION)
557 {
558 for(xk2=-10 ; xk2 < 10 ; xk2++)
559 {
560 if(!xk2)continue;
561
562 reallocate_to_be_removed_ids;
563 unsigned int test_pos_small = event_body -> event_small_side + xk2;
564 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
565 int found_events = search_event(global_context,event_table, event_space, test_pos_small + delta_small , EVENT_SEARCH_BY_BOTH_SIDES, CHRO_EVENT_TYPE_JUNCTION|CHRO_EVENT_TYPE_FUSION, search_return);
566
567 for(xk3 = 0; xk3<found_events; xk3++)
568 {
569 chromosome_event_t * tested_neighbour = search_return[xk3];
570 if(found_events && event_body -> supporting_reads < tested_neighbour -> supporting_reads)
571 {
572 to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
573 break;
574 }
575 }
576 }
577 }
578 }
579 }
580 }
581
582 for(xk1=0; xk1<to_be_removed_number; xk1++) {
583 int event_no = to_be_removed_ids[xk1];
584 chromosome_event_t * deleted_event = &event_space[event_no];
585
586 for(xk2=0;xk2<2;xk2++){
587 unsigned int pos = xk2?deleted_event->event_large_side:deleted_event->event_small_side;
588 unsigned int * res = HashTableGet(event_table, NULL+pos);
589 if(res){
590 int current_size = res[0]&0x0fffffff;
591 int wrt_ptr = 1;
592 for(xk3 = 1 ; xk3< current_size+1 ; xk3++){
593 if(!res[xk3])break;
594 if(res[xk3] -1 == event_no) continue;
595 if(wrt_ptr != xk3) res[wrt_ptr]=res[xk3];
596 wrt_ptr++;
597 }
598 if(0 && wrt_ptr==1){
599 HashTableRemove(event_table, NULL+pos);
600 free(res);
601 }else if(wrt_ptr < current_size +1)res[wrt_ptr]=0;
602 }
603 }
604 if(deleted_event -> event_type == CHRO_EVENT_TYPE_INDEL && deleted_event -> inserted_bases)
605 free(deleted_event -> inserted_bases);
606 deleted_event -> event_type = CHRO_EVENT_TYPE_REMOVED;
607 }
608
609 free(to_be_removed_ids);
610 }
611
612
613
localPointerCmp_forEventEntry(const void * pointer1,const void * pointer2)614 int localPointerCmp_forEventEntry(const void *pointer1, const void *pointer2)
615 {
616 unsigned int v1 = (unsigned int)(pointer1-NULL);
617 unsigned int v2 = (unsigned int)(pointer2-NULL);
618 return v1!=v2;
619 }
620
localPointerHashFunction_forEventEntry(const void * pointer)621 srUInt_64 localPointerHashFunction_forEventEntry(const void *pointer)
622 {
623 unsigned int v = (pointer-NULL);
624 return (unsigned long) (v << 24) ^ (v>>8);
625 }
626
627
init_indel_tables(global_context_t * context)628 int init_indel_tables(global_context_t * context)
629 {
630 indel_context_t * indel_context = (indel_context_t*)malloc(sizeof(indel_context_t));
631
632 // event_entry_table is a table from an absolute chromosome location to a list of integers.
633 // each integer is the index of event in the event array + 1.
634 // unused values are set to 0.
635
636 indel_context -> event_entry_table = NULL;
637
638 context -> module_contexts[MODULE_INDEL_ID] = indel_context;
639
640 indel_context -> total_events = 0;
641 indel_context -> current_max_event_number = 0;
642 //indel_context -> current_max_event_number = context->config.init_max_event_number;
643 //indel_context -> event_space_dynamic = (chromosome_event_t *)malloc(sizeof(chromosome_event_t)*indel_context -> current_max_event_number);
644
645 indel_context -> event_space_dynamic = NULL;
646
647 if(context -> config.all_threads < 2) {
648 indel_context -> event_entry_table = HashTableCreate(399997);
649
650 indel_context -> event_entry_table -> appendix1=malloc(1024 * 1024 * 64);
651 indel_context -> event_entry_table -> appendix2=malloc(1024 * 1024 * 64);
652 memset(indel_context -> event_entry_table -> appendix1, 0, 1024 * 1024 * 64);
653 memset(indel_context -> event_entry_table -> appendix2, 0, 1024 * 1024 * 64);
654
655 HashTableSetKeyComparisonFunction(indel_context->event_entry_table, localPointerCmp_forEventEntry);
656 HashTableSetHashFunction(indel_context->event_entry_table, localPointerHashFunction_forEventEntry);
657
658 indel_context -> total_events = 0;
659 indel_context -> current_max_event_number = context->config.init_max_event_number;
660 indel_context -> event_space_dynamic = malloc(sizeof(chromosome_event_t)*indel_context -> current_max_event_number);
661 if(!indel_context -> event_space_dynamic)
662 {
663 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_FATAL, "Cannot allocate memory for threads. Please try to reduce the thread number.");
664 return 1;
665 }
666 }
667 if(context->config.is_third_iteration_running) {
668 char * fns = malloc(200);
669 fns[0]=0;
670 exec_cmd("ulimit -n", fns, 200);
671 int max_open_file = atoi(fns);
672 //SUBREADprintf("SYS FILE LIMIT=%d\n", max_open_file);
673 free(fns);
674 max_open_file = max(100, max_open_file);
675 max_open_file = min(3000, max_open_file);
676
677
678 indel_context -> local_reassembly_pileup_files = HashTableCreate(100);
679 indel_context -> local_reassembly_pileup_files -> appendix1 = NULL + max_open_file * 2/ 3;
680 HashTableSetDeallocationFunctions(indel_context -> local_reassembly_pileup_files, NULL, NULL);
681 HashTableSetKeyComparisonFunction(indel_context -> local_reassembly_pileup_files, my_strcmp);
682 HashTableSetHashFunction(indel_context -> local_reassembly_pileup_files ,HashTableStringHashFunction);
683 }
684
685 indel_context -> dynamic_align_table = malloc(sizeof(short*)*MAX_READ_LENGTH);
686 indel_context -> dynamic_align_table_mask = malloc(sizeof(char *)*MAX_READ_LENGTH);
687
688 int xk1;
689 for(xk1=0;xk1<MAX_READ_LENGTH; xk1++) {
690 indel_context -> dynamic_align_table[xk1] = malloc(sizeof(short)*MAX_READ_LENGTH);
691 indel_context -> dynamic_align_table_mask[xk1] = malloc(sizeof(char)*MAX_READ_LENGTH);
692 }
693
694 for(xk1=0; xk1<EVENT_BODY_LOCK_BUCKETS; xk1++)
695 subread_init_lock(indel_context -> event_body_locks+xk1);
696
697 return 0;
698 }
699
init_indel_thread_contexts(global_context_t * global_context,thread_context_t * thread_context,int task)700 int init_indel_thread_contexts(global_context_t * global_context, thread_context_t * thread_context, int task)
701 {
702 indel_thread_context_t * indel_thread_context = (indel_thread_context_t*)malloc(sizeof(indel_thread_context_t));
703 indel_context_t * indel_context = (indel_context_t *) global_context -> module_contexts[MODULE_INDEL_ID];
704
705 if(task == STEP_VOTING) {
706 indel_thread_context -> event_entry_table = HashTableCreate(399997 * ( global_context -> config.scRNA_input_mode?18:2 ));
707 indel_thread_context -> event_entry_table -> appendix1=NULL;//indel_context -> event_entry_table-> appendix1;
708 indel_thread_context -> event_entry_table -> appendix2=NULL;//indel_context -> event_entry_table-> appendix2;
709 HashTableSetKeyComparisonFunction(indel_thread_context->event_entry_table, localPointerCmp_forEventEntry);
710 HashTableSetHashFunction(indel_thread_context->event_entry_table, localPointerHashFunction_forEventEntry);
711
712 indel_thread_context -> total_events = 0;
713 indel_thread_context -> current_max_event_number = global_context->config.init_max_event_number;
714 indel_thread_context -> event_space_dynamic = malloc(sizeof(chromosome_event_t)*indel_thread_context -> current_max_event_number);
715 if(!indel_thread_context -> event_space_dynamic)
716 {
717 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_FATAL, "Cannot allocate memory for threads. Please try to reduce the thread number.");
718 return 1;
719 }
720
721 indel_thread_context -> dynamic_align_table = malloc(sizeof(short*)*MAX_READ_LENGTH);
722 indel_thread_context -> dynamic_align_table_mask = malloc(sizeof(char *)*MAX_READ_LENGTH);
723
724 int xk1;
725 for(xk1=0;xk1<MAX_READ_LENGTH; xk1++)
726 {
727 indel_thread_context -> dynamic_align_table[xk1] = malloc(sizeof(short)*MAX_READ_LENGTH);
728 indel_thread_context -> dynamic_align_table_mask[xk1] = malloc(sizeof(char)*MAX_READ_LENGTH);
729 }
730 }
731 else if(task == STEP_ITERATION_TWO)
732 {
733 indel_thread_context -> event_entry_table = indel_context -> event_entry_table;
734 indel_thread_context -> total_events = indel_context -> total_events;
735 indel_thread_context -> event_space_dynamic = indel_context -> event_space_dynamic;
736 indel_thread_context -> final_reads_mismatches_array = (unsigned short*)malloc(sizeof(unsigned short)*indel_context -> total_events);
737 indel_thread_context -> final_counted_reads_array = (unsigned short*)malloc(sizeof(unsigned short)*indel_context -> total_events);
738 assert(indel_thread_context -> final_counted_reads_array);
739
740 memset(indel_thread_context -> final_reads_mismatches_array , 0, sizeof(unsigned short)*indel_context -> total_events);
741 memset(indel_thread_context -> final_counted_reads_array , 0, sizeof(unsigned short)*indel_context -> total_events);
742
743 subread_init_lock(&thread_context -> output_lock);
744
745 }
746
747 thread_context -> module_thread_contexts[MODULE_INDEL_ID] = indel_thread_context;
748 return 0;
749 }
750
751
destory_event_entry_table(HashTable * old)752 void destory_event_entry_table(HashTable * old)
753 {
754 int bucket;
755 KeyValuePair * cursor;
756 for(bucket=0; bucket<old -> numOfBuckets; bucket++)
757 {
758 cursor = old -> bucketArray[bucket];
759 while(1)
760 {
761 if (!cursor) break;
762 int * id_list = (int *) cursor ->value;
763 //#warning "=============== UNCOMMENT THE NEXT LINE IN THE RELEASE!!! THIS IS FOR REMOVING A SEGFAULT ERROR =================="
764 free(id_list);
765
766 cursor = cursor->next;
767 }
768 }
769 }
770
771 typedef struct {
772 unsigned int scanning_positons;
773 unsigned int thread_bodytable_number;
774 } scanning_events_record_t;
775
776
777
778 #define _get_global_body(iii) ( indel_context -> event_space_dynamic + records[iii].thread_bodytable_number)
779
scanning_events_compare(void * arr,int l,int r)780 int scanning_events_compare(void * arr, int l, int r){
781 void ** arrr = (void **) arr;
782 indel_context_t * indel_context = arrr[0];
783 scanning_events_record_t * records = arrr[1];
784 chromosome_event_t * body_l = _get_global_body(l);
785 chromosome_event_t * body_r = _get_global_body(r);
786
787 if(records[l].scanning_positons > records[r].scanning_positons)return 1;
788 if(records[l].scanning_positons < records[r].scanning_positons)return -1;
789
790 if((body_l -> is_donor_found_or_annotation & 64)!=0 && (body_r -> is_donor_found_or_annotation & 64)==0) return 1; // prefer known truth.
791 if((body_l -> is_donor_found_or_annotation & 64)==0 && (body_r -> is_donor_found_or_annotation & 64)!=0) return -1;
792 if(body_l -> supporting_reads > body_r -> supporting_reads) return -1;
793 if(body_l -> supporting_reads < body_r -> supporting_reads) return 1;
794 if(abs(body_l -> indel_length) < abs(body_r -> indel_length)) return 1;
795 if(abs(body_l -> indel_length) > abs(body_r -> indel_length)) return -1;
796 if(body_l -> indel_length > body_r -> indel_length) return -1; // same length, but L is del and R is ins -- prefer del than ins
797 if(body_l -> indel_length < body_r -> indel_length) return 1;
798
799 if(body_l -> event_small_side > body_r -> event_small_side)return 1;
800 if(body_l -> event_small_side < body_r -> event_small_side)return -1;
801 if(body_l -> event_large_side > body_r -> event_large_side)return 1;
802 return -1;
803 }
804
null_scanning_events_merge(void * arr,int start,int items,int items2)805 void null_scanning_events_merge(void * arr, int start, int items, int items2){}
scanning_events_merge(void * arr,int start,int items,int items2)806 void scanning_events_merge(void * arr, int start, int items, int items2){
807 void ** arrr = (void **) arr;
808 scanning_events_record_t * records = arrr[1];
809
810 int read_1_ptr = start, read_2_ptr = start+items, write_ptr;
811 scanning_events_record_t * merged_records = malloc(sizeof(scanning_events_record_t) * (items+items2));
812
813 for(write_ptr=0; write_ptr<items+items2; write_ptr++){
814 if((read_1_ptr >= start+items)||(read_2_ptr < start+items+items2 && scanning_events_compare(arr, read_1_ptr, read_2_ptr) > 0))
815 memcpy(merged_records+write_ptr, records+(read_2_ptr++), sizeof(scanning_events_record_t));
816 else
817 memcpy(merged_records+write_ptr, records+(read_1_ptr++), sizeof(scanning_events_record_t));
818 }
819 memcpy(records + start, merged_records, sizeof(scanning_events_record_t) * (items+items2));
820 free(merged_records);
821 }
822
scanning_events_exchange(void * arr,int l,int r)823 void scanning_events_exchange(void * arr, int l, int r){
824 void ** arrr = (void **) arr;
825 scanning_events_record_t * records = arrr[1];
826
827 unsigned long tmpi;
828
829 tmpi = records[l].scanning_positons;
830 records[l].scanning_positons = records[r].scanning_positons;
831 records[r].scanning_positons = tmpi;
832
833 tmpi = records[l].thread_bodytable_number;
834 records[l].thread_bodytable_number = records[r].thread_bodytable_number;
835 records[r].thread_bodytable_number = tmpi;
836 }
837
838
839
840 #define _test_record_size if(current_record_number >= current_record_size - 2){\
841 current_record_size *= 1.5;\
842 records=realloc(records, sizeof(scanning_events_record_t)*current_record_size);\
843 if(NULL == records) return -1;\
844 }\
845
846 #define _add_record records[current_record_number].scanning_positons = body -> event_small_side;\
847 records[current_record_number].thread_bodytable_number = xx1;\
848 current_record_number++;\
849 records[current_record_number].scanning_positons = body -> event_large_side;\
850 records[current_record_number].thread_bodytable_number = xx1;\
851 current_record_number++;\
852
853
sort_junction_entry_table(global_context_t * global_context)854 int sort_junction_entry_table(global_context_t * global_context){
855 indel_context_t * indel_context = (indel_context_t*)global_context -> module_contexts[MODULE_INDEL_ID];
856 chromosome_event_t * event_space = indel_context -> event_space_dynamic;
857
858 if(indel_context -> event_entry_table){
859 if(indel_context -> event_entry_table->appendix1)
860 {
861 free(indel_context -> event_entry_table -> appendix1);
862 free(indel_context -> event_entry_table -> appendix2);
863 }
864
865 // free the entry table: the pointers.
866 destory_event_entry_table(indel_context -> event_entry_table);
867 HashTableDestroy(indel_context -> event_entry_table);
868 }
869
870 indel_context -> event_entry_table = HashTableCreate(399997);
871 HashTableSetKeyComparisonFunction(indel_context->event_entry_table, localPointerCmp_forEventEntry);
872 HashTableSetHashFunction(indel_context->event_entry_table, localPointerHashFunction_forEventEntry);
873
874 if(global_context -> config.use_bitmap_event_table) {
875 indel_context -> event_entry_table -> appendix1=malloc(1024 * 1024 * 64);
876 indel_context -> event_entry_table -> appendix2=malloc(1024 * 1024 * 64);
877 memset(indel_context -> event_entry_table -> appendix1, 0, 1024 * 1024 * 64);
878 memset(indel_context -> event_entry_table -> appendix2, 0, 1024 * 1024 * 64);
879 } else {
880 indel_context -> event_entry_table -> appendix1=NULL;
881 indel_context -> event_entry_table -> appendix2=NULL;
882 }
883
884 int xx1, current_record_number=0, current_record_size=10000;
885
886 scanning_events_record_t * records = malloc(sizeof(scanning_events_record_t)*current_record_size);
887
888 for(xx1 = 0; xx1 < indel_context -> total_events; xx1++){
889 chromosome_event_t * body = event_space + xx1;
890 _test_record_size;
891 _add_record;
892 }
893
894 void * sort_arr[2];
895 sort_arr[0] = indel_context;
896 sort_arr[1] = records;
897
898 // many repeated elements
899 // do not use quick sort.
900 merge_sort(sort_arr, current_record_number, scanning_events_compare, scanning_events_exchange, scanning_events_merge);
901 unsigned int last_scannping_pos = records[0].scanning_positons;
902 int merge_start = 0;
903 HashTable * event_table = indel_context -> event_entry_table;
904
905 for(xx1 =0; xx1 <= current_record_number; xx1++){
906 scanning_events_record_t * this_record = NULL;
907 if(xx1<current_record_number) this_record = records+xx1;
908
909 if(xx1>0){
910 if(xx1 == current_record_number || last_scannping_pos!= this_record -> scanning_positons){
911 int merge_end = xx1, merge_i;
912 if( merge_end-merge_start > MAX_EVENT_ENTRIES_PER_SITE )
913 merge_end = merge_start + MAX_EVENT_ENTRIES_PER_SITE;
914 unsigned int * id_list = malloc(sizeof(int) * (1+merge_end-merge_start));
915 assert(id_list);
916 id_list[0] = merge_end-merge_start;
917 for(merge_i = merge_start; merge_i < merge_end; merge_i++){
918 chromosome_event_t * body = _get_global_body(merge_i);
919 id_list[merge_i - merge_start + 1] = records[merge_i].thread_bodytable_number + 1;
920
921 mark_event_bitmap(event_table->appendix1, body -> event_small_side);
922 mark_event_bitmap(event_table->appendix2, body -> event_large_side);
923 }
924 merge_start = xx1;
925
926 //#warning "======= UNCOMMENT NEXT ======="
927 HashTablePut(indel_context -> event_entry_table, NULL + last_scannping_pos, id_list);
928 }
929 }
930 if(xx1 == current_record_number) break;
931 last_scannping_pos = this_record -> scanning_positons;
932 }
933
934 free(records);
935 return 0;
936 }
937
938 #define _get_event_body(iii) (records[iii].thread_no<0?( indel_context -> event_space_dynamic + records[iii].thread_bodytable_number ):(((indel_thread_context_t*)(thread_contexts[records[iii].thread_no].module_thread_contexts[MODULE_INDEL_ID]))-> event_space_dynamic + records[iii].thread_bodytable_number ))
939
940 typedef struct {
941 unsigned int thread_bodytable_number;
942 short thread_no;
943 } concatinating_events_record_t;
944
945 #define _test_conc_size if(conc_rec_items >= conc_rec_size - 1){\
946 conc_rec_size *= 1.5;\
947 records = realloc(records, conc_rec_size * sizeof(concatinating_events_record_t));\
948 if(NULL == records) return -1;\
949 }
950
conc_sort_compare(void * arr,int l,int r)951 int conc_sort_compare(void * arr, int l, int r){
952 void ** arrr = (void **)arr;
953 concatinating_events_record_t * records = arrr[0];
954 indel_context_t * indel_context = arrr[1];
955 thread_context_t * thread_contexts = arrr[2];
956
957 chromosome_event_t * body_l = _get_event_body(l);
958 chromosome_event_t * body_r = _get_event_body(r);
959
960
961 if(body_l -> event_small_side > body_r -> event_small_side)return 3;
962 if(body_l -> event_small_side < body_r -> event_small_side)return -3;
963 if(body_l -> event_large_side > body_r -> event_large_side)return 3;
964 if(body_l -> event_large_side < body_r -> event_large_side)return -3;
965
966 if(abs(body_l -> indel_length) < abs(body_r -> indel_length)) return 2;
967 if(abs(body_l -> indel_length) > abs(body_r -> indel_length)) return -2;
968 if(body_l -> indel_length > body_r -> indel_length) return -2; // same length, but L is del and R is ins -- prefer del than ins
969 if(body_l -> indel_length < body_r -> indel_length) return 2;
970
971 if((body_l -> is_donor_found_or_annotation & 64)!=0 && (body_r -> is_donor_found_or_annotation & 64)==0) return 1; // prefer known truth.
972 if((body_l -> is_donor_found_or_annotation & 64)==0 && (body_r -> is_donor_found_or_annotation & 64)!=0) return -1;
973 if(body_l -> supporting_reads > body_r -> supporting_reads) return -1;
974 if(body_l -> supporting_reads < body_r -> supporting_reads) return 1;
975 return 0;
976
977 }
978
979
conc_sort_merge(void * arr,int start,int items,int items2)980 void conc_sort_merge(void * arr, int start, int items, int items2){
981 void ** arrr = (void **) arr;
982 concatinating_events_record_t * records = arrr[0];
983
984 int read_1_ptr = start, read_2_ptr = start+items, write_ptr;
985 concatinating_events_record_t * merged_records = malloc(sizeof(concatinating_events_record_t) * (items+items2));
986
987 for(write_ptr=0; write_ptr<items+items2; write_ptr++){
988 if((read_1_ptr >= start+items)||(read_2_ptr < start+items+items2 && conc_sort_compare(arr, read_1_ptr, read_2_ptr) > 0))
989 memcpy(merged_records+write_ptr, records+(read_2_ptr++), sizeof(concatinating_events_record_t));
990 else
991 memcpy(merged_records+write_ptr, records+(read_1_ptr++), sizeof(concatinating_events_record_t));
992 }
993 memcpy(records + start, merged_records, sizeof(concatinating_events_record_t) * (items+items2));
994 free(merged_records);
995 }
996
997
998
conc_sort_exchange(void * arr,int l,int r)999 void conc_sort_exchange(void * arr, int l, int r){
1000
1001 void ** arrr = (void **)arr;
1002 concatinating_events_record_t * records = arrr[0];
1003
1004 unsigned int tmpi;
1005 tmpi = records[l].thread_bodytable_number;
1006 records[l].thread_bodytable_number = records[r].thread_bodytable_number;
1007 records[r].thread_bodytable_number = tmpi;
1008
1009 tmpi = records[l].thread_no;
1010 records[l].thread_no = records[r].thread_no;
1011 records[r].thread_no = tmpi;
1012 }
1013
1014 // sort and merge events from all threads and the global event space.
sort_global_event_table(global_context_t * global_context)1015 int sort_global_event_table(global_context_t * global_context){
1016 return finalise_indel_and_junction_thread(global_context, NULL, STEP_VOTING);
1017 }
1018
finalise_indel_and_junction_thread(global_context_t * global_context,thread_context_t * thread_contexts,int task)1019 int finalise_indel_and_junction_thread(global_context_t * global_context, thread_context_t * thread_contexts, int task)
1020 {
1021 indel_context_t * indel_context = (indel_context_t*)global_context -> module_contexts[MODULE_INDEL_ID];
1022 if(task == STEP_VOTING) {
1023 concatinating_events_record_t * records;
1024 int conc_rec_size = 10000, conc_rec_items = 0;
1025 records = malloc(sizeof(concatinating_events_record_t) * conc_rec_size);
1026
1027 int xk1, thn;
1028 if(thread_contexts)
1029 for(thn =0; thn < global_context->config.all_threads; thn++){
1030 thread_context_t * thread_context = thread_contexts+thn;
1031 indel_thread_context_t * indel_thread_context = (indel_thread_context_t*)thread_context -> module_thread_contexts[MODULE_INDEL_ID];
1032
1033 for(xk1 = 0; xk1 < indel_thread_context -> total_events; xk1++){
1034 chromosome_event_t * old_body = indel_thread_context -> event_space_dynamic + xk1;
1035 if(old_body -> event_type == CHRO_EVENT_TYPE_REMOVED) continue;
1036
1037 _test_conc_size;
1038 records[conc_rec_items].thread_no=thn;
1039 records[conc_rec_items++].thread_bodytable_number=xk1;
1040 }
1041 }
1042
1043 for(xk1 = 0; xk1 < indel_context -> total_events; xk1++){
1044 chromosome_event_t * old_body = indel_context -> event_space_dynamic + xk1;
1045 if(old_body -> event_type == CHRO_EVENT_TYPE_REMOVED) continue;
1046 _test_conc_size;
1047 records[conc_rec_items].thread_no=-1;
1048 records[conc_rec_items++].thread_bodytable_number=xk1;
1049 }
1050
1051 void * sort_arr[3];
1052 sort_arr[0] = records;
1053 sort_arr[1] = indel_context;
1054 sort_arr[2] = thread_contexts;
1055
1056 // many repeated elements -- do not use quick sort.
1057 merge_sort(sort_arr, conc_rec_items, conc_sort_compare, conc_sort_exchange, conc_sort_merge);
1058
1059 chromosome_event_t * prev_env = NULL;
1060 int merge_target_size = 10000;
1061 int merge_target_items = 0;
1062 chromosome_event_t * merge_target = malloc(sizeof(chromosome_event_t) * merge_target_size);
1063
1064 int merge_start = 0;
1065 for(xk1 = 0; xk1 <= conc_rec_items; xk1++){
1066 chromosome_event_t * this_event = (xk1 == conc_rec_items)?NULL:_get_event_body(xk1);
1067 if(xk1 > 0){
1068 int compret = 0;
1069 if(xk1 < conc_rec_items) compret = conc_sort_compare(sort_arr, xk1-1, xk1);
1070 if(abs(compret)>1 || xk1 == conc_rec_items){// different events or last one -- merge [ prev_event_record_no , xk1 - 1 ]
1071 int xk_merge;
1072 // find a new slot in the target space
1073 if(merge_target_items >= merge_target_size - 1){
1074 merge_target_size *= 1.5;
1075 merge_target = realloc(merge_target, sizeof(chromosome_event_t) * merge_target_size);
1076 }
1077
1078 chromosome_event_t * merged_body = merge_target + merge_target_items;
1079 memcpy( merged_body, prev_env, sizeof(chromosome_event_t) );
1080 merged_body -> global_event_id = merge_target_items++;
1081
1082 for(xk_merge = merge_start; xk_merge < xk1 - 1; xk_merge++){
1083 chromosome_event_t * old_body = _get_event_body(xk_merge);
1084
1085 assert(merged_body -> event_type == old_body -> event_type);
1086 merged_body -> supporting_reads += old_body -> supporting_reads;
1087 merged_body -> anti_supporting_reads += old_body -> anti_supporting_reads;
1088 merged_body -> final_counted_reads += old_body -> final_counted_reads;
1089 merged_body -> final_reads_mismatches += old_body -> final_reads_mismatches;
1090 merged_body -> critical_supporting_reads += old_body -> critical_supporting_reads;
1091 merged_body -> junction_flanking_left = max(merged_body -> junction_flanking_left, old_body -> junction_flanking_left);
1092 merged_body -> junction_flanking_right = max(merged_body -> junction_flanking_right, old_body -> junction_flanking_right);
1093 merged_body -> is_donor_found_or_annotation |= old_body -> is_donor_found_or_annotation;
1094
1095 if(merged_body -> connected_next_event_distance > 0 && old_body -> connected_next_event_distance > 0)
1096 merged_body -> connected_next_event_distance = min(merged_body -> connected_next_event_distance, old_body -> connected_next_event_distance);
1097 else merged_body -> connected_next_event_distance = max(merged_body -> connected_next_event_distance, old_body -> connected_next_event_distance);
1098
1099 if(merged_body -> connected_previous_event_distance > 0 && old_body -> connected_previous_event_distance > 0)
1100 merged_body -> connected_previous_event_distance = min(merged_body -> connected_previous_event_distance, old_body -> connected_previous_event_distance);
1101 else merged_body -> connected_previous_event_distance = max(merged_body -> connected_previous_event_distance, old_body -> connected_previous_event_distance);
1102
1103 if(old_body->inserted_bases && old_body -> event_type == CHRO_EVENT_TYPE_INDEL && old_body -> indel_length<0){
1104 //printf("OCT27-FREEMEM-INS [%d] : thread-%d + %d %u-%p\n", xk_merge, records[xk_merge].thread_no, records[xk_merge].thread_bodytable_number, old_body->event_small_side, old_body->inserted_bases );
1105 if(merged_body -> inserted_bases && merged_body -> inserted_bases != old_body -> inserted_bases){
1106 // SUBREADprintf("FREE PTR=%p\n", old_body -> inserted_bases);
1107 free(old_body -> inserted_bases);
1108 }
1109 else merged_body -> inserted_bases = old_body -> inserted_bases;
1110 old_body -> inserted_bases = NULL;
1111 }
1112 }
1113 merge_start = xk1;
1114 }
1115 }
1116 if(xk1 == conc_rec_items) break;
1117 prev_env = this_event;
1118 }
1119
1120 free(records);
1121
1122 if(thread_contexts)
1123 for(thn =0; thn < global_context->config.all_threads; thn++){
1124 thread_context_t * thread_context = thread_contexts+thn;
1125 indel_thread_context_t * indel_thread_context = (indel_thread_context_t*)thread_context -> module_thread_contexts[MODULE_INDEL_ID];
1126
1127 destory_event_entry_table(indel_thread_context -> event_entry_table);
1128 HashTableDestroy(indel_thread_context -> event_entry_table);
1129
1130 free(indel_thread_context -> event_space_dynamic);
1131
1132 for(xk1=0;xk1<MAX_READ_LENGTH; xk1++)
1133 {
1134 free(indel_thread_context -> dynamic_align_table[xk1]);
1135 free(indel_thread_context -> dynamic_align_table_mask[xk1]);
1136 }
1137
1138 free(indel_thread_context->dynamic_align_table);
1139 free(indel_thread_context->dynamic_align_table_mask);
1140 free(indel_thread_context);
1141 }
1142
1143 if(indel_context -> event_space_dynamic ) free(indel_context -> event_space_dynamic);
1144 indel_context -> event_space_dynamic = NULL;
1145
1146 indel_context -> event_space_dynamic = merge_target;
1147 indel_context -> current_max_event_number = merge_target_size;
1148 indel_context -> total_events = merge_target_items;
1149 } else if(task == STEP_ITERATION_TWO) {
1150 int xk1, thn;
1151 if(thread_contexts)
1152 for(thn =0; thn < global_context->config.all_threads; thn++){
1153 thread_context_t * thread_context = thread_contexts+thn;
1154 indel_thread_context_t * indel_thread_context = (indel_thread_context_t*)thread_context -> module_thread_contexts[MODULE_INDEL_ID];
1155
1156 for(xk1 = 0; xk1 < indel_thread_context -> total_events; xk1++)
1157 {
1158 indel_context -> event_space_dynamic [xk1] . final_counted_reads +=indel_thread_context -> final_counted_reads_array[xk1];
1159 indel_context -> event_space_dynamic [xk1] . final_reads_mismatches +=indel_thread_context -> final_reads_mismatches_array[xk1];
1160 }
1161 free(indel_thread_context -> final_counted_reads_array);
1162 free(indel_thread_context -> final_reads_mismatches_array);
1163 free(indel_thread_context);
1164 }
1165 }
1166 return 0;
1167 }
1168
1169
add_annotation_to_junctions(void * key,void * val,HashTable * tab)1170 void add_annotation_to_junctions(void * key, void * val, HashTable * tab){
1171 global_context_t * global_context = tab->appendix1;
1172 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
1173 char * gene_chr = key;
1174 int * pos_strands = val;
1175 int current_lagre_end = -1;
1176 char chroname[MAX_CHROMOSOME_NAME_LEN];
1177
1178 int x1, ww=0, wi=0;
1179 for(x1 =0 ; ; x1++){
1180 if(ww){
1181 chroname[wi++]=gene_chr[x1];
1182 chroname[wi]=0;
1183 }
1184 if(0==gene_chr[x1]) break;
1185 if(gene_chr[x1]==':')
1186 ww=1;
1187 }
1188
1189 //#warning ">>>>>>> COMMENt NEXT <<<<<<<<<<<<<<"
1190 //if(1) SUBREADprintf("SEE LOCATION: %s has %d\n", gene_chr , pos_strands[0]+1 );
1191
1192 for(x1 = 1; x1 < pos_strands[0]+1; x1+=3){
1193 if(pos_strands[x1]==0)break;
1194 //#warning ">>>>>>> COMMENt NEXT <<<<<<<<<<<<<<"
1195 //SUBREADprintf("SEE LOCATION ___ : %d >0 && %d < %d %d %d\n", current_lagre_end , current_lagre_end, pos_strands[x1], pos_strands[x1+1], pos_strands[x1+2]);
1196 if(current_lagre_end > 0 && current_lagre_end < pos_strands[x1]){
1197 // add a junction between current_lagre_end and pos_strands[x1]
1198 if(indel_context -> total_events >= indel_context -> current_max_event_number -1){
1199 indel_context -> current_max_event_number*=1.2;
1200 indel_context -> event_space_dynamic = realloc(indel_context -> event_space_dynamic , sizeof( chromosome_event_t )*indel_context -> current_max_event_number);
1201 }
1202 chromosome_event_t * newbody = indel_context -> event_space_dynamic + indel_context -> total_events ;
1203 memset(newbody, 0, sizeof(chromosome_event_t));
1204 newbody -> global_event_id = indel_context -> total_events++;
1205 newbody -> event_type = CHRO_EVENT_TYPE_JUNCTION;
1206
1207 newbody -> event_small_side = linear_gene_position(&global_context->chromosome_table , chroname, current_lagre_end);
1208 newbody -> event_large_side = linear_gene_position(&global_context->chromosome_table , chroname, pos_strands[x1]);
1209
1210 //#warning ">>>>>>> COMMENt NEXT <<<<<<<<<<<<<<"
1211 if(0){
1212 if ( 1 || newbody -> event_small_side == 625683723){
1213 SUBREADprintf("Loaded event :");
1214 debug_show_event(global_context, newbody);
1215 }
1216 }
1217 newbody -> is_negative_strand = pos_strands[x1+2];
1218 newbody -> is_donor_found_or_annotation = 127;
1219 // printf("09NOV: add_annotation_to_junctions: %u -> %u '%s', %d,%d\n", newbody -> event_small_side , newbody -> event_large_side, chroname, current_lagre_end, pos_strands[x1]);
1220 }
1221
1222 if(current_lagre_end < pos_strands[x1+1])
1223 current_lagre_end = pos_strands[x1+1];
1224 }
1225 }
1226
1227
1228 typedef struct {
1229 global_context_t * global_context;
1230 HashTable * feature_sorting_table;
1231 } do_load_juncs_context_t;
1232
do_juncs_add_feature(char * gene_name,char * transcript_id,char * chro_name,unsigned int feature_start,unsigned int feature_end,int is_negative_strand,void * context)1233 int do_juncs_add_feature(char * gene_name, char * transcript_id, char * chro_name, unsigned int feature_start, unsigned int feature_end, int is_negative_strand, void * context){
1234 //#warning ">>>>>>>>>>>>>>>>> COMMENT NEXT <<<<<<<<<<<<<<<<<<<<"
1235 //return 0;
1236 //#warning ">>>>>>>>>>>>>>>>> COMMENT NEXT <<<<<<<<<<<<<<<<<<<<"
1237 //SUBREADprintf("INJ LOCS: %s : %u, %u\n", chro_name, feature_start, feature_end);
1238 do_load_juncs_context_t * do_load_juncs_context = context;
1239 HashTable * feature_sorting_table = do_load_juncs_context -> feature_sorting_table;
1240
1241 global_context_t * global_context = do_load_juncs_context -> global_context;
1242 char tmp_chro_name[MAX_CHROMOSOME_NAME_LEN];
1243 if(global_context -> sam_chro_to_anno_chr_alias){
1244 char * sam_chro = get_sam_chro_name_from_alias(global_context -> sam_chro_to_anno_chr_alias, chro_name);
1245 if(sam_chro!=NULL) chro_name = sam_chro;
1246 }
1247 int access_n = HashTableGet( global_context -> chromosome_table.read_name_to_index, chro_name ) - NULL;
1248 if(access_n < 1){
1249 if(chro_name[0]=='c' && chro_name[1]=='h' && chro_name[2]=='r'){
1250 chro_name += 3;
1251 }else{
1252 strcpy(tmp_chro_name, "chr");
1253 strcat(tmp_chro_name, chro_name);
1254 chro_name = tmp_chro_name;
1255 }
1256 }
1257
1258 char sort_key[ MAX_CHROMOSOME_NAME_LEN * 3 + 2 ];
1259 sprintf(sort_key, "%s:%s", gene_name, chro_name);
1260 int * old_features = HashTableGet(feature_sorting_table, sort_key), x1;
1261 int written_space = -1, written_last_item = 0;
1262 if(old_features){
1263 int has_space = 0;
1264 for(x1= 1; x1 < old_features[0] + 1 ; x1+=3){
1265 if(old_features[x1]==0){
1266 has_space = 1;
1267 break;
1268 }
1269 }
1270 if(!has_space){
1271 int old_size = old_features[0];
1272 old_features[0] *= 1.5;
1273 old_features[0] -= old_features[0]%3;
1274
1275 // I used malloc but not realloc here, because the old memory block will be freed by HashTablePutReplace.
1276 int * old_features2 = malloc( sizeof(int) * (old_features[0] +1));
1277 memcpy(old_features2, old_features, (old_size + 1) * sizeof(int));
1278 old_features2[old_size + 1] = 0;
1279
1280 HashTablePutReplace(feature_sorting_table, sort_key, old_features2, 0);
1281 old_features = old_features2;
1282 }
1283 for(x1= 1; x1 < old_features[0] + 1 ; x1+=3){
1284 if(old_features[x1]> feature_start || old_features[x1]==0){
1285 written_space = x1;
1286 if(old_features[x1]> feature_start){
1287 int x2;
1288 for(x2 = x1+3; x2 < old_features[0] + 1;x2+=3){
1289 if(old_features[x2]==0)break;
1290 }
1291 x2 += 2;
1292 assert(x2 < old_features[0] + 1);
1293 if(x2 + 1 < old_features[0] + 1) old_features[x2+1]=0;
1294
1295 for(; x2 >= x1 + 3; x2 --)
1296 old_features[x2] = old_features[x2-3];
1297 } else written_last_item = 1;
1298 break;
1299 }
1300 }
1301 }else{
1302 int init_space_sort = 15;
1303 old_features = malloc((init_space_sort + 1) * sizeof(int));
1304 old_features[0]=init_space_sort;
1305 char * mm_sort_key = malloc(strlen(sort_key)+1);
1306 strcpy(mm_sort_key, sort_key);
1307 HashTablePut(feature_sorting_table, mm_sort_key, old_features);
1308 written_space = 1;
1309 written_last_item = 1;
1310 }
1311 assert(written_space >0);
1312 old_features[written_space]=feature_start - 1;
1313 old_features[written_space+1]=feature_end - 1;
1314 old_features[written_space+2]=is_negative_strand;
1315 if(written_last_item && written_space + 3 < old_features[0]+1) old_features[written_space+3]=0;
1316
1317 return 0;
1318 }
1319
load_known_junctions(global_context_t * global_context)1320 int load_known_junctions(global_context_t * global_context){
1321
1322 HashTable * feature_sorting_table = HashTableCreate(90239);
1323 HashTableSetKeyComparisonFunction(feature_sorting_table, my_strcmp);
1324 HashTableSetHashFunction(feature_sorting_table, HashTableStringHashFunction);
1325 HashTableSetDeallocationFunctions(feature_sorting_table, free, free);
1326
1327 do_load_juncs_context_t do_load_juncs_context;
1328 memset(&do_load_juncs_context, 0, sizeof(do_load_juncs_context));
1329
1330 do_load_juncs_context.global_context = global_context;
1331 do_load_juncs_context.feature_sorting_table = feature_sorting_table;
1332
1333 int features = load_features_annotation(global_context->config.exon_annotation_file , global_context->config.exon_annotation_file_type, global_context->config.exon_annotation_gene_id_column, NULL, global_context->config.exon_annotation_feature_name_column, &do_load_juncs_context, do_juncs_add_feature);
1334
1335 feature_sorting_table -> appendix1 = global_context;
1336 HashTableIteration(feature_sorting_table, add_annotation_to_junctions);
1337 HashTableDestroy(feature_sorting_table);
1338
1339 if(features <0)return -1;
1340 else{
1341 return 0;
1342 }
1343 }
1344
1345
there_are_events_in_range(char * bitmap,unsigned int pos,int sec_len)1346 int there_are_events_in_range(char * bitmap, unsigned int pos, int sec_len)
1347 {
1348 int ret = 0;
1349 if(!bitmap) return 1;
1350 unsigned int offset = pos >> 3;
1351 unsigned int offset_end = (pos + sec_len) >> 3;
1352 unsigned int offset_byte = (offset >> 3)&0x3ffffff;
1353 unsigned int offset_byte_end =1+((offset_end >> 3)&0x3ffffff);
1354 for(; offset_byte < offset_byte_end ; offset_byte++)
1355 {
1356 if(bitmap[offset_byte]){
1357 ret = 1;
1358 break;
1359 }
1360 }
1361 // printf("TEST_JUMP: THERE ARE %d at %u\n" , ret , pos);
1362 return ret;
1363
1364 }
1365 // bitmap has 64M bytes.
check_event_bitmap(unsigned char * bitmap,unsigned int pos)1366 int check_event_bitmap(unsigned char * bitmap, unsigned int pos)
1367 {
1368 unsigned int offset = pos >> 3;
1369 unsigned int offset_byte = (offset >> 3)&0x3ffffff;
1370 char keybyte = bitmap[offset_byte];
1371 if(!keybyte) return 0;
1372
1373 int offset_bit = offset & 7;
1374 return (keybyte & (1<<offset_bit))!=0;
1375 }
1376
mark_event_bitmap(unsigned char * bitmap,unsigned int pos)1377 void mark_event_bitmap(unsigned char * bitmap, unsigned int pos)
1378 {
1379
1380 // bitmap is as large as 64MBytes
1381 // pos_all_bit = pos / 8
1382 // pos_byte = pos_all_bit / 8
1383 // pos_bit = pos_all_bit % 8
1384
1385 unsigned int offset = pos >> 3;
1386 unsigned int offset_byte = (offset >> 3)&0x3ffffff;
1387 int offset_bit = offset & 7;
1388 bitmap[offset_byte] |= (1<<offset_bit);
1389 }
1390
1391
put_new_event(HashTable * event_table,chromosome_event_t * new_event,int event_no)1392 void put_new_event(HashTable * event_table, chromosome_event_t * new_event , int event_no)
1393 {
1394 unsigned int sides[2];
1395 sides[0] = new_event->event_small_side;
1396 sides[1] = new_event->event_large_side;
1397
1398 new_event -> global_event_id = event_no;
1399 int xk1, xk2;
1400
1401 for(xk1=0; xk1<2; xk1++)
1402 {
1403 if(0 == sides[xk1])continue ;
1404 unsigned int * id_list = HashTableGet(event_table, NULL+sides[xk1]);
1405 if(!id_list)
1406 {
1407 id_list = malloc(sizeof(unsigned int)*(1+EVENT_ENTRIES_INIT_SIZE));
1408 id_list[0]=EVENT_ENTRIES_INIT_SIZE;
1409 id_list[1]=0;
1410 HashTablePut(event_table , NULL+sides[xk1] , id_list);
1411 }
1412
1413 for(xk2=1;xk2< id_list[0] ; xk2++)
1414 if(id_list[xk2]==0) break;
1415
1416 if(xk2 < id_list[0])
1417 id_list[xk2] = event_no+1;
1418 if(xk2 < id_list[0]) id_list[xk2+1] = 0;
1419 }
1420
1421 if(event_table->appendix1)
1422 {
1423 mark_event_bitmap(event_table->appendix1, sides[0]);
1424 mark_event_bitmap(event_table->appendix2, sides[1]);
1425 }
1426 }
search_event(global_context_t * global_context,HashTable * event_table,chromosome_event_t * event_space,unsigned int pos,int search_type,unsigned char event_type,chromosome_event_t ** return_buffer)1427 int search_event(global_context_t * global_context, HashTable * event_table, chromosome_event_t * event_space, unsigned int pos, int search_type, unsigned char event_type, chromosome_event_t ** return_buffer)
1428 {
1429 int ret = 0;
1430
1431 if(pos < 1 || pos > 0xffff0000)return 0;
1432
1433 if(event_table->appendix1)
1434 {
1435 if(search_type == EVENT_SEARCH_BY_SMALL_SIDE)
1436 if(! check_event_bitmap(event_table -> appendix1, pos))return 0;
1437
1438 if(search_type == EVENT_SEARCH_BY_LARGE_SIDE)
1439 if(! check_event_bitmap(event_table -> appendix2, pos))return 0;
1440
1441 if(search_type == EVENT_SEARCH_BY_BOTH_SIDES)
1442 if(!(check_event_bitmap(event_table -> appendix1, pos) || check_event_bitmap(event_table -> appendix2, pos)))
1443 return 0;
1444 }
1445
1446 unsigned int * res = HashTableGet(event_table, NULL+pos);
1447 if(res)
1448 {
1449 int xk2;
1450 int current_size = res[0]&0x0fffffff;
1451 for(xk2=1; xk2< current_size+1 ; xk2++)
1452 {
1453 if(!res[xk2])break;
1454 //if(res[xk2] - 1>= ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> current_max_event_number ) { SUBREADprintf("FATAL ERROR: Event id out-of-boundary: %u > %u.\n", res[xk2], ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> current_max_event_number ); continue;}
1455 chromosome_event_t * event_body = &event_space[res[xk2]-1];
1456
1457 if((event_body -> event_type & event_type) == 0)continue;
1458 //SUBREADprintf("VB1:%u\n", res[xk2]);
1459 if(search_type == EVENT_SEARCH_BY_SMALL_SIDE && event_body -> event_small_side != pos)continue;
1460 if(search_type == EVENT_SEARCH_BY_LARGE_SIDE && event_body -> event_large_side != pos)continue;
1461 if(search_type == EVENT_SEARCH_BY_BOTH_SIDES && event_body -> event_small_side != pos && event_body -> event_large_side != pos)continue;
1462
1463 return_buffer[ret++] = event_body;
1464 }
1465 }
1466
1467 return ret;
1468 }
1469
1470
get_insertion_sequence(global_context_t * global_context,thread_context_t * thread_context,char * binary_bases,char * read_text,int insertions)1471 void get_insertion_sequence(global_context_t * global_context, thread_context_t * thread_context , char * binary_bases , char * read_text , int insertions)
1472 {
1473 int xk1;
1474 read_text[0]=0;
1475
1476 for(xk1=0; xk1<insertions; xk1++)
1477 {
1478 int byte_no = xk1/4;
1479 int bit_no = 2*(xk1%4);
1480
1481 read_text[xk1] = int2base((3&(binary_bases[byte_no]>> bit_no)));
1482 }
1483 read_text[insertions]=0;
1484 }
1485
set_insertion_sequence(global_context_t * global_context,thread_context_t * thread_context,char ** binary_bases,char * read_text,int insertions)1486 void set_insertion_sequence(global_context_t * global_context, thread_context_t * thread_context , char ** binary_bases , char * read_text , int insertions)
1487 {
1488 int xk1;
1489
1490 (*binary_bases) = malloc((1+insertions)/4+2);
1491 //SUBREADprintf("ALLOC PTR=%p\n", (*binary_bases) );
1492
1493 assert(insertions <= MAX_INSERTION_LENGTH);
1494 memset((*binary_bases),0, (1+insertions)/4+2);
1495
1496 for(xk1=0; xk1<insertions;xk1++)
1497 {
1498 int byte_no = xk1/4;
1499 int bit_no = 2*(xk1%4);
1500
1501 *((*binary_bases)+byte_no) |= (base2int(read_text[xk1]))<<bit_no;
1502 }
1503 }
1504
local_add_indel_event(global_context_t * global_context,thread_context_t * thread_context,HashTable * event_table,char * read_text,unsigned int left_edge,int indels,int score_supporting_read_added,int is_ambiguous,int mismatched_bases,int * old_event_id)1505 chromosome_event_t * local_add_indel_event(global_context_t * global_context, thread_context_t * thread_context, HashTable * event_table, char * read_text, unsigned int left_edge, int indels, int score_supporting_read_added, int is_ambiguous, int mismatched_bases,int * old_event_id)
1506 {
1507 chromosome_event_t * found = NULL;
1508 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
1509
1510 chromosome_event_t * event_space = NULL;
1511 if(thread_context)
1512 event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1513 else
1514 event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1515
1516
1517 int found_events = search_event(global_context, event_table, event_space, left_edge, EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_INDEL|CHRO_EVENT_TYPE_LONG_INDEL, search_return);
1518
1519 //#warning ">>>>>>>>>>>>>>>>>> COMMENt THIS <<<<<<<<<<<<<<<<<<<<<<<"
1520 //printf("OCT27-STEPRR-TST LEFT=%u, INDEL=%d; FOUND=%d\n", left_edge , indels , found_events);
1521
1522
1523 if(found_events)
1524 {
1525 int kx1;
1526 for(kx1 = 0; kx1 < found_events ; kx1++)
1527 {
1528 if(search_return[kx1] -> indel_length == indels)
1529 {
1530 found = search_return[kx1];
1531 break;
1532 }
1533 }
1534 }
1535
1536 if(found){
1537 if(old_event_id) (*old_event_id)=found->global_event_id;
1538 found -> supporting_reads += score_supporting_read_added;
1539 //found -> is_ambiguous = max(is_ambiguous , found -> is_ambiguous );
1540 return NULL;
1541 }
1542 else
1543 {
1544 int event_no;
1545 //event_no =(thread_context)?((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events: ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> total_events;
1546
1547 if(thread_context)
1548 event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
1549 else
1550 event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> total_events ++;
1551
1552 event_space = reallocate_event_space(global_context, thread_context, event_no);
1553
1554
1555 chromosome_event_t * new_event = event_space+event_no;
1556 memset(new_event,0,sizeof(chromosome_event_t));
1557 if(indels <0) // if it is an insertion:
1558 set_insertion_sequence(global_context, thread_context , &new_event -> inserted_bases , read_text, -indels);
1559
1560 new_event -> event_small_side = left_edge;
1561 new_event -> event_large_side = left_edge + 1 + max(0, indels);
1562 //assert(new_event -> event_small_side && new_event -> event_large_side);
1563 new_event -> event_type = CHRO_EVENT_TYPE_INDEL;
1564 new_event -> indel_length = indels;
1565 new_event -> supporting_reads = 1;
1566 new_event -> event_quality = 1;//pow(0.5 , 3*mismatched_bases);
1567 //new_event -> is_ambiguous = is_ambiguous;
1568
1569 //#warning ">>>>>>>>>>>>>>>>>> COMMENt THIS <<<<<<<<<<<<<<<<<<<<<<<"
1570 //printf("OCT27-STEPRR-NEW LEFT=%u RIGHT=%u, INDEL=%d\n", new_event -> event_small_side , new_event -> event_large_side, new_event -> indel_length );
1571
1572 put_new_event(event_table, new_event , event_no);
1573 return new_event;
1574 }
1575
1576 }
1577
1578 float EXON_RECOVER_MATCHING_RATE = 0.9;
1579 float EXON_INDEL_MATCHING_RATE_TAIL = 0.8;
1580 float EXON_INDEL_MATCHING_RATE_HEAD = 0.8;
1581
1582
1583
1584
core_extend_covered_region_15(global_context_t * global_context,gene_value_index_t * array_index,unsigned int read_start_pos,char * read,int read_len,int cover_start,int cover_end,int window_size,int req_match_5end,int req_match_3end,int indel_tolerance,int space_type,int tail_indel,short * head_indel_pos,int * head_indel_movement,short * tail_indel_pos,int * tail_indel_movement,int is_head_high_quality,char * qual_txt,int qual_format,float head_matching_rate,float tail_matching_rate)1585 int core_extend_covered_region_15(global_context_t * global_context, gene_value_index_t *array_index, unsigned int read_start_pos, char * read, int read_len, int cover_start, int cover_end, int window_size, int req_match_5end , int req_match_3end, int indel_tolerance, int space_type, int tail_indel, short * head_indel_pos, int * head_indel_movement, short * tail_indel_pos, int * tail_indel_movement, int is_head_high_quality, char * qual_txt, int qual_format, float head_matching_rate, float tail_matching_rate){
1586 int is_head;
1587
1588 if(0){
1589 char posout[100];
1590 absoffset_to_posstr(global_context, read_start_pos, posout);
1591 SUBREADprintf("RTXT=%s, MAPP=%s\n", read, posout);
1592 }
1593
1594 for(is_head = 0; is_head < 2; is_head ++){
1595 int move_i;
1596 int best_match_n = -1, best_movement = 0;
1597 for(move_i = 0; move_i < 2* indel_tolerance-1; move_i ++){
1598 int indel_move = (move_i+1)/2 * (move_i %2?1:-1);
1599 int indel_movement = indel_move + (is_head?0:tail_indel);
1600 int this_match_n;
1601
1602 if(is_head)
1603 this_match_n = match_chro(read, array_index, read_start_pos - indel_movement, window_size, 0, space_type );
1604 else
1605 this_match_n = match_chro(read + read_len - window_size, array_index, read_start_pos + read_len - window_size + indel_movement, window_size, 0, space_type );
1606
1607 // indel_movement : negativ = insertion, positive = deletion before or after the covered region
1608
1609 //SUBREADprintf("MOVE: HEAD=%d, MATCH=%d, MOV=%d\n", is_head, this_match_n, indel_movement);
1610 if(this_match_n > best_match_n){
1611 best_match_n = this_match_n;
1612 best_movement = indel_movement;
1613 }
1614 }
1615
1616 int max_score=-1, best_splice_on_read=0;
1617 if(best_match_n >0 && best_movement!=0){
1618 int test_start = is_head?window_size:(cover_end);
1619 int test_end = is_head?cover_start - max(0, -best_movement):(read_len - window_size - max(0, -best_movement));
1620 int indel_first_piece_end_on_read, indel_second_piece_start_on_read;
1621
1622 for(indel_first_piece_end_on_read = test_start ; indel_first_piece_end_on_read < test_end; indel_first_piece_end_on_read++){
1623 unsigned int indel_first_piece_end_on_chro = read_start_pos + indel_first_piece_end_on_read + (is_head? -best_movement :tail_indel);
1624 indel_second_piece_start_on_read = indel_first_piece_end_on_read - min(0, best_movement);
1625 unsigned int indel_second_piece_start_on_chro = indel_first_piece_end_on_chro + max(0,best_movement);
1626 int first_piece_match = match_chro(read + indel_first_piece_end_on_read - window_size , array_index , indel_first_piece_end_on_chro - window_size, window_size, 0, space_type);
1627 int second_piece_match = match_chro(read + indel_second_piece_start_on_read , array_index , indel_second_piece_start_on_chro, window_size, 0, space_type);
1628 int score = first_piece_match + second_piece_match;
1629 if(score > max_score){
1630 max_score = score;
1631 best_splice_on_read = indel_first_piece_end_on_read;
1632 }
1633 if(score == 2*window_size)break;
1634 //SUBREADprintf("FIND_SPLICE_POINT: IS_HEAD=%d, FIRST_PIECE_END=%d-%d, CHRO=%u-%u MATCHED=%d,%d\n", is_head, indel_first_piece_end_on_read, indel_second_piece_start_on_read, indel_first_piece_end_on_chro, indel_second_piece_start_on_chro, first_piece_match, second_piece_match);
1635 }
1636 }
1637
1638 if(max_score >= 2*window_size - 1){
1639 if(is_head){
1640 (*head_indel_pos) = best_splice_on_read;
1641 (*head_indel_movement) = best_movement;
1642 } else {
1643 (*tail_indel_pos) = best_splice_on_read;
1644 (*tail_indel_movement) = best_movement;
1645 }
1646 }
1647 }
1648
1649 return 0;
1650
1651 }
1652
1653
1654
core_extend_covered_region_13(gene_value_index_t * array_index,unsigned int read_start_pos,char * read,int read_len,int cover_start,int cover_end,int window_size,int req_match_5end,int req_match_3end,int indel_tolerance,int space_type,int tail_indel,short * head_indel_pos,int * head_indel_movement,short * tail_indel_pos,int * tail_indel_movement,int is_head_high_quality,char * qual_txt,int qual_format,float head_matching_rate,float tail_matching_rate)1655 int core_extend_covered_region_13(gene_value_index_t *array_index, unsigned int read_start_pos, char * read, int read_len, int cover_start, int cover_end, int window_size, int req_match_5end , int req_match_3end, int indel_tolerance, int space_type, int tail_indel, short * head_indel_pos, int * head_indel_movement, short * tail_indel_pos, int * tail_indel_movement, int is_head_high_quality, char * qual_txt, int qual_format, float head_matching_rate, float tail_matching_rate)
1656 {
1657 int ret = 0;
1658 *head_indel_pos = -1;
1659 *tail_indel_pos = -1;
1660 if (cover_start >= window_size && head_matching_rate < 1.0001)
1661 {
1662 int head_test_len = cover_start;
1663 int roughly_mapped = match_chro(read, array_index, read_start_pos, head_test_len , 0, space_type);
1664 if (roughly_mapped >= head_test_len * EXON_RECOVER_MATCHING_RATE - 0.0001)
1665 {
1666 ret |= 1;
1667 }
1668 else
1669 {
1670 int window_end_pos = cover_start + window_size-1;
1671 int not_too_bad = 1;
1672 int right_match_number=0;
1673
1674 while (1)
1675 {
1676 int matched_bases_in_window = match_chro_wronglen(read+ window_end_pos - window_size, array_index, read_start_pos + window_end_pos - window_size, window_size, space_type, NULL, &right_match_number);
1677 int best_indel_movement = -1000;
1678 int best_indel_pos = -1;
1679
1680 if (matched_bases_in_window >= req_match_5end) // this window is not bad enough so that an indel is considered
1681 window_end_pos--;
1682 else
1683 {
1684 roughly_mapped = match_chro(read, array_index, read_start_pos, window_end_pos - right_match_number , 0, space_type);
1685 if (roughly_mapped < (int)(0.5+ (window_end_pos - right_match_number) * EXON_RECOVER_MATCHING_RATE))
1686 {
1687 // the quality of this window is too low (at most 1 base is matched). I must consider if it is an indel.
1688 int indel_movement_i ;
1689
1690 int best_matched_bases_after_indel = -1;
1691
1692 not_too_bad = 0;
1693 for (indel_movement_i = 0; indel_movement_i < 2* indel_tolerance-1 ; indel_movement_i ++)
1694 {
1695 int indel_movement = (indel_movement_i+1)/2 * (indel_movement_i %2?1:-1);
1696 int test_length = window_end_pos /*- 1*/ - max(0, indel_movement) - right_match_number;
1697
1698 if (test_length < window_size) continue;
1699 //if (test_length <= 1+ abs(indel_movement)/4) continue;
1700 //test_length = min(10, test_length);
1701 if (abs(indel_movement) > indel_tolerance) continue;
1702
1703 int test_start =0;// window_end_pos - max(0, indel_movement) - right_match_number - test_length;
1704
1705 float matched_bases_after_indel = match_chro_support(read +test_start, array_index, read_start_pos + indel_movement +test_start, test_length,0, space_type, qual_txt, qual_format);
1706 SUBREADprintf("HEAD : MATCHED_AFTER_INDEL = %f ; MVMT=%d ; WINDOW_END=%d\n", matched_bases_after_indel, indel_movement, window_end_pos);
1707
1708 float test_rate = head_matching_rate;
1709
1710 if(test_length < 3) test_rate = 1;
1711
1712 if(best_matched_bases_after_indel <matched_bases_after_indel && matched_bases_after_indel >= (int)(0.5+test_length * test_rate))
1713 {
1714 not_too_bad = 1;
1715
1716 best_matched_bases_after_indel = matched_bases_after_indel;
1717 best_indel_movement = indel_movement;
1718 best_indel_pos = window_end_pos - right_match_number;
1719
1720
1721
1722 *head_indel_pos = best_indel_pos;
1723 *head_indel_movement = best_indel_movement;
1724 }
1725 }
1726 if(best_indel_pos<0) *head_indel_pos = window_end_pos - right_match_number;
1727 // break;
1728 }
1729 window_end_pos--;
1730 }
1731 if (window_end_pos - window_size <= 0) break;
1732 }
1733 if(not_too_bad)
1734 ret |=1;
1735 //else
1736 //*head_indel_pos = -1;
1737 }
1738 }
1739 else ret |= 1;
1740
1741
1742
1743 if (cover_end <= read_len - window_size && tail_matching_rate < 1.0001)
1744 {
1745 int tail_test_len = read_len - cover_end;
1746 int roughly_mapped = match_chro(read + cover_end, array_index, read_start_pos + cover_end + tail_indel, tail_test_len , 0, space_type);
1747 if (roughly_mapped >= tail_test_len * EXON_RECOVER_MATCHING_RATE - 0.0001)
1748 {
1749 ret |= 2;
1750 }
1751 else
1752 {
1753 int window_start_pos = cover_end - window_size +1;
1754 int not_too_bad = 1;
1755
1756 while (1)
1757 {
1758 int left_match_number = 0;
1759 int matched_bases_in_window = match_chro_wronglen(read+ window_start_pos, array_index, read_start_pos + window_start_pos + tail_indel, window_size, space_type, &left_match_number, NULL);
1760 int best_indel_movement = -1000;
1761 int best_indel_pos = -1;
1762
1763 //printf("MOVE 4-bp WINDOW : %d ; matched = %d >=? %d\n", window_start_pos, matched_bases_in_window, req_match_3end);
1764
1765 if (matched_bases_in_window >= req_match_3end) // this window is not bad enough so that an indel is considered
1766 window_start_pos++;
1767 else
1768 {
1769 roughly_mapped = match_chro(read+window_start_pos + left_match_number, array_index, read_start_pos + window_start_pos + tail_indel + left_match_number, read_len - window_start_pos - left_match_number , 0, space_type);
1770
1771 if (roughly_mapped < (int)(0.5+(read_len - window_start_pos -left_match_number ) * EXON_RECOVER_MATCHING_RATE))
1772 {
1773 // the quality of this window is too low (at most 1 base is matched). I must consider if it is an indel.
1774 int indel_movement_i ;
1775 int best_matched_bases_after_indel = -1;
1776 not_too_bad = 0;
1777 for (indel_movement_i = 0; indel_movement_i < 2* indel_tolerance; indel_movement_i ++)
1778 {
1779
1780 int indel_adjustment = (indel_movement_i+1)/2 * (indel_movement_i %2?1:-1);
1781 int indel_movement = tail_indel + indel_adjustment;
1782 int test_length = read_len - window_start_pos - left_match_number + min(0, indel_adjustment);
1783 //test_length = min(10, test_length);
1784
1785
1786 if (test_length < window_size) continue;
1787 //if (test_length <= 1 + abs(indel_movement)/4) continue;
1788 if (abs(indel_movement) > indel_tolerance) continue;
1789
1790 char * qual_txt_moved = qual_txt;
1791 if(qual_txt[0]) qual_txt_moved+=window_start_pos - min(0, indel_movement) + left_match_number;
1792
1793 float matched_bases_after_indel = match_chro_support(read + window_start_pos - min(0, indel_movement) + left_match_number, array_index, read_start_pos + window_start_pos + max(0,indel_movement) +left_match_number , test_length,0, space_type, qual_txt_moved , qual_format);
1794 SUBREADprintf("TAIL : MATCHED_AFTER_INDEL = %f ; MVMT=%d ; WINDOW_END=%d\n", matched_bases_after_indel, indel_movement, window_start_pos - min(0, indel_movement) + left_match_number);
1795
1796 float test_rate = tail_matching_rate;
1797
1798 if(test_length < 3) test_rate = 1;
1799
1800 //printf("%d <? %d && %d >=? %d\n", best_matched_bases_after_indel, matched_bases_after_indel, matched_bases_after_indel, (int)(0.5+test_length * test_rate));
1801 if(best_matched_bases_after_indel <matched_bases_after_indel && matched_bases_after_indel >= (int)(0.5+test_length * test_rate))
1802 {
1803 not_too_bad = 1;
1804 best_matched_bases_after_indel = matched_bases_after_indel;
1805 best_indel_movement = indel_movement;
1806 best_indel_pos = window_start_pos + left_match_number ;//-1;
1807 *tail_indel_movement = best_indel_movement ;
1808 }
1809 }
1810
1811 if(best_indel_pos<0)
1812 *tail_indel_pos = window_start_pos + left_match_number ;
1813 else
1814 *tail_indel_pos = best_indel_pos;
1815 }
1816 window_start_pos++;
1817 }
1818 if (window_start_pos + window_size >= read_len) break;
1819 }
1820 if(not_too_bad)
1821 ret |=2;
1822 //else
1823 // *tail_indel_pos =-1;
1824 }
1825 }
1826 else ret |= 2;
1827
1828 return ret;
1829 }
1830
1831
1832
1833
1834
1835
1836 int core_dynamic_align(global_context_t * global_context, thread_context_t * thread_context, char * read, int read_len, unsigned int begin_position, char * movement_buffer, int expected_offset, char * read_name);
1837
find_new_indels(global_context_t * global_context,thread_context_t * thread_context,int pair_number,char * read_name,char * read_text,char * qual_text,int read_len,int is_second_read,int best_read_id)1838 int find_new_indels(global_context_t * global_context, thread_context_t * thread_context, int pair_number, char * read_name, char * read_text, char * qual_text, int read_len, int is_second_read, int best_read_id)
1839 {
1840 int i,j;
1841
1842 int last_correct_subread = 0;
1843 int last_indel = 0;
1844
1845 gene_vote_number_t * indel_recorder;
1846 unsigned int voting_position;
1847
1848 HashTable * event_table = NULL;
1849
1850 if(thread_context)
1851 event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
1852 else
1853 event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
1854
1855
1856 mapping_result_t *current_result = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, best_read_id);
1857
1858 if(global_context->config.do_big_margin_filtering_for_reads)
1859 if(is_ambiguous_voting(global_context, pair_number, is_second_read , current_result->selected_votes, current_result->confident_coverage_start, current_result->confident_coverage_end, read_len, (current_result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0))return 0;
1860
1861
1862
1863 indel_recorder = current_result -> selected_indel_record;
1864 voting_position = current_result -> selected_position;
1865
1866
1867 if(!indel_recorder[0])
1868 return 0;
1869
1870 gene_value_index_t * current_value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
1871
1872 if(0){ // test "shifting indels" -- not needed when "NEW" go_Q used
1873 int last_offset = 0;
1874 for(i=0; indel_recorder[i] && (i<MAX_INDEL_SECTIONS); i+=3){
1875 last_offset = indel_recorder[i+2];
1876 }
1877
1878 if(0==last_offset) return 0;
1879 }
1880
1881 for(i=0; indel_recorder[i] && (i<MAX_INDEL_SECTIONS); i+=3)
1882 {
1883 int indels = indel_recorder[i+2] - last_indel;
1884 // -1 : 1 insert; 1: 1 delete
1885 int next_correct_subread = indel_recorder[i] -1;
1886 //int next_correct_subread_last = indel_recorder[i+1] -1;
1887
1888 if(indels)
1889 {
1890 int last_correct_base = find_subread_end(read_len, global_context->config.total_subreads , last_correct_subread) - 9;
1891 int first_correct_base = find_subread_end(read_len, global_context->config.total_subreads , next_correct_subread) - 16 + 9;
1892
1893 last_correct_base = max(0, last_correct_base);
1894 last_correct_base = min(read_len-1, last_correct_base);
1895 first_correct_base = min(first_correct_base, read_len-1);
1896 first_correct_base = max(0, first_correct_base);
1897 first_correct_base = max(first_correct_base, last_correct_base);
1898
1899 if(first_correct_base < last_correct_base || first_correct_base > last_correct_base + 3000)
1900 SUBREADprintf("WRONG ORDER: F=%u, L=%d\n", first_correct_base , last_correct_base);
1901
1902 //printf("OCT27-STEPRR %s POS=%u, I=%d; INDELS=%d; COVER=%d -- %d (vote_no : %d - %d)\n", read_name, current_result->selected_position, i, indels, last_correct_base, first_correct_base, last_correct_subread, next_correct_subread);
1903
1904 if(global_context -> config.use_dynamic_programming_indel || read_len > EXON_LONG_READ_LENGTH)
1905 {
1906 char movement_buffer[MAX_READ_LENGTH * 10 / 7];
1907 //chromosome_event_t * last_event = NULL;
1908 int last_event_id = -1;
1909
1910 first_correct_base = min(first_correct_base+10, read_len);
1911 int x1, dyna_steps;
1912
1913 dyna_steps = core_dynamic_align(global_context, thread_context, read_text + last_correct_base, first_correct_base - last_correct_base, voting_position + last_correct_base + last_indel, movement_buffer, indels, read_name);
1914
1915 movement_buffer[dyna_steps]=0;
1916
1917 if(0 && FIXLENstrcmp("simulated.2467286", read_name) == 0){
1918 SUBREADprintf("DYNAMIC: indels=%d, %s\n", indels, movement_buffer);
1919 }
1920
1921 //#warning ">>>>>>> COMMENT NEXT BLOCK IN RELEASE <<<<<<<<"
1922 if(0) {
1923 char outstr[1000];
1924 sprintf(outstr, "OCT27-STEPDD-IR %s %d %d~%d ", read_name, dyna_steps, last_correct_base, first_correct_base);
1925
1926 for(x1=0; x1<dyna_steps;x1++)
1927 {
1928 int mc, mv=movement_buffer[x1];
1929 if(mv==0)mc='=';
1930 else if(mv==1)mc='D';
1931 else if(mv==2)mc='I';
1932 else mc='X';
1933 sprintf(outstr+strlen(outstr),"%c",mc);
1934 }
1935 puts(outstr);
1936 }
1937 unsigned int cursor_on_chromosome = voting_position + last_correct_base + last_indel, cursor_on_read = last_correct_base;
1938 int last_mv = 0;
1939 unsigned int indel_left_boundary = 0;
1940 int is_in_indel = 0, current_indel_len = 0, total_mismatch = 0;
1941
1942 for(x1=0; x1<dyna_steps;x1++)
1943 {
1944 int mv=movement_buffer[x1];
1945 if(mv==3) total_mismatch++;
1946 }
1947
1948 if(total_mismatch<=2 || (global_context->config.maximise_sensitivity_indel && total_mismatch <= 2 ))
1949 for(x1=0; x1<dyna_steps;x1++)
1950 {
1951 int mv=movement_buffer[x1];
1952
1953 if(last_mv != mv)
1954 {
1955 if( ( mv==1 || mv==2 ) && ! is_in_indel)
1956 {
1957 indel_left_boundary = cursor_on_chromosome;
1958 is_in_indel = 1;
1959 current_indel_len = 0;
1960 }
1961 else if ( is_in_indel && (mv == 0 || mv == 3) )
1962 {
1963 gene_value_index_t * current_value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
1964
1965 int ambiguous_count=0;
1966 if(0){
1967 int ambiguous_i, best_matched_bases = match_chro(read_text + cursor_on_read - 6, current_value_index, indel_left_boundary - 6, 6, 0, global_context->config.space_type) +
1968 match_chro(read_text + cursor_on_read - min(current_indel_len,0), current_value_index, indel_left_boundary + max(0, current_indel_len), 6, 0, global_context->config.space_type);
1969 for(ambiguous_i=-5; ambiguous_i<=5; ambiguous_i++)
1970 {
1971 int left_match = match_chro(read_text + cursor_on_read - 6, current_value_index, indel_left_boundary - 6, 6+ambiguous_i, 0, global_context->config.space_type);
1972 int right_match = match_chro(read_text + cursor_on_read + ambiguous_i - min(current_indel_len,0), current_value_index, indel_left_boundary + ambiguous_i + max(0, current_indel_len), 6-ambiguous_i, 0,global_context->config.space_type);
1973 if(left_match+right_match == best_matched_bases) ambiguous_count ++;
1974 }
1975 }
1976
1977 if(abs(current_indel_len)<=global_context -> config.max_indel_length)
1978 {
1979 int old_event_id = -1;
1980
1981 if(0)if(indel_left_boundary >= 46481340 + 1210 - 200 && indel_left_boundary <= 46481340 + 1210 + 200 && abs(current_indel_len )==4){
1982 SUBREADprintf("ADD AN INDEL: %s : %u ; len = %d\n", read_name, indel_left_boundary, current_indel_len);
1983 }
1984 chromosome_event_t * new_event = local_add_indel_event(global_context, thread_context, event_table, read_text + cursor_on_read + min(0,current_indel_len), indel_left_boundary - 1, current_indel_len, 1, ambiguous_count, 0, &old_event_id);
1985 mark_gapped_read(current_result);
1986
1987 chromosome_event_t * event_space = NULL;
1988 if(thread_context)
1989 event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1990 else
1991 event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1992
1993 if(last_event_id>=0){
1994 chromosome_event_t * last_event = event_space + last_event_id;
1995 int dist = indel_left_boundary - last_event -> event_large_side ;
1996
1997 if(last_event -> connected_next_event_distance<1)
1998 last_event -> connected_next_event_distance = dist;
1999 else last_event -> connected_next_event_distance = min(dist, last_event -> connected_next_event_distance);
2000
2001 chromosome_event_t * old_new_event = new_event;
2002 if(old_event_id>=0){
2003 assert(!new_event);
2004 old_new_event = event_space + old_event_id;
2005 }
2006
2007 if(old_new_event -> connected_previous_event_distance < 1)
2008 old_new_event -> connected_previous_event_distance = dist;
2009 else old_new_event -> connected_previous_event_distance = min(dist, old_new_event -> connected_previous_event_distance);
2010 }
2011
2012 if (new_event)
2013 last_event_id = new_event -> global_event_id;
2014 else last_event_id = old_event_id;
2015 }
2016 }
2017
2018
2019 if(mv == 0 || mv == 3)
2020 is_in_indel = 0;
2021 }
2022
2023 if(is_in_indel && mv == 1)
2024 current_indel_len += 1;
2025 if(is_in_indel && mv == 2)
2026 current_indel_len -= 1;
2027
2028 if(mv == 1 || mv == 3 || mv == 0) cursor_on_chromosome++;
2029 if(mv == 2 || mv == 3 || mv == 0) cursor_on_read++;
2030
2031 last_mv = mv;
2032 }
2033
2034 }
2035 else
2036 {
2037 if(first_correct_base - last_correct_base < -min(0,indels)) continue;
2038
2039 int best_score=-99999, is_ambiguous_indel= 0;
2040 unsigned int best_pos = 0;
2041 int cutoff = first_correct_base-last_correct_base + min(0, indels) - global_context -> config.flanking_subread_indel_mismatch;
2042 int mid_j = (first_correct_base-last_correct_base )/2, min_j=99990;
2043 int mismatched_bases = 0;
2044
2045 if(first_correct_base > last_correct_base + 4)
2046 for(j=0; j<(first_correct_base-last_correct_base + min(0, indels)); j++)
2047 {
2048 // j is the first UNWANTED base after the first matched part.
2049 int score , s2, s1;
2050
2051 s1 = match_chro(read_text + last_correct_base, current_value_index, voting_position + last_correct_base + last_indel, j, 0, global_context->config.space_type);
2052 if(s1 >= j - global_context -> config.flanking_subread_indel_mismatch)
2053 {
2054 s2 = match_chro(read_text + last_correct_base + j - min(0, indels) , current_value_index, voting_position + last_correct_base + j + max(0,indels) + last_indel, first_correct_base-last_correct_base - j + min(0, indels), 0, global_context->config.space_type);
2055
2056 score = s2+s1;
2057 if(score >= cutoff)
2058 {
2059 if(score == best_score)
2060 is_ambiguous_indel ++;
2061
2062 if(score>best_score || (abs(mid_j-j)>min_j && best_score == score))
2063 {
2064 mismatched_bases = first_correct_base-last_correct_base + min(0, indels) - score;
2065 best_score = score;
2066 // best_pos here is the absolute offset of the FIRST UNWANTED BASE.
2067 best_pos = voting_position + last_correct_base + j + last_indel;
2068 min_j = abs(mid_j-j);
2069 if(score>best_score)
2070 is_ambiguous_indel = 0;
2071 }
2072 }
2073 }
2074 }
2075
2076
2077
2078 /*
2079 SUBREADprintf("\n%s\nRANGE:%d - %d ; BSCORE:%d @ %d ; AMB=%d ; INDELS=%d ; MM=%d\n", read_name, first_correct_base, last_correct_base, best_score, last_indel + last_correct_base + best_j, is_ambiguous_indel, indels, mismatched_bases);
2080
2081 SUBREADprintf("%s\n", read_text);
2082 for(j=0;j< last_indel + last_correct_base + best_j; j++)
2083 putc(' ' , stderr);
2084 putc('\\' , stderr);
2085 for(j=0;j< indels; j++)
2086 putc(gvindex_get( current_value_index, best_pos + j) , stderr);
2087 putc('\n', stderr);
2088
2089 */
2090
2091 if(best_score >0) {
2092 //#warning ">>>>>>>>>>>>>>>>>> COMMEN THIS <<<<<<<<<<<<<<<<<<<<<<"
2093 //printf("OCT27-STEPAD-%s POS=%u INDEL=%d\n", read_name, best_pos , indels);
2094 local_add_indel_event(global_context, thread_context, event_table, read_text + last_correct_base + last_indel, best_pos - 1, indels, 1, is_ambiguous_indel, mismatched_bases, NULL);
2095 mark_gapped_read(current_result);
2096 }
2097 }
2098 }
2099
2100 last_correct_subread = indel_recorder[i+1]-1;
2101 last_indel = indel_recorder[i+2];
2102 }
2103
2104
2105
2106 if(global_context -> config.extending_search_indels && global_context -> config.max_indel_length>0)
2107 {
2108 // USING EXTENDING TO FIND INDELS
2109 // THIS IS UNSTABLE!
2110 int s_head = match_chro(read_text, current_value_index, voting_position, 8, 0, global_context->config.space_type);
2111 int s_tail = match_chro(read_text+read_len - 8, current_value_index, voting_position + read_len + current_result -> indels_in_confident_coverage - 8, 8, 0, global_context->config.space_type);
2112
2113 //SUBREADprintf("EXT_START %d, %d\n", s_head, s_tail);
2114
2115 //#warning "============ REMOVE THE FIRST '1' FROM THE NEXT LINE! =========="
2116 if(s_head<=6 || s_tail<=6)
2117 {
2118 int is_head_high_quality = (current_result -> result_flags & CORE_IS_NEGATIVE_STRAND)?0:1;
2119 int k;
2120 int cover_start = find_subread_end(read_len, global_context -> config.total_subreads, current_result -> selected_indel_record[0]-1)- 15;
2121
2122 for (k=0; current_result -> selected_indel_record[k]; k+=3);
2123 int cover_end = find_subread_end(read_len, global_context -> config.total_subreads, current_result -> selected_indel_record[k-2]-1) + max(0,-current_result -> selected_indel_record[k-1]);
2124
2125 int head_qual = is_head_high_quality?0x7fffffff:0, tail_qual = is_head_high_quality?0:0x7fffffff;
2126
2127
2128 if(qual_text && qual_text[0])
2129 {
2130 int j;
2131 head_qual = 0;
2132 tail_qual = 0;
2133 for (j = 0; j<cover_start; j++)
2134 {
2135 if(FASTQ_PHRED64 == global_context -> config.phred_score_format)
2136 {
2137 head_qual += (1000000-get_base_error_prob64i(qual_text[j]));
2138 }
2139 else
2140 {
2141 head_qual += (1000000-get_base_error_prob33i(qual_text[j]));
2142 }
2143 }
2144 for (j = 0; j<read_len - cover_end; j++)
2145 {
2146 if(FASTQ_PHRED64 == global_context -> config.phred_score_format)
2147 {
2148 tail_qual += (1000000-get_base_error_prob64i(qual_text[read_len - j -1]));
2149 }
2150 else
2151 {
2152 tail_qual += (1000000-get_base_error_prob33i(qual_text[read_len - j -1]));
2153 }
2154 }
2155 }
2156
2157 float exon_tail_matching_rate=0.9, exon_head_matching_rate=0.9;
2158
2159 int head_must_correct = 4;
2160 if(cover_start > 0)
2161 {
2162 if (head_qual / cover_start < 850000)
2163 {
2164 exon_head_matching_rate = 0.75;
2165 head_must_correct =2;
2166 }
2167 else if (head_qual / cover_start < 950000 )
2168 {
2169 exon_head_matching_rate = 0.85;
2170 head_must_correct =3;
2171 }
2172 else exon_head_matching_rate = 0.92;
2173 }else exon_head_matching_rate = 9999;
2174
2175
2176 int tail_must_correct = 4;
2177 if(read_len-cover_end > 0)
2178 {
2179 if (tail_qual / (read_len-cover_end) < 850000)
2180 {
2181 exon_tail_matching_rate = 0.75;
2182 tail_must_correct =2;
2183 }
2184 else if (tail_qual / (read_len-cover_end) < 950000)
2185 {
2186 exon_tail_matching_rate = 0.85;
2187 tail_must_correct =3;
2188 }
2189 else exon_tail_matching_rate = 0.92;
2190 }else exon_tail_matching_rate = 9999;
2191
2192 short head_indel_pos=-1 , tail_indel_pos=-1;
2193 int head_indel_movement=0, tail_indel_movement=0;
2194
2195 if(0)SUBREADprintf("HQ=%.4f; TQ=%.4f; HM=%d; TM=%d; COVG = %d ~ %d\n", exon_head_matching_rate, exon_tail_matching_rate, head_must_correct, tail_must_correct, cover_start, cover_end);
2196 /*
2197 if(exon_head_matching_rate<1)
2198 exon_head_matching_rate = 0.97;
2199 if(exon_tail_matching_rate<1)
2200 exon_tail_matching_rate = 0.97;
2201 */
2202
2203 core_extend_covered_region_15(global_context, current_value_index, current_result->selected_position, read_text, read_len, cover_start, cover_end, 7, head_must_correct,tail_must_correct, global_context -> config.max_indel_length + 1, global_context -> config.space_type, current_result -> selected_indel_record[k-1], & head_indel_pos, & head_indel_movement, & tail_indel_pos, & tail_indel_movement, is_head_high_quality, qual_text, global_context -> config.phred_score_format, exon_head_matching_rate, exon_tail_matching_rate);
2204
2205 //head_indel_movement = -head_indel_movement;
2206
2207 if(0)SUBREADprintf("HMV=%d; TMV=%d; TPOS=%d\n", head_indel_movement, tail_indel_movement, tail_indel_pos);
2208
2209 if(head_indel_movement && s_head < 7)
2210 {
2211 unsigned int head_indel_left_edge = head_indel_pos + current_result->selected_position - 1;
2212 //head_indel_left_edge -= max(0, head_indel_movement);
2213 if(abs(head_indel_movement)<=global_context -> config.max_indel_length)
2214 {
2215 local_add_indel_event(global_context, thread_context, event_table, read_text + head_indel_pos, head_indel_left_edge, head_indel_movement, 1, 1, 0,NULL);
2216 mark_gapped_read(current_result);
2217 }
2218 }
2219 if(tail_indel_movement && s_tail < 7)
2220 {
2221 unsigned int tail_indel_left_edge = tail_indel_pos + current_result->selected_position + current_result -> indels_in_confident_coverage - 1;
2222
2223 if(abs(tail_indel_movement)<=global_context -> config.max_indel_length)
2224 {
2225 local_add_indel_event(global_context, thread_context, event_table, read_text + tail_indel_pos, tail_indel_left_edge, tail_indel_movement, 1, 1, 0,NULL);
2226 mark_gapped_read(current_result);
2227 }
2228
2229 }
2230 }
2231
2232 }
2233 return 0;
2234 }
2235
print_indel_table(global_context_t * global_context)2236 void print_indel_table(global_context_t * global_context){
2237 }
2238
write_indel_final_results(global_context_t * global_context)2239 int write_indel_final_results(global_context_t * global_context)
2240 {
2241
2242 int xk1, disk_is_full = 0;
2243 char * inserted_bases = NULL;
2244 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
2245 char * fn2, *ref_bases, *alt_bases;
2246 FILE * ofp = NULL;
2247
2248 fn2 = malloc(MAX_FILE_NAME_LENGTH+30);
2249 sprintf(fn2, "%s.indel.vcf", global_context->config.output_prefix);
2250
2251 ofp = f_subr_open(fn2, "wb");
2252
2253 //if(!ofp)
2254 // printf("HOW??? %s\n", fn2);
2255 inserted_bases = malloc(MAX_INSERTION_LENGTH + 2);
2256 ref_bases = malloc(1000);
2257 alt_bases = malloc(1000);
2258
2259 fputs("##fileformat=VCFv4.0\n##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">\n##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">\n##INFO=<ID=SR,Number=1,Type=String,Description=\"Number of supporting reads for variants\">\n", ofp);
2260 fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", ofp);
2261
2262 for(xk1 = 0; xk1 < indel_context -> total_events ; xk1++)
2263 {
2264 char * chro_name;
2265 int chro_pos;
2266
2267 chromosome_event_t * event_body = indel_context -> event_space_dynamic +xk1;
2268
2269
2270 // #warning " ================= REMOVE '- 1' from the next LINE!!! ========================="
2271 if((event_body -> event_type != CHRO_EVENT_TYPE_INDEL && event_body->event_type != CHRO_EVENT_TYPE_LONG_INDEL && event_body -> event_type != CHRO_EVENT_TYPE_POTENTIAL_INDEL)|| (event_body -> final_counted_reads < 1 && event_body -> event_type == CHRO_EVENT_TYPE_INDEL) )
2272 continue;
2273
2274 //assert((-event_body -> indel_length) < MAX_INSERTION_LENGTH);
2275
2276 if(event_body -> indel_length<0)
2277 {
2278 get_insertion_sequence(global_context, NULL , event_body -> inserted_bases , inserted_bases , -event_body -> indel_length);
2279 free(event_body -> inserted_bases);
2280 }
2281
2282 //assert(event_body -> event_small_side < event_body -> event_large_side );
2283 locate_gene_position( event_body -> event_small_side , &global_context -> chromosome_table, &chro_name, &chro_pos);
2284
2285 // VCF format: chr21 1000001 . AACC AAGGGGGCC 29 . INDEL;DP=20
2286 if(event_body -> event_type == CHRO_EVENT_TYPE_INDEL || event_body -> event_type == CHRO_EVENT_TYPE_LONG_INDEL)
2287 {
2288 ref_bases[0]=0;
2289 alt_bases[0]=0;
2290
2291 gene_value_index_t * current_index = find_current_value_index(global_context , event_body -> event_small_side-1 , max(0, event_body -> indel_length) + 2);
2292 if(current_index)
2293 {
2294 int rlen = max(0, event_body -> indel_length) + 2;
2295 assert(rlen<900);
2296 gvindex_get_string(ref_bases, current_index, event_body -> event_small_side-1 , rlen, 0);
2297 ref_bases[rlen] = 0;
2298
2299 if(event_body -> indel_length > 0) // deletion
2300 {
2301 alt_bases[0] = ref_bases[0];
2302 alt_bases[1] = ref_bases[event_body -> indel_length+1];
2303 alt_bases[2] = 0;
2304 }
2305 else // insertion
2306 {
2307 alt_bases[0] = ref_bases[0];
2308 strcpy(alt_bases+1,inserted_bases);
2309 strcat(alt_bases,ref_bases+1);
2310 }
2311 }
2312
2313 if(CHRO_EVENT_TYPE_INDEL == event_body -> event_type)
2314 {
2315 if(event_body -> final_counted_reads<30)
2316 event_body -> event_quality = pow(0.5,(30-event_body -> final_counted_reads));
2317 else event_body -> event_quality = 1;
2318
2319 }
2320 int write_len = fprintf(ofp, "%s\t%u\t.\t%s\t%s\t%d\t.\tINDEL;DP=%d;SR=%d\n", chro_name, chro_pos, ref_bases, alt_bases, (int)(max(1, 250 + 10*log(event_body -> event_quality)/log(10))), event_body -> final_counted_reads + event_body -> anti_supporting_reads, event_body -> final_counted_reads);
2321 if(write_len < 10) disk_is_full = 1;
2322 }
2323
2324 global_context->all_indels++;
2325 }
2326
2327 fclose(ofp);
2328 free(ref_bases);
2329 free(alt_bases);
2330 free(inserted_bases);
2331 if(disk_is_full){
2332 unlink(fn2);
2333 SUBREADprintf("ERROR: disk is full. Unable to write into the indel list.\n");
2334 }
2335 free(fn2);
2336
2337 return 0;
2338 }
2339
destroy_indel_module(global_context_t * global_context)2340 int destroy_indel_module(global_context_t * global_context)
2341 {
2342 int xk1;
2343 indel_context_t * indel_context = global_context -> module_contexts[MODULE_INDEL_ID];
2344 if(indel_context -> event_entry_table->appendix1)
2345 {
2346 free(indel_context -> event_entry_table -> appendix1);
2347 free(indel_context -> event_entry_table -> appendix2);
2348 }
2349
2350 // free the entry table: the pointers.
2351 destory_event_entry_table(indel_context -> event_entry_table);
2352 HashTableDestroy(indel_context -> event_entry_table);
2353
2354 free(indel_context -> event_space_dynamic);
2355
2356 for(xk1=0;xk1<MAX_READ_LENGTH; xk1++)
2357 {
2358 free(indel_context -> dynamic_align_table[xk1]);
2359 free(indel_context -> dynamic_align_table_mask[xk1]);
2360 }
2361
2362 for(xk1=0; xk1<EVENT_BODY_LOCK_BUCKETS; xk1++)
2363 subread_destroy_lock(indel_context -> event_body_locks+xk1);
2364
2365 free(indel_context -> dynamic_align_table);
2366 free(indel_context -> dynamic_align_table_mask);
2367 return 0;
2368 }
2369
2370
2371
trim_read(global_context_t * global_context,thread_context_t * thread_context,char * read_text,char * qual_text,int read_len,int * delta_position)2372 int trim_read(global_context_t *global_context , thread_context_t *thread_context, char * read_text, char * qual_text, int read_len, int * delta_position)
2373 {
2374 int xk1, bad_bases = 0;
2375 if(qual_text[0]==0) return read_len;
2376
2377
2378 int tail_trim_pos, last_ok = read_len /2;
2379
2380 bad_bases=0;
2381 for(xk1=read_len/2 ; xk1< read_len ; xk1++)
2382 {
2383 if(qual_text[xk1] <= '#'+2)
2384 bad_bases++;
2385 else
2386 last_ok = xk1;
2387 if(bad_bases>=2)
2388 break;
2389 }
2390 tail_trim_pos = last_ok;
2391
2392 int head_trim_pos;
2393
2394 bad_bases=0;
2395 last_ok = read_len /2;
2396 for(xk1=read_len/2 ; xk1>=0 ; xk1--)
2397 {
2398 if(qual_text[xk1] <= '#'+2)
2399 bad_bases++;
2400 else
2401 last_ok = xk1;
2402 if(bad_bases>=2)
2403 break;
2404 }
2405 if(bad_bases>=2)
2406 head_trim_pos = last_ok;
2407 else head_trim_pos = 0;
2408
2409 if(3*(tail_trim_pos - head_trim_pos )<read_len)return -1;
2410
2411 for(xk1=0; xk1<tail_trim_pos - head_trim_pos; xk1++)
2412 {
2413 read_text[xk1] = read_text[xk1+head_trim_pos];
2414 qual_text[xk1] = qual_text[xk1+head_trim_pos];
2415 }
2416
2417 (* delta_position) = head_trim_pos;
2418 read_text[xk1]=0;
2419 qual_text[xk1]=0;
2420
2421 return tail_trim_pos - head_trim_pos;
2422
2423 }
2424
2425 #define READ_HEAD_TAIL_LEN 22
write_local_reassembly(global_context_t * global_context,HashTable * pileup_fp_table,unsigned int anchor_pos,char * read_name,char * read_text,char * qual_text,int read_len,int is_anchor_certain)2426 int write_local_reassembly(global_context_t *global_context, HashTable *pileup_fp_table, unsigned int anchor_pos, char * read_name , char * read_text ,char * qual_text , int read_len, int is_anchor_certain)
2427 {
2428
2429 char * chro_name;
2430 int chro_offset;
2431 int delta_pos = 0;
2432 int new_read_len = trim_read(global_context, NULL, read_text, qual_text, read_len, &delta_pos);
2433
2434
2435 anchor_pos += delta_pos;
2436 if(new_read_len * 3<read_len * 2)
2437 return 0;
2438
2439 read_len = new_read_len;
2440
2441 gene_value_index_t * base_index = &global_context->all_value_indexes[0] ;
2442
2443 if(is_anchor_certain)
2444 {
2445 int head_match = match_chro(read_text, base_index , anchor_pos, READ_HEAD_TAIL_LEN, 0, global_context -> config.space_type);
2446 int tail_match = match_chro(read_text + read_len - READ_HEAD_TAIL_LEN, base_index , anchor_pos + read_len - READ_HEAD_TAIL_LEN, READ_HEAD_TAIL_LEN, 0, global_context -> config.space_type);
2447 if(head_match > READ_HEAD_TAIL_LEN-3 && tail_match> READ_HEAD_TAIL_LEN-3)
2448 return 0;
2449 }
2450
2451
2452
2453
2454 if(0 == locate_gene_position_max(anchor_pos, &global_context -> chromosome_table, &chro_name, &chro_offset, NULL, NULL, read_len))
2455 {
2456 char temp_file_name[MAX_FILE_NAME_LENGTH+40];
2457 int close_now = 0;
2458
2459 sprintf(temp_file_name,"%s@%s-%04u.bin", global_context -> config.temp_file_prefix, chro_name , chro_offset / BASE_BLOCK_LENGTH );
2460
2461 FILE * pileup_fp = get_temp_file_pointer(temp_file_name, pileup_fp_table, &close_now);
2462 //assert(read_len == strlen(read_text) && read_len > 90);
2463 write_read_block_file(pileup_fp , 0, read_name, 0, chro_name , chro_offset, NULL, 0, read_text , qual_text, read_len , 1 , is_anchor_certain , anchor_pos, read_len, 0);
2464 if(close_now) fclose(pileup_fp);
2465 }
2466
2467
2468 return 0;
2469 }
2470
build_local_reassembly(global_context_t * global_context,thread_context_t * thread_context,int pair_number,char * read_name,char * read_text,char * qual_text,int read_len,int mate_read_len,int is_second_read,int best_read_id,int use_mate_pos,mapping_result_t * current_result,mapping_result_t * mate_result)2471 int build_local_reassembly(global_context_t *global_context , thread_context_t *thread_context, int pair_number , char * read_name , char * read_text ,char * qual_text , int read_len , int mate_read_len , int is_second_read, int best_read_id, int use_mate_pos, mapping_result_t *current_result, mapping_result_t *mate_result)
2472 {
2473 unsigned int read_anchor_position;
2474 if(!read_text) return 0;
2475 int is_position_certain = 1;
2476
2477 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
2478 //mapping_result_t *current_result = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, best_read_id);
2479 //mapping_result_t * mate_result = _global_retrieve_alignment_ptr(global_context, pair_number, !is_second_read, best_read_id);
2480
2481
2482
2483 // No complex cigar strings are allowed.
2484 if(use_mate_pos)
2485 {
2486 read_anchor_position = mate_result -> selected_position;
2487
2488 if(is_second_read + ((mate_result -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0) == 1)
2489 // I'm the second read so I'm after the first one.
2490 read_anchor_position += global_context -> config.expected_pair_distance;
2491 else
2492 // I'm the first read so I'm ahead the second one.
2493 read_anchor_position -= global_context -> config.expected_pair_distance;
2494
2495 is_position_certain = 0;
2496
2497 }
2498 else
2499 {
2500 read_anchor_position = current_result -> selected_position;
2501 }
2502
2503 //gene_alue_index_t * value_index = &global_context->all_value_indexes[0] ;
2504
2505 if(qual_text[0] &&global_context -> config.phred_score_format == FASTQ_PHRED64)
2506 fastq_64_to_33(qual_text);
2507
2508 //int window_no = 2*(read_anchor_position / REASSEMBLY_WINDOW_LENGTH);
2509
2510
2511 write_local_reassembly(global_context, indel_context -> local_reassembly_pileup_files, read_anchor_position, read_name , read_text , qual_text , read_len, is_position_certain);
2512
2513 return 0;
2514 }
2515
insert_pileup_read(global_context_t * global_context,reassembly_block_context_t * block_context,unsigned int first_base_pos,char * read_text,char * qual_text,int read_len,int is_read_head_aligned)2516 int insert_pileup_read(global_context_t * global_context , reassembly_block_context_t * block_context , unsigned int first_base_pos ,char * read_text , char * qual_text , int read_len, int is_read_head_aligned)
2517 {
2518 int window_i,xk1;
2519
2520 for(window_i= -1; window_i<1; window_i++)
2521 {
2522 int de_brujin_grp_id = 2*(first_base_pos / REASSEMBLY_WINDOW_LENGTH) + window_i;
2523 int read_pos;
2524 if(de_brujin_grp_id<0)continue;
2525
2526 HashTable * current_table = block_context -> de_bruijn_graphs [de_brujin_grp_id];
2527 for(read_pos = 0; read_pos <= read_len - global_context -> config.k_mer_length; read_pos +=1)
2528 {
2529 unsigned long long int node_key = 0;
2530 for(xk1=0; xk1<global_context ->config.k_mer_length; xk1++)
2531 {
2532 node_key = (node_key << 2) | (base2int(read_text[xk1 + read_pos]));
2533 }
2534
2535
2536 unsigned long long int next_key_prefix = 0x8000000000000000ull | (node_key >> 2);
2537
2538 if(!read_pos && is_read_head_aligned)
2539 {
2540 int start_offset = (first_base_pos - de_brujin_grp_id * REASSEMBLY_WINDOW_LENGTH/2);
2541 if(start_offset < block_context -> start_offsets [de_brujin_grp_id])
2542 {
2543 block_context -> start_offsets [de_brujin_grp_id] = start_offset;
2544 block_context -> start_keys[1+10*de_brujin_grp_id]=0;
2545 block_context -> start_keys[10*de_brujin_grp_id]=next_key_prefix;
2546 }
2547 else if(start_offset == block_context -> start_offsets [de_brujin_grp_id])
2548 {
2549 int start_key_id;
2550 for(start_key_id=0; start_key_id<10&&block_context -> start_keys[de_brujin_grp_id * 10 + start_key_id]; start_key_id++);
2551 if(start_key_id<10) block_context -> start_keys[start_key_id + 10 * de_brujin_grp_id] = next_key_prefix;
2552 }
2553 }
2554
2555 int next_node_type = node_key&0x03;
2556 unsigned int next_jump_array = HashTableGet(current_table ,NULL+next_key_prefix) - NULL;
2557 if(next_jump_array) next_jump_array-=1;
2558
2559 unsigned int current_read_count = 0xff&(next_jump_array >> (next_node_type*8));
2560 if(current_read_count<254) current_read_count ++;
2561
2562 next_jump_array = 1+((next_jump_array & ~(0xff << (next_node_type*8))) | ((current_read_count) << (next_node_type*8)));
2563
2564 HashTablePut(current_table, NULL + next_key_prefix , NULL + next_jump_array);
2565 }
2566 }
2567 return 0;
2568 }
2569
2570
finalise_db_graph(global_context_t * global_context,reassembly_block_context_t * block_context,int de_brujin_graph_id,HashTable * de_brujin_graph,unsigned long long int start_key,short start_offset)2571 int finalise_db_graph(global_context_t * global_context, reassembly_block_context_t * block_context, int de_brujin_graph_id, HashTable * de_brujin_graph, unsigned long long int start_key, short start_offset)
2572 {
2573
2574 unsigned long long int current_key = start_key;
2575 char inserted_sequence[MAX_INSERTION_LENGTH+REASSEMBLY_WINDOW_LENGTH];
2576 unsigned long long int used_keys[MAX_INSERTION_LENGTH+REASSEMBLY_WINDOW_LENGTH];
2577 int inserted_length = 0;
2578 int is_looped = 0;
2579
2580 int xk1;
2581
2582 for(xk1=0; xk1<global_context -> config.k_mer_length-1; xk1++)
2583 {
2584 int intbase = (start_key >> (2*(global_context -> config.k_mer_length-xk1-2))) & 3;
2585 inserted_sequence[xk1] = "AGCT"[intbase];
2586 }
2587 inserted_length = global_context -> config.k_mer_length-1;
2588
2589 while(1)
2590 {
2591 unsigned int next_moves;
2592 int max_move=-1;
2593 next_moves = HashTableGet(de_brujin_graph , NULL + current_key) - NULL;
2594 if(!next_moves) break;
2595 else
2596 {
2597 int test_next;
2598 int all_reads = 0;
2599 int ignored_moves=0;
2600 int max_move_reads=0;
2601 unsigned long long int previous_key = current_key<<2;
2602
2603 next_moves-=1;
2604 for(test_next=0; test_next<4; test_next++)
2605 all_reads += 0xff&(next_moves>>(8*test_next));
2606
2607 while(1)
2608 {
2609 max_move = -1;
2610 max_move_reads = -1;
2611 for(test_next=0; test_next<4; test_next++)
2612 {
2613 if(ignored_moves & (1<<test_next)) continue;
2614 int test_move_reads=0xff&(next_moves>>(8*test_next));
2615 if(test_move_reads > max_move_reads)
2616 {
2617 max_move_reads = test_move_reads;
2618 max_move = test_next;
2619 }
2620 }
2621
2622 if(max_move < 0)
2623 break;
2624
2625 inserted_sequence[inserted_length] = "AGCT"[max_move];
2626 int xk2, is_used_key = 0;
2627 current_key = (previous_key | max_move);
2628
2629 for(xk2=0; xk2<inserted_length- global_context -> config.k_mer_length + 1; xk2++)
2630 {
2631 if(current_key == used_keys[xk2])
2632 {
2633 is_used_key = 1;
2634 is_looped = 1;
2635 ignored_moves |= (1<<max_move);
2636 break;
2637 }
2638 }
2639 if(!is_used_key || is_looped) break;
2640 }
2641 used_keys[inserted_length - global_context -> config.k_mer_length + 1] = current_key;
2642 current_key &= (0x3fffffffffffffffull >> (2*(32 - global_context -> config.k_mer_length)));
2643 current_key |= 0x8000000000000000ull;
2644 inserted_length++;
2645 }
2646 if(inserted_length > MAX_INSERTION_LENGTH + REASSEMBLY_WINDOW_LENGTH - 1 || max_move<0|| is_looped)
2647 {
2648 is_looped = 1;
2649 inserted_sequence[inserted_length]=0;
2650
2651 break;
2652 }
2653 }
2654
2655 if(!is_looped)
2656 {
2657 inserted_sequence[inserted_length]=0;
2658
2659
2660 assert(global_context -> index_block_number == 1);
2661 gene_value_index_t * base_index = &global_context->all_value_indexes[0] ;
2662 unsigned int window_start = block_context -> block_start_linear_pos + de_brujin_graph_id * REASSEMBLY_WINDOW_LENGTH / 2;
2663 int best_matched_bases = -1, best_end_pos = -1, best_start_pos = start_offset;
2664
2665 #define END_POS_CONFIRM_LENGTH 24
2666
2667 for(xk1 = inserted_length + start_offset - END_POS_CONFIRM_LENGTH ; xk1>= start_offset+15 ; xk1--) //find the end of insertion
2668 {
2669 if(inserted_length - xk1 - END_POS_CONFIRM_LENGTH + best_start_pos >= MAX_INSERTION_LENGTH) break;
2670 int matched_bases = match_chro(inserted_sequence + inserted_length - END_POS_CONFIRM_LENGTH, base_index , window_start + xk1, END_POS_CONFIRM_LENGTH, 0, global_context -> config.space_type );
2671 if(matched_bases>best_matched_bases)
2672 {
2673 best_matched_bases = matched_bases;
2674 best_end_pos = xk1 + END_POS_CONFIRM_LENGTH;
2675 }
2676 }
2677
2678
2679 int new_base_length = inserted_length - best_end_pos + best_start_pos;
2680 if(new_base_length>0 && best_matched_bases >= END_POS_CONFIRM_LENGTH - 1)
2681 {
2682 best_matched_bases = -1;
2683 int best_insertion_point = -1;
2684 for(xk1 = 15; xk1<best_end_pos - best_start_pos; xk1++)
2685 {
2686 int insertion_matched =
2687 match_chro(inserted_sequence + xk1 - 15, base_index , window_start + best_start_pos + xk1 - 15, 15, 0, global_context -> config.space_type) +
2688 match_chro(inserted_sequence + xk1 + new_base_length , base_index , window_start + best_start_pos + xk1 , 15, 0 , global_context -> config.space_type );
2689 if(insertion_matched>best_matched_bases && insertion_matched>=29)
2690 {
2691 best_insertion_point = xk1;
2692 best_matched_bases = insertion_matched;
2693 }
2694 }
2695 if(best_insertion_point>=0)
2696 {
2697 unsigned int best_pos = window_start + best_start_pos + best_insertion_point;
2698 int indels = -new_base_length;
2699
2700 //inserted_sequence[best_insertion_point + new_base_length] = 0;
2701
2702
2703 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
2704 HashTable * event_table = indel_context -> event_entry_table;
2705 chromosome_event_t * event_space = indel_context -> event_space_dynamic;
2706
2707 chromosome_event_t * found = NULL;
2708 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
2709 int found_events = search_event(global_context, event_table, event_space, best_pos - 1, EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_INDEL, search_return);
2710
2711 if(found_events)
2712 {
2713 int kx1;
2714 for(kx1 = 0; kx1 < found_events ; kx1++)
2715 {
2716 if(search_return[kx1] -> indel_length == indels)
2717 {
2718 found = search_return[kx1];
2719 break;
2720 }
2721 }
2722 }
2723
2724 if(found){
2725 found -> supporting_reads ++;
2726 }
2727 else
2728 {
2729 int event_no;
2730 event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> total_events ++;
2731
2732 event_space = reallocate_event_space(global_context, NULL, event_no);
2733
2734 chromosome_event_t * new_event = event_space+event_no;
2735 memset(new_event,0,sizeof(chromosome_event_t));
2736
2737 set_insertion_sequence(global_context, NULL , &new_event -> inserted_bases , inserted_sequence+best_insertion_point, -indels);
2738
2739 new_event -> event_small_side = best_pos - 1;
2740 new_event -> event_large_side = best_pos + max(0, indels);
2741 assert(new_event -> event_small_side && new_event -> event_large_side);
2742 new_event -> event_type = CHRO_EVENT_TYPE_INDEL;
2743 new_event -> indel_length = indels;
2744 new_event -> supporting_reads = 1;
2745
2746 put_new_event(event_table, new_event , event_no);
2747
2748 }
2749
2750 fprintf(global_context -> long_insertion_FASTA_fp, ">BEGIN%u_%dM%dI%dM\n", window_start + best_start_pos, best_insertion_point , -indels , inserted_length + indels - best_insertion_point);
2751 int write_cursor;
2752 for(write_cursor = 0 ; write_cursor < inserted_length ; write_cursor += 70)
2753 {
2754 fwrite(inserted_sequence + write_cursor , min(70, inserted_length - write_cursor), 1, global_context -> long_insertion_FASTA_fp);
2755 fputs("\n",global_context -> long_insertion_FASTA_fp);
2756 }
2757 }
2758 }
2759
2760 }
2761 return 0;
2762 }
2763
2764
2765 #define finalise_pileup_file_by_voting finalise_pileup_file
2766
2767 #define REASSEMBLY_SUBREAD_GAP 1
2768
2769 int search_window_once(global_context_t * global_context, reassembly_by_voting_block_context_t * block_context, int window_no, int start_read_no, int search_direction, int anti_loop, float allele_quality);
2770 // search_direction = 1 : from 5' end to 3' end
2771 // search_direction = -1 : from 3'end to 5' end
search_window(global_context_t * global_context,reassembly_by_voting_block_context_t * block_context,int window_no,int start_read_no,int search_direction)2772 void search_window(global_context_t * global_context, reassembly_by_voting_block_context_t * block_context, int window_no, int start_read_no, int search_direction)
2773 {
2774
2775 block_context->rebuilt_window[0] = 0;
2776 block_context->rebuilt_window_size = 0;
2777
2778 memset(block_context->final_alleles, 0, sizeof(struct reassmebly_window_allele ) * global_context -> config.reassembly_window_alleles);
2779
2780 block_context -> search_cost = 0;
2781 block_context -> used_read_number = 0;
2782 block_context -> total_matched_bases = 0;
2783 block_context -> max_matched_bases = 0;
2784
2785 /*int ret = */search_window_once(global_context, block_context, window_no, start_read_no, search_direction, 0, 1);
2786
2787 }
2788
2789
2790 // this function returns if the read is not rectifiable (i.e. likely to be misalignment)
rectify_read_text(global_context_t * global_context,reassembly_by_voting_block_context_t * block_context,int window_no,gehash_t * index,char * next_read_txt,int really_make_change)2791 int rectify_read_text(global_context_t * global_context, reassembly_by_voting_block_context_t * block_context, int window_no, gehash_t * index, char * next_read_txt, int really_make_change)
2792 {
2793 // return 1;
2794 int read_offset,i,j;
2795 int read_len = strlen(next_read_txt);
2796
2797 //int is_test_target = (strcmp("AATGGAACAACCGCCTCAAATGGAATGGAATGGCCTCCAAAGGAAGGGAGTGACATTTAATGAATGCATTCTAATGGAATGGCATGGAATGGACTGGAA", next_read_txt)==0);
2798 int is_test_target = 0;
2799
2800 init_gene_vote(block_context -> vote_list_rectify);
2801 for(read_offset = 0; read_offset < read_len - global_context -> config.reassembly_subread_length+1; read_offset += 1)
2802 {
2803 unsigned int subread_int = 0, xk2;
2804 for(xk2=0; xk2< global_context -> config.reassembly_subread_length; xk2++)
2805 subread_int = (subread_int << 2) | base2int(next_read_txt[read_offset + xk2]);
2806 gehash_go_q(index, subread_int , read_offset, read_len, 0, block_context -> vote_list_rectify, 22, read_offset, 0, 0x7fffffff);
2807 }
2808
2809 memset(block_context -> read_rectify_space, 0, MAX_READ_LENGTH * 4 * sizeof(short));
2810
2811 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
2812 for(j=0; j< block_context -> vote_list_rectify -> items[i]; j++)
2813 {
2814 int xk1;
2815
2816 int neighbour_read_no = (block_context -> vote_list_rectify -> pos[i][j] + MAX_READ_LENGTH+1)/10000;
2817 int neighbour_read_mapped_start, this_read_mapping_start, overlapped_length;
2818 unsigned long long int neighbour_read_key = (window_no * 1llu +1)<<32 ;
2819 neighbour_read_key |= (neighbour_read_no& 0xfffff)<<12;
2820 char * neighbour_read_text = HashTableGet(block_context->read_sequence_table,NULL+neighbour_read_key);
2821 assert(neighbour_read_text);
2822 int neighbour_read_len = strlen(neighbour_read_text);
2823
2824 // neighbour_read_mapped_start is the location on the neigibour read of the first base of the current read.
2825 neighbour_read_mapped_start = - neighbour_read_no*10000 + block_context -> vote_list_rectify -> pos[i][j];// + block_context -> vote_list -> coverage_start[i][j];
2826
2827 if(neighbour_read_mapped_start>=0)
2828 {
2829 this_read_mapping_start = 0;
2830 overlapped_length = min(neighbour_read_len - neighbour_read_mapped_start, read_len);
2831 }
2832 else
2833 {
2834 // this_read_mapping_start is the location on this read of the first base of the neighbour read.
2835 this_read_mapping_start = -neighbour_read_mapped_start;
2836 neighbour_read_mapped_start = 0;
2837 overlapped_length = min(read_len - this_read_mapping_start, neighbour_read_len);
2838 if(is_test_target)
2839 {
2840 for(xk1 = 0; xk1 < 3+this_read_mapping_start; xk1++) fputc(' ',stdout );
2841 printf("%s\n", neighbour_read_text);
2842 }
2843 }
2844
2845
2846 for(xk1 = 0; xk1 < overlapped_length; xk1++)
2847 {
2848 int this_read_pos = this_read_mapping_start + xk1;
2849 int nb_read_pos = neighbour_read_mapped_start + xk1;
2850 block_context -> read_rectify_space[this_read_pos * 4 +base2int(neighbour_read_text[nb_read_pos])] ++;
2851 }
2852 }
2853
2854 if(is_test_target)
2855 printf(" %s\n", next_read_txt);
2856 char ori[2000];
2857 strcpy(ori, next_read_txt);
2858 is_bad_read(ori, read_len, "RECT");
2859 int rectified = 0;
2860
2861
2862 if(is_test_target)
2863 printf("R0=%s\n", next_read_txt);
2864 char changed[200];
2865 // memset(changed, ' ', 200);
2866 for(i=0; i<read_len; i++)
2867 {
2868 int max_support = block_context -> read_rectify_space[i*4+base2int(next_read_txt[i])];
2869 int base_sel = -1;
2870 for(j=0; j<4; j++)
2871 {
2872 if(j!=base2int(next_read_txt[i]) && block_context -> read_rectify_space[i*4+j] > max_support)
2873 {
2874 base_sel = int2base(j);
2875 max_support = block_context -> read_rectify_space[i*4+j];
2876 }
2877 }
2878
2879 if(base_sel>0)
2880 {
2881 if(really_make_change)
2882 {
2883 next_read_txt[i]=base_sel;
2884 // changed[i] = '~';
2885 }
2886 rectified ++;
2887 }
2888 }
2889
2890 changed[i]=0;
2891
2892 if(is_test_target)
2893 printf("R1=%s\n %s\n\n", next_read_txt, changed);
2894 is_bad_read(next_read_txt, read_len, "REC2");
2895 return rectified<=2;
2896 }
2897
search_window_once(global_context_t * global_context,reassembly_by_voting_block_context_t * block_context,int window_no,int start_read_no,int search_direction,int anti_loop,float allele_quality)2898 int search_window_once(global_context_t * global_context, reassembly_by_voting_block_context_t * block_context, int window_no, int start_read_no, int search_direction, int anti_loop, float allele_quality)
2899 {
2900
2901 block_context -> search_cost++;
2902 if(block_context -> search_cost > 1000) return 0;
2903
2904
2905 int is_start_certain = 1, is_digged_in = 0;
2906 unsigned long long int starter_read_key = (window_no * 1llu +1)<<32 ;
2907 starter_read_key |= (start_read_no& 0xfffff)<<12;
2908
2909 char * start_read_txt = HashTableGet(block_context->read_sequence_table,NULL+starter_read_key);
2910 assert(start_read_txt);
2911
2912
2913 int start_read_pos = HashTableGet(block_context->read_position_table, NULL+starter_read_key) - NULL ;
2914 if(start_read_pos == 0) is_start_certain = 0;
2915 else start_read_pos --;
2916
2917
2918 int read_len = strlen(start_read_txt), read_offset;
2919 init_gene_vote(block_context -> vote_list);
2920
2921 int minimum_overlap_length = read_len /4;
2922 //if(!is_start_certain) minimum_overlap_length *= 1.5;
2923 //minimum_overlap_length = max(1,16-global_context ->config.reassembly_subread_length);
2924
2925
2926 for(read_offset = 0; read_offset < read_len - global_context -> config.reassembly_subread_length+1; read_offset += 1)
2927 {
2928 unsigned int subread_int = 0, xk2;
2929 for(xk2=0; xk2< global_context -> config.reassembly_subread_length; xk2++)
2930 subread_int = (subread_int << 2) | base2int(start_read_txt[read_offset + xk2]);
2931
2932
2933 if(global_context -> config.reassembly_tolerable_voting)
2934 gehash_go_q_tolerable(&block_context->voting_indexes[window_no], subread_int , read_offset, read_len, 0, block_context -> vote_list, 1, 22, 24, 0, read_offset, global_context -> config.reassembly_tolerable_voting, global_context -> config.reassembly_subread_length, 0, 0x7fffffff);
2935 else
2936 gehash_go_q(&block_context->voting_indexes[window_no], subread_int , read_offset, read_len, 0, block_context -> vote_list,0,read_offset, 0, 0x7fffffff);
2937 }
2938
2939 int WINDOW_SEARCH_TREE_WIDTH = 1;
2940 int i,j;
2941
2942 int max_read_scoring [8];
2943 unsigned int max_read_ids [8];
2944 int max_read_nonoverlap [8];
2945 float max_allele_quality [8];
2946
2947 memset(max_read_scoring, 0, sizeof(int)* 8);
2948 memset(max_allele_quality, 0, sizeof(float)* 8);
2949 memset(max_read_ids, 0, sizeof(int)* 8);
2950 memset(max_read_nonoverlap, 0, sizeof(int)* 8);
2951
2952 //intif(is_target_window_X(window_no))
2953 //if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
2954 // print_votes(block_context -> vote_list, global_context -> config.index_prefix);
2955
2956 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
2957 for(j=0; j< block_context -> vote_list -> items[i]; j++)
2958 {
2959 //if(is_digged_in)break;
2960 //if((block_context -> vote_list -> coverage_end[i][j] - block_context -> vote_list -> coverage_start[i][j]) * 6 > read_len )
2961 // if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
2962 // SUBREADprintf("LLLX PRINT_VOTE @%u : V=%d\n", block_context -> vote_list -> pos[i][j], block_context -> vote_list -> votes[i][j] );
2963
2964 if( block_context -> vote_list -> votes[i][j] > 4)
2965 {
2966 int next_read_no = (block_context -> vote_list -> pos[i][j] + MAX_READ_LENGTH +1 )/10000;
2967 if(next_read_no == start_read_no) continue;
2968 int next_read_nonoverlap = next_read_no * 10000 - block_context -> vote_list -> pos[i][j];
2969 float next_read_allele_quality = allele_quality;
2970 int anchor_to_index = 0;
2971
2972 if(abs(next_read_nonoverlap) < 2)
2973 anchor_to_index = 0; // PARALLEL or VALUELESS
2974 else if(next_read_nonoverlap < 0)
2975 anchor_to_index = -1; // INDEX -> ANCHOR
2976 else anchor_to_index = 1; // ANCHOR -> INDEX
2977
2978
2979 unsigned long long int test_next_read_key = (window_no * 1llu +1)<<32 ;
2980 test_next_read_key |= ( next_read_no& 0xfffff)<<12;
2981 int is_next_certain = 1;
2982 int next_read_pos = HashTableGet(block_context->read_position_table, NULL+test_next_read_key) - NULL;
2983 if(next_read_pos==0)
2984 is_next_certain =0;
2985 else next_read_pos--;
2986
2987 /*
2988 if(!is_start_certain || !is_next_certain)
2989 {
2990 if(anchor_to_index == 1 && (block_context -> vote_list -> coverage_start[i][j] - next_read_nonoverlap) > 5)
2991 anchor_to_index = 0; // PARALLEL or VALUELESS
2992
2993 if(anchor_to_index == -1 && block_context -> vote_list -> coverage_start[i][j] > 5)
2994 anchor_to_index = 0; // PARALLEL or VALUELESS
2995 }*/
2996
2997 char * next_read_txt = HashTableGet(block_context->read_sequence_table,NULL+test_next_read_key);
2998 assert(next_read_txt);
2999 int next_read_len = strlen(next_read_txt);
3000 int two_read_overlapped = next_read_len - next_read_nonoverlap;
3001 if(next_read_nonoverlap<0)
3002 two_read_overlapped = read_len + next_read_nonoverlap;
3003 if(two_read_overlapped< minimum_overlap_length)
3004 anchor_to_index = 0;
3005
3006 if(search_direction>0)
3007 {
3008 if( next_read_nonoverlap + strlen(next_read_txt) <= read_len)
3009 anchor_to_index = 0;
3010 }
3011
3012 if(anchor_to_index * search_direction > 0)
3013 {
3014
3015 long long int pos_delta = next_read_pos;
3016 pos_delta -= start_read_pos;
3017
3018 int xk3, next_read_scoring = 0, next_read_matching = 0;
3019
3020 int order_reward = (is_next_certain && is_start_certain && search_direction * pos_delta >0)? 25000:0;
3021 if(is_start_certain * is_next_certain == 0) order_reward = 10000;
3022 int wrong_bases=0;
3023
3024 if(anchor_to_index > 0)
3025 {
3026 if(next_read_len+ next_read_nonoverlap< read_len)continue;// no new bases are added by the new read.
3027
3028 for(xk3 = next_read_nonoverlap ; xk3 < read_len ; xk3++)
3029 if(start_read_txt[xk3] == next_read_txt[xk3 - next_read_nonoverlap]) next_read_matching++;
3030 wrong_bases = read_len - next_read_nonoverlap - next_read_matching;
3031 if(wrong_bases>=2)
3032 {
3033 next_read_scoring = -1;
3034 }
3035 else
3036 {
3037 block_context -> total_matched_bases += next_read_matching;
3038
3039 next_read_scoring = two_read_overlapped * 10000 + order_reward
3040 +(unsigned int)(HashTableGet(block_context -> read_quality_table, NULL+test_next_read_key)-NULL)/10 - wrong_bases * 20000;
3041 }
3042 }
3043 else
3044 {
3045 if(0< next_read_nonoverlap) continue; // no new bases are added by the new read.
3046 int should_match = min(next_read_len, read_len - next_read_nonoverlap);
3047 if(next_read_len > read_len - next_read_nonoverlap) continue; // the head of the next read is untested: bad!
3048 for(xk3 = - next_read_nonoverlap ; xk3 < should_match ; xk3++)
3049 if(next_read_txt[xk3] == start_read_txt[xk3 + next_read_nonoverlap]) next_read_matching++;
3050
3051 wrong_bases = should_match + next_read_nonoverlap - next_read_matching;
3052
3053 if(wrong_bases>=2)
3054 {
3055 next_read_scoring = -1;
3056 }
3057 else
3058 {
3059 block_context -> total_matched_bases += next_read_matching;
3060
3061 next_read_scoring = two_read_overlapped * 10000 +order_reward
3062 + (unsigned int)(HashTableGet(block_context -> read_quality_table, NULL+test_next_read_key)-NULL)/10 - wrong_bases * 20000;
3063 }
3064
3065 }
3066
3067
3068 if(wrong_bases)
3069 next_read_allele_quality *= 0.5;
3070
3071 int is_rectifiable = rectify_read_text(global_context, block_context, window_no, &block_context->voting_indexes[window_no], next_read_txt , 0);
3072
3073
3074 /*
3075 if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
3076 {
3077 SUBREADprintf("LLLX: NREAD #%d HAS QUAL %d ; is_rectifiable=%d ; WB=%d; ANCHOR_TO_I=%d\n", next_read_no, next_read_scoring, is_rectifiable, wrong_bases, anchor_to_index);
3078 SUBREADprintf("LLLX: NREAD OVL=%d, RLEN=%d, NRLEN=%d, NRMATCH=%d\n", next_read_nonoverlap, read_len, next_read_len, next_read_matching);
3079 SUBREADprintf("LLLX: THIS_READ=%s\n", start_read_txt);
3080 SUBREADprintf("LLLX: NEXT_READ=%s\n", next_read_txt);
3081 }
3082 */
3083 if((next_read_scoring > max_read_scoring[ WINDOW_SEARCH_TREE_WIDTH-1]) && is_rectifiable)
3084 {
3085 int is_unique = 1, is_replace = 0;
3086 for(xk3=0; xk3<WINDOW_SEARCH_TREE_WIDTH; xk3++)
3087 {
3088
3089 if(global_context -> config.reassembly_window_alleles>1&&max_read_scoring[xk3]>0)
3090 {
3091 int position_diff = max_read_nonoverlap[xk3] - next_read_nonoverlap;
3092 int xk4;
3093 unsigned long long int test_prev_read_key = (window_no * 1llu +1)<<32 ;
3094 test_prev_read_key |= (max_read_ids[xk3] & 0xfffff)<<12;
3095 char *prev_read_txt = HashTableGet(block_context->read_sequence_table,NULL+test_prev_read_key);
3096 int mismatch=0;
3097
3098 for(xk4 = abs(position_diff) ; ; xk4++)
3099 {
3100 int prev_read_pos = xk4-(position_diff>0?position_diff:0);
3101 int next_read_pos = xk4+(position_diff<0?position_diff:0);
3102 if(prev_read_txt[prev_read_pos]==0 || next_read_txt[next_read_pos]==0) break;
3103 mismatch += (prev_read_txt[prev_read_pos]!=next_read_txt[next_read_pos]);
3104 }
3105
3106
3107 if(mismatch<2)
3108 {
3109 is_unique = 0;
3110 if(position_diff > 0)
3111 is_replace = xk3+1;
3112 }
3113 }
3114
3115 }
3116 if(is_replace)
3117 {
3118 max_read_scoring[is_replace-1] = next_read_scoring;
3119 max_read_ids[is_replace-1] = next_read_no;
3120 max_read_nonoverlap[is_replace-1] = next_read_nonoverlap;
3121 max_allele_quality[is_replace-1] = next_read_allele_quality;
3122 }
3123 else
3124 {
3125 if(is_unique) for(xk3=0; xk3<WINDOW_SEARCH_TREE_WIDTH; xk3++)
3126 {
3127 if(next_read_scoring > max_read_scoring[xk3])
3128 {
3129 int xk4;
3130 for(xk4 = WINDOW_SEARCH_TREE_WIDTH-1; xk4>xk3; xk4--)
3131 {
3132 max_read_scoring[xk4] = max_read_scoring[xk4-1];
3133 max_read_ids[xk4] = max_read_ids[xk4-1];
3134 max_read_nonoverlap[xk4] = max_read_nonoverlap[xk4-1];
3135 max_allele_quality[xk4] = max_allele_quality[xk4-1];
3136 }
3137 max_read_scoring[xk3] = next_read_scoring;
3138 max_read_ids[xk3] = next_read_no;
3139 max_read_nonoverlap[xk3] = next_read_nonoverlap;
3140 max_allele_quality[xk3] = next_read_allele_quality;
3141 //break;
3142 }
3143 }
3144 }
3145 }
3146 }
3147
3148 }
3149 }
3150
3151
3152 for(i = 0; i<WINDOW_SEARCH_TREE_WIDTH; i++)
3153 {
3154 unsigned int next_read_no = max_read_ids[i];
3155 int is_fresh = 1;
3156
3157 // if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
3158 // SUBREADprintf("LLLX: TEST: %d\n", i);
3159
3160 if(max_read_scoring[i]<=0) break;
3161 is_digged_in = 1;
3162 for(j=0; j< block_context -> used_read_number; j++)
3163 {
3164 if(block_context -> used_read_ids[j] == next_read_no)
3165 {
3166 is_fresh = 0;
3167
3168 FRESH_COUNTER ++;
3169 break;
3170 }
3171 }
3172
3173 if(!is_fresh)continue;
3174
3175 block_context -> used_read_ids[block_context ->used_read_number++] = next_read_no;
3176
3177 float next_allele_quality = max_allele_quality[i];
3178 int next_read_nonoverlap = max_read_nonoverlap[i];
3179
3180
3181 unsigned long long int test_next_read_key = (window_no * 1llu +1)<<32 ;
3182 test_next_read_key |= ( next_read_no& 0xfffff)<<12;
3183
3184
3185 char * next_read_txt = HashTableGet(block_context->read_sequence_table,NULL+test_next_read_key);
3186 assert(next_read_txt);
3187
3188 int new_bases = (search_direction>0)?strlen(next_read_txt)-(read_len - next_read_nonoverlap):-next_read_nonoverlap;
3189 if (new_bases + block_context -> rebuilt_window_size > 1000){
3190 return 1;
3191 }
3192
3193 int basic_length = 1;//max(read_len, strlen(next_read_txt))/3;
3194 int max_length = 99999;
3195
3196 //if(anti_loop) max_length = min(read_len, strlen(next_read_txt))/3;
3197
3198 int current_rebuilt_size = block_context -> rebuilt_window_size;
3199 int current_matched_bases = block_context -> total_matched_bases;
3200 //int current_used_read_number = block_context -> used_read_number;
3201 if(new_bases>=basic_length && new_bases < max_length)
3202 {
3203 if(strlen(block_context -> rebuilt_window) != block_context ->rebuilt_window_size) return 1;
3204
3205 //int is_rectifiable = rectify_read_text(global_context, block_context, window_no, &block_context->voting_indexes[window_no], next_read_txt , 1);
3206 //if(!is_rectifiable) continue;
3207
3208 if(search_direction > 0)
3209 {
3210 int high_quality_offset = 0*min(2, strlen(next_read_txt) - new_bases);
3211 assert(high_quality_offset>=0);
3212 if(block_context ->rebuilt_window_size > high_quality_offset)
3213 {
3214 block_context -> rebuilt_window[ block_context ->rebuilt_window_size-high_quality_offset] =0;
3215 strcat(block_context -> rebuilt_window , next_read_txt + strlen(next_read_txt) - new_bases - high_quality_offset);
3216 }
3217 else
3218 strcat(block_context -> rebuilt_window , next_read_txt + strlen(next_read_txt) - new_bases);
3219 }
3220 else
3221 {
3222 int xk3;
3223 for(xk3= block_context -> rebuilt_window_size ; xk3>=0 ; xk3--)
3224 block_context -> rebuilt_window[new_bases + xk3] = block_context -> rebuilt_window[xk3];
3225
3226 int high_quality_offset = 0*min(2, strlen(next_read_txt) - new_bases);
3227 //high_quality_offset = 0;
3228 assert(high_quality_offset>=0);
3229 memcpy(block_context -> rebuilt_window , next_read_txt , new_bases + high_quality_offset);
3230 block_context -> rebuilt_window [block_context -> rebuilt_window_size +new_bases] = 0;
3231 }
3232
3233 block_context -> rebuilt_window_size += new_bases;
3234 // if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
3235 // SUBREADprintf("LLLX: DIG IN: %d -> %d (DIR=%d)\n", start_read_no, next_read_no, search_direction);
3236
3237 search_window_once(global_context, block_context, window_no, next_read_no, search_direction, anti_loop, next_allele_quality);
3238 DIGIN_COUNTER ++;
3239 if(block_context -> search_cost > 1000) return 1;
3240
3241 }
3242 if(search_direction > 0)
3243 // if the search is to the 3' end, the stack is simply dumpped.
3244 block_context -> rebuilt_window[current_rebuilt_size] = 0;
3245 else
3246 {
3247 // if the search is to the 5' end, we have to move the tail back to head.
3248 int xk4;
3249 int bytes_to_remove = block_context -> rebuilt_window_size - current_rebuilt_size;
3250 for(xk4 = bytes_to_remove; xk4 < block_context -> rebuilt_window_size; xk4++)
3251 block_context -> rebuilt_window[xk4-bytes_to_remove] = block_context -> rebuilt_window[xk4];
3252 block_context -> rebuilt_window[current_rebuilt_size] = 0;
3253 }
3254
3255 block_context -> rebuilt_window_size = current_rebuilt_size;
3256 block_context -> total_matched_bases = current_matched_bases;
3257 //block_context -> used_read_number = current_used_read_number;
3258 }
3259
3260 // if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
3261 // SUBREADprintf("LLLX: NOT DIG IN: %d :: %d DIR=%d\n", is_digged_in, block_context -> rebuilt_window_size, search_direction);
3262
3263 if(!is_digged_in && block_context -> rebuilt_window_size > 0)
3264 {
3265 //if(block_context -> total_matched_bases * 10000 /block_context -> rebuilt_window_size > block_context -> max_matched_bases)
3266 int xk1, xk2;
3267 for(xk1 = 0; xk1 < global_context -> config.reassembly_window_alleles; xk1++)
3268 {
3269 //int is_same_allele = 0;
3270
3271 // if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
3272 // SUBREADprintf("LLLX [%d] CPY: %d<%d\n", xk1, block_context -> final_alleles[xk1].rebuilt_size , block_context -> rebuilt_window_size);
3273
3274 if(block_context -> final_alleles[xk1].rebuilt_size < block_context -> rebuilt_window_size)
3275 {
3276 //int xk3=0;
3277 for(xk2= global_context -> config.reassembly_window_alleles - 1 ; xk2 > xk1; xk2--)
3278 memcpy(&block_context -> final_alleles[xk2], &block_context -> final_alleles[xk2 - 1], sizeof(struct reassmebly_window_allele));
3279
3280 strcpy(block_context -> final_alleles[xk1].rebuilt_window , block_context -> rebuilt_window);
3281 block_context -> max_matched_bases = block_context -> total_matched_bases * 10000 / block_context -> rebuilt_window_size;
3282 block_context -> final_alleles[xk1].rebuilt_size = block_context -> rebuilt_window_size;
3283 block_context -> final_alleles[xk1].allele_quality = allele_quality;
3284 break;
3285 }
3286 }
3287 }
3288
3289 return 0;
3290 }
3291
3292
3293 #define MAX_INDELS_IN_WINDOW 10
3294 #define PROBE_KEY_NUMBER 12
3295 #define INDEL_FULL_ALIGN_MAX_MISMATCH 3
3296 #define SINGLE_PROBE_MAX_MISMATCH 1
3297
full_indel_alignment(global_context_t * global_context,reassembly_by_voting_block_context_t * block_context,char * full_rebuilt_window,int full_rebuilt_window_size,gene_value_index_t * base_index,unsigned int window_start_pos,unsigned int * perfect_segment_start_pos,int * perfect_segment_lengths,int * indels_after_perfect_segments,short * indels_read_positions,float * indel_quality,unsigned int * contig_start_pos,unsigned int * contig_end_pos,int * head_removed_bases,int * tail_removed_bases,int * good_quality_indel)3298 int full_indel_alignment(global_context_t * global_context, reassembly_by_voting_block_context_t * block_context, char * full_rebuilt_window, int full_rebuilt_window_size, gene_value_index_t * base_index, unsigned int window_start_pos, unsigned int * perfect_segment_start_pos, int * perfect_segment_lengths,int * indels_after_perfect_segments, short * indels_read_positions, float * indel_quality, unsigned int * contig_start_pos, unsigned int * contig_end_pos, int * head_removed_bases, int * tail_removed_bases, int * good_quality_indel)
3299 {
3300 int xk1;
3301 int is_unreliable = 0;
3302
3303 unsigned int xk2 = window_start_pos;
3304 unsigned int probe_poses [PROBE_KEY_NUMBER];
3305 int used_probe_in_rebuilt_window[PROBE_KEY_NUMBER];
3306 int used_probes = 0;
3307
3308 if(window_start_pos>300) xk2-=300;
3309 if(full_rebuilt_window_size<101)return 0;
3310 unsigned int last_best_xk2 = xk2;
3311
3312 memset(probe_poses ,0 , sizeof(unsigned int)*PROBE_KEY_NUMBER);
3313 (*contig_start_pos) = 0;
3314 (*contig_end_pos) = 0xffffffff;
3315
3316 for(xk1=0; xk1<PROBE_KEY_NUMBER; xk1++)
3317 {
3318 char * probe_key = full_rebuilt_window + (full_rebuilt_window_size-global_context -> config.reassembly_key_length) * xk1 /(PROBE_KEY_NUMBER-1);
3319
3320 int max_probe_match = -1;
3321 unsigned int next_best_xk2 = 0;
3322
3323
3324 // xk2 is the first base of the location matching this probe.
3325 for(xk2 = last_best_xk2; xk2 < REASSEMBLY_WINDOW_LENGTH + window_start_pos + 300 + MAX_INSERTION_LENGTH; xk2++)
3326 {
3327 int probe_match = match_chro(probe_key, base_index, xk2, global_context -> config.reassembly_key_length, 0, global_context -> config.space_type);
3328 if(probe_match > max_probe_match )
3329 {
3330 next_best_xk2 = xk2;
3331 max_probe_match = probe_match;
3332 }
3333 }
3334
3335 if(max_probe_match >= global_context -> config.reassembly_key_length - SINGLE_PROBE_MAX_MISMATCH)
3336 {
3337 // calculate the contig start;
3338 if((*contig_start_pos)==0)
3339 {
3340 (*contig_start_pos) = next_best_xk2 + 1 - (probe_key-full_rebuilt_window);
3341 (*head_removed_bases) = probe_key-full_rebuilt_window;
3342 }
3343
3344 (*contig_end_pos) = next_best_xk2 + 1 - (probe_key-full_rebuilt_window) + full_rebuilt_window_size;
3345 (*tail_removed_bases) = full_rebuilt_window_size - (probe_key-full_rebuilt_window) - global_context -> config.reassembly_key_length;
3346
3347 probe_poses[used_probes] = next_best_xk2+1;
3348 used_probe_in_rebuilt_window[used_probes] = (full_rebuilt_window_size-global_context -> config.reassembly_key_length) * xk1 /(PROBE_KEY_NUMBER-1);
3349 //if(max_probe_match<global_context -> config.reassembly_key_length - 1)
3350 // is_unreliable += 2;
3351
3352 used_probes +=1;
3353 }
3354 else
3355 {
3356 if(xk1 ==0 || xk1 == PROBE_KEY_NUMBER-1)
3357 {
3358 is_unreliable += 3;
3359 }
3360 }
3361
3362 // if(next_best_xk2>0)
3363 }
3364
3365
3366 for(xk1=0; xk1<used_probes - 1; xk1++)
3367 {
3368 if(probe_poses[xk1]>= probe_poses[xk1+1]) return 0;
3369 }
3370
3371 // find indels between every two mapped probes.
3372 // and we know the indel length already.
3373
3374 int ret = 0, total_unmatched = 0;
3375
3376 //int H0_matched = match_chro(full_rebuilt_window , base_index, probe_poses[0]-1, full_rebuilt_window_size, 0, global_context -> config.space_type);
3377
3378 int total_delta = 0;
3379 for(xk1=0; xk1 < used_probes-1; xk1++)
3380 {
3381
3382 // indels_in_section < 0 : insertion.
3383
3384 // used_probe_in_rebuilt_window is the location of the probe on reads
3385
3386 // probe_poses is the location of the probe on the chromosome.
3387 long long int indels_in_section = probe_poses[xk1+1];
3388 indels_in_section -= probe_poses[xk1];
3389 indels_in_section -= used_probe_in_rebuilt_window[xk1+1];
3390 indels_in_section += used_probe_in_rebuilt_window[xk1];
3391
3392 total_delta += indels_in_section;
3393 if(indels_in_section)
3394 {
3395 // indels_in_section = -5: 5 insertion; indels_in_section = 3: 3 deletion
3396 // find the edge.
3397 int best_edge_score = -1;
3398 unsigned int section_best_edge = 0;
3399
3400 // the smallest possible location of the second half is the location if the first half + deletion length
3401 xk2 = probe_poses[xk1] - 1 + max(0, indels_in_section);
3402
3403
3404 // find the exact indel point; xk2 is the tested point on the chromosome ( first WANTED base on the second half ).
3405 for(; xk2 < probe_poses[xk1+1] -1; xk2++)
3406 {
3407 int left_matched = match_chro(full_rebuilt_window + used_probe_in_rebuilt_window[xk1], base_index, probe_poses[xk1] - 1, (xk2 - probe_poses[xk1]+ 1 - max(0, indels_in_section)), 0, global_context -> config.space_type);
3408 int right_matched = match_chro(full_rebuilt_window + used_probe_in_rebuilt_window[xk1] + (xk2 - probe_poses[xk1] + 1) -indels_in_section , base_index, xk2, probe_poses[xk1+1] - 1 - xk2, 0, global_context -> config.space_type);
3409 if(left_matched+right_matched > best_edge_score)
3410 {
3411 best_edge_score = left_matched+right_matched;
3412 section_best_edge = xk2;
3413 }
3414 }
3415
3416 int section_mismatch = probe_poses[xk1+1]-probe_poses[xk1]-max(0, indels_in_section) - best_edge_score;
3417 is_unreliable += section_mismatch;
3418
3419
3420 perfect_segment_start_pos[ret] = probe_poses[xk1] - 1;
3421 // perfect_segment_lengths is : first WANTED base on the second half - first base pos on the first half - deletions
3422 perfect_segment_lengths[ret] = section_best_edge - probe_poses[xk1] + 1 - max(0, indels_in_section);
3423
3424 good_quality_indel[ret] = section_mismatch < INDEL_FULL_ALIGN_MAX_MISMATCH;
3425 indel_quality[ret] = pow(2,-is_unreliable);
3426 indels_after_perfect_segments[ret] = indels_in_section;
3427 indels_read_positions[ret] = used_probe_in_rebuilt_window[xk1] + perfect_segment_lengths[ret];
3428 ret++;
3429 total_unmatched += section_mismatch;
3430 }
3431 else
3432 {
3433
3434 int matched = match_chro(full_rebuilt_window + used_probe_in_rebuilt_window[xk1], base_index, probe_poses[xk1] - 1, (probe_poses[xk1+1] - probe_poses[xk1]), 0, global_context -> config.space_type);
3435 total_unmatched += ((probe_poses[xk1+1] - probe_poses[xk1]) - matched);
3436 }
3437 }
3438
3439 if(ret>3|| total_unmatched>INDEL_FULL_ALIGN_MAX_MISMATCH || total_delta == 0)
3440 {
3441 return 0;
3442 }
3443
3444
3445 for(xk1=0; xk1<ret; xk1++)
3446 {
3447 indel_quality[xk1] *= pow(2.,-(ret * (1+total_unmatched)));
3448 }
3449
3450 //printf("\nRET=%d\n\n", ret);
3451
3452 return ret;
3453
3454 }
3455
3456
3457
match_str(char * c1,char * c2,int len)3458 int match_str(char * c1, char * c2, int len)
3459 {
3460 int xk1,ret=0;
3461 for(xk1=0; xk1<len; xk1++)
3462 ret += c1[xk1]==c2[xk1];
3463 return ret;
3464 }
3465
3466
find_best_location_for_probe(global_context_t * global_context,gene_value_index_t * base_index,char * probe,unsigned int search_start,unsigned int search_len,unsigned int * ret_location)3467 int find_best_location_for_probe(global_context_t * global_context ,gene_value_index_t * base_index, char * probe, unsigned int search_start, unsigned int search_len, unsigned int * ret_location)
3468 {
3469 unsigned int xk2;
3470 unsigned int best_location = 0;
3471 int best_match = -1;
3472 for(xk2=search_start; xk2<search_start+search_len; xk2++)
3473 {
3474 int test_match = match_chro(probe, base_index, xk2, global_context -> config.reassembly_key_length, 0, global_context->config.space_type);
3475 if(test_match>best_match)
3476 {
3477 best_match = test_match;
3478 best_location = xk2;
3479 }
3480 }
3481
3482 * ret_location = best_location;
3483 return best_match;
3484
3485 }
3486
find_potential_ultralong_indels(global_context_t * global_context,reassembly_by_voting_block_context_t * block_context,int window_no,char * contig_1,char * contig_2,unsigned int * potential_ins_left,unsigned int * potential_ins_len)3487 int find_potential_ultralong_indels(global_context_t * global_context , reassembly_by_voting_block_context_t * block_context , int window_no, char * contig_1, char * contig_2, unsigned int * potential_ins_left, unsigned int * potential_ins_len)
3488 {
3489 int contig_len_1 = strlen(contig_1);
3490 int contig_len_2 = strlen(contig_2);
3491 unsigned int xk1, is_1_left;
3492 for(is_1_left = 0; is_1_left < 2; is_1_left ++)
3493 {
3494 int best_match = -1;
3495
3496 char * test_key = is_1_left?contig_2:contig_1;
3497 char * testee = is_1_left?contig_1:contig_2;
3498 int testee_len = is_1_left?contig_len_1:contig_len_2;
3499
3500
3501 for(xk1=0; xk1< testee_len - global_context -> config.reassembly_key_length; xk1++)
3502 {
3503 int test_match = match_str(test_key, testee + xk1, global_context -> config.reassembly_key_length);
3504 if(test_match > best_match)
3505 best_match = test_match;
3506 }
3507
3508 if(best_match >= global_context -> config.reassembly_key_length-1)
3509 return 0;
3510 }
3511
3512 unsigned int window_left_pos = block_context->block_start_linear_pos + window_no * REASSEMBLY_WINDOW_LENGTH / 2 ;
3513 unsigned int search_start = window_left_pos;
3514 if(search_start>300)search_start-=300;
3515 else search_start=0;
3516 unsigned int search_end = 300 + global_context -> config.max_indel_length + window_left_pos;
3517
3518
3519 gene_value_index_t * base_index = &global_context->all_value_indexes[0] ;
3520 if(search_end > base_index -> length + base_index -> start_base_offset) search_end = base_index -> length + base_index -> start_base_offset;
3521
3522 unsigned int contig_1_left_pos;
3523 int contig_1_left_match = find_best_location_for_probe(global_context, base_index, contig_1, search_start, search_end - search_start , &contig_1_left_pos);
3524
3525 unsigned int contig_1_right_pos;
3526 int contig_1_right_match = find_best_location_for_probe(global_context, base_index, contig_1 + contig_len_1 - global_context -> config.reassembly_key_length, search_start, search_end - search_start , &contig_1_right_pos);
3527
3528
3529
3530 unsigned int contig_2_left_pos;
3531 int contig_2_left_match = find_best_location_for_probe(global_context, base_index, contig_2, search_start, search_end - search_start , &contig_2_left_pos);
3532
3533 unsigned int contig_2_right_pos;
3534 int contig_2_right_match = find_best_location_for_probe(global_context, base_index, contig_2 + contig_len_2 - global_context -> config.reassembly_key_length, search_start, search_end - search_start , &contig_2_right_pos);
3535
3536
3537 if(contig_1_left_match >= global_context -> config.reassembly_key_length -1 && contig_1_right_match >= global_context -> config.reassembly_key_length -1)
3538 return 0;
3539
3540 if(contig_2_left_match >= global_context -> config.reassembly_key_length -1 && contig_2_right_match >= global_context -> config.reassembly_key_length -1)
3541 return 0;
3542
3543 if(contig_1_left_match < global_context -> config.reassembly_key_length -1 && contig_1_right_match < global_context -> config.reassembly_key_length -1)
3544 return 0;
3545
3546 if(contig_2_left_match < global_context -> config.reassembly_key_length -1 && contig_2_right_match < global_context -> config.reassembly_key_length -1)
3547 return 0;
3548
3549 int ret = 0;
3550 unsigned int basic_ins_left = 0, basic_ins_right = 0;
3551 int is_1_at_left;
3552
3553 if(contig_1_left_match >= global_context -> config.reassembly_key_length -1 && contig_2_right_match >= global_context -> config.reassembly_key_length -1 && contig_1_left_pos<contig_2_right_pos)
3554 {
3555 if(contig_len_1 + contig_len_2 - global_context -> config.reassembly_key_length > contig_2_right_pos - contig_1_left_pos)
3556 {
3557 *potential_ins_len = contig_len_1 + contig_len_2 - global_context -> config.reassembly_key_length - (contig_2_right_pos - contig_1_left_pos);
3558 *potential_ins_left = contig_1_left_pos;// + global_context -> config.reassembly_key_length;
3559 basic_ins_left = contig_1_left_pos;
3560 basic_ins_right = contig_2_right_pos;
3561 is_1_at_left = 1;
3562 ret = 1;
3563 }
3564 }
3565
3566 if(contig_1_right_match >= global_context -> config.reassembly_key_length -1 && contig_2_left_match >= global_context -> config.reassembly_key_length -1 && contig_1_right_pos>contig_2_left_pos)
3567 {
3568 if( contig_len_1 + contig_len_2 - global_context -> config.reassembly_key_length> contig_1_right_pos - contig_2_left_pos)
3569 {
3570 *potential_ins_len = contig_len_1 + contig_len_2 - global_context -> config.reassembly_key_length - (contig_1_right_pos - contig_2_left_pos);
3571 *potential_ins_left = contig_2_left_pos;// + global_context -> config.reassembly_key_length;
3572 basic_ins_right = contig_1_right_pos;
3573 basic_ins_left = contig_2_left_pos;
3574 is_1_at_left = 0;
3575 ret = 1;
3576 }
3577 }
3578
3579
3580 if(ret)
3581 {
3582 unsigned int x1=0,left_ins_edge = * potential_ins_left, best_edge = 0;
3583 char * left_contig = is_1_at_left ? contig_1: contig_2;
3584 char * right_contig = is_1_at_left ? contig_2: contig_1;
3585 char probe_window[global_context -> config.reassembly_key_length+1];
3586 probe_window[global_context -> config.reassembly_key_length]=0;
3587
3588 for(; left_ins_edge < (*potential_ins_left) + global_context -> config.reassembly_key_length; left_ins_edge++)
3589 probe_window[x1++]=gvindex_get(base_index, left_ins_edge);
3590
3591 for(; left_ins_edge < basic_ins_right; left_ins_edge++)
3592 {
3593 int delta = left_ins_edge - global_context -> config.reassembly_key_length - (*potential_ins_left);
3594 char * test_key = left_contig + delta;
3595
3596 if(probe_window[global_context -> config.reassembly_key_length-1] == test_key[global_context -> config.reassembly_key_length-1]) best_edge = left_ins_edge ;
3597
3598 int matched = match_str(test_key, probe_window, global_context -> config.reassembly_key_length);
3599 if(matched< global_context -> config.reassembly_key_length-1) break;
3600
3601
3602 for(x1=0; x1<global_context -> config.reassembly_key_length - 1; x1++)
3603 probe_window[x1] = probe_window[x1+1];
3604 probe_window[global_context -> config.reassembly_key_length-1] = gvindex_get(base_index, left_ins_edge);
3605 }
3606
3607 if(best_edge>0)
3608 *potential_ins_left = best_edge;
3609
3610 if( ( basic_ins_right + global_context -> config.reassembly_key_length - best_edge) >strlen(right_contig) ||base_index -> length + base_index -> start_base_offset <= basic_ins_right + global_context -> config.reassembly_key_length || basic_ins_left < base_index -> start_base_offset || best_edge >= base_index -> length + base_index -> start_base_offset) ret = 0;
3611 else
3612 {
3613 int left_match = match_chro(left_contig, base_index, basic_ins_left, best_edge - basic_ins_left, 0, global_context->config.space_type);
3614 int right_match = match_chro(right_contig + strlen(right_contig)- ( basic_ins_right + global_context -> config.reassembly_key_length - best_edge), base_index, best_edge, basic_ins_right + global_context -> config.reassembly_key_length - best_edge, 0, global_context->config.space_type);
3615
3616 if(left_match+right_match< basic_ins_right+global_context -> config.reassembly_key_length-basic_ins_left - 2) ret=0;
3617 }
3618 }
3619
3620 return ret;
3621 }
3622
3623
put_long_indel_event(global_context_t * global_context,unsigned int best_pos,int indels,float indel_quality,char * inserted_bases,int event_type)3624 int put_long_indel_event(global_context_t * global_context, unsigned int best_pos, int indels, float indel_quality, char * inserted_bases, int event_type)
3625 {
3626 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
3627 HashTable * event_table = indel_context -> event_entry_table;
3628 chromosome_event_t * event_space = indel_context -> event_space_dynamic;
3629
3630 chromosome_event_t * found = NULL;
3631 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
3632
3633 int pos_delta;
3634 for(pos_delta = -10; pos_delta <= 10; pos_delta++)
3635 {
3636 int found_events = search_event(global_context, event_table, event_space, best_pos - 1 + pos_delta, EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_LONG_INDEL|CHRO_EVENT_TYPE_INDEL, search_return);
3637
3638 if(found_events)
3639 {
3640 int kx1;
3641 for(kx1 = 0; kx1 < found_events ; kx1++)
3642 {
3643 if(search_return[kx1] -> indel_length == indels || event_type == CHRO_EVENT_TYPE_POTENTIAL_INDEL )
3644 {
3645 found = search_return[kx1];
3646 break;
3647 }
3648 }
3649 }
3650 if(found)break;
3651 }
3652
3653 /*
3654 if(best_pos >= 1124357402&& best_pos < 1124359402)
3655 {
3656 fprintf(stderr, "LLLX-BB: %u :: FOUND=%p\n", best_pos, found);
3657 }
3658
3659 */
3660
3661 if(found)
3662 {
3663 found -> supporting_reads ++;
3664 found -> event_quality = max(found ->event_quality, indel_quality);
3665 }
3666 else
3667 {
3668 int event_no;
3669 event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> total_events ++;
3670 event_space = reallocate_event_space(global_context , NULL, event_no);
3671 chromosome_event_t * new_event = event_space+event_no;
3672 memset(new_event,0,sizeof(chromosome_event_t));
3673
3674 if(indels<0 && inserted_bases)
3675 set_insertion_sequence(global_context, NULL , &new_event -> inserted_bases , inserted_bases , -indels);
3676 new_event -> event_small_side = best_pos - 1;
3677 new_event -> event_large_side = best_pos + max(0, indels);
3678 assert(new_event -> event_small_side && new_event -> event_large_side);
3679 new_event -> event_type = event_type;
3680 new_event -> indel_length = indels;
3681 new_event -> supporting_reads = 1;
3682 new_event -> event_quality = indel_quality;
3683
3684 put_new_event(event_table, new_event , event_no);
3685 }
3686
3687 return 0;
3688 }
3689
3690
3691 #define BASE_NEEDED_PER_WINDOW 60
3692
3693 #define PILEUP_BOX_LEN_CERTAIN 12
3694 #define MAX_PILEUP_BOX_COVERAGE_CERTAIN 12
3695
3696 #define PILEUP_BOX_LEN 50
3697 #define MAX_PILEUP_BOX_COVERAGE 50
3698
finalise_pileup_file_by_voting(global_context_t * global_context,char * temp_file_name,char * chro_name,int block_no)3699 int finalise_pileup_file_by_voting(global_context_t * global_context , char * temp_file_name, char * chro_name, int block_no)
3700 {
3701 int global_read_no = 0;
3702 char * read_text, * qual_text;
3703 int xk1;
3704 int total_window_number =2 * (BASE_BLOCK_LENGTH / REASSEMBLY_WINDOW_LENGTH+1), window_adjust;
3705
3706 unsigned int block_start_linear_pos = linear_gene_position(&global_context->chromosome_table, chro_name, block_no*BASE_BLOCK_LENGTH);
3707 //if(block_start_linear_pos> 150000000) return 0;
3708
3709 FILE * tmp_fp = f_subr_open(temp_file_name,"rb");
3710 if(!tmp_fp)
3711 return 0;
3712
3713 print_in_box(80,0,0,"Processing %s\n", temp_file_name);
3714
3715 unsigned int * window_center_read_no = (unsigned int *) malloc(sizeof(unsigned int)* total_window_number * global_context -> config.reassembly_start_read_number);
3716 unsigned int * window_base_number = (unsigned int *) malloc(sizeof(unsigned int)* total_window_number);
3717 unsigned int * window_center_read_qual = (unsigned int *) malloc(sizeof(unsigned int)* total_window_number * global_context -> config.reassembly_start_read_number);
3718 unsigned int * window_center_read_masks = (unsigned int *) malloc(sizeof(unsigned int)* total_window_number);
3719 struct reassmebly_window_allele *first_half_alleles = (struct reassmebly_window_allele *) malloc(sizeof(struct reassmebly_window_allele ) * global_context -> config.reassembly_window_alleles);
3720 struct reassmebly_window_allele *second_half_alleles = (struct reassmebly_window_allele *) malloc(sizeof(struct reassmebly_window_allele ) * global_context -> config.reassembly_window_alleles);
3721
3722 unsigned char * mapquality_lower_bounds_certain = calloc(1, BASE_BLOCK_LENGTH / PILEUP_BOX_LEN_CERTAIN * MAX_PILEUP_BOX_COVERAGE_CERTAIN);
3723 unsigned char * box_read_number_certain = calloc(1, BASE_BLOCK_LENGTH / PILEUP_BOX_LEN_CERTAIN);
3724
3725 unsigned char * mapquality_lower_bounds = calloc(1, BASE_BLOCK_LENGTH / PILEUP_BOX_LEN * MAX_PILEUP_BOX_COVERAGE);
3726 unsigned char * box_read_number = calloc(1, BASE_BLOCK_LENGTH / PILEUP_BOX_LEN);
3727
3728 char * full_rebuilt_window = (char *)malloc(3000);
3729
3730 reassembly_by_voting_block_context_t block_context;
3731
3732 block_context.voting_indexes = (gehash_t*)malloc(sizeof(gehash_t)* total_window_number);
3733 block_context.block_start_linear_pos = block_start_linear_pos;
3734 block_context.start_keys = (unsigned long long int *) calloc(sizeof(unsigned long long int),10 * total_window_number);
3735 block_context.start_offsets = (short *)malloc(sizeof( short) * total_window_number);
3736 block_context.read_no_counter = (unsigned int *) calloc(sizeof(unsigned int),total_window_number);
3737 block_context.read_rectify_space = (short *) malloc(sizeof(unsigned short) * 4 * MAX_READ_LENGTH);
3738 block_context.final_alleles = malloc(sizeof(struct reassmebly_window_allele ) * global_context -> config.reassembly_window_alleles);
3739
3740 read_text = (char *) malloc(MAX_READ_LENGTH);
3741 qual_text = (char *) malloc(MAX_READ_LENGTH);
3742
3743 HashTable *read_sequence_table=HashTableCreate(300000);
3744 HashTable *read_position_table=HashTableCreate(300000);
3745 HashTable *read_quality_table=HashTableCreate(300000);
3746 block_context.read_sequence_table = read_sequence_table;
3747 block_context.read_quality_table = read_quality_table;
3748 block_context.read_position_table = read_position_table;
3749
3750 memset(block_context.final_alleles, 0, sizeof(struct reassmebly_window_allele ) * global_context -> config.reassembly_window_alleles);
3751 memset(window_center_read_no, 0, sizeof(unsigned int)* total_window_number * global_context -> config.reassembly_start_read_number);
3752 memset(window_center_read_qual, 0, sizeof(unsigned int)* total_window_number*global_context -> config.reassembly_start_read_number);
3753 memset(window_center_read_masks, 0, sizeof(unsigned int)* total_window_number);
3754 memset(window_base_number, 0, sizeof(unsigned int)* total_window_number);
3755
3756
3757 if(!tmp_fp) return 1;
3758
3759 while(!feof(tmp_fp))
3760 {
3761 short read_len;
3762 long long int first_base_pos;
3763 //unsigned int qual_of_this_read = 1;
3764 base_block_temp_read_t read_rec;
3765 int rlen = fread(&read_rec, sizeof(read_rec), 1, tmp_fp);
3766 int is_position_certain = read_rec.strand;
3767 if(rlen<1) break;
3768
3769 rlen = fread(&read_len, sizeof(short), 1, tmp_fp);
3770 if(rlen < 1){
3771 SUBREADprintf("ERROR: Unable to parse piles\n");
3772 return -1;
3773 }
3774 rlen = fread(read_text, sizeof(char), read_len, tmp_fp);
3775 if(rlen <read_len){
3776 SUBREADprintf("ERROR: Unable to parse piles\n");
3777 return -1;
3778 }
3779 rlen = fread(qual_text, sizeof(char), read_len, tmp_fp);
3780 if(rlen <read_len){
3781 SUBREADprintf("ERROR: Unable to parse piles\n");
3782 return -1;
3783 }
3784
3785 first_base_pos = read_rec.pos;
3786 first_base_pos -= block_no * BASE_BLOCK_LENGTH;
3787
3788 for(window_adjust = 0; window_adjust < global_context -> config.reassembly_window_multiplex ; window_adjust ++)
3789 {
3790 int window_no = 2*(first_base_pos / REASSEMBLY_WINDOW_LENGTH)+ window_adjust , xk2;
3791 if(window_no >=total_window_number)
3792 break;
3793 window_base_number[window_no]+=read_len;
3794
3795 int qual_of_this_read = 127, read_offset;
3796 for(read_offset = 0; read_offset < read_len ; read_offset++)
3797 {
3798 int nch = qual_text[read_offset] - '!';
3799 if(nch < 14) qual_of_this_read -=4;
3800 else if(nch < 22) qual_of_this_read -= 2;
3801 else if(nch < 33) qual_of_this_read --;
3802 }
3803 qual_of_this_read=max(1,qual_of_this_read);
3804
3805
3806 if(is_position_certain)
3807 {
3808 int box_no = first_base_pos / PILEUP_BOX_LEN_CERTAIN;
3809 for(xk1 = box_no * MAX_PILEUP_BOX_COVERAGE_CERTAIN ; xk1 < (box_no+1) * MAX_PILEUP_BOX_COVERAGE_CERTAIN; xk1++)
3810 {
3811 if(qual_of_this_read > mapquality_lower_bounds_certain[xk1])
3812 {
3813 for(xk2 = (box_no+1) * MAX_PILEUP_BOX_COVERAGE_CERTAIN - 1; xk2 > xk1; xk2--)
3814 mapquality_lower_bounds_certain[xk2] = mapquality_lower_bounds_certain[xk2-1];
3815 mapquality_lower_bounds_certain[xk1] = qual_of_this_read;
3816 break;
3817 }
3818 }
3819 }
3820 else
3821 {
3822 int box_no = first_base_pos / PILEUP_BOX_LEN;
3823 for(xk1 = box_no * MAX_PILEUP_BOX_COVERAGE ; xk1 < (box_no+1) * MAX_PILEUP_BOX_COVERAGE; xk1++)
3824 {
3825 if(qual_of_this_read > mapquality_lower_bounds[xk1])
3826 {
3827 for(xk2 = (box_no+1) * MAX_PILEUP_BOX_COVERAGE - 1; xk2 > xk1; xk2--)
3828 mapquality_lower_bounds[xk2] = mapquality_lower_bounds[xk2-1];
3829 mapquality_lower_bounds[xk1] = qual_of_this_read;
3830 break;
3831 }
3832 }
3833
3834 }
3835 if(is_position_certain && window_adjust == 1) break;
3836 }
3837
3838 }
3839 fclose(tmp_fp);
3840
3841 for(xk1=0; xk1< total_window_number; xk1++)
3842 {
3843 if(window_base_number[xk1] >= BASE_NEEDED_PER_WINDOW)
3844 {
3845 gehash_create(&block_context.voting_indexes[xk1], max(4000,window_base_number[xk1]*4), 1);
3846 block_context . start_offsets[xk1] = 0x7fff;
3847 }
3848 }
3849
3850
3851 tmp_fp = f_subr_open(temp_file_name,"rb");
3852 while(!feof(tmp_fp))
3853 {
3854 short read_len;
3855 long long int first_base_pos;
3856 base_block_temp_read_t read_rec;
3857 int rlen = fread(&read_rec, sizeof(read_rec), 1, tmp_fp);
3858 int is_position_certain = read_rec.strand;
3859
3860 if(rlen<1) break;
3861 rlen = fread(&read_len, sizeof(short), 1, tmp_fp);
3862 if(rlen <1){
3863 SUBREADprintf("ERROR: cannot parse piles.\n");
3864 return -1;
3865 }
3866
3867 rlen = fread(read_text, sizeof(char), read_len, tmp_fp);
3868 if(rlen <read_len){
3869 SUBREADprintf("ERROR: cannot parse piles.\n");
3870 return -1;
3871 }
3872
3873 rlen = fread(qual_text, sizeof(char), read_len, tmp_fp);
3874 if(rlen <read_len){
3875 SUBREADprintf("ERROR: cannot parse piles.\n");
3876 return -1;
3877 }
3878 first_base_pos = read_rec.pos;
3879 first_base_pos -= block_no * BASE_BLOCK_LENGTH;
3880 assert(first_base_pos>=0);
3881 read_text [read_len]=0;
3882 qual_text [read_len]=0;
3883
3884 is_bad_read(read_text, read_len, "LOAD");
3885
3886 int read_offset;
3887
3888 /*
3889 unsigned int qual_of_this_read = 1;
3890 for(read_offset = 0; read_offset < read_len ; read_offset++)
3891 qual_of_this_read += qual_text[read_offset];
3892
3893 qual_of_this_read = qual_of_this_read * 100 / read_len + (is_position_certain?200:0);
3894 if(qual_of_this_read<1)qual_of_this_read=1;
3895 */
3896
3897 int qual_of_this_read = 127;
3898 for(read_offset = 0; read_offset < read_len ; read_offset++)
3899 {
3900 int nch = qual_text[read_offset] - '!';
3901
3902
3903 if(nch < 14) qual_of_this_read -=4;
3904 else if(nch < 22) qual_of_this_read -= 2;
3905 else if(nch < 33) qual_of_this_read --;
3906 }
3907
3908 qual_of_this_read=max(1,qual_of_this_read);
3909
3910 for(window_adjust = 0; window_adjust < global_context -> config.reassembly_window_multiplex; window_adjust++)
3911 {
3912 if(is_position_certain && window_adjust >= 2) break;
3913 int window_no = 2*(first_base_pos / REASSEMBLY_WINDOW_LENGTH) + window_adjust;
3914
3915 if(window_no >= total_window_number) break;
3916 if(window_base_number[window_no]<BASE_NEEDED_PER_WINDOW) continue;
3917
3918
3919 if(is_position_certain)
3920 {
3921 int box_no = first_base_pos / PILEUP_BOX_LEN_CERTAIN;
3922 if(qual_of_this_read < mapquality_lower_bounds_certain[box_no * MAX_PILEUP_BOX_COVERAGE_CERTAIN + MAX_PILEUP_BOX_COVERAGE_CERTAIN - 1])
3923 continue;
3924 if(box_read_number_certain[box_no]>MAX_PILEUP_BOX_COVERAGE_CERTAIN)continue;
3925 box_read_number_certain[box_no]++;
3926 //qual_of_this_read += 200;
3927 }
3928 else
3929 {
3930 int box_no = first_base_pos / PILEUP_BOX_LEN;
3931
3932 if(qual_of_this_read < mapquality_lower_bounds[box_no * MAX_PILEUP_BOX_COVERAGE + MAX_PILEUP_BOX_COVERAGE - 1])
3933 continue;
3934 if(box_read_number[box_no]>MAX_PILEUP_BOX_COVERAGE)continue;
3935 box_read_number[box_no]++;
3936 }
3937
3938 int read_no = block_context.read_no_counter[window_no];
3939 block_context.read_no_counter[window_no]++;
3940
3941
3942 if(window_center_read_qual[window_no * global_context -> config.reassembly_start_read_number + global_context -> config.reassembly_start_read_number-1] < qual_of_this_read)//&& is_position_certain)
3943 {
3944
3945 int xk44,xk3;
3946 for(xk44=0; xk44<global_context -> config.reassembly_start_read_number; xk44++)
3947 if(window_center_read_qual[window_no * global_context -> config.reassembly_start_read_number + xk44] < qual_of_this_read)
3948 break;
3949 for(xk3=global_context -> config.reassembly_start_read_number -2; xk3>=xk44 ; xk3--)
3950 {
3951 window_center_read_no[window_no * global_context -> config.reassembly_start_read_number + xk3 +1] = window_center_read_no[window_no * global_context -> config.reassembly_start_read_number + xk3] ;
3952 window_center_read_qual[window_no * global_context -> config.reassembly_start_read_number + xk3 +1] = window_center_read_qual[window_no * global_context -> config.reassembly_start_read_number + xk3];
3953 }
3954
3955 window_center_read_no[window_no * global_context -> config.reassembly_start_read_number + xk44] = read_no + 1;
3956 window_center_read_qual[window_no * global_context -> config.reassembly_start_read_number + xk44] = qual_of_this_read;
3957
3958 }
3959 for(read_offset = 0; read_offset < read_len - global_context -> config.reassembly_subread_length+1; read_offset += REASSEMBLY_SUBREAD_GAP)
3960 {
3961 unsigned int subread_int = 0;
3962 int xk2;
3963 for(xk2=0; xk2< global_context -> config.reassembly_subread_length; xk2++)
3964 subread_int = (subread_int << 2) | base2int(read_text[read_offset + xk2]);
3965
3966 gehash_insert_limited(&block_context.voting_indexes[window_no], subread_int , read_offset +10000 *read_no, 400, 10000);
3967 }
3968
3969
3970
3971 /*
3972 if(strstr(temp_file_name, "chr1-0005"))
3973 if(window_no == 63763 || window_no == 63762)
3974 {
3975 SUBREADprintf("LLLX-INSERT_WIN: [%d] : %s RN=%d RL=%d ITEMS=%llu\n",window_no, read_text, read_no, read_len, block_context.voting_indexes[window_no].current_items);
3976
3977 }
3978 */
3979
3980 char * sequence_for_hash = (char *) malloc(read_len+1);
3981 assert(sequence_for_hash);
3982 strcpy(sequence_for_hash, read_text);
3983 unsigned long long int read_hashed_key = (window_no * 1llu +1)<<32 ;
3984 read_hashed_key |= (read_no & 0xfffff)<<12;
3985
3986 if(is_position_certain)
3987 HashTablePut(read_position_table, NULL + read_hashed_key, NULL+first_base_pos+1);
3988 HashTablePut(read_sequence_table, NULL + read_hashed_key, sequence_for_hash);
3989 HashTablePut(read_quality_table, NULL + read_hashed_key, NULL + qual_of_this_read);
3990 }
3991
3992 global_read_no ++;
3993 }
3994 fclose(tmp_fp);
3995
3996 for(xk1=0; xk1< total_window_number; xk1++)
3997 {
3998 if(window_base_number[xk1]>=BASE_NEEDED_PER_WINDOW)
3999 gehash_sort(&block_context.voting_indexes[xk1]);
4000 }
4001
4002
4003 gene_vote_t *vote_list;
4004 vote_list = (gene_vote_t *) malloc(sizeof(gene_vote_t));
4005 assert(vote_list);
4006 block_context.vote_list = vote_list;
4007 block_context.vote_list_rectify = (gene_vote_t *) malloc(sizeof(gene_vote_t));
4008
4009 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
4010
4011 HashTable * event_table = indel_context -> event_entry_table;
4012
4013 for(xk1 = 0; xk1 < total_window_number; xk1++)
4014 {
4015 // xk1 is the number of the current window.
4016
4017 if(window_base_number[xk1]<BASE_NEEDED_PER_WINDOW)
4018 {
4019 //printf("SKIPPED : %d : %d/%d\n", xk1, window_base_number[xk1], BASE_NEEDED_PER_WINDOW);
4020 continue;
4021 }
4022 // full_window = 5'end -> anchor_read -> 3'end
4023
4024
4025
4026 //int IGNORED_BASES_IN_WINDOW = 10;
4027 //for(IGNORED_BASES_IN_WINDOW = 0; IGNORED_BASES_IN_WINDOW <=60; IGNORED_BASES_IN_WINDOW+=30)
4028 {
4029 int start_read_i;
4030 for(start_read_i = 0; start_read_i < global_context -> config.reassembly_start_read_number; start_read_i ++)
4031 {
4032
4033 // from middle read to 5' end
4034
4035 if(!window_center_read_no[xk1 * global_context -> config.reassembly_start_read_number + start_read_i ]) continue;
4036
4037 unsigned int this_start_read_no = window_center_read_no[xk1 * global_context -> config.reassembly_start_read_number + start_read_i ]-1 ;
4038 unsigned int window_start_pos = block_context.block_start_linear_pos + xk1 * REASSEMBLY_WINDOW_LENGTH / 2;
4039 block_context.window_start_pos = window_start_pos;
4040
4041 search_window(global_context, &block_context, xk1, this_start_read_no , -1);
4042 memcpy(first_half_alleles, block_context.final_alleles, global_context -> config.reassembly_window_alleles * sizeof(struct reassmebly_window_allele ));
4043 search_window(global_context, &block_context, xk1, this_start_read_no, 1);
4044 memcpy(second_half_alleles, block_context.final_alleles, global_context -> config.reassembly_window_alleles * sizeof(struct reassmebly_window_allele));
4045
4046 /*
4047 if(window_start_pos >= 1124357402&& window_start_pos < 1124359402)
4048 {
4049 fprintf(stderr, "LLLX: %u [%d] :: START READ = %u ; FN=%s ; BASES=%u\n", window_start_pos, xk1, this_start_read_no, temp_file_name , window_base_number[xk1]);
4050 char fname [100];
4051 sprintf(fname, "DP-%d-%d", getpid(), xk1);
4052 gehash_dump(&block_context.voting_indexes[xk1], fname);
4053 }*/
4054
4055 unsigned long long int starter_read_key = (xk1 * 1llu +1)<<32 ;
4056 starter_read_key |= ( (this_start_read_no)& 0xfffff)<<12;
4057
4058 char * start_read_txt = HashTableGet(block_context.read_sequence_table,NULL+starter_read_key);
4059 assert(start_read_txt);
4060
4061 int first_allele_no, second_allele_no;
4062
4063 for(first_allele_no = 0; first_allele_no < global_context -> config.reassembly_window_alleles && first_half_alleles[first_allele_no].rebuilt_size>0 ; first_allele_no++)
4064 for(second_allele_no = 0; second_allele_no < global_context -> config.reassembly_window_alleles && second_half_alleles[second_allele_no].rebuilt_size>0 ; second_allele_no++)
4065 {
4066 int all_indels_in_window = 0;
4067 strcpy(full_rebuilt_window, first_half_alleles [first_allele_no]. rebuilt_window);
4068 strcat(full_rebuilt_window, start_read_txt);
4069 strcat(full_rebuilt_window, second_half_alleles[second_allele_no].rebuilt_window);
4070
4071
4072 int xk2, full_rebuilt_window_size = strlen(full_rebuilt_window);
4073
4074 gene_value_index_t * base_index = &global_context->all_value_indexes[0] ;
4075
4076 unsigned int perfect_segment_start_pos[MAX_INDELS_IN_WINDOW], contig_start_pos, contig_end_pos, all_fresh = 0;
4077 int perfect_segment_lengths[MAX_INDELS_IN_WINDOW], head_removed_bases=0, tail_removed_bases=0;
4078 int indels_after_perfect_segments[MAX_INDELS_IN_WINDOW];
4079 int good_quality_indels[MAX_INDELS_IN_WINDOW];
4080 float quality_of_indels_aln[MAX_INDELS_IN_WINDOW];
4081 short indels_read_positions[MAX_INDELS_IN_WINDOW];
4082 char contig_CIGAR[200];
4083
4084 memset(good_quality_indels, 0, sizeof(int) * MAX_INDELS_IN_WINDOW);
4085 int indels_in_window = full_indel_alignment(global_context, &block_context, full_rebuilt_window, full_rebuilt_window_size, base_index, window_start_pos, perfect_segment_start_pos, perfect_segment_lengths, indels_after_perfect_segments, indels_read_positions, quality_of_indels_aln, &contig_start_pos, &contig_end_pos, &head_removed_bases, &tail_removed_bases, good_quality_indels);
4086 contig_CIGAR[0]=0;
4087 int read_position_cursor = head_removed_bases;
4088 int is_indel_contig = 0;
4089
4090 for(xk2 = 0; xk2 < indels_in_window; xk2++)
4091 {
4092 unsigned int best_pos = perfect_segment_start_pos [xk2] + perfect_segment_lengths[xk2];
4093 int indels = indels_after_perfect_segments[xk2], is_fresh = 1;
4094 float quality_of_this_indel = quality_of_indels_aln[xk2] * first_half_alleles [first_allele_no].allele_quality * second_half_alleles[second_allele_no].allele_quality;
4095
4096 if(abs(indels) >= global_context -> config.max_indel_length){
4097 //#warning "====================== REMOVE THE debug output ==============="
4098 continue;
4099 }
4100
4101 /*if(abs(indels) <= 16 && 0){
4102 continue;
4103 }*/
4104
4105 is_indel_contig=1;
4106 sprintf(contig_CIGAR+strlen(contig_CIGAR), "%dM%d%c", indels_read_positions[xk2] - read_position_cursor, abs(indels), indels<0?'I':'D');
4107 read_position_cursor = indels_read_positions[xk2];
4108 if(indels<0) read_position_cursor -= indels;
4109
4110 all_indels_in_window++;
4111
4112 if(indels >= -global_context -> config.max_indel_length && good_quality_indels[xk2])
4113 {
4114 int neighbour_delta;
4115 for(neighbour_delta = - 30; neighbour_delta < 30 ; neighbour_delta++)
4116 {
4117 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
4118 chromosome_event_t * event_space = indel_context -> event_space_dynamic;
4119
4120 int xk3, found_events = search_event(global_context, event_table, event_space, best_pos + neighbour_delta , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_LONG_INDEL|CHRO_EVENT_TYPE_INDEL, search_return);
4121
4122 for(xk3 = 0; xk3<found_events; xk3++)
4123 {
4124 chromosome_event_t * tested_neighbour = search_return[xk3];
4125
4126 if(tested_neighbour -> event_quality >= quality_of_this_indel)
4127 is_fresh = 0;
4128 else tested_neighbour -> event_type = CHRO_EVENT_TYPE_REMOVED;
4129 }
4130
4131
4132 }
4133
4134
4135 if(is_fresh)
4136 put_long_indel_event(global_context, best_pos, indels, quality_of_this_indel, full_rebuilt_window + indels_read_positions[xk2], CHRO_EVENT_TYPE_LONG_INDEL);
4137 all_fresh += is_fresh;
4138 }
4139 }
4140
4141
4142 //#warning "====================== Make sure ' is_indel_contig ' in the next line is harmless before RELEASE ==============="
4143 if( is_indel_contig || all_fresh)
4144 {
4145 int write_cursor;
4146 char * chro_begin;
4147 int chro_offset_start = 0;
4148 int chro_offset_end = 0;
4149 locate_gene_position_max(contig_start_pos + head_removed_bases ,& global_context -> chromosome_table, &chro_begin, &chro_offset_start, NULL, NULL, 0);
4150 locate_gene_position_max(contig_end_pos - tail_removed_bases ,& global_context -> chromosome_table, &chro_begin, &chro_offset_end, NULL, NULL, 0);
4151 if(full_rebuilt_window_size - read_position_cursor - tail_removed_bases)
4152 fprintf(global_context -> long_insertion_FASTA_fp, ">%s-%u-%u-%s%dM\n", chro_name, chro_offset_start, chro_offset_end - 1, contig_CIGAR, full_rebuilt_window_size - read_position_cursor - tail_removed_bases);
4153 else
4154 fprintf(global_context -> long_insertion_FASTA_fp, ">%s-%u-%u-%s0M\n", chro_name, chro_offset_start, chro_offset_end - 1, contig_CIGAR);
4155 full_rebuilt_window[full_rebuilt_window_size - tail_removed_bases] = 0;
4156 for(write_cursor = head_removed_bases ; write_cursor < full_rebuilt_window_size - tail_removed_bases ; write_cursor += 70)
4157 {
4158 fwrite(full_rebuilt_window + write_cursor , min(70, full_rebuilt_window_size - write_cursor), 1, global_context -> long_insertion_FASTA_fp);
4159 fputs("\n",global_context -> long_insertion_FASTA_fp);
4160 }
4161 }
4162
4163 }
4164 }
4165 }
4166
4167
4168 }
4169
4170 free(vote_list);
4171 free(block_context.vote_list_rectify);
4172
4173 for(xk1=0; xk1< total_window_number; xk1++)
4174 {
4175 if(window_base_number[xk1]>=BASE_NEEDED_PER_WINDOW)
4176 gehash_destory(&block_context.voting_indexes[xk1]);
4177 }
4178
4179
4180 free_values_destroy(read_sequence_table);
4181
4182 free(mapquality_lower_bounds_certain);
4183 free(mapquality_lower_bounds);
4184 free(box_read_number);
4185 free(box_read_number_certain);
4186 free(full_rebuilt_window);
4187 free(block_context.read_no_counter);
4188 free(block_context.voting_indexes);
4189 free(block_context.start_keys);
4190 free(block_context.start_offsets);
4191 free(block_context.read_rectify_space);
4192 free(block_context.final_alleles);
4193 free(first_half_alleles);
4194 free(second_half_alleles);
4195 free(window_base_number);
4196 free(window_center_read_no);
4197 free(window_center_read_qual);
4198 free(window_center_read_masks);
4199 free(read_text);
4200 free(qual_text);
4201
4202
4203 return 0;
4204 }
finalise_pileup_file_by_debrujin(global_context_t * global_context,char * temp_file_name,char * chro_name,int block_no)4205 int finalise_pileup_file_by_debrujin(global_context_t * global_context , char * temp_file_name, char * chro_name, int block_no)
4206 {
4207 FILE * tmp_fp = f_subr_open(temp_file_name,"rb");
4208 if(!tmp_fp)
4209 return 0;
4210 char * read_text, * qual_text;
4211 int xk1;
4212 unsigned int block_start_linear_pos = linear_gene_position(&global_context->chromosome_table, chro_name, block_no*BASE_BLOCK_LENGTH);
4213 int total_window_number =2 * (BASE_BLOCK_LENGTH / REASSEMBLY_WINDOW_LENGTH+1);
4214
4215 reassembly_block_context_t * block_context = (reassembly_block_context_t*)malloc(sizeof(reassembly_block_context_t));
4216 block_context -> block_start_linear_pos = block_start_linear_pos;
4217 block_context -> start_keys = (unsigned long long int *) calloc(sizeof(unsigned long long int),10 * total_window_number);
4218 block_context -> start_offsets = (short *)malloc(sizeof( short) * total_window_number);
4219
4220 assert(block_context -> start_offsets);
4221 assert(block_context -> start_keys);
4222
4223 block_context -> de_bruijn_graphs = ( HashTable ** )malloc(sizeof( HashTable *) * total_window_number);
4224 for(xk1=0; xk1< total_window_number; xk1++)
4225 {
4226 block_context -> de_bruijn_graphs[xk1] = HashTableCreate(10* REASSEMBLY_WINDOW_LENGTH);
4227 block_context -> start_offsets[xk1] = 0x7fff;
4228 }
4229
4230 read_text = (char *) malloc(MAX_READ_LENGTH);
4231 qual_text = (char *) malloc(MAX_READ_LENGTH);
4232 while(!feof(tmp_fp))
4233 {
4234 short read_len;
4235 unsigned int first_base_pos;
4236 base_block_temp_read_t read_rec;
4237 int rlen = fread(&read_rec, sizeof(read_rec), 1, tmp_fp);
4238 if(rlen<1) break;
4239 rlen = fread(&read_len, sizeof(short), 1, tmp_fp);
4240 if(rlen<1) return -1;
4241 rlen = fread(read_text, sizeof(char), read_len, tmp_fp);
4242 if(rlen<read_len) return -1;
4243 rlen = fread(qual_text, sizeof(char), read_len, tmp_fp);
4244 if(rlen<read_len) return -1;
4245 first_base_pos = read_rec.pos - block_no * BASE_BLOCK_LENGTH;
4246
4247 insert_pileup_read(global_context , block_context , first_base_pos , read_text , qual_text , read_len, read_rec.read_pos>0);
4248 }
4249
4250 for(xk1=0; xk1< total_window_number; xk1++)
4251 {
4252 if(block_context -> start_offsets[xk1]<0x7fff && block_context -> de_bruijn_graphs[xk1] -> numOfElements >3)
4253 {
4254 finalise_db_graph(global_context, block_context, xk1, block_context -> de_bruijn_graphs[xk1] , block_context -> start_keys [xk1*10], block_context -> start_offsets [xk1]);
4255 }
4256 HashTableDestroy(block_context -> de_bruijn_graphs[xk1]);
4257 }
4258
4259
4260 free(block_context -> de_bruijn_graphs);
4261 free(block_context -> start_keys);
4262 free(block_context -> start_offsets);
4263 free(block_context);
4264 free(read_text);
4265 free(qual_text);
4266 fclose(tmp_fp);
4267 return 0;
4268 }
4269
finalise_long_insertions_by_hashtable(global_context_t * global_context)4270 int finalise_long_insertions_by_hashtable(global_context_t * global_context)
4271 {
4272 int chro_i;
4273 assert(global_context -> index_block_number == 1);
4274 unsigned int chro_start_pos = 0;
4275 char tmp_fname[MAX_FILE_NAME_LENGTH+40];
4276
4277 sprintf(tmp_fname,"%s.reassembly.fa", global_context->config.output_prefix);
4278 global_context->long_insertion_FASTA_fp = f_subr_open(tmp_fname ,"wb");
4279
4280 for(chro_i=0; chro_i < global_context -> chromosome_table.total_offsets ; chro_i++)
4281 {
4282 unsigned int chro_length = global_context -> chromosome_table.read_offsets[chro_i] - chro_start_pos;
4283 unsigned int chro_current_pos = 0;
4284 for(chro_current_pos = 0; chro_current_pos < chro_length ; chro_current_pos += BASE_BLOCK_LENGTH)
4285 {
4286 char temp_file_name[MAX_FILE_NAME_LENGTH + 50];
4287 int block_no = chro_current_pos / BASE_BLOCK_LENGTH;
4288
4289 sprintf(temp_file_name,"%s@%s-%04u.bin", global_context -> config.temp_file_prefix, global_context -> chromosome_table.read_names+chro_i*MAX_CHROMOSOME_NAME_LEN , block_no );
4290
4291 finalise_pileup_file(global_context , temp_file_name, global_context -> chromosome_table.read_names+MAX_CHROMOSOME_NAME_LEN*chro_i, block_no);
4292
4293 unlink(temp_file_name);
4294 }
4295 chro_start_pos = global_context -> chromosome_table.read_offsets[chro_i];
4296 }
4297
4298 fclose(global_context -> long_insertion_FASTA_fp);
4299
4300 /*
4301 for(xk1=0; xk1 < global_context -> config.long_indel_iterations ; xk1++)
4302 {
4303 explorer_insertions_in_unmapped(global_context);
4304 }
4305 */
4306
4307 return 0;
4308
4309 }
destroy_pileup_table(HashTable * local_reassembly_pileup_files)4310 void destroy_pileup_table(HashTable* local_reassembly_pileup_files)
4311 {
4312 int bucket;
4313 KeyValuePair * cursor;
4314
4315 for(bucket=0; bucket< local_reassembly_pileup_files -> numOfBuckets; bucket++)
4316 {
4317 cursor = local_reassembly_pileup_files -> bucketArray[bucket];
4318 while (1)
4319 {
4320 if (!cursor) break;
4321 FILE * fp = (FILE *)cursor->value;
4322 if(fp != NULL+1)
4323 fclose(fp);
4324 free((void *)cursor->key);
4325 cursor = cursor->next;
4326 }
4327 }
4328
4329 HashTableDestroy(local_reassembly_pileup_files);
4330
4331
4332 }
4333
4334 char * _COREMAIN_delete_temp_prefix = NULL;
COREMAIN_SIGINT_hook(int param)4335 void COREMAIN_SIGINT_hook(int param)
4336 {
4337 #ifdef MAKE_STANDALONE
4338 int xk1, last_slash = -1;
4339 if(_COREMAIN_delete_temp_prefix != NULL)
4340 {
4341 char del2[MAX_FILE_NAME_LENGTH], del_suffix[MAX_FILE_NAME_LENGTH], del_name[MAX_FILE_NAME_LENGTH];
4342 SUBREADprintf("\n\nReceived a terminal signal. The temporary files were removed.\n");
4343 for(xk1=0; _COREMAIN_delete_temp_prefix[xk1]; xk1++)
4344 {
4345 if(_COREMAIN_delete_temp_prefix[xk1]=='/') last_slash = xk1;
4346 else if(_COREMAIN_delete_temp_prefix[xk1]=='\\')
4347 {
4348 SUBREADprintf("The file name is unknown.\n");
4349 return;
4350 }
4351 }
4352 if(last_slash>=0)
4353 {
4354 memcpy(del2, _COREMAIN_delete_temp_prefix, last_slash);
4355 del2[last_slash] = 0;
4356 strcpy(del_suffix , _COREMAIN_delete_temp_prefix + last_slash + 1);
4357 }
4358 else
4359 {
4360 strcpy(del2,".");
4361 strcpy(del_suffix , _COREMAIN_delete_temp_prefix);
4362 }
4363
4364 if(strlen(del_suffix)>8)
4365 {
4366 DIR *d;
4367 struct dirent *dir;
4368
4369 d = opendir(del2);
4370 if (d)
4371 {
4372 while ((dir = readdir(d)) != NULL)
4373 {
4374 if(strstr(dir->d_name, del_suffix))
4375 {
4376 //printf("%s\n", dir->d_name);
4377 strcpy(del_name, del2);
4378 strcat(del_name, "/");
4379 strcat(del_name, dir->d_name);
4380 unlink(del_name);
4381 }
4382 }
4383 closedir(d);
4384 }
4385 }
4386
4387 }
4388
4389 exit(param);
4390 #endif
4391 }
4392
4393
4394
finalise_long_insertions(global_context_t * global_context)4395 int finalise_long_insertions(global_context_t * global_context)
4396 {
4397 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
4398 HashTable * local_reassembly_pileup_files = indel_context -> local_reassembly_pileup_files;
4399
4400 destroy_pileup_table(local_reassembly_pileup_files);
4401
4402 return finalise_long_insertions_by_hashtable(global_context);
4403 }
4404
init_global_context(global_context_t * context)4405 void init_global_context(global_context_t * context)
4406 {
4407 memset(context->module_contexts, 0, 5*sizeof(void *));
4408 memset(&context->config, 0, sizeof(configuration_t));
4409
4410 context->config.fast_run = 0;
4411 context->config.memory_use_multiplex = 1;
4412 context->config.report_sam_file = 1;
4413 context->config.do_breakpoint_detection = 0;
4414 context->config.do_fusion_detection = 0;
4415 context->config.do_long_del_detection = 0;
4416 context->config.do_structural_variance_detection = 0;
4417 context->config.more_accurate_fusions = 1;
4418 context->config.report_multi_mapping_reads = 0;
4419
4420 //#warning "============= best values for the SVs application: 8; 5; 32 ==============="
4421 context->config.top_scores = 8 - 5;
4422 context->config.max_vote_combinations = 5 - 3;
4423 context->config.max_vote_simples = 5;
4424 context->config.max_vote_number_cutoff = 1;
4425
4426 context->config.experiment_type = 0;
4427 context->config.prefer_donor_receptor_junctions = 1;
4428 context->config.maximum_translocation_length = 10000;
4429 context->config.maximum_colocating_distance = 500;
4430 context->config.do_big_margin_filtering_for_reads = 0;
4431 context->config.do_big_margin_filtering_for_junctions = 0;
4432 context->config.maximum_intron_length = 500000;
4433 context->config.use_hamming_distance_in_exon = 0;
4434 context->config.is_third_iteration_running = 0;
4435 context->input_reads.is_paired_end_reads = 0;
4436 context->will_remove_input_file = 0;
4437 context->config.ignore_unmapped_reads = 0;
4438 context->config.report_unmapped_using_mate_pos = 1;
4439 context->config.downscale_mapping_quality=0;
4440 context->config.ambiguous_mapping_tolerance = 39;
4441 context->config.use_hamming_distance_break_ties = 0;
4442 context->config.use_quality_score_break_ties = 0;
4443 context->config.extending_search_indels = 0;
4444 context->config.PE_predominant_weight = 0;
4445
4446 //#warning "============= best values for the SVs application: 3 ====================="
4447 context->config.multi_best_reads = 1;
4448 context->config.reported_multi_best_reads = 1;
4449 context->config.is_SAM_file_input=0;
4450 context->config.use_dynamic_programming_indel=0;
4451 context->config.use_bitmap_event_table = 1;
4452 context->config.convert_color_to_base = 0;
4453 context->config.is_gzip_fastq = 0;
4454
4455 context->config.is_BAM_output = 1;
4456 context->config.is_BAM_input = 0;
4457 context->config.is_input_read_order_required = 0;
4458 context->config.read_trim_5 = 0;
4459 context->config.read_trim_3 = 0;
4460 context->config.minimum_exonic_subread_fraction = -1.0;
4461 context->config.is_first_read_reversed = 0;
4462 context->config.is_second_read_reversed = 1;
4463 context->config.space_type = GENE_SPACE_BASE;
4464 context->config.minimum_pair_distance = 50;
4465 context->config.maximum_pair_distance = 600;
4466 context->config.expected_pair_distance = 300;
4467 context->config.restrected_read_order = 1;
4468 context->config.k_mer_length = 28;
4469
4470 context->config.reassembly_start_read_number = 2;
4471 context->config.reassembly_key_length = 28;
4472 context->config.reassembly_subread_length = 12;
4473 context->config.reassembly_window_multiplex = 3;
4474 context->config.reassembly_tolerable_voting = 1;
4475 context->config.reassembly_window_alleles = 1;
4476 context->config.do_superlong_indel_detection = 0;
4477 context->config.flanking_subread_indel_mismatch = 1;
4478
4479 context->config.total_subreads = 10;
4480 context->config.minimum_subread_for_first_read = 3;
4481 context->config.minimum_subread_for_second_read = 1;
4482 context->config.min_mapped_fraction = 0;
4483 context->config.max_mismatch_exonic_reads = 5;
4484 context->config.max_mismatch_junction_reads = 2;
4485 context->config.all_threads = 1;
4486 context->config.is_first_iteration_running = 1;
4487 context->config.is_second_iteration_running = 1;
4488
4489 context->config.reads_per_chunk = 20*1024*1024;
4490
4491 //#warning "=========== 2*1024*1024 IS FOR TESTING BLOCKING AND SHOULD BE COMMENTED ==============="
4492 // context->config.reads_per_chunk = 2*1024*1024;
4493
4494 context->config.use_memory_buffer = 1;
4495 context->config.is_methylation_reads = 0;
4496 context->config.report_no_unpaired_reads = 0;
4497 context->config.limited_tree_scan = 0;
4498 context->config.high_quality_base_threshold = 500000;
4499 context->config.report_multiple_best_in_pairs = 0;
4500 context->config.realignment_minimum_variant_distance = 16;
4501
4502 context->config.init_max_event_number = 70000;
4503 context->config.show_soft_cliping = 1 ;
4504 context->config.big_margin_record_size = 9;
4505
4506 context->config.read_group_id[0] = 0;
4507 context->config.read_group_txt[0] = 0;
4508 context->config.first_read_file[0] = 0;
4509 context->config.second_read_file[0] = 0;
4510 context->config.index_prefix[0] = 0;
4511 context->config.output_prefix[0] = 0;
4512 context->config.exon_annotation_file[0] = 0;
4513 context->config.SAM_extra_columns = 0;
4514 context->config.exon_annotation_file_type = FILE_TYPE_GTF;
4515 strcpy(context->config.exon_annotation_gene_id_column, "gene_id");
4516 strcpy(context->config.exon_annotation_feature_name_column, "exon");
4517
4518 context->config.DP_penalty_create_gap = -1;
4519 context->config.DP_penalty_extend_gap = 0;
4520 context->config.DP_match_score = 2;
4521 context->config.DP_mismatch_penalty = 0;
4522
4523 context->config.max_insertion_at_junctions=0;
4524 context->config.check_donor_at_junctions=1;
4525 memset(&context -> input_reads, 0, sizeof(read_input_t));
4526
4527 signal (SIGTERM, COREMAIN_SIGINT_hook);
4528 signal (SIGINT, COREMAIN_SIGINT_hook);
4529
4530
4531 int seed_rand[2];
4532 double double_time = miltime();
4533 memcpy(seed_rand, &double_time, 2*sizeof(int));
4534 myrand_srand(seed_rand[0]^seed_rand[1]); // the seed is NOT used in R because myrand_srand will always takes four random numbers from R's RNG
4535
4536 context->config.max_indel_length = 5;
4537 context->config.phred_score_format = FASTQ_PHRED33;
4538 context->start_time = miltime();
4539
4540 context->timecost_load_index = 0;
4541 context->timecost_voting = 0;
4542 context->timecost_before_realign = 0;
4543 context->timecost_for_realign = 0;
4544 }
4545
init_core_temp_path(global_context_t * context)4546 void init_core_temp_path(global_context_t * context){
4547 int x1;
4548 char mac_rand[13];
4549 mac_or_rand_str(mac_rand);
4550
4551 context->config.temp_file_prefix[0] = 0;
4552 if(context->config.output_prefix[0]){
4553 for(x1 = strlen(context->config.output_prefix); x1 >=0; x1--){
4554 if(context->config.output_prefix[x1]=='/'){
4555 memcpy(context->config.temp_file_prefix, context->config.output_prefix, x1);
4556 context->config.temp_file_prefix[x1]=0;
4557 break;
4558 }
4559 }
4560 }
4561
4562 if(context->config.temp_file_prefix[0] == 0)strcpy(context->config.temp_file_prefix, "./");
4563 sprintf(context->config.temp_file_prefix+strlen(context->config.temp_file_prefix), "/core-temp-sum-%06u-%s", getpid(), mac_rand );
4564 _COREMAIN_delete_temp_prefix = context->config.temp_file_prefix;
4565 }
4566
4567 #define INDEL_MASK_BY_INSERTION 1
4568 #define INDEL_MASK_BY_DELETION 2
4569 #define INDEL_MASK_BY_MATCH 0
4570 #define INDEL_MASK_BY_MISMATCH 3
4571
4572
4573 int CORE_DPALIGN_CREATEGAP_PENALTY = -1;
4574 int CORE_DPALIGN_EXTENDGAP_PENALTY = 0;
4575 int CORE_DPALIGN_MATCH_SCORE = 2;
4576 int CORE_DPALIGN_MISMATCH_PENALTY = 0;
4577
4578
core_dynamic_align(global_context_t * global_context,thread_context_t * thread_context,char * read,int read_len,unsigned int begin_position,char * movement_buffer,int expected_offset,char * read_name)4579 int core_dynamic_align(global_context_t * global_context, thread_context_t * thread_context, char * read, int read_len, unsigned int begin_position, char * movement_buffer, int expected_offset, char * read_name)
4580 // read must be converted to the positive strand.
4581 // movement buffer: 0:match, 1: read-insert, 2: gene-insert, 3:mismatch
4582 // the size of the movement buffer must be equal to the length of the read plus max_indel * 3.
4583 {
4584 int max_indel = min(16 , global_context->config.max_indel_length);
4585 int i,j;
4586
4587 CORE_DPALIGN_CREATEGAP_PENALTY = global_context -> config.DP_penalty_create_gap;
4588 CORE_DPALIGN_EXTENDGAP_PENALTY = global_context -> config.DP_penalty_extend_gap;
4589 CORE_DPALIGN_MATCH_SCORE = global_context -> config.DP_match_score;
4590 CORE_DPALIGN_MISMATCH_PENALTY = global_context -> config.DP_mismatch_penalty;
4591
4592 if(read_len < 3 || abs(expected_offset) > max_indel)
4593 return 0;
4594 if(expected_offset < 0 && read_len < (3-expected_offset))
4595 return 0;
4596
4597 gene_value_index_t * current_value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
4598
4599 indel_context_t * indel_context = (indel_context_t*)global_context -> module_contexts[MODULE_INDEL_ID];
4600
4601 //unsigned long long table_ptr = (unsigned long long) indel_context -> dynamic_align_table;
4602
4603 short ** table = indel_context -> dynamic_align_table;
4604 char ** table_mask = indel_context -> dynamic_align_table_mask;
4605 if(thread_context)
4606 {
4607 indel_thread_context_t * indel_thread_context = (indel_thread_context_t*)thread_context -> module_thread_contexts[MODULE_INDEL_ID];
4608 table = indel_thread_context -> dynamic_align_table;
4609 table_mask = indel_thread_context -> dynamic_align_table_mask;
4610 }
4611
4612
4613 if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4614 {
4615 int ii;
4616 for(ii = 0; ii<read_len - expected_offset; ii++)
4617 SUBREADprintf("%c",gvindex_get(current_value_index, begin_position + ii));
4618
4619 SUBREADprintf ("\n%s\n", read);
4620 }
4621
4622
4623 // vertical move: deletion (1)
4624 // horizontal move: insertion (2)
4625 // cross move: match (0) or mismatch (3)
4626 // i: vertical move; j: horizontal move
4627
4628 //SUBREADprintf("DM[%d]: %p %d,%d\n", thread_context -> thread_id, table_mask, read_len + expected_offset, read_len);
4629 for (i=0; i<read_len + expected_offset; i++)
4630 {
4631 for(j=0; j<read_len; j++)
4632 {
4633 if(0&&(i >= read_len + expected_offset || j >= read_len))
4634 SUBREADprintf("XXDM[%d]: %p %d,%d\n", thread_context -> thread_id, table_mask, read_len + expected_offset, read_len);
4635 table_mask[i][j]=0;
4636
4637 if(0&&(i >= read_len + expected_offset || j >= read_len))
4638 SUBREADprintf("YYDM[%d]: %p %d,%d\n", thread_context -> thread_id, table_mask, read_len + expected_offset, read_len);
4639
4640 if (j < i - max_indel || j > max_indel + i)
4641 {
4642 table[i][j]=-9999;
4643 if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4644 {
4645 putchar('\t');
4646 }
4647 continue;
4648 }
4649
4650 short from_upper;
4651
4652 if (i>0) from_upper = table[i-1][j] + (table_mask[i-1][j] == INDEL_MASK_BY_DELETION?CORE_DPALIGN_EXTENDGAP_PENALTY:CORE_DPALIGN_CREATEGAP_PENALTY);
4653 else from_upper = -9999;
4654
4655 short from_left;
4656
4657 if (j>0) from_left = table[i][j-1] + (table_mask[i][j-1] == INDEL_MASK_BY_INSERTION?CORE_DPALIGN_EXTENDGAP_PENALTY:CORE_DPALIGN_CREATEGAP_PENALTY);
4658 else from_left = -9999;
4659
4660 char chromo_ch = gvindex_get(current_value_index, begin_position + i);
4661 char is_matched_ij = (chromo_ch == read[j])?CORE_DPALIGN_MATCH_SCORE:CORE_DPALIGN_MISMATCH_PENALTY;
4662
4663 short from_upperleft;
4664
4665 if (i>0 && j>0) from_upperleft = table[i-1][j-1] + is_matched_ij;
4666 else if(i==0 && j==0) from_upperleft = is_matched_ij;
4667 else from_upperleft = -9999;
4668
4669 if (from_upperleft == from_upper && from_upperleft > from_left)
4670 {
4671 table_mask[i][j]= INDEL_MASK_BY_DELETION;
4672 table[i][j] = from_upper;
4673 }
4674 else if(from_upperleft == from_left && from_upperleft > from_upper)
4675 {
4676 table_mask[i][j]= INDEL_MASK_BY_INSERTION;
4677 table[i][j] = from_left;
4678 }
4679 else if(from_upperleft > from_left && from_upperleft > from_upper)
4680 {
4681 table_mask[i][j]= (chromo_ch == read[j])?INDEL_MASK_BY_MATCH:INDEL_MASK_BY_MISMATCH;
4682 table[i][j] = from_upperleft;
4683 }
4684 else if(from_upperleft == from_left && from_upperleft == from_upper)
4685 {
4686 table_mask[i][j]= (chromo_ch == read[j])?INDEL_MASK_BY_MATCH:INDEL_MASK_BY_MISMATCH;
4687 table[i][j] = from_upperleft;
4688 }
4689 else if(from_left > from_upper)
4690 {
4691 table_mask[i][j]= INDEL_MASK_BY_INSERTION;
4692 table[i][j] = from_left;
4693 }
4694 else if(from_left <= from_upper)
4695 {
4696 table_mask[i][j]= INDEL_MASK_BY_DELETION;
4697 table[i][j] = from_upper;
4698 }
4699
4700 if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4701 SUBREADprintf("%c%c\t", chromo_ch, read[j]);
4702 }
4703 if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4704 SUBREADputs("");
4705
4706 }
4707 #ifdef indel_debug
4708 //SUBREADputs("");
4709 //SUBREADputs("");
4710
4711 #endif
4712
4713 short path_i = read_len + expected_offset - 1;
4714 int out_pos = 0, delta=0;
4715 j = read_len - 1;
4716
4717 if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4718 {
4719 int ii,jj;
4720 for(ii=0;ii< path_i+1; ii++)
4721 {
4722 SUBREADprintf("%d\t", ii);
4723 for(jj=0; jj<j+1; jj++)
4724 SUBREADprintf("% 6d",table[ii][jj]);
4725 SUBREADputs("");
4726 }
4727 SUBREADprintf(" \t");
4728 for(jj=0; jj<j+1; jj++)
4729 SUBREADprintf("#%4d ",jj);
4730 SUBREADputs("");
4731 SUBREADputs("");
4732
4733 for(ii=0;ii< path_i+1; ii++)
4734 {
4735 for(jj=0; jj<j+1; jj++)
4736 SUBREADprintf("% 6d",table_mask[ii][jj]);
4737 SUBREADputs("");
4738 }
4739 }
4740
4741 while(1)
4742 {
4743 if(table_mask[path_i][j] == INDEL_MASK_BY_INSERTION)
4744 {
4745 j--;
4746 delta --;
4747 movement_buffer[out_pos++] = 2;
4748 }
4749 else if(table_mask[path_i][j] == INDEL_MASK_BY_DELETION)
4750 {
4751 path_i--;
4752 delta ++;
4753 movement_buffer[out_pos++] = 1;
4754 }
4755 else if(table_mask[path_i][j] == INDEL_MASK_BY_MATCH || table_mask[path_i][j] == INDEL_MASK_BY_MISMATCH)
4756 {
4757 movement_buffer[out_pos++] = table_mask[path_i][j] == INDEL_MASK_BY_MATCH?0:3;
4758 path_i--;
4759 j--;
4760 }
4761
4762 if(path_i == -1 && j == -1) break;
4763 if(j<0 || path_i<0) return 0;
4764 }
4765 //out_pos++;
4766 //movement_buffer[out_pos]= 0;
4767
4768 if(expected_offset!=delta)return 0;
4769 for(i=0; i<out_pos/2; i++)
4770 {
4771 char tmp;
4772 tmp = movement_buffer[out_pos-1-i];
4773 movement_buffer[out_pos-1-i] = movement_buffer[i];
4774 movement_buffer[i] = tmp;
4775 }
4776
4777 if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4778 {
4779 for(i=0; i<out_pos; i++)
4780 {
4781 char tmp = movement_buffer[i];
4782 switch(tmp){
4783 case 0: putchar('=');break;
4784 case 1: putchar('D');break;
4785 case 2: putchar('I');break;
4786 case 3: putchar('X');break;
4787 }
4788 }
4789 putchar('\n');
4790 }
4791 return out_pos;
4792
4793 }
4794
4795
4796