1 /***************************************************************
2
3 The Subread software package is free software package:
4 you can redistribute it and/or modify it under the terms
5 of the GNU General Public License as published by the
6 Free Software Foundation, either version 3 of the License,
7 or (at your option) any later version.
8
9 Subread is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty
11 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12
13 See the GNU General Public License for more details.
14
15 Authors: Drs Yang Liao and Wei Shi
16
17 ***************************************************************/
18
19
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <string.h>
23 #include <ctype.h>
24 #include <assert.h>
25 #include <unistd.h>
26 #include "subread.h"
27 #include "sublog.h"
28 #include "gene-value-index.h"
29 #include "gene-algorithms.h"
30 #include "input-files.h"
31 #include "core.h"
32 #include "core-indel.h"
33 #include "core-junction.h"
34 #include "core-bigtable.h"
35
36 #define TTTSNAME "V0112_0155:7:1308:1308:136442"
37
38 #define CLUSTER_ALIGNMENT_DONOR_R1_MAPPED 2
39 #define CLUSTER_ALIGNMENT_DONOR_R2_MAPPED 4
40 #define CLUSTER_ALIGNMENT_DONOR_NEGATIVE_STRAND 1
41
42
abs32uint(unsigned int x)43 unsigned int abs32uint(unsigned int x){
44 if(x > 0x7fffffff) x = (0xffffffff - x) + 1;
45 return x;
46 }
47
48 typedef struct{
49 unsigned int piece_main_abs_offset;
50 unsigned int piece_minor_abs_offset;
51 int piece_main_masks;
52 short piece_main_coverage_start;
53 short piece_main_coverage_end;
54
55 short piece_main_hamming_match;
56 short piece_main_read_quality;
57 short piece_minor_hamming_match;
58 short piece_minor_read_quality;
59 int piece_minor_score;
60 short intron_length;
61
62 gene_vote_number_t *piece_main_indel_record;
63 unsigned short piece_main_indels;
64 unsigned short piece_minor_indel_offset;
65 gene_vote_number_t piece_main_votes;
66 gene_vote_number_t piece_minor_votes;
67
68 short piece_minor_coverage_start;
69 short piece_minor_coverage_end;
70 short split_point;
71 char inserted_bases;
72 char is_GT_AG_donors;
73 char is_donor_found_or_annotation;
74 char is_strand_jumped;
75 char is_break_even;
76
77 //unsigned long long int Score_H;
78 //unsigned long long int Score_L;
79 } select_junction_record_t;
80
81
debug_show_event(global_context_t * global_context,chromosome_event_t * event)82 void debug_show_event(global_context_t* global_context, chromosome_event_t * event){
83 char outpos1[100], outpos2[100];
84 absoffset_to_posstr(global_context, event -> event_small_side, outpos1);
85 absoffset_to_posstr(global_context, event -> event_large_side, outpos2);
86 SUBREADprintf("Event between %s and %s\n", outpos1, outpos2);
87 }
88
get_offset_maximum_chro_pos(global_context_t * global_context,thread_context_t * thread_context,unsigned int linear)89 int get_offset_maximum_chro_pos(global_context_t * global_context, thread_context_t * thread_context, unsigned int linear){
90 gene_offset_t * chros =& global_context -> chromosome_table;
91 int n = 0;
92 int total_offsets = chros -> total_offsets;
93
94 int LL = 0, RR = total_offsets-1;
95
96 while(1){
97 if(LL >= RR-1) break;
98 int MM = (LL+RR)/2;
99 if( linear > chros->read_offsets[MM]) LL = MM;
100 else if(linear < chros->read_offsets[MM]) RR = MM;
101 else break;
102 }
103
104 n = max(0, LL - 2);
105
106 for (; n < chros -> total_offsets; n++) {
107 if (chros->read_offsets[n] > linear) {
108 int ret;
109 unsigned int last_linear = 0;
110 if(n==0)
111 ret = chros->read_offsets[0] - chros -> padding *2 +16;
112 else{
113 ret = ( chros->read_offsets[n] - chros->read_offsets[n-1] ) - chros -> padding *2 +16;
114 last_linear = chros->read_offsets[n-1];
115 }
116 linear -= last_linear;
117 if(linear < chros -> padding || linear >= chros -> padding + ret) return -1;
118 return ret;
119 }
120 }
121 return -2;
122 }
123
124
125 // read_head_abs_pos is the offset of the FIRST WANTED base.
search_events_to_front(global_context_t * global_context,thread_context_t * thread_context,explain_context_t * explain_context,char * read_text,char * qual_text,unsigned int read_head_abs_offset,short remainder_len,short sofar_matched,int suggested_movement,int do_not_jump)126 void search_events_to_front(global_context_t * global_context, thread_context_t * thread_context, explain_context_t * explain_context, char * read_text , char * qual_text, unsigned int read_head_abs_offset, short remainder_len, short sofar_matched, int suggested_movement, int do_not_jump)
127 {
128 short tested_read_pos;
129 // #warning "SUBREAD_151 REMOVE THIS ASSERTION! "
130 // if(remainder_len >= 102)SUBREADprintf("FATAL:%d\n", remainder_len );
131 // assert(remainder_len < 102);
132
133 HashTable * event_table = NULL;
134 chromosome_event_t * event_space = NULL;
135
136 if(thread_context) {
137 event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
138 event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
139 } else {
140 event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
141 event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
142 }
143
144 gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index ;
145
146 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection)|| there_are_events_in_range(event_table->appendix1, read_head_abs_offset, remainder_len)) {
147
148 int event_search_method;
149 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection))
150 event_search_method = EVENT_SEARCH_BY_BOTH_SIDES;
151 else
152 event_search_method = EVENT_SEARCH_BY_SMALL_SIDE;
153
154 // tested_read_pos is the index of the first base unwanted!
155
156
157 int move_start = do_not_jump?0:global_context -> config.realignment_minimum_variant_distance;
158 if(suggested_movement) move_start = suggested_movement-1;
159 int is_junction_scanned = 0;
160
161 if(MAX_EVENTS_IN_READ - 1 > explain_context -> tmp_search_sections)
162 for(tested_read_pos = move_start ; tested_read_pos <= remainder_len; tested_read_pos++)
163 {
164 int xk1, matched_bases_to_site;
165 chromosome_event_t *site_events[MAX_EVENT_ENTRIES_PER_SITE+1];
166
167 int jump_penalty = 0;
168
169 unsigned potential_event_pos;
170 if(explain_context -> current_is_strand_jumped)
171 potential_event_pos = read_head_abs_offset - tested_read_pos +1;
172 else
173 potential_event_pos = read_head_abs_offset + tested_read_pos -1;
174 if(!check_event_bitmap( event_table->appendix1, potential_event_pos )) continue;
175
176 int search_types = CHRO_EVENT_TYPE_INDEL | CHRO_EVENT_TYPE_JUNCTION | CHRO_EVENT_TYPE_FUSION;
177 int site_events_no = search_event(global_context, event_table , event_space , potential_event_pos, event_search_method , search_types , site_events);
178
179
180 if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0)
181 {
182 SUBREADprintf("FOUND THE EVENT FRONT:%d at %u\n", site_events_no, potential_event_pos);
183 if(site_events_no)
184 SUBREADprintf("EVENT0_type = %d\n", site_events[0]->event_type);
185 }
186
187 if(!site_events_no)continue;
188
189 unsigned int tested_chro_begin;
190 if(explain_context -> current_is_strand_jumped)
191 tested_chro_begin = read_head_abs_offset - tested_read_pos + 1;
192 else
193 tested_chro_begin = read_head_abs_offset;
194
195 matched_bases_to_site = match_chro(read_text, value_index, tested_chro_begin, tested_read_pos, explain_context -> current_is_strand_jumped, global_context -> config.space_type);
196
197 /*
198 #warning "========= COMMENT TWO LINES ===================="
199 SUBREADprintf("MBASETOSITE=%d, tested_read_pos=%d\n", matched_bases_to_site, tested_read_pos);
200 SUBREADprintf("TXT=%s, tested_read_pos=%d\n", read_text, tested_chro_begin);
201 */
202
203 int this_round_junction_scanned = 0;
204
205 if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0)
206 SUBREADprintf("F_JUMP? match=%d / tested=%d\n", matched_bases_to_site , tested_read_pos);
207
208 //#warning "========= remove - 2000 from next line ============="
209 if(explain_context -> total_tries < REALIGN_TOTAL_TRIES && tested_read_pos >0 && ( matched_bases_to_site*10000/tested_read_pos > 9000 - 2000 || global_context->config.maximise_sensitivity_indel) )
210 for(xk1 = 0; xk1 < site_events_no ; xk1++)
211 {
212 chromosome_event_t * tested_event = site_events[xk1];
213
214 if(explain_context -> is_fully_covered && tested_event -> event_type == CHRO_EVENT_TYPE_FUSION && tested_event -> event_large_side - tested_event -> event_small_side > MAX_DELETION_LENGTH){
215 continue;
216 }
217 //if(explain_context -> pair_number == 23)
218 if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0){
219 SUBREADprintf("F_JUMP?%d > %d %s (%u) ; SEARCH_TAG=%u\n", (1+matched_bases_to_site)*10000 / tested_read_pos , 9000, read_text, tested_chro_begin, potential_event_pos);
220 debug_show_event(global_context, tested_event);
221
222 }
223
224 // note that these two values are the index of the first wanted base.
225 unsigned int new_read_head_abs_offset;
226
227 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection) && tested_event -> event_type == CHRO_EVENT_TYPE_INDEL)
228 {
229 if(explain_context ->current_is_strand_jumped){
230 if(potential_event_pos == tested_event-> event_small_side) continue;
231 }else{
232 if(potential_event_pos == tested_event-> event_large_side) continue;
233 }
234 }
235 if( tested_event -> event_type != CHRO_EVENT_TYPE_INDEL){
236 this_round_junction_scanned = 1;
237 }
238
239 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection))// && tested_event->event_type == CHRO_EVENT_TYPE_FUSION)
240 new_read_head_abs_offset = (potential_event_pos == tested_event -> event_large_side)?tested_event -> event_small_side:tested_event -> event_large_side;
241 else
242 new_read_head_abs_offset = tested_event -> event_large_side;
243
244
245 short new_remainder_len = remainder_len - tested_read_pos + min(0, tested_event->indel_length) - tested_event -> indel_at_junction;
246
247 // #warning "SUBREAD_151 REMOVE THIS ASSERTION! "
248 // assert(new_remainder_len < 102);
249
250 if(new_remainder_len>0)
251 {
252 //if(explain_context -> pair_number==2074) printf("JUMPPED IN.\n");
253
254 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_end = explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start + tested_read_pos;
255 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].event_after_section = tested_event;
256 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].is_connected_to_large_side = (potential_event_pos == tested_event -> event_large_side);
257 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].read_pos_start = tested_read_pos - min(0, tested_event -> indel_length) + tested_event -> indel_at_junction;
258 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].abs_offset_for_start = new_read_head_abs_offset;
259
260
261 if(tested_event->event_type == CHRO_EVENT_TYPE_FUSION) jump_penalty = 2;
262
263 int current_is_jumped = explain_context -> current_is_strand_jumped;
264 int current_sup_as_complex = explain_context -> tmp_min_support_as_complex;
265 int current_sup_as_simple = explain_context -> tmp_support_as_simple;
266 //int current_unsup_as_simple = explain_context -> tmp_min_unsupport;
267 int current_pure_donor_found = explain_context -> tmp_is_pure_donor_found_explain;
268
269 explain_context -> tmp_support_as_simple = tested_event -> supporting_reads;
270 explain_context -> tmp_min_support_as_complex = min((tested_event -> is_donor_found_or_annotation & 64)?0x7fffffff:tested_event -> supporting_reads,explain_context -> tmp_min_support_as_complex);
271 explain_context -> tmp_min_unsupport = min(tested_event -> anti_supporting_reads,explain_context -> tmp_min_unsupport);
272 explain_context -> tmp_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain && tested_event -> is_donor_found_or_annotation;
273 explain_context -> tmp_indel_penalty += ( tested_event -> event_type == CHRO_EVENT_TYPE_INDEL );
274
275 if(tested_event -> event_type == CHRO_EVENT_TYPE_FUSION && tested_event -> is_strand_jumped)
276 explain_context -> current_is_strand_jumped = !explain_context -> current_is_strand_jumped;
277
278 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].is_strand_jumped = explain_context -> current_is_strand_jumped;
279 explain_context -> tmp_search_sections ++;
280
281 explain_context -> total_tries ++;
282 search_events_to_front(global_context, thread_context, explain_context, read_text + tested_event -> indel_at_junction + tested_read_pos - min(0, tested_event->indel_length), qual_text + tested_read_pos - min(0, tested_event->indel_length), new_read_head_abs_offset, new_remainder_len, sofar_matched + matched_bases_to_site - jump_penalty, tested_event -> connected_next_event_distance, 0);
283 explain_context -> tmp_search_sections --;
284
285 explain_context -> current_is_strand_jumped = current_is_jumped;
286 explain_context -> tmp_indel_penalty -= ( tested_event -> event_type == CHRO_EVENT_TYPE_INDEL );
287 explain_context -> tmp_min_support_as_complex = current_sup_as_complex;
288 explain_context -> tmp_support_as_simple = current_sup_as_simple;
289 //explain_context -> tmp_min_unsupport = current_unsup_as_simple;
290 explain_context -> tmp_is_pure_donor_found_explain = current_pure_donor_found;
291 }
292 //if(global_context ->config.limited_tree_scan) break;
293 }
294 if( (global_context ->config.limited_tree_scan) && explain_context -> full_read_len <= EXON_LONG_READ_LENGTH) break;
295 is_junction_scanned = max(is_junction_scanned, this_round_junction_scanned);
296 }
297 }
298 //#warning "SUBREAD_151 REMOVE THE ASSERT! "
299 //assert( remainder_len< 102 );
300 int whole_section_matched = match_chro(read_text , value_index, explain_context -> current_is_strand_jumped?read_head_abs_offset - remainder_len +1:read_head_abs_offset, remainder_len , explain_context -> current_is_strand_jumped, global_context -> config.space_type);
301
302 explain_context -> tmp_total_matched_bases = whole_section_matched + sofar_matched ;
303
304 new_explain_try_replace(global_context, thread_context, explain_context, remainder_len, 0);
305 }
306
new_explain_try_replace(global_context_t * global_context,thread_context_t * thread_context,explain_context_t * explain_context,int remainder_len,int search_to_back)307 void new_explain_try_replace(global_context_t* global_context, thread_context_t * thread_context, explain_context_t * explain_context, int remainder_len, int search_to_back)
308 {
309 int is_better_result = 0, is_same_best = 0;
310
311 if(0 && FIXLENstrcmp("simulated.11420793", explain_context->read_name)==0){
312 SUBREADprintf("TRY_REPLACE : %s has best=%d, b_evn=%d, tscore=%d, t_evn=%d\n", explain_context->read_name, explain_context -> best_matching_bases , explain_context -> best_is_complex , explain_context-> tmp_total_matched_bases, explain_context -> tmp_search_sections );
313 }
314
315 if(explain_context -> best_matching_bases - explain_context -> best_indel_penalty < explain_context-> tmp_total_matched_bases - explain_context -> tmp_indel_penalty)
316 {
317 is_better_result = 1;
318 explain_context -> best_is_complex = explain_context -> tmp_search_sections ;
319 explain_context -> is_currently_tie = 0;
320 explain_context -> best_support_as_simple = explain_context -> tmp_support_as_simple;
321 explain_context -> best_min_unsupport_as_simple = explain_context -> tmp_min_unsupport;
322 explain_context -> best_min_support_as_complex = explain_context -> tmp_min_support_as_complex;
323 explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
324 explain_context -> second_best_matching_bases = max(explain_context -> second_best_matching_bases, explain_context -> best_matching_bases);
325 explain_context -> best_matching_bases = explain_context-> tmp_total_matched_bases ;
326 explain_context -> best_indel_penalty = explain_context -> tmp_indel_penalty;
327 }
328 else if(explain_context -> best_matching_bases - explain_context -> best_indel_penalty == explain_context-> tmp_total_matched_bases - explain_context -> tmp_indel_penalty)
329 {
330 // only gapped explainations are complex counted.
331 explain_context -> best_is_complex += explain_context -> tmp_search_sections;
332 explain_context -> second_best_matching_bases = explain_context -> best_matching_bases;
333 explain_context -> best_indel_penalty = explain_context -> tmp_indel_penalty;
334
335 if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0){
336 SUBREADprintf("complexity: curr=%d, new=%d ; sections=%d\n", explain_context->best_min_support_as_complex, explain_context -> tmp_min_support_as_complex, explain_context -> tmp_search_sections );
337 }
338 if(explain_context -> best_is_complex > 1)
339 {
340 // is complex now!
341 if(explain_context -> tmp_search_sections == 0)
342 {
343 if(explain_context -> tmp_min_unsupport >explain_context->best_min_support_as_complex){
344 is_better_result = 1;
345 explain_context->best_min_support_as_complex =explain_context -> tmp_min_unsupport;
346 explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
347 explain_context -> is_currently_tie = 0;
348 }
349 else if(explain_context -> tmp_min_unsupport == explain_context->best_min_support_as_complex)
350 {
351 explain_context -> is_currently_tie = 1;
352 is_same_best = 1;
353 }
354 }
355 else{
356 if(explain_context -> tmp_min_support_as_complex >explain_context->best_min_support_as_complex){
357 is_better_result = 1;
358 explain_context -> best_min_support_as_complex =explain_context -> tmp_min_support_as_complex;
359 explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
360 explain_context -> is_currently_tie = 0;
361 }
362 else if(explain_context -> tmp_min_support_as_complex == explain_context->best_min_support_as_complex){
363 explain_context -> is_currently_tie = 1;
364 is_same_best = 1;
365 }
366 }
367
368 }
369 else
370 {
371 // this branch is reached ONLY if the last best is ONE-gapped (50M3D50M) and the current best is ungapped (100M)!
372 if(explain_context -> best_is_pure_donor_found_explain)
373 {
374 if(explain_context -> best_min_unsupport_as_simple >= explain_context -> best_support_as_simple+2)
375 {
376 is_better_result = 1;
377 explain_context -> best_min_support_as_complex = explain_context -> best_min_unsupport_as_simple;
378 explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
379 explain_context -> is_currently_tie = 0;
380 }
381 }
382 //#warning "======= MAKE if(0) IS CORRECT BEFORE RELEASE ======"
383 else if(0)
384 if(explain_context -> best_min_unsupport_as_simple >= explain_context -> best_support_as_simple)
385 {
386 is_better_result = 1;
387 explain_context -> best_min_support_as_complex = explain_context -> best_min_unsupport_as_simple;
388 explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
389 explain_context -> is_currently_tie = 0;
390 }
391 }
392 }
393 else return;
394
395 if(is_better_result || is_same_best){
396 if(search_to_back){
397 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start = 0;
398 }else{
399 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_end = explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start + remainder_len;
400 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].event_after_section = NULL;
401 }
402 }
403
404 if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0){
405 SUBREADprintf("TRY_REPLACE_DESICION TO %s: BETTER=%d, SAME=%d ; CURRENT : %d secs ; NEWBEST : %d secs\n", search_to_back?"BACK":"FRONT", is_better_result, is_same_best, search_to_back? explain_context -> result_back_junction_numbers[0]:explain_context -> result_front_junction_numbers[0] ,explain_context -> tmp_search_sections);
406 int xx1;
407 for(xx1 = 0; xx1 < explain_context -> tmp_search_sections;xx1++){
408 SUBREADprintf(" Event : %d ~ %d in read\n", explain_context -> tmp_search_junctions[xx1].read_pos_start, explain_context -> tmp_search_junctions[xx1].read_pos_end);
409 if(explain_context -> tmp_search_junctions[xx1].event_after_section){
410 SUBREADprintf(" ");
411 debug_show_event(global_context, explain_context -> tmp_search_junctions[xx1].event_after_section);
412 }
413 }
414 }
415
416 if(is_better_result)
417 {
418 if(search_to_back){
419 explain_context -> all_back_alignments = 1;
420 explain_context -> result_back_junction_numbers[0] = explain_context -> tmp_search_sections +1;
421 // checked: memory boundary
422 memcpy(explain_context -> result_back_junctions[0], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
423
424 }else{
425 explain_context -> all_front_alignments = 1;
426 explain_context -> result_front_junction_numbers[0] = explain_context -> tmp_search_sections +1;
427 // checked: memory boundary
428 memcpy(explain_context -> result_front_junctions[0], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
429 }
430
431 }else if(is_same_best){
432 if(search_to_back && explain_context -> all_back_alignments < MAX_ALIGNMENT_PER_ANCHOR){
433 explain_context -> result_back_junction_numbers[explain_context -> all_back_alignments] = explain_context -> tmp_search_sections +1;
434
435 // checked: memory boundary
436 memcpy(explain_context -> result_back_junctions[explain_context -> all_back_alignments], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
437 explain_context -> all_back_alignments ++;
438 }else if((!search_to_back) && explain_context -> all_front_alignments < MAX_ALIGNMENT_PER_ANCHOR){
439 explain_context -> result_front_junction_numbers[explain_context -> all_front_alignments] = explain_context -> tmp_search_sections +1;
440
441 // checked: memory boundary
442 memcpy(explain_context -> result_front_junctions[explain_context -> all_front_alignments], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
443 explain_context -> all_front_alignments ++;
444 }
445 }
446 }
447
448
449
new_explain_try_replace_xe(global_context_t * global_context,thread_context_t * thread_context,explain_context_t * explain_context,int remainder_len,int search_to_back)450 void new_explain_try_replace_xe(global_context_t* global_context, thread_context_t * thread_context, explain_context_t * explain_context, int remainder_len, int search_to_back)
451 {
452 int is_better_result = 0, is_same_best = 0;
453
454 //SUBREADprintf("TRYING SET %s %s : Matched_bases : %d -> %d ; SECS : %d -> %d\n", explain_context -> read_name, search_to_back?"BACK":"FRONT", explain_context -> best_matching_bases, explain_context-> tmp_total_matched_bases, search_to_back? explain_context -> result_back_junction_numbers[0]:explain_context -> result_front_junction_numbers[0] ,explain_context -> tmp_search_sections );
455
456 if(explain_context -> best_matching_bases < explain_context-> tmp_total_matched_bases)
457 {
458 is_better_result = 1;
459 explain_context -> best_is_complex = explain_context -> tmp_search_sections ;
460 explain_context -> is_currently_tie = 0;
461 explain_context -> best_support_as_simple = explain_context -> tmp_support_as_simple;
462 explain_context -> best_min_unsupport_as_simple = explain_context -> tmp_min_unsupport;
463 explain_context -> best_min_support_as_complex = explain_context -> tmp_min_support_as_complex;
464 explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
465 explain_context -> second_best_matching_bases = max(explain_context -> second_best_matching_bases, explain_context -> best_matching_bases);
466 explain_context -> best_matching_bases = explain_context-> tmp_total_matched_bases ;
467
468 } else if(explain_context -> tmp_search_sections < search_to_back? explain_context -> result_back_junction_numbers[0]:explain_context -> result_front_junction_numbers[0] - 1 && explain_context -> best_matching_bases == explain_context-> tmp_total_matched_bases) {
469 is_better_result = 1;
470 explain_context -> best_is_complex = explain_context -> tmp_search_sections ;
471 explain_context -> is_currently_tie = 0;
472 explain_context -> best_support_as_simple = explain_context -> tmp_support_as_simple;
473 explain_context -> best_min_unsupport_as_simple = explain_context -> tmp_min_unsupport;
474 explain_context -> best_min_support_as_complex = explain_context -> tmp_min_support_as_complex;
475 explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
476 explain_context -> second_best_matching_bases = max(explain_context -> second_best_matching_bases, explain_context -> best_matching_bases);
477 explain_context -> best_matching_bases = explain_context-> tmp_total_matched_bases ;
478 } else if( 0 && explain_context -> best_matching_bases == explain_context-> tmp_total_matched_bases) {
479 // only gapped explainations are complex counted.
480 explain_context -> best_is_complex += explain_context -> tmp_search_sections;
481 explain_context -> second_best_matching_bases = explain_context -> best_matching_bases;
482
483 if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0){
484 SUBREADprintf("complexity: curr=%d, new=%d ; sections=%d\n", explain_context->best_min_support_as_complex, explain_context -> tmp_min_support_as_complex, explain_context -> tmp_search_sections );
485 }
486 if(explain_context -> best_is_complex > 1)
487 {
488 // is complex now!
489 if(explain_context -> tmp_search_sections == 0)
490 {
491 if(explain_context -> tmp_min_unsupport >explain_context->best_min_support_as_complex){
492 is_better_result = 1;
493 explain_context->best_min_support_as_complex =explain_context -> tmp_min_unsupport;
494 explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
495 explain_context -> is_currently_tie = 0;
496 }
497 else if(explain_context -> tmp_min_unsupport == explain_context->best_min_support_as_complex)
498 {
499 explain_context -> is_currently_tie = 1;
500 is_same_best = 1;
501 }
502 } else {
503 if(explain_context -> tmp_min_support_as_complex >explain_context->best_min_support_as_complex){
504 is_better_result = 1;
505 explain_context -> best_min_support_as_complex =explain_context -> tmp_min_support_as_complex;
506 explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
507 explain_context -> is_currently_tie = 0;
508 }
509 else if(explain_context -> tmp_min_support_as_complex == explain_context->best_min_support_as_complex){
510 explain_context -> is_currently_tie = 1;
511 is_same_best = 1;
512 }
513 }
514
515 } else {
516 // this branch is reached ONLY if the last best is ONE-gapped (50M3D50M) and the current best is ungapped (100M)!
517 if(explain_context -> best_is_pure_donor_found_explain)
518 {
519 if(explain_context -> best_min_unsupport_as_simple >= explain_context -> best_support_as_simple+2)
520 {
521 is_better_result = 1;
522 explain_context -> best_min_support_as_complex = explain_context -> best_min_unsupport_as_simple;
523 explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
524 explain_context -> is_currently_tie = 0;
525 }
526 }
527 }
528 }
529 else return;
530
531 if(is_better_result || is_same_best){
532 if(search_to_back){
533 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start = 0;
534 }else{
535 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_end = explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start + remainder_len;
536 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].event_after_section = NULL;
537 }
538 }
539
540 if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0){
541 SUBREADprintf("RNAME=%s TRY_REPLACE_DESICION TO %s: BETTER=%d, SAME=%d ; CURRENT : %d secs ; NEWBEST : %d secs MM_bases : %d -> %d\n", explain_context -> read_name, search_to_back?"BACK":"FRONT", is_better_result, is_same_best, search_to_back? explain_context -> result_back_junction_numbers[0]:explain_context -> result_front_junction_numbers[0] ,explain_context -> tmp_search_sections, explain_context -> best_matching_bases , explain_context-> tmp_total_matched_bases);
542 int xx1;
543 for(xx1 = 0; xx1 < explain_context -> tmp_search_sections;xx1++){
544 SUBREADprintf(" Event : %d ~ %d in read\n", explain_context -> tmp_search_junctions[xx1].read_pos_start, explain_context -> tmp_search_junctions[xx1].read_pos_end);
545 if(explain_context -> tmp_search_junctions[xx1].event_after_section){
546 SUBREADprintf(" ");
547 debug_show_event(global_context, explain_context -> tmp_search_junctions[xx1].event_after_section);
548 }
549 }
550 }
551
552 if(is_better_result)
553 {
554 if(search_to_back){
555 explain_context -> all_back_alignments = 1;
556 explain_context -> result_back_junction_numbers[0] = explain_context -> tmp_search_sections +1;
557 // checked: memory boundary
558 memcpy(explain_context -> result_back_junctions[0], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
559
560 }else{
561 explain_context -> all_front_alignments = 1;
562 explain_context -> result_front_junction_numbers[0] = explain_context -> tmp_search_sections +1;
563 // checked: memory boundary
564 memcpy(explain_context -> result_front_junctions[0], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
565 }
566
567 }else if(is_same_best && 0){
568 if(search_to_back && explain_context -> all_back_alignments < MAX_ALIGNMENT_PER_ANCHOR){
569 explain_context -> result_back_junction_numbers[explain_context -> all_back_alignments] = explain_context -> tmp_search_sections +1;
570
571 // checked: memory boundary
572 memcpy(explain_context -> result_back_junctions[explain_context -> all_back_alignments], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
573 explain_context -> all_back_alignments ++;
574 }else if((!search_to_back) && explain_context -> all_front_alignments < MAX_ALIGNMENT_PER_ANCHOR){
575 explain_context -> result_front_junction_numbers[explain_context -> all_front_alignments] = explain_context -> tmp_search_sections +1;
576
577 // checked: memory boundary
578 memcpy(explain_context -> result_front_junctions[explain_context -> all_front_alignments], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
579 explain_context -> all_front_alignments ++;
580 }
581 }
582 }
583
584
585 // read_tail_abs_offset is actually the offset of the base next to the last base in read tail.
586 // read_tail_pos is the FIRST UNWANTED BASE, after the read.
search_events_to_back(global_context_t * global_context,thread_context_t * thread_context,explain_context_t * explain_context,char * read_text,char * qual_text,unsigned int read_tail_abs_offset,short read_tail_pos,short sofar_matched,int suggested_movement,int do_not_jump)587 void search_events_to_back(global_context_t * global_context, thread_context_t * thread_context, explain_context_t * explain_context, char * read_text , char * qual_text, unsigned int read_tail_abs_offset, short read_tail_pos, short sofar_matched, int suggested_movement, int do_not_jump)
588 {
589 short tested_read_pos;
590
591 HashTable * event_table = NULL;
592 chromosome_event_t * event_space = NULL;
593
594 if(thread_context) {
595 event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
596 event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
597 } else {
598 event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
599 event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
600 }
601
602 gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index ;
603 if( there_are_events_in_range(event_table -> appendix2, read_tail_abs_offset - read_tail_pos, read_tail_pos) || global_context -> config.do_fusion_detection ||global_context -> config.do_long_del_detection ){
604 int event_search_method;
605 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection))
606 event_search_method = EVENT_SEARCH_BY_BOTH_SIDES;
607 else
608 event_search_method = EVENT_SEARCH_BY_LARGE_SIDE;
609
610
611 int is_junction_scanned = 0;
612 // minimum perfect section length is 1
613 // tested_read_pos is the first WANTED BASE in section.
614 int move_start = read_tail_pos - (do_not_jump?0:global_context -> config.realignment_minimum_variant_distance);
615 if(suggested_movement) move_start = read_tail_pos - suggested_movement + 1;
616
617 //#warning ">>>>>>>>>>>>>> COMMENT THIS <<<<<<<<<<<<<<<<<<<<<"
618 //printf("OCT27-STEP-BKIN : %s , STT=%d, %u, %d\n", explain_context -> read_name, move_start, read_tail_abs_offset, read_tail_pos);
619
620 if(MAX_EVENTS_IN_READ - 1> explain_context -> tmp_search_sections)
621 for(tested_read_pos = move_start; tested_read_pos >=0;tested_read_pos --)
622 {
623 int xk1, matched_bases_to_site;
624 int jump_penalty = 0;
625 chromosome_event_t *site_events[MAX_EVENT_ENTRIES_PER_SITE];
626
627 int potential_event_pos;
628
629 if(explain_context -> current_is_strand_jumped)
630 potential_event_pos = read_tail_abs_offset + ( read_tail_pos - tested_read_pos);
631 else
632 potential_event_pos = read_tail_abs_offset - ( read_tail_pos - tested_read_pos);
633
634
635 if(!check_event_bitmap( event_table->appendix2, potential_event_pos )) continue;
636 int search_types = CHRO_EVENT_TYPE_INDEL | CHRO_EVENT_TYPE_JUNCTION | CHRO_EVENT_TYPE_FUSION;
637 int site_events_no = search_event(global_context, event_table , event_space , potential_event_pos, event_search_method , search_types, site_events);
638 //#warning ">>>>>>>>>>>>>> COMMENT THIS <<<<<<<<<<<<<<<<<<<<<"
639 //printf("OCT27-STEP-BKIN-SR: %s at %u, FOUND=%d\n" , explain_context -> read_name,potential_event_pos,site_events_no);
640
641 if(!site_events_no)continue;
642
643 unsigned int tested_chro_begin;
644 if(explain_context -> current_is_strand_jumped)
645 tested_chro_begin = read_tail_abs_offset + 1;
646 else
647 tested_chro_begin = read_tail_abs_offset - (read_tail_pos - tested_read_pos);
648
649 matched_bases_to_site = match_chro(read_text + tested_read_pos, value_index, tested_chro_begin , read_tail_pos - tested_read_pos, explain_context -> current_is_strand_jumped, global_context -> config.space_type);
650
651 int this_round_junction_scanned = 0;
652
653 //#warning ">>>>>>>>>>>>>>>> REMOVE IT <<<<<<<<<<<<<<<<<<<<<<"
654 //printf("OCT27-STEPSB-JB-%s: test %u = %d events; TEST=%d > 7000 : MA=%d; %s ; %u = %u - (%d - %d) ; LEV=%d\n", explain_context -> read_name, potential_event_pos, site_events_no, (read_tail_pos<=tested_read_pos)?(-1234):( matched_bases_to_site*10000/(read_tail_pos - tested_read_pos)) , matched_bases_to_site, read_text + tested_read_pos, potential_event_pos, read_tail_abs_offset, read_tail_pos, tested_read_pos, explain_context -> tmp_search_sections);
655 //#warning "========= remove - 2000 from next line ============="
656 if(explain_context -> total_tries < REALIGN_TOTAL_TRIES && (read_tail_pos>tested_read_pos) && ( matched_bases_to_site*10000/(read_tail_pos - tested_read_pos) > 9000 - 2000 || global_context->config.maximise_sensitivity_indel) )
657 for(xk1 = 0; xk1 < site_events_no ; xk1++)
658 {
659 chromosome_event_t * tested_event = site_events[xk1];
660
661 if(explain_context -> is_fully_covered && tested_event -> event_type == CHRO_EVENT_TYPE_FUSION && tested_event -> event_large_side - tested_event -> event_small_side > MAX_DELETION_LENGTH){
662 continue;
663 }
664
665 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection) && tested_event -> event_type == CHRO_EVENT_TYPE_INDEL)
666 {
667 if(explain_context->current_is_strand_jumped){
668 if(potential_event_pos == tested_event-> event_large_side) continue;
669 }else{
670 if(potential_event_pos == tested_event-> event_small_side) continue;
671 }
672 }
673 if( tested_event -> event_type != CHRO_EVENT_TYPE_INDEL){
674 this_round_junction_scanned = 1;
675 }
676
677
678 if(0 && strcmp("S_chr901_565784_72M8D28M", explain_context -> read_name) == 0)
679 SUBREADprintf("B_JUMP?%d > %d TLEN=%d \n", (1+matched_bases_to_site)*10000 / (read_tail_pos - tested_read_pos) , 9000, read_tail_pos - tested_read_pos);
680 // note that read_tail_pos is the first unwanted base.
681 int new_read_tail_pos = tested_read_pos;
682 if(tested_event->event_type == CHRO_EVENT_TYPE_INDEL) new_read_tail_pos += min(0, tested_event -> indel_length);
683 // note that read_tail_abs_offset is the first unwanted base.
684 unsigned int new_read_tail_abs_offset;
685
686 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection))// && tested_event->event_type == CHRO_EVENT_TYPE_FUSION)
687 {
688 new_read_tail_abs_offset = (potential_event_pos == tested_event -> event_small_side)? tested_event -> event_large_side : tested_event -> event_small_side;
689 if(tested_event->is_strand_jumped + explain_context -> current_is_strand_jumped == 1)
690 new_read_tail_abs_offset--;
691 else
692 new_read_tail_abs_offset++;
693 }
694 else
695 new_read_tail_abs_offset = tested_event -> event_small_side + 1;
696
697 new_read_tail_pos -= tested_event -> indel_at_junction;
698
699 if(new_read_tail_pos>0)
700 {
701 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start = tested_read_pos;
702 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].event_after_section = tested_event;
703 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].is_connected_to_large_side = (potential_event_pos == tested_event -> event_small_side);
704 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].read_pos_end = tested_read_pos + min(0, tested_event->indel_length) - tested_event -> indel_at_junction;
705 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].abs_offset_for_start = new_read_tail_abs_offset;
706
707 if(tested_event->event_type == CHRO_EVENT_TYPE_FUSION) jump_penalty = 2;
708
709 int current_is_jumped = explain_context -> current_is_strand_jumped ;
710 int current_sup_as_complex = explain_context -> tmp_min_support_as_complex;
711 int current_sup_as_simple = explain_context -> tmp_support_as_simple;
712 int current_pure_donor_found = explain_context -> tmp_is_pure_donor_found_explain;
713
714 explain_context -> tmp_support_as_simple = tested_event -> supporting_reads;
715 explain_context -> tmp_min_support_as_complex = min((tested_event -> is_donor_found_or_annotation & 64)?0x7fffffff:tested_event -> supporting_reads,explain_context -> tmp_min_support_as_complex);
716 explain_context -> tmp_min_unsupport = min(tested_event -> anti_supporting_reads,explain_context -> tmp_min_unsupport);
717 explain_context -> tmp_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain && tested_event -> is_donor_found_or_annotation;
718 explain_context -> tmp_indel_penalty += ( tested_event -> event_type == CHRO_EVENT_TYPE_INDEL );
719
720 if(tested_event -> event_type == CHRO_EVENT_TYPE_FUSION && tested_event -> is_strand_jumped)
721 explain_context -> current_is_strand_jumped = !explain_context -> current_is_strand_jumped;
722 explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].is_strand_jumped = explain_context -> current_is_strand_jumped;
723 explain_context -> tmp_search_sections ++;
724 explain_context -> total_tries ++;
725
726 search_events_to_back(global_context, thread_context, explain_context, read_text , qual_text, new_read_tail_abs_offset , new_read_tail_pos, sofar_matched + matched_bases_to_site - jump_penalty, tested_event -> connected_previous_event_distance, 0);
727
728 explain_context -> tmp_search_sections --;
729 explain_context -> tmp_indel_penalty -= ( tested_event -> event_type == CHRO_EVENT_TYPE_INDEL );
730 explain_context -> current_is_strand_jumped = current_is_jumped;
731 explain_context -> tmp_min_support_as_complex = current_sup_as_complex;
732 explain_context -> tmp_support_as_simple = current_sup_as_simple;
733 explain_context -> tmp_is_pure_donor_found_explain = current_pure_donor_found;
734 }
735 }
736 if(( global_context ->config.limited_tree_scan) && explain_context -> full_read_len <= EXON_LONG_READ_LENGTH) break;
737 this_round_junction_scanned = max(this_round_junction_scanned, is_junction_scanned);
738 }
739 }
740 int whole_section_matched = match_chro(read_text , value_index, read_tail_abs_offset - (explain_context -> current_is_strand_jumped?-1:read_tail_pos), read_tail_pos , explain_context -> current_is_strand_jumped, global_context -> config.space_type);
741
742 explain_context -> tmp_total_matched_bases = whole_section_matched + sofar_matched ;
743
744 new_explain_try_replace(global_context, thread_context, explain_context, 0, 1);
745 }
746
init_junction_tables(global_context_t * context)747 int init_junction_tables(global_context_t * context)
748 {
749 fraglist_init(&context -> funky_list_A);
750 fraglist_init(&context -> funky_list_DE);
751
752 bktable_init(&context -> funky_table_BC, FUNKY_COLOCATION_TOLERANCE * 2, 10000000);
753 bktable_init(&context -> funky_table_DE, FUNKY_COLOCATION_TOLERANCE * 2, 10000000);
754
755 bktable_init(&context -> breakpoint_table_P, 2 * context -> config.maximum_pair_distance, 1000000);
756 bktable_init(&context -> breakpoint_table_QR, 2 * BREAK_POINT_MAXIMUM_TOLERANCE, 1000000);
757 bktable_init(&context -> breakpoint_table_YZ, 2 * context -> config.maximum_pair_distance, 1000000);
758
759 bktable_init(&context -> translocation_result_table, 2*BREAK_POINT_MAXIMUM_TOLERANCE, 1000000);
760 bktable_init(&context -> inversion_result_table, 2*BREAK_POINT_MAXIMUM_TOLERANCE, 1000000);
761 return 0;
762 }
763
destroy_junction_tables(global_context_t * context)764 int destroy_junction_tables(global_context_t * context)
765 {
766 fraglist_destroy(&context -> funky_list_A);
767 fraglist_destroy(&context -> funky_list_DE);
768
769 bktable_destroy(&context -> funky_table_BC);
770 bktable_destroy(&context -> funky_table_DE);
771 bktable_destroy(&context -> breakpoint_table_P);
772 bktable_destroy(&context -> breakpoint_table_QR);
773 bktable_destroy(&context -> breakpoint_table_YZ);
774
775 HashTableIteration(context -> inversion_result_table.entry_table , bktable_free_ptrs);
776 bktable_destroy(&context -> inversion_result_table);
777
778 HashTableIteration(context -> translocation_result_table.entry_table , bktable_free_ptrs);
779 bktable_destroy(&context -> translocation_result_table);
780
781 return 0;
782 }
init_junction_thread_contexts(global_context_t * global_context,thread_context_t * thread_context,int task)783 int init_junction_thread_contexts(global_context_t * global_context, thread_context_t * thread_context, int task)
784 {
785 return 0;
786 }
787
insert_big_margin_record(global_context_t * global_context,unsigned short * big_margin_record,unsigned char votes,short read_pos_start,short read_pos_end,int read_len,int is_negative)788 void insert_big_margin_record(global_context_t * global_context , unsigned short * big_margin_record, unsigned char votes, short read_pos_start, short read_pos_end, int read_len, int is_negative)
789 {
790
791 if( global_context->config.big_margin_record_size<3) return;
792
793 unsigned short read_pos_start_2 = (is_negative?read_len -read_pos_end:read_pos_start) ;
794 unsigned short read_pos_end_2 = (is_negative?read_len -read_pos_start:read_pos_end);
795
796 int xk1;
797 for(xk1=0; xk1< global_context->config.big_margin_record_size / 3; xk1++)
798 {
799 if( votes >= big_margin_record[xk1*3])
800 break;
801 }
802 if(xk1< global_context->config.big_margin_record_size / 3)
803 {
804 int xk2;
805 for(xk2 = global_context->config.big_margin_record_size-4; xk2 >= xk1*3; xk2--)
806 big_margin_record[xk2 + 3] = big_margin_record[xk2];
807 big_margin_record[xk1*3+0] = votes;
808 big_margin_record[xk1*3+1] = read_pos_start_2;
809 big_margin_record[xk1*3+2] = read_pos_end_2;
810 }
811 }
812
is_PE_distance(global_context_t * global_context,unsigned int pos1,unsigned int pos2,int rlen1,int rlen2,int is_negative_R1,int is_negative_R2)813 int is_PE_distance(global_context_t * global_context, unsigned int pos1, unsigned int pos2, int rlen1, int rlen2, int is_negative_R1, int is_negative_R2)
814 {
815 long long int dist = pos2;
816 dist -= pos1;
817
818 is_negative_R1 = (is_negative_R1>0)?1:0;
819 is_negative_R2 = (is_negative_R2>0)?1:0;
820
821 if(pos1 > pos2) dist -= rlen1;
822 else if(pos1 < pos2) dist += rlen2;
823 else dist += max(rlen2, rlen1);
824
825 if(abs(dist) > global_context->config.maximum_pair_distance || abs(dist)<global_context->config.minimum_pair_distance) return 0;
826
827 if(is_negative_R1 != is_negative_R2) return 0;
828 if(pos1 > pos2 && !is_negative_R1) return 0;
829 if(pos1 < pos2 && is_negative_R1) return 0;
830 return 1;
831 }
832
833
834 #define MAX_VOTE_TOLERANCE 1
835 //returns 1 if the vote number is not significantly higher than the vote numbers in the vote list.
test_small_minor_votes(global_context_t * global_context,int minor_i,int minor_j,int major_i,int major_j,gene_vote_t * votes,int read_len)836 int test_small_minor_votes(global_context_t * global_context, int minor_i, int minor_j, int major_i, int major_j , gene_vote_t * votes, int read_len)
837 {
838 int is_small_margin_minor = 0;
839 long long dist = votes -> pos[minor_i][minor_j];
840 dist -= votes -> pos[major_i][major_j];
841
842 if(abs(dist)> global_context->config.maximum_intron_length)
843 {
844 int iii, jjj;
845 for(iii=0; iii<GENE_VOTE_TABLE_SIZE; iii++)
846 {
847 for(jjj = 0; jjj < votes->items[iii]; jjj++)
848 {
849 if(iii == minor_i && jjj == minor_j) continue;
850 // "2" is the tolerance.
851 if(votes -> votes[minor_i][minor_j] - votes -> votes[iii][jjj] >=1) continue;
852
853 int minor_coverage_start = votes -> coverage_start[minor_i][minor_j] ;
854 int minor_coverage_end = votes -> coverage_end[minor_i][minor_j] ;
855
856 int other_coverage_start = votes -> coverage_start[iii][jjj];
857 int other_coverage_end = votes -> coverage_end[iii][jjj];
858
859 int minor_negative = votes -> masks[minor_i][minor_j] & IS_NEGATIVE_STRAND;
860 int other_negative = votes -> masks[iii][jjj] & IS_NEGATIVE_STRAND;
861
862 if(minor_negative) {
863 int ttt = read_len - minor_coverage_end;
864 minor_coverage_end = read_len - minor_coverage_start;
865 minor_coverage_start = ttt;
866 }
867
868 if(other_negative){
869 int ttt = read_len - other_coverage_end;
870 other_coverage_end = read_len - other_coverage_start;
871 other_coverage_start = ttt;
872 }
873
874 if(abs(minor_coverage_end - other_coverage_end) < 7 && abs(minor_coverage_start - other_coverage_start)<7)
875 is_small_margin_minor = 1;
876
877 if(is_small_margin_minor) break;
878 }
879 if(is_small_margin_minor) break;
880 }
881 }
882 return is_small_margin_minor;
883 }
884
885
886 // function test_junction_minor returns 1 if the current anchor and current_vote[i][j] are not good mates in terms of junction reads:
887 // for example, if the distance is too far, if the coverered region overlapped or if the two mapped parts in the read are reversely arranged (expect in fusion detection)
test_junction_minor(global_context_t * global_context,thread_context_t * thread_context,gene_vote_t * votes,int vote_i,int vote_j,int i,int j,long long int dist)888 int test_junction_minor(global_context_t * global_context, thread_context_t * thread_context, gene_vote_t * votes, int vote_i, int vote_j, int i, int j, long long int dist)
889 {
890 if(abs(dist)> global_context->config.maximum_intron_length) return 1;
891 if(votes -> coverage_start[vote_i][vote_j] == votes -> coverage_start[i][j])return 2;
892 if(votes -> coverage_end[vote_i][vote_j] == votes -> coverage_end[i][j])return 3;
893
894 //SUBREADprintf( " COV_IN_READ: %d ~ %d CHRO_POS: %u ~ %u \n", votes -> coverage_start[vote_i][vote_j] , votes -> coverage_start[i][j] , votes -> pos[vote_i][vote_j] , votes -> pos[i][j] );
895 if(votes -> coverage_start[vote_i][vote_j] > votes -> coverage_start[i][j])
896 {
897 if(votes -> pos[vote_i][vote_j] < votes -> pos[i][j])return 4;
898 }
899 else
900 {
901 if(votes -> pos[vote_i][vote_j] > votes -> pos[i][j])return 5;
902 }
903
904 return 0;
905 }
906
update_top_three(global_context_t * global_context,int * top_buffer_3i,int new_value)907 void update_top_three(global_context_t * global_context, int * top_buffer_3i, int new_value){
908 if(new_value > top_buffer_3i[global_context -> config.top_scores - 1]){
909 int x1;
910 for(x1 = 0;x1 < global_context -> config.top_scores ; x1++){
911 if(new_value > top_buffer_3i[x1]){
912 int x2;
913 for(x2 = global_context -> config.top_scores - 1 ; x2 > x1 ; x2 --){
914 top_buffer_3i[x2] = top_buffer_3i[x2-1];
915 }
916 top_buffer_3i[x1] = new_value;
917 break;
918 }else if(new_value == top_buffer_3i[x1]) break;
919 }
920 }
921 }
922
923
924
comb_sort_compare(void * Vcomb_buffer,int i,int j)925 int comb_sort_compare(void * Vcomb_buffer, int i, int j){
926 vote_combination_t * comb_buffer = (vote_combination_t *)Vcomb_buffer;
927 return comb_buffer[i].score_adj - comb_buffer[j].score_adj;
928 }
929
comb_sort_exchange(void * Vcomb_buffer,int i,int j)930 void comb_sort_exchange(void * Vcomb_buffer, int i, int j){
931 vote_combination_t * comb_buffer = (vote_combination_t *)Vcomb_buffer;
932 vote_combination_t tmpv;
933 memcpy(&tmpv, comb_buffer + i, sizeof(vote_combination_t));
934 memcpy(comb_buffer + i, comb_buffer + j, sizeof(vote_combination_t));
935 memcpy(comb_buffer + j, &tmpv, sizeof(vote_combination_t));
936 }
937
comb_sort_merge(void * Vcomb_buffer,int start,int items,int items2)938 void comb_sort_merge(void * Vcomb_buffer, int start, int items, int items2){
939 vote_combination_t * comb_buffer = (vote_combination_t *)Vcomb_buffer;
940 vote_combination_t * merge_target = malloc(sizeof(vote_combination_t) * (items + items2));
941
942 int items1_cursor = start, items2_cursor = start + items, x1;
943
944 for(x1=0; x1 < items+items2; x1++){
945 int select_items_1 = (items1_cursor < items + start && comb_sort_compare(comb_buffer, items1_cursor, items2_cursor) <=0) || (items2_cursor == start + items + items2);
946 if(select_items_1){
947 memcpy(merge_target+x1, comb_buffer+items1_cursor, sizeof(vote_combination_t));
948 items1_cursor++;
949 }else{
950 memcpy(merge_target+x1, comb_buffer+items2_cursor, sizeof(vote_combination_t));
951 items2_cursor++;
952 }
953
954 }
955
956 memcpy(comb_buffer + start, merge_target, (items+items2) * sizeof(vote_combination_t));
957 free(merge_target);
958
959 }
960
is_better_inner(global_context_t * global_context,thread_context_t * thread_context,subjunc_result_t * junc_res,int old_intron_length,gene_vote_number_t Vote_minor,int coverage_minor_length,int intron)961 int is_better_inner(global_context_t * global_context, thread_context_t * thread_context, subjunc_result_t * junc_res, int old_intron_length, gene_vote_number_t Vote_minor, int coverage_minor_length, int intron)
962 {
963 if( Vote_minor > junc_res -> minor_votes ||
964 (Vote_minor ==junc_res -> minor_votes && coverage_minor_length > junc_res -> minor_coverage_end - junc_res -> minor_coverage_start) ||
965 (Vote_minor ==junc_res -> minor_votes && coverage_minor_length ==junc_res -> minor_coverage_end - junc_res -> minor_coverage_start && intron < old_intron_length))
966 return 1;
967 else return 0;
968 }
969
970
971 #define COVERAGE_STAB_NUMBER 100
test_fully_covered(global_context_t * global_context,gene_vote_t * vote,int read_length)972 int test_fully_covered(global_context_t * global_context, gene_vote_t * vote, int read_length){
973 int i,j,xk1,xk2;
974 char local_strands[COVERAGE_STAB_NUMBER];
975 unsigned int local_locations[COVERAGE_STAB_NUMBER];
976 unsigned long long local_coverage[COVERAGE_STAB_NUMBER];
977 int used_stabs = 0;
978
979 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
980 {
981 for (j=0; j< vote->items[i]; j++)
982 {
983 if(vote -> votes[i][j]>2 && used_stabs < COVERAGE_STAB_NUMBER)
984 {
985 int is_fresh = 1;
986 int is_negative = (vote -> masks[i][j] & IS_NEGATIVE_STRAND)?1:0;
987 for(xk1=0; xk1<used_stabs; xk1++){
988 if(local_strands[xk1] == is_negative){
989 long long dist = vote -> pos[i][j];
990 dist -= local_locations[xk1];
991 if(abs(dist) < MAX_DELETION_LENGTH)
992 {
993 is_fresh=0;
994 break;
995 }
996 }
997 }
998
999 if(is_fresh){
1000 local_strands[used_stabs]=is_negative;
1001 local_locations[used_stabs]= vote -> pos[i][j];
1002 local_coverage[used_stabs] = 0;
1003 used_stabs++;
1004 }
1005 }
1006 }
1007 }
1008 if(!used_stabs) return 0;
1009
1010 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1011 {
1012 for (j=0; j< vote->items[i]; j++)
1013 {
1014 if(vote -> votes[i][j]>=1)
1015 {
1016 int is_negative = (vote -> masks[i][j] & IS_NEGATIVE_STRAND)?1:0;
1017 for(xk1=0; xk1<used_stabs; xk1++){
1018 if(local_strands[xk1] == is_negative){
1019 long long dist = vote -> pos[i][j];
1020 dist -= local_locations[xk1];
1021 if(abs(dist) < MAX_DELETION_LENGTH)
1022 {
1023 for(xk2 = vote -> coverage_start[i][j] * 64 / read_length; xk2 <=
1024 vote -> coverage_end[i][j] * 64 / read_length; xk2++){
1025 local_coverage[xk1] |= 1llu<<xk2;
1026 }
1027 }
1028 }
1029 }
1030 }
1031 }
1032 }
1033
1034 for(xk1=0; xk1<used_stabs; xk1++){
1035 int covered = 0;
1036 for(xk2 = 0; xk2<64; xk2++){
1037 covered += ( local_coverage[xk1] & (1llu<<xk2) )?1:0;
1038 }
1039 //SUBREADprintf("COVERAGE LEVEL=%d\n", covered);
1040
1041 if(covered > 54){
1042 return 1;
1043 }
1044 }
1045
1046 return 0;
1047 }
1048
is_long_del_high_quality(global_context_t * global_context,thread_context_t * thread_context,int p1_start,int p1_end,int p2_start,int p2_end,int read_len,int p1_votes,int p2_votes)1049 int is_long_del_high_quality(global_context_t * global_context, thread_context_t * thread_context, int p1_start, int p1_end, int p2_start, int p2_end, int read_len, int p1_votes, int p2_votes){
1050 if(p1_votes < 3 || p2_votes < 3) return 0;
1051 if( min( p1_start, p2_start ) > 10 ) return 0;
1052 if( read_len - max(p1_end, p2_end) > 10 ) return 0;
1053 return 1;
1054 }
1055 #define SE_READ_IN_KNOWN_EXON_REWARD 1
1056
copy_vote_to_alignment_res(global_context_t * global_context,thread_context_t * thread_context,mapping_result_t * align_res,subjunc_result_t * junc_res,gene_vote_t * current_vote,int vote_i,int vote_j,int curr_read_len,char * read_name,char * curr_read_text,int used_subreads_in_vote,int noninformative_subreads_in_vote,subread_read_number_t pair_number,int is_second_read,int * is_fully_covered)1057 void copy_vote_to_alignment_res(global_context_t * global_context, thread_context_t * thread_context, mapping_result_t * align_res, subjunc_result_t * junc_res, gene_vote_t * current_vote, int vote_i, int vote_j, int curr_read_len, char * read_name, char * curr_read_text, int used_subreads_in_vote, int noninformative_subreads_in_vote, subread_read_number_t pair_number, int is_second_read, int * is_fully_covered)
1058 {
1059 int vv = current_vote -> votes[vote_i][vote_j];
1060 if(global_context->config.scRNA_input_mode && !global_context -> input_reads.is_paired_end_reads) vv += SE_READ_IN_KNOWN_EXON_REWARD *is_pos_in_annotated_exon_regions(global_context, current_vote -> pos[vote_i][vote_j]);
1061 align_res -> selected_position = current_vote -> pos[vote_i][vote_j];
1062 align_res -> selected_votes = vv;
1063 align_res -> indels_in_confident_coverage = indel_recorder_copy(align_res -> selected_indel_record, current_vote -> indel_recorder[vote_i][vote_j]);
1064 align_res -> confident_coverage_end = current_vote -> coverage_end[vote_i][vote_j];
1065 align_res -> confident_coverage_start = current_vote -> coverage_start[vote_i][vote_j];
1066 align_res -> result_flags = (current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND)?(CORE_IS_NEGATIVE_STRAND):0;
1067 align_res -> used_subreads_in_vote = used_subreads_in_vote;
1068 align_res -> noninformative_subreads_in_vote = noninformative_subreads_in_vote;
1069 align_res -> is_fully_covered = *is_fully_covered ;
1070
1071 if(global_context -> config.do_breakpoint_detection)
1072 {
1073 int i,j;
1074
1075 // iterate all the anchors we have found in step 1:
1076 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1077 {
1078 for (j=0; j< current_vote->items[i]; j++)
1079 {
1080 if(i == vote_i && j == vote_j) continue;
1081 if(align_res -> selected_votes < current_vote -> votes[i][j]) continue; // major half must be the anchor
1082
1083 long long int dist = current_vote -> pos[vote_i][vote_j];
1084 dist -= current_vote -> pos[i][j];
1085
1086 int is_strand_jumpped = (current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND)!=(current_vote -> masks[i][j] & IS_NEGATIVE_STRAND);
1087 if((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection) && (*is_fully_covered) && (dist > MAX_DELETION_LENGTH || is_strand_jumpped)) continue;
1088
1089 if((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection)){
1090 // function test_small_minor_votes returns 1 if the vote number is not significantly
1091 // higher than the vote numbers in the vote list.
1092 //#warning "SUBREAD_151 =========== THE TWO LINES SHOULD BE UNCOMMENTED IN RELEASED VERSION ==== WE COMMENT IT FOR A BETTER FUSION SENSITIVITY BUT ONLY FOR TEST ==================="
1093 if(1){
1094 int small_minor_bigmargin = test_small_minor_votes(global_context , i, j, vote_i, vote_j, current_vote, curr_read_len);
1095 if(small_minor_bigmargin) continue;
1096 }
1097 }else{
1098 // function test_junction_minor returns 1 if the current anchor and current_vote[i][j]
1099 // are not good mates in terms of junction reads:
1100 //
1101 // for example, if the distance is too far, if the coverered region overlapped or
1102 // if the covered region has a wrong arrangement to their relative positions.
1103 int test_minor_res = test_junction_minor(global_context, thread_context, current_vote, vote_i, vote_j, i, j, dist);
1104 //#warning " ============ DEBUG 1 ==================== "
1105 if(0 && FIXLENstrcmp("R002403247", read_name) == 0) {
1106 char posout2[100];
1107 char posout1[100];
1108 absoffset_to_posstr(global_context, current_vote -> pos[vote_i][vote_j], posout1);
1109 absoffset_to_posstr(global_context, current_vote -> pos[i][j], posout2);
1110 SUBREADprintf("SMALL_MARGIN=%d at %s ~ %s\n", test_minor_res, posout1, posout2);
1111 }
1112 // SUBREADprintf("TMR=%d (V=%d)\n", test_minor_res, current_vote -> votes[i][j]);
1113 if(test_minor_res)continue;
1114 }
1115
1116 int is_better = is_better_inner(global_context, thread_context,
1117 junc_res, abs32uint(current_vote -> pos[vote_i][vote_j] - junc_res -> minor_position), current_vote -> votes[i][j], current_vote -> coverage_end[i][j] - current_vote -> coverage_start[i][j],
1118 abs32uint(current_vote -> pos[vote_i][vote_j] - current_vote -> pos[i][j]));
1119
1120 int replace_minor = 0, minor_indel_offset = 0, inserted_bases = 0, is_GT_AG_donors = 0, is_donor_found_or_annotation = 0, final_split_point = 0, major_indels = 0, small_side_increasing_coordinate = 0, large_side_increasing_coordinate = 0;
1121
1122 if(0 && FIXLENstrcmp("R002403247", read_name) == 0)
1123 {
1124 char posout[100];
1125 absoffset_to_posstr(global_context, current_vote -> pos[i][j], posout);
1126 SUBREADprintf("IBT=%d (V=%d , OV=%d) at %s\n", is_better, current_vote -> votes[i][j], junc_res -> minor_votes, posout);
1127 SUBREADprintf("IBT OLD_INTRON=%d, INTRON=%d\n", abs32uint(current_vote -> pos[vote_i][vote_j] - junc_res -> minor_position),
1128 abs32uint(current_vote -> pos[vote_i][vote_j] - current_vote -> pos[i][j])
1129 );
1130 }
1131
1132 if(is_better){
1133 // Determine the splicing point of the fusion or the junction
1134 // If the splicing point is determined, then set replace_minor = 1
1135 if(is_strand_jumpped){
1136 if(!global_context -> config.do_fusion_detection) continue;
1137
1138 int minor_cover_end_as_reversed = (current_vote -> masks[i][j] & IS_NEGATIVE_STRAND)? current_vote -> coverage_end[i][j]:(curr_read_len - current_vote -> coverage_start[i][j]);
1139 int minor_cover_start_as_reversed = (current_vote -> masks[i][j] & IS_NEGATIVE_STRAND)? current_vote -> coverage_start[i][j]:(curr_read_len - current_vote -> coverage_end[i][j]);
1140 int main_cover_end_as_reversed = (current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND)?current_vote -> coverage_end[vote_i][vote_j]:(curr_read_len - current_vote -> coverage_start[vote_i][vote_j]);
1141 int main_cover_start_as_reversed = (current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND)?current_vote -> coverage_start[vote_i][vote_j]:(curr_read_len - current_vote -> coverage_end[vote_i][vote_j]);
1142
1143
1144 int overlapped ;
1145 if(main_cover_start_as_reversed > minor_cover_start_as_reversed)
1146 overlapped = minor_cover_end_as_reversed - main_cover_start_as_reversed;
1147 else
1148 overlapped = main_cover_end_as_reversed - minor_cover_start_as_reversed;
1149
1150 if(overlapped > 14) continue;
1151
1152
1153 int guess_start_as_reversed = (main_cover_start_as_reversed > minor_cover_start_as_reversed)?
1154 (minor_cover_end_as_reversed - 15): (main_cover_end_as_reversed - 15);
1155
1156 int guess_end_as_reversed = (main_cover_start_as_reversed > minor_cover_start_as_reversed)?
1157 (main_cover_start_as_reversed + 15): (minor_cover_start_as_reversed + 15);
1158
1159 int is_small_half_negative = 0 != ((current_vote -> pos[vote_i][vote_j]>current_vote -> pos[i][j]?current_vote -> masks[i][j]:current_vote -> masks[vote_i][vote_j])&IS_NEGATIVE_STRAND);
1160 int is_large_half_negative = !is_small_half_negative;
1161
1162 int is_small_half_on_left_as_reversed = (main_cover_start_as_reversed > minor_cover_start_as_reversed) + (current_vote -> pos[vote_i][vote_j]> current_vote -> pos[i][j]) !=1;
1163 // small half on left(as reversed) === small half on right (as 'forward' form of the read, i.e., the raw FASTQ form for read_A and reversed FASTQ form for read_B)
1164
1165 unsigned int small_half_abs_offset = min(current_vote -> pos[i][j], current_vote -> pos[vote_i][vote_j]);
1166 unsigned int large_half_abs_offset = max(current_vote -> pos[i][j], current_vote -> pos[vote_i][vote_j]);
1167
1168 // curr_read_text is the 'reversed' form of the read. I.e., the reversed FASTQ form for read_A and the raw FASTQ form for read_B.
1169 replace_minor = donor_jumped_score(global_context, thread_context, small_half_abs_offset, large_half_abs_offset,
1170 max(0, guess_start_as_reversed) , min( guess_end_as_reversed, curr_read_len), curr_read_text,
1171 curr_read_len, is_small_half_negative, is_large_half_negative, is_small_half_on_left_as_reversed,
1172 & final_split_point, & is_GT_AG_donors, & is_donor_found_or_annotation, &small_side_increasing_coordinate, &large_side_increasing_coordinate);
1173
1174 if( 0 && 1018082 == pair_number)
1175 {
1176 print_votes(current_vote, global_context -> config.index_prefix);
1177 SUBREADprintf("JUMP_001018082 NORMAL=%d SMALL_NEG=%d LARGE_NEG=%d, SMALL_ABS=%u LARGE_ABS=%u, REPLACE=%d, INCS=%d %d\n" , is_small_half_on_left_as_reversed, is_small_half_negative, is_large_half_negative, small_half_abs_offset, large_half_abs_offset, replace_minor, small_side_increasing_coordinate, large_side_increasing_coordinate);
1178 }
1179
1180
1181 // Now "final_split_point" is the read offset on the 'reversed' form of the read. It needs to be changed to (read_len - final_split_point) if the major half is on negative strand.
1182
1183 if(replace_minor>0) replace_minor += current_vote -> votes[i][j] * 100000;
1184
1185 }
1186 else
1187 {
1188
1189 int overlapped ;
1190 if(current_vote -> coverage_start[vote_i][vote_j] > current_vote -> coverage_start[i][j])
1191 overlapped = current_vote -> coverage_end[i][j] - current_vote -> coverage_start[vote_i][vote_j];
1192 else
1193 overlapped = current_vote -> coverage_end[vote_i][vote_j] - current_vote -> coverage_start[i][j];
1194
1195 if(0 && FIXLENstrcmp("R000404427", read_name) == 0)
1196 {
1197 SUBREADprintf("OVL=%d, DIST=%u\n", overlapped, (unsigned int)abs(dist));
1198 }
1199
1200 if(overlapped > 14) continue;
1201 if(abs(dist)<6) continue;
1202
1203 int guess_start = (current_vote -> coverage_start[vote_i][vote_j] > current_vote -> coverage_start[i][j])?
1204 (current_vote -> coverage_end[i][j] - 8): (current_vote -> coverage_end[vote_i][vote_j] - 8);
1205
1206 int guess_end = (current_vote -> coverage_start[vote_i][vote_j] < current_vote -> coverage_start[i][j])?
1207 (current_vote -> coverage_start[i][j] + 8): (current_vote -> coverage_start[vote_i][vote_j] + 8);
1208
1209 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection) && !(current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND))
1210 // if for fusion, the current read must have been reversed.
1211 // hence, it is now changed to "main half" view.
1212 reverse_read(curr_read_text, curr_read_len, global_context -> config.space_type);
1213
1214 int left_indel_offset=0, right_indel_offset=0;
1215 int kx2;
1216
1217 int normally_arranged = 1!=(current_vote -> coverage_start[vote_i][vote_j] > current_vote -> coverage_start[i][j]) + (current_vote -> pos[vote_i][vote_j] > current_vote -> pos[i][j]);
1218
1219 if(curr_read_len > EXON_LONG_READ_LENGTH){
1220
1221 int kx1;
1222 gene_vote_number_t * indel_recorder = current_vote -> indel_recorder[vote_i][vote_j];
1223 for(kx1=0; kx1<MAX_INDEL_SECTIONS; kx1++)
1224 {
1225 if(!indel_recorder[kx1*3]) break;
1226 major_indels += indel_recorder[kx1*3+2];
1227 }
1228
1229 for(kx2=0; kx2<MAX_INDEL_SECTIONS; kx2++)
1230 {
1231 if(!current_vote -> indel_recorder[i][j][kx2*3]) break;
1232 minor_indel_offset += (current_vote -> indel_recorder[i][j][kx2*3+2]);
1233 }
1234
1235 if(current_vote -> pos[vote_i][vote_j] < current_vote -> pos[i][j])
1236 {
1237 left_indel_offset=major_indels;
1238 right_indel_offset=minor_indel_offset;
1239 }
1240 else
1241 {
1242 right_indel_offset=major_indels;
1243 left_indel_offset=minor_indel_offset;
1244
1245 }
1246
1247 // the section having a smaller coordinate will have indel_offset !=0
1248 // the section having a larger coordiname MUST HAVE indel_offset == 0
1249 right_indel_offset=0;
1250 }
1251
1252 if(is_long_del_high_quality( global_context, thread_context, current_vote -> coverage_start[i][j], current_vote -> coverage_end[i][j], current_vote -> coverage_start[vote_i][vote_j], current_vote -> coverage_end[vote_i][vote_j], curr_read_len, current_vote -> votes[i][j], current_vote -> votes[vote_i][vote_j])|| ! global_context -> config.do_long_del_detection)
1253 replace_minor = donor_score(global_context, thread_context, min(current_vote -> pos[vote_i][vote_j],
1254 current_vote -> pos[i][j]),max(current_vote -> pos[vote_i][vote_j] ,
1255 current_vote -> pos[i][j]), left_indel_offset, right_indel_offset, normally_arranged,
1256 max(0, guess_start), min( guess_end, curr_read_len), curr_read_text, curr_read_len,
1257 & final_split_point, & is_GT_AG_donors, & is_donor_found_or_annotation, & inserted_bases, &small_side_increasing_coordinate, &large_side_increasing_coordinate, read_name);
1258 else replace_minor = 0;
1259
1260 // Now "final_split_point" is the read offset on the 'reversed' form of the read (I.e., the reversed FASTQ form for read_A and the raw FASTQ form for read_B.) if do_fusion_detection AND if the main half is on negative strand.
1261 // However, because the final_split_point is ALWAYS on the form where the major half can be mapped, final_split_point will never be changed.
1262
1263 if(replace_minor>0) replace_minor += current_vote -> votes[i][j] * 100000;
1264
1265 if(0 && ( FIXLENstrcmp("R006232475", read_name) == 0 ) )
1266 SUBREADprintf("NOJUMP_DONORs=%d LOC=%u\n", replace_minor , current_vote -> pos[i][j]);
1267 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection) && !(current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND))
1268 // changed back.
1269 reverse_read(curr_read_text, curr_read_len, global_context -> config.space_type);
1270 }
1271 }
1272
1273 if(0 && ( FIXLENstrcmp("R006232475", read_name) == 0 ) )
1274 {
1275 char posout[100];
1276 absoffset_to_posstr(global_context, current_vote -> pos[i][j], posout);
1277 SUBREADprintf("TEST MINOR: POS=%s, REPLACE=%d\n", posout, replace_minor);
1278 }
1279
1280 if(replace_minor){// && (replace_minor > current_piece_minor_score)){
1281 junc_res -> minor_position = current_vote -> pos[i][j];
1282 junc_res -> minor_votes = current_vote -> votes[i][j];
1283
1284 junc_res -> minor_coverage_start = current_vote -> coverage_start[i][j];
1285 junc_res -> minor_coverage_end = current_vote -> coverage_end [i][j];
1286
1287 junc_res -> double_indel_offset = (minor_indel_offset & 0xf)|((major_indels & 0xf)<<4);
1288 junc_res -> split_point = final_split_point;
1289
1290
1291 if(0 && 1018082 == pair_number)
1292 {
1293 SUBREADprintf("REPLACED: LOC %u, INCS=%d %d\n", junc_res -> minor_position, small_side_increasing_coordinate, large_side_increasing_coordinate);
1294 }
1295
1296 junc_res -> small_side_increasing_coordinate = small_side_increasing_coordinate;
1297 junc_res -> large_side_increasing_coordinate = large_side_increasing_coordinate;
1298 junc_res -> indel_at_junction = inserted_bases;
1299
1300 align_res -> result_flags &=~0x3;
1301 if( (!is_donor_found_or_annotation) || is_GT_AG_donors > 2) align_res -> result_flags |= 3;
1302 else align_res -> result_flags = is_GT_AG_donors? (align_res -> result_flags|CORE_IS_GT_AG_DONORS):(align_res -> result_flags &~CORE_IS_GT_AG_DONORS);
1303
1304 align_res -> result_flags = is_strand_jumpped? (align_res -> result_flags|CORE_IS_STRAND_JUMPED):(align_res -> result_flags &~CORE_IS_STRAND_JUMPED);
1305 }
1306 }
1307 }
1308
1309 if(0 && memcmp("V0112_0155:7:1101:1173:2204", read_name, 26) == 0)
1310 {
1311 char leftpos[100], rightpos[100];
1312 absoffset_to_posstr(global_context, current_vote -> pos[vote_i][vote_j] , leftpos);
1313 absoffset_to_posstr(global_context, junc_res -> minor_position, rightpos);
1314 SUBREADprintf("READ=%s, MAJOR=%s, MINOR=%s\n", read_name, leftpos, rightpos);
1315 }
1316
1317
1318 // This block runs after the minor half of this anchor is fully determined.
1319 // If the minor half is a fusion and there is a strand jump, move the minor half coverage to the major half strand.
1320 if(align_res -> result_flags & CORE_IS_STRAND_JUMPED)
1321 {
1322 // If "is_strand_jumped" is true, all coordinates so far are on the best voted strands (must be differnet strands, namely they're very likely to be overlapped).
1323 int tmpv = junc_res -> minor_coverage_start;
1324 junc_res -> minor_coverage_start = curr_read_len - junc_res -> minor_coverage_end;
1325 junc_res -> minor_coverage_end = curr_read_len - tmpv;
1326
1327 // Split_point is now the "negative strand read" view. It has to be changed to "main piece" view
1328 junc_res -> split_point = (align_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?
1329 junc_res -> split_point :
1330 (curr_read_len - junc_res -> split_point);
1331 }
1332 }
1333 }
1334
1335
simple_PE_and_same_chro(global_context_t * global_context,simple_mapping_t * r1,simple_mapping_t * r2,int * is_PE_distance,int * is_same_chromosome,int rlen1,int rlen2)1336 void simple_PE_and_same_chro(global_context_t * global_context , simple_mapping_t * r1, simple_mapping_t * r2 , int * is_PE_distance, int * is_same_chromosome , int rlen1, int rlen2){
1337 test_PE_and_same_chro(global_context, r1 -> mapping_position, r2 -> mapping_position, is_PE_distance, is_same_chromosome, rlen1, rlen2);
1338 }
1339
1340
1341 #define MAX_CLUSTER_ELEMENTS 7
1342
1343 struct cluster_element{
1344 unsigned int initial_position;
1345 char cluster_members;
1346 char from_second_read[MAX_CLUSTER_ELEMENTS];
1347 int i_list[MAX_CLUSTER_ELEMENTS], j_list[MAX_CLUSTER_ELEMENTS];
1348 };
1349
add_cluster_member(struct cluster_element * cl,int i,int j,int is_second_read)1350 int add_cluster_member(struct cluster_element * cl , int i, int j, int is_second_read){
1351 if(cl->cluster_members < MAX_CLUSTER_ELEMENTS){
1352 cl->i_list[(int)cl->cluster_members] = i;
1353 cl->j_list[(int)cl->cluster_members] = j;
1354 cl->from_second_read[(int)cl->cluster_members] = is_second_read;
1355 cl->cluster_members++;
1356 }
1357 return cl->cluster_members;
1358 }
1359
is_same_cluster(global_context_t * global_context,struct cluster_element * cl,unsigned int pos)1360 int is_same_cluster( global_context_t * global_context, struct cluster_element * cl , unsigned int pos){
1361 long long int test_pos = pos;
1362 test_pos -= cl -> initial_position;
1363 if(abs(test_pos) < global_context -> config.maximum_intron_length)
1364 return 1;
1365 return 0;
1366 }
1367
1368 int process_voting_junction_PE_topK(global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t pair_number, gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, gene_vote_number_t v1_all_subreads, gene_vote_number_t v2_all_subreads);
1369 int align_cluster(global_context_t * global_context, thread_context_t * thread_context, struct cluster_element * this_cluster, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, gene_vote_t * vote_1, gene_vote_t * vote_2, int * this_score, int * ii_path, int * jj_path, int * masks, int * path_len, int * R1R2_mapped);
1370
simple_copy_vote_to_result(mapping_result_t * align_res,gene_vote_t * current_vote,int vote_i,int vote_j,int used_subreads_in_vote,int noninformative_subreads_in_vote,int score)1371 void simple_copy_vote_to_result( mapping_result_t * align_res, gene_vote_t * current_vote, int vote_i, int vote_j, int used_subreads_in_vote, int noninformative_subreads_in_vote, int score){
1372 align_res -> selected_position = current_vote -> pos[vote_i][vote_j];
1373 align_res -> selected_votes = score;
1374 align_res -> indels_in_confident_coverage = indel_recorder_copy(align_res -> selected_indel_record, current_vote -> indel_recorder[vote_i][vote_j]);
1375 align_res -> confident_coverage_end = current_vote -> coverage_end[vote_i][vote_j];
1376 align_res -> confident_coverage_start = current_vote -> coverage_start[vote_i][vote_j];
1377 align_res -> result_flags = (current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND)?(CORE_IS_NEGATIVE_STRAND):0;
1378 align_res -> used_subreads_in_vote = used_subreads_in_vote;
1379 align_res -> noninformative_subreads_in_vote = noninformative_subreads_in_vote;
1380 }
1381
process_voting_junction_PE_juncs(global_context_t * global_context,thread_context_t * thread_context,subread_read_number_t pair_number,gene_vote_t * vote_1,gene_vote_t * vote_2,char * read_name_1,char * read_name_2,char * read_text_1,char * read_text_2,int read_len_1,int read_len_2,int is_negative_strand,gene_vote_number_t v1_all_subreads,gene_vote_number_t v2_all_subreads)1382 int process_voting_junction_PE_juncs( global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t pair_number, gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, gene_vote_number_t v1_all_subreads, gene_vote_number_t v2_all_subreads ){
1383 int current_cluster_number = 0,max_clusters = global_context -> config.max_vote_simples * 2;
1384 int i,j, is_second_read, tested_votes, x1;
1385
1386 struct cluster_element * cluster_buffer = malloc(max_clusters * sizeof(struct cluster_element));
1387 int max_cluster_size_r1 = 0, max_cluster_size_r2 = 0;
1388
1389 for( tested_votes = max(vote_1 -> max_vote, vote_2 -> max_vote); tested_votes > 0; tested_votes--) {
1390 for(is_second_read = 0 ; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read ++) {
1391 gene_vote_t * current_vote = is_second_read?vote_2:vote_1;
1392 int * max_cluster_size = is_second_read?(&max_cluster_size_r2):(&max_cluster_size_r1);
1393 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++) {
1394 for (j=0; j< current_vote->items[i]; j++) {
1395 if(current_vote->votes[i][j]!=tested_votes) continue;
1396 int is_added = 0;
1397
1398 for(x1 = 0; x1 < current_cluster_number ; x1++){
1399 if(is_same_cluster(global_context, cluster_buffer+x1, current_vote->pos[i][j])){
1400 int new_size =add_cluster_member(cluster_buffer+x1, i, j, is_second_read);
1401 (*max_cluster_size) = max(*max_cluster_size, new_size);
1402 is_added = 1;
1403 }
1404 if(is_added)break;
1405 }
1406 if(current_cluster_number < max_clusters && !is_added){
1407 cluster_buffer[current_cluster_number].initial_position = current_vote->pos[i][j];
1408 cluster_buffer[current_cluster_number].i_list[0] = i;
1409 cluster_buffer[current_cluster_number].j_list[0] = j;
1410 cluster_buffer[current_cluster_number].from_second_read[0] = is_second_read;
1411 cluster_buffer[current_cluster_number].cluster_members = 1;
1412 current_cluster_number++;
1413 }
1414 }
1415 }
1416 }
1417 }
1418
1419 if(1 || max_cluster_size_r1 == 3 || max_cluster_size_r2 == 3 ) // if there are 3-section clusters then parse it, else go to the regular procedure. There is a upper-limit to the sections to avoid fragile mapping.
1420 {
1421 for(x1 = 0 ; x1 < current_cluster_number ; x1++){
1422 int this_score = -1, path_len = -1, R1R2_mapped = 0;
1423 int this_ii_path[ MAX_CLUSTER_ELEMENTS ], this_jj_path[ MAX_CLUSTER_ELEMENTS ], this_masks [ MAX_CLUSTER_ELEMENTS ];
1424 align_cluster(global_context, thread_context, cluster_buffer + x1, read_name_1, read_name_2, read_text_1, read_text_2, read_len_1, read_len_2, is_negative_strand, vote_1, vote_2, &this_score, this_ii_path, this_jj_path, this_masks, &path_len, &R1R2_mapped);
1425
1426 if(0 && FIXLENstrcmp("R00000003493", read_name_1)==0)
1427 SUBREADprintf("REAE_TEST : R12MAP=%d, PATHLEN=%d, SCORE=%d\n", R1R2_mapped, path_len, this_score);
1428
1429 if(this_score > 0){
1430 if(( R1R2_mapped & CLUSTER_ALIGNMENT_DONOR_R1_MAPPED) && ( R1R2_mapped & CLUSTER_ALIGNMENT_DONOR_R2_MAPPED)){
1431 for(i = 0; i < global_context -> config.multi_best_reads; i++) {
1432 mapping_result_t * old_result_R1 = _global_retrieve_alignment_ptr(global_context, pair_number, 0, i);
1433 mapping_result_t * old_result_R2 = _global_retrieve_alignment_ptr(global_context, pair_number, 1, i);
1434 short old_score_R1 = old_result_R1 -> selected_votes;
1435 short old_score_R2 = old_result_R2 -> selected_votes;
1436
1437 if( old_score_R1 < this_score || old_score_R2 < this_score ){
1438
1439 for(j = global_context -> config.multi_best_reads - 2; j>=i; j--){
1440 mapping_result_t * shifted_result_R1 = _global_retrieve_alignment_ptr(global_context, pair_number, 0, j);
1441 mapping_result_t * shifted_result_R2 = _global_retrieve_alignment_ptr(global_context, pair_number, 1, j);
1442 mapping_result_t * target_result_R1 = _global_retrieve_alignment_ptr(global_context, pair_number, 0, j+1);
1443 mapping_result_t * target_result_R2 = _global_retrieve_alignment_ptr(global_context, pair_number, 1, j+1);
1444 memcpy( target_result_R1, shifted_result_R1 , sizeof(mapping_result_t));
1445 memcpy( target_result_R2, shifted_result_R2 , sizeof(mapping_result_t) );
1446
1447 }
1448
1449 int best_R1_i = -1, best_R1_j = - 1 , highest_vote_R1 = -1, highest_vote_R2 = -1, best_R2_i = -2, best_R2_j = -2;
1450
1451 for(j = 0; j < path_len ; j++){
1452 if( this_masks[j] & CLUSTER_ALIGNMENT_DONOR_R1_MAPPED ){
1453 if( highest_vote_R1 < vote_1 -> votes [ this_ii_path [j] ] [ this_jj_path [j] ] ){
1454 best_R1_i = this_ii_path [j] ;
1455 best_R1_j = this_jj_path [j] ;
1456 highest_vote_R1 = vote_1 -> votes [ this_ii_path [j] ] [ this_jj_path [j] ] ;
1457 }
1458 }else{
1459 if( highest_vote_R2 < vote_2 -> votes [ this_ii_path [j] ] [ this_jj_path [j] ] ){
1460 best_R2_i = this_ii_path [j] ;
1461 best_R2_j = this_jj_path [j] ;
1462 highest_vote_R2 = vote_2 -> votes [ this_ii_path [j] ] [ this_jj_path [j] ] ;
1463 }
1464 }
1465 //SUBREADprintf("MASK=%d\n", this_masks[j]);
1466 }
1467
1468 //SUBREADprintf("IJ: R1=%d,%d R2=%d,%d MASK=%d\n", best_R1_i,best_R1_j,best_R2_i,best_R2_j);
1469
1470 simple_copy_vote_to_result( old_result_R1, vote_1, best_R1_i, best_R1_j , v1_all_subreads, vote_1 -> noninformative_subreads, this_score);
1471 simple_copy_vote_to_result( old_result_R2, vote_2, best_R2_i, best_R2_j , v2_all_subreads, vote_2 -> noninformative_subreads, this_score);
1472 break;
1473 }
1474 }
1475 } else if( R1R2_mapped & ( CLUSTER_ALIGNMENT_DONOR_R1_MAPPED | CLUSTER_ALIGNMENT_DONOR_R2_MAPPED ) ) {
1476 int is_R2_mapped = ( R1R2_mapped & CLUSTER_ALIGNMENT_DONOR_R2_MAPPED)?1:0;
1477 for(i = 0; i < global_context -> config.multi_best_reads; i++) {
1478 mapping_result_t * old_result_R = _global_retrieve_alignment_ptr(global_context, pair_number, is_R2_mapped, i);
1479 short old_score_R = old_result_R -> selected_votes;
1480
1481
1482
1483 if( old_score_R < this_score ){
1484
1485 for(j = global_context -> config.multi_best_reads - 2; j>=i; j--){
1486 mapping_result_t * shifted_result_R = _global_retrieve_alignment_ptr(global_context, pair_number, is_R2_mapped, j);
1487 mapping_result_t * target_result_R = _global_retrieve_alignment_ptr(global_context, pair_number, is_R2_mapped, j+1);
1488 memcpy( target_result_R, shifted_result_R , sizeof(mapping_result_t));
1489
1490 }
1491
1492 int best_R_i = -1, best_R_j = - 1 , highest_vote_R = -1;
1493 gene_vote_t * this_vote = is_R2_mapped?vote_2:vote_1;
1494
1495 for(j = 0; j < path_len ; j++){
1496 if( highest_vote_R < this_vote -> votes [ this_ii_path [j] ] [ this_jj_path [j] ] ){
1497 best_R_i = this_ii_path [j] ;
1498 best_R_j = this_jj_path [j] ;
1499 highest_vote_R = this_vote -> votes [ this_ii_path [j] ] [ this_jj_path [j] ] ;
1500 }
1501 //SUBREADprintf("MASK=%d\n", this_masks[j]);
1502 }
1503
1504 //SUBREADprintf("IJ: R1=%d,%d R2=%d,%d MASK=%d\n", best_R1_i,best_R1_j,best_R2_i,best_R2_j);
1505
1506 simple_copy_vote_to_result( old_result_R, this_vote, best_R_i, best_R_j , v1_all_subreads, this_vote -> noninformative_subreads, this_score);
1507 break;
1508 }
1509
1510
1511 }
1512 }
1513 }
1514
1515 /*if(highest_score > 0){
1516 if(this_score >0){
1517 if(this_score > highest_score){
1518 highest_score = this_score;
1519 highest_occurance = 1;
1520 memcpy(best_ii_path, this_ii_path, sizeof(int)*path_len);
1521 memcpy(best_jj_path, this_jj_path, sizeof(int)*path_len);
1522 best_path_len = path_len;
1523 }else if(this_score == highest_score)
1524 highest_occurance ++;
1525 }
1526 }*/
1527 }
1528
1529
1530 // call new junctions from the path
1531 // then put the alignment into the best list.
1532
1533 }else{
1534 return process_voting_junction_PE_topK(global_context, thread_context, pair_number, vote_1, vote_2, read_name_1, read_name_2, read_text_1, read_text_2, read_len_1, read_len_2, is_negative_strand, v1_all_subreads, v2_all_subreads);
1535 }
1536
1537 free(cluster_buffer);
1538 return 0;
1539 }
1540
1541
compare_cluster_elements(void * arr,int l,int r)1542 int compare_cluster_elements (void * arr, int l, int r){
1543 int * ii_array = ((void **)arr)[0];
1544 int * jj_array = ((void **)arr)[1];
1545 int * second_vote = ((void **)arr)[2];
1546
1547 if(second_vote[l] != second_vote[r])
1548 return second_vote[l] - second_vote[r];
1549
1550 gene_vote_t * vote_1 = ((void **)arr)[3];
1551 gene_vote_t * vote_2 = ((void **)arr)[4];
1552
1553
1554 gene_vote_t * this_vote_L = second_vote[l]?vote_2:vote_1;
1555 gene_vote_t * this_vote_R = second_vote[r]?vote_2:vote_1;
1556
1557 return this_vote_L->coverage_start[ii_array[l]][jj_array[l]] - this_vote_R -> coverage_start[ii_array[r]][jj_array[r]];
1558 }
1559
exchange_cluster_elements(void * arr,int l,int r)1560 void exchange_cluster_elements (void * arr, int l, int r){
1561 int * ii_array = ((void **)arr)[0];
1562 int * jj_array = ((void **)arr)[1];
1563 int * second_vote = ((void **)arr)[2];
1564
1565 int ti;
1566 ti = ii_array[l];
1567 ii_array[l] = ii_array[r];
1568 ii_array[r]=ti;
1569
1570 ti = jj_array[l];
1571 jj_array[l] = jj_array[r];
1572 jj_array[r]=ti;
1573
1574 ti = second_vote[l];
1575 second_vote[l] = second_vote[r];
1576 second_vote[r]=ti;
1577 }
1578
1579 #define NEW_EXTEND_SCAN_INTRON_LONGEST 6000
1580 #define NEW_EXTEND_SCAN_EXON_SHORTEST 14
1581
1582 int find_path(global_context_t * global_context, thread_context_t * thread_context, int start_element_i, int target_element_i, int * ii_array, int * jj_array, int * is_second_vote_array, gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, int * this_mask , int * exon_last_base);
1583 int find_donor_receptor(global_context_t * global_context, thread_context_t * thread_context, char * rname, char * rtext, int rlen, int start_coverage, int end_coverage, unsigned int start_pos, unsigned int end_pos, int indels_in_start, int v1, int v2, int * misma_bases, int * matched_bases, int * is_negative_donor);
extend_uncovered_region_juncs(global_context_t * global_context,thread_context_t * thread_context,char * rname,char * rtext,int rlen,int scan_to_tail,unsigned int scan_chro_start,int scan_read_start,unsigned short expect_donor,int * mismatched_bases_after_start,int * first_exon_last_base,unsigned int * first_exon_first_base,int * ret_mismatched_bases,int * is_negative_donor)1584 int extend_uncovered_region_juncs(global_context_t * global_context, thread_context_t * thread_context, char * rname, char * rtext, int rlen, int scan_to_tail, unsigned int scan_chro_start, int scan_read_start, unsigned short expect_donor, int * mismatched_bases_after_start, int * first_exon_last_base, unsigned int * first_exon_first_base, int * ret_mismatched_bases, int * is_negative_donor){
1585
1586 if( scan_to_tail ) assert( scan_read_start < rlen - NEW_EXTEND_SCAN_EXON_SHORTEST );
1587 else assert( scan_read_start >= NEW_EXTEND_SCAN_EXON_SHORTEST);
1588
1589 gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
1590 int x1, best_best_score = -1, best_best_occurance = 0;
1591
1592 unsigned long long matching_target = 0, rolling_bases = 0;
1593
1594 for(x1 = 0 ; x1 < 8 ; x1++){
1595 int nch = scan_to_tail? rtext[ rlen - 2 - x1 ] : rtext[ 10 - x1 ] ;
1596 matching_target = ( matching_target << 8 ) | nch;
1597 }
1598 if(0 && FIXLENstrcmp("V0112_0155:7:1101:13762:2349#ACTTGA", rname ) == 0 )
1599 SUBREADprintf("TAG=%016llX\n",matching_target);
1600
1601 for(x1 = 0; x1 < NEW_EXTEND_SCAN_INTRON_LONGEST ; x1++){
1602 int best_last_exon_base = -1, matched_in_the_uncovered_gap = -1, mismatched_bases = -1, extended_should_mismatch = -1;
1603 unsigned int scan_cursor = scan_chro_start ;
1604 if(scan_to_tail) scan_cursor+=x1;else scan_cursor-=x1;
1605 unsigned long long nch = gvindex_get( value_index, scan_cursor );
1606 if(scan_to_tail)
1607 rolling_bases = (rolling_bases >> 8) | nch << 56;
1608 else
1609 rolling_bases = nch | ( rolling_bases << 8 );
1610
1611 //SUBREADprintf("MATCH:%016llX,%016llX\n", rolling_bases, matching_target);
1612 if(rolling_bases == matching_target){
1613 //SUBREADprintf("PNTT-M\n");
1614 if(scan_to_tail) {
1615 best_last_exon_base = find_donor_receptor(global_context, thread_context, rname, rtext, rlen, scan_read_start, rlen - 2 - 7, scan_chro_start, scan_cursor - (rlen - 2) , 0, 0,0, &mismatched_bases , &matched_in_the_uncovered_gap, is_negative_donor);
1616 if(best_last_exon_base>0)
1617 extended_should_mismatch = match_chro( rtext + best_last_exon_base , value_index, scan_chro_start + best_last_exon_base, rlen - best_last_exon_base, 0, global_context->config.space_type);
1618 } else {
1619 best_last_exon_base = find_donor_receptor(global_context, thread_context, rname, rtext, rlen, 10, scan_read_start, scan_cursor - 3 , scan_chro_start, 0, 0,0, &mismatched_bases , &matched_in_the_uncovered_gap,is_negative_donor);
1620 if(best_last_exon_base>0)
1621 extended_should_mismatch = match_chro( rtext, value_index, scan_chro_start, best_last_exon_base , 0, global_context->config.space_type);
1622 }
1623
1624 }
1625
1626 if(best_last_exon_base >0 && extended_should_mismatch < ( scan_to_tail?( rlen - best_last_exon_base - 4 ):( best_last_exon_base - 4 )) && mismatched_bases < 2 ){
1627 int this_score;
1628 if(scan_to_tail) this_score = rlen - scan_read_start - mismatched_bases;
1629 else this_score = scan_read_start - mismatched_bases;
1630 if(best_best_score < this_score){
1631 best_best_score = this_score;
1632 (*mismatched_bases_after_start) = mismatched_bases;
1633 (*first_exon_last_base) = best_last_exon_base;
1634 (*first_exon_first_base) = scan_to_tail?( scan_cursor - (rlen - 2) ) : ( scan_cursor - 3 );
1635 (*ret_mismatched_bases) = mismatched_bases;
1636 best_best_occurance = 1;
1637
1638 }else if( best_best_score == this_score ) best_best_occurance ++;
1639 }
1640
1641
1642 if(0 && (!scan_to_tail) && best_last_exon_base >0 && extended_should_mismatch < best_last_exon_base - 4 && mismatched_bases < 2){
1643 char out1pos[100], out2pos[100];
1644 absoffset_to_posstr(global_context, scan_chro_start + best_last_exon_base, out1pos);
1645 absoffset_to_posstr(global_context, scan_cursor - (rlen - 2) + best_last_exon_base, out2pos);
1646 SUBREADprintf("LIMMISMA: %d < %d - 4\t\tfor %s\n" , extended_should_mismatch, best_last_exon_base ,rname);
1647
1648 SUBREADprintf("HEAD MATCH: %s - %s : MM=%d ; SPLIT=%d\t%s\n",out1pos, out2pos, mismatched_bases, best_last_exon_base, rname);
1649
1650 SUBREADprintf("R =%s\nS1=", rtext);
1651 int x2;
1652 for(x2 = 0; x2 < rlen; x2++){
1653 if(x2 > best_last_exon_base + 16) SUBREADprintf(" ");
1654 else{
1655 int nch = gvindex_get( value_index, scan_cursor - 3 + x2 );
1656 SUBREADprintf("%c", nch);
1657 }
1658 }
1659 SUBREADprintf("\nS2=");
1660
1661 for(x2 = 0; x2 < rlen; x2++){
1662 if(x2 > best_last_exon_base + 16 ) SUBREADprintf(" ");
1663 else{
1664 int nch = gvindex_get( value_index, scan_chro_start +x2);
1665 SUBREADprintf("%c", nch);
1666 }
1667 }
1668 SUBREADprintf("\n ");
1669
1670 for(x2 = 0; x2 < rlen; x2++){
1671 if(x2 < best_last_exon_base ) SUBREADprintf(" ");
1672 else if( x2 > best_last_exon_base + 1 ) SUBREADprintf(" ");
1673 else SUBREADprintf("|");
1674 }
1675 SUBREADprintf("\n\n");
1676 }
1677 if(0 && scan_to_tail && best_last_exon_base >0 && extended_should_mismatch < rlen - best_last_exon_base - 4 && mismatched_bases < 2){
1678
1679 SUBREADprintf("LIMMISMA: %d < %d - 4\t\tfor %s\n" , extended_should_mismatch, (rlen - best_last_exon_base ),rname);
1680 char out1pos[100], out2pos[100];
1681 absoffset_to_posstr(global_context, scan_chro_start + best_last_exon_base, out1pos);
1682 absoffset_to_posstr(global_context, scan_cursor - (rlen - 2) + best_last_exon_base, out2pos);
1683 SUBREADprintf("TAIL MATCH: %s - %s : MM=%d ; SPLIT=%d\t%s\n",out1pos, out2pos, mismatched_bases, best_last_exon_base, rname);
1684
1685 SUBREADprintf("R =%s\nS1=", rtext);
1686 int x2;
1687 for(x2 = 0; x2 < rlen; x2++){
1688 if(x2 < scan_read_start - 16) SUBREADprintf(" ");
1689 else{
1690 int nch = gvindex_get( value_index, x2 + scan_chro_start);
1691 SUBREADprintf("%c", nch);
1692 }
1693 }
1694 SUBREADprintf("\nS2=");
1695
1696 for(x2 = 0; x2 < rlen; x2++){
1697 if(x2 < best_last_exon_base - 16 ) SUBREADprintf(" ");
1698 else{
1699 int nch = gvindex_get( value_index, scan_cursor - (rlen - 2)+x2);
1700 SUBREADprintf("%c", nch);
1701 }
1702 }
1703 SUBREADprintf("\n ");
1704
1705 for(x2 = 0; x2 < rlen; x2++){
1706 if(x2 < best_last_exon_base ) SUBREADprintf(" ");
1707 else if( x2 > best_last_exon_base + 1 ) SUBREADprintf(" ");
1708 else SUBREADprintf("|");
1709 }
1710 SUBREADprintf("\n\n");
1711 }
1712 }
1713 if(0&&best_best_occurance>0 && best_best_score>0)
1714 SUBREADprintf("OCCR=%d : SCR=%d\n", best_best_occurance, best_best_score);
1715 if(best_best_occurance == 1) return best_best_score;
1716 return -1;
1717 }
1718
simple_add_junction(global_context_t * global_context,thread_context_t * thread_context,unsigned int left_edge_wanted,unsigned int right_edge_wanted,int indel_at_junction,int is_negative_donors)1719 void simple_add_junction( global_context_t * global_context, thread_context_t * thread_context, unsigned int left_edge_wanted, unsigned int right_edge_wanted, int indel_at_junction, int is_negative_donors ){
1720 char * chro_name_left, *chro_name_right;
1721 int chro_pos_left,chro_pos_right;
1722
1723 locate_gene_position( left_edge_wanted , &global_context -> chromosome_table, &chro_name_left, &chro_pos_left);
1724 locate_gene_position( right_edge_wanted , &global_context -> chromosome_table, &chro_name_right, &chro_pos_right);
1725 if((!( global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection) ) && chro_name_right!=chro_name_left) return;
1726
1727 //insert event
1728 HashTable * event_table = NULL;
1729 chromosome_event_t * event_space = NULL;
1730 if(thread_context)
1731 {
1732 event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
1733 event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1734 }
1735 else
1736 {
1737 event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
1738 event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1739 }
1740 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
1741 chromosome_event_t * found = NULL;
1742 int found_events = search_event(global_context, event_table, event_space, left_edge_wanted , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_INDEL | CHRO_EVENT_TYPE_JUNCTION | CHRO_EVENT_TYPE_FUSION, search_return);
1743
1744 if(found_events)
1745 {
1746 int kx1;
1747 for(kx1 = 0; kx1 < found_events ; kx1++)
1748 {
1749 if(search_return[kx1] -> event_large_side == right_edge_wanted)
1750 {
1751 found = search_return[kx1];
1752 break;
1753 }
1754 }
1755 }
1756
1757 if(found) found -> supporting_reads ++;
1758 else
1759 {
1760 int event_no;
1761
1762
1763 if(thread_context)
1764 event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
1765 else
1766 event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> total_events ++;
1767
1768
1769 chromosome_event_t * new_event = event_space+event_no;
1770 memset(new_event,0,sizeof(chromosome_event_t));
1771 new_event -> event_small_side = left_edge_wanted;
1772 new_event -> event_large_side = right_edge_wanted;
1773 new_event -> is_negative_strand= is_negative_donors;
1774 new_event -> event_type = CHRO_EVENT_TYPE_JUNCTION;
1775 new_event -> supporting_reads = 1;
1776 new_event -> indel_length = 0;
1777 new_event -> indel_at_junction = indel_at_junction;
1778 new_event -> is_donor_found_or_annotation = 1;
1779 new_event -> small_side_increasing_coordinate = 0;
1780 new_event -> large_side_increasing_coordinate = 1;
1781 put_new_event(event_table, new_event , event_no);
1782 }
1783 }
1784
align_cluster(global_context_t * global_context,thread_context_t * thread_context,struct cluster_element * this_cluster,char * read_name_1,char * read_name_2,char * read_text_1,char * read_text_2,int read_len_1,int read_len_2,int is_negative_strand,gene_vote_t * vote_1,gene_vote_t * vote_2,int * this_score,int * best_ii_path,int * best_jj_path,int * best_masks,int * best_path_length,int * R1R2_mapped)1785 int align_cluster(global_context_t * global_context, thread_context_t * thread_context, struct cluster_element * this_cluster, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, gene_vote_t * vote_1, gene_vote_t * vote_2, int * this_score, int * best_ii_path, int * best_jj_path, int * best_masks, int * best_path_length, int * R1R2_mapped){
1786 //int cluster_x1;
1787
1788 //SUBREADprintf("\n === Cluster %s %s === \n", is_negative_strand?"NEG":"POS", read_name_1);
1789 //unsigned int min_frag_start = 0xffffffff;
1790
1791 int ii_array[MAX_CLUSTER_ELEMENTS], jj_array[MAX_CLUSTER_ELEMENTS], is_second_vote_array[MAX_CLUSTER_ELEMENTS], dynamic_highest_mask[MAX_CLUSTER_ELEMENTS], x1;
1792 void * sort_pointers [5];
1793
1794 for(x1 = 0 ; x1 < this_cluster->cluster_members; x1++){
1795 ii_array[x1] = this_cluster -> i_list[x1];
1796 jj_array[x1] = this_cluster -> j_list[x1];
1797 is_second_vote_array[x1] = this_cluster -> from_second_read[x1];
1798
1799 }
1800
1801 sort_pointers[0] = ii_array;
1802 sort_pointers[1] = jj_array;
1803 sort_pointers[2] = is_second_vote_array;
1804 sort_pointers[3] = vote_1;
1805 sort_pointers[4] = vote_2;
1806
1807 basic_sort(sort_pointers, this_cluster->cluster_members, compare_cluster_elements, exchange_cluster_elements);
1808
1809 if(0)
1810 for(x1 = 0 ; x1 < this_cluster->cluster_members; x1++){
1811 gene_vote_t * this_vote = is_second_vote_array[x1]?vote_2:vote_1;
1812 int ii = ii_array[x1];
1813 int jj = jj_array[x1];
1814
1815 SUBREADprintf(" R%d %d - %d POS=%u VOTES=%d\n", 1+is_second_vote_array[x1], this_vote->coverage_start[ii][jj], this_vote->coverage_end[ii][jj], this_vote -> pos[ii][jj], this_vote->votes[ii][jj]);
1816 }
1817
1818 int dynamic_highest_scores[MAX_CLUSTER_ELEMENTS], dynamic_last_exon[MAX_CLUSTER_ELEMENTS];
1819 char dynamic_highest_path[MAX_CLUSTER_ELEMENTS];
1820 memset(dynamic_highest_scores,0,sizeof(int)*MAX_CLUSTER_ELEMENTS);
1821
1822 int target_element_i;
1823
1824 for(target_element_i = 0; target_element_i < this_cluster->cluster_members; target_element_i++){
1825 gene_vote_t * this_vote = is_second_vote_array[target_element_i]?vote_2:vote_1;
1826 int ii = ii_array[target_element_i];
1827 int jj = jj_array[target_element_i];
1828 dynamic_highest_scores[target_element_i] = this_vote->coverage_end[ii][jj] - this_vote->coverage_start[ii][jj];
1829 dynamic_highest_path[ target_element_i ] = -1;
1830 }
1831
1832 int highest_score = -1;
1833 int highest_target_end = -1;
1834 for(target_element_i = 0; target_element_i < this_cluster->cluster_members; target_element_i++){
1835 int start_element_i;
1836 for(start_element_i = 0; start_element_i < this_cluster->cluster_members; start_element_i++){
1837 if(target_element_i <= start_element_i) continue;
1838 int this_mask = -1, breakpount_last_exon = -1;
1839 int increasing_score = find_path(global_context, thread_context, start_element_i, target_element_i, ii_array, jj_array, is_second_vote_array, vote_1, vote_2, read_name_1, read_name_2, read_text_1, read_text_2, read_len_1, read_len_2, is_negative_strand, &this_mask, &breakpount_last_exon);
1840 if(increasing_score >= 0 && increasing_score + dynamic_highest_scores[start_element_i] > dynamic_highest_scores[target_element_i]){
1841 dynamic_highest_path[ target_element_i ] = start_element_i;
1842 dynamic_highest_scores[target_element_i] = increasing_score + dynamic_highest_scores[start_element_i];
1843 dynamic_highest_mask[ target_element_i ] = this_mask;
1844 dynamic_last_exon[ target_element_i ] = breakpount_last_exon;
1845 if( dynamic_highest_scores[target_element_i] > highest_score ){
1846 highest_score = dynamic_highest_scores[target_element_i] ;
1847 highest_target_end = target_element_i;
1848 }
1849 }
1850 }
1851 }
1852
1853
1854 if(highest_target_end >=0 && highest_score > 160 - 159){
1855 int is_on_path [MAX_CLUSTER_ELEMENTS];
1856 memset(is_on_path,0,sizeof(int)*MAX_CLUSTER_ELEMENTS);
1857
1858 gene_vote_t * last_vote = is_second_vote_array[ highest_target_end ]?vote_2:vote_1;
1859 int last_section_read_end = last_vote -> coverage_end[ ii_array [highest_target_end] ] [ jj_array [highest_target_end] ];
1860 int this_rlen = is_second_vote_array[ highest_target_end ]?read_len_2 : read_len_1;
1861 int this_votes = last_vote -> votes [ ii_array [highest_target_end] ] [ jj_array [highest_target_end] ];
1862 int tail_first_exon_last_base_in_read=-1, tail_mismatched_bases=-1;
1863 unsigned int tail_first_exon_first_base_on_chro, tail_mapped_section_pos;
1864 int front_first_exon_last_base_in_read=-1, front_mismatched_bases=-1;
1865 unsigned int front_first_exon_first_base_on_chro, front_mapped_section_pos ;
1866 int front_score = 0, tail_score = 0, front_negative_donor = 0, tail_negative_donor = 0;
1867
1868 if(0 && last_section_read_end < this_rlen - NEW_EXTEND_SCAN_EXON_SHORTEST && this_votes > 4)
1869 {
1870 char * this_rname = is_second_vote_array[ highest_target_end ]?read_name_2:read_name_1;
1871 char * this_rtext = is_second_vote_array[ highest_target_end ]?read_text_2:read_text_1;
1872 int scan_to_tail = 1, mismatched_bases_after_start;
1873 tail_mapped_section_pos = last_vote -> pos[ ii_array [highest_target_end] ] [ jj_array [highest_target_end] ] +
1874 last_vote -> current_indel_cursor[ ii_array [highest_target_end] ] [ jj_array [highest_target_end] ] ;
1875 if(0){
1876 char out1pos[100];
1877 absoffset_to_posstr(global_context, tail_mapped_section_pos, out1pos);
1878 SUBREADprintf("RN=%s\nSTART=%u, READ_START=%d, READ_FIRTS_BASE_POS=%s\n", this_rname, tail_mapped_section_pos , last_section_read_end, out1pos);
1879 }
1880
1881 tail_score = extend_uncovered_region_juncs(global_context, thread_context, this_rname, this_rtext , this_rlen, scan_to_tail, tail_mapped_section_pos, last_section_read_end , -1, & mismatched_bases_after_start, &tail_first_exon_last_base_in_read, &tail_first_exon_first_base_on_chro, &tail_mismatched_bases, &tail_negative_donor);
1882 }
1883 (*best_path_length) = 0;
1884 if(highest_score>0) while(1){
1885 best_ii_path[(*best_path_length)] = ii_array[highest_target_end];
1886 best_jj_path[(*best_path_length)] = jj_array[highest_target_end];
1887 best_masks[ (*best_path_length) ] = dynamic_highest_mask[highest_target_end];
1888
1889 if( dynamic_last_exon [ highest_target_end ] > 0 ) best_masks[ (*best_path_length) ] |= ( is_second_vote_array[ highest_target_end ]?CLUSTER_ALIGNMENT_DONOR_R2_MAPPED:CLUSTER_ALIGNMENT_DONOR_R1_MAPPED);
1890
1891 if( is_second_vote_array[ highest_target_end ] ) (*R1R2_mapped) |= CLUSTER_ALIGNMENT_DONOR_R2_MAPPED;
1892 else (*R1R2_mapped) |= CLUSTER_ALIGNMENT_DONOR_R1_MAPPED;
1893
1894 (*best_path_length)++;
1895
1896 is_on_path[highest_target_end] = 1;
1897 if( dynamic_highest_path[highest_target_end] == -1 ) break;
1898 highest_target_end = dynamic_highest_path[highest_target_end];
1899 }
1900
1901 gene_vote_t * first_vote = is_second_vote_array[ highest_target_end ]?vote_2:vote_1;
1902 int first_section_read_start = first_vote -> coverage_start [ ii_array [highest_target_end] ] [ jj_array [highest_target_end] ] ;
1903 this_votes = first_vote -> votes [ ii_array [highest_target_end] ] [ jj_array [highest_target_end] ];
1904
1905 if(0 && first_section_read_start > NEW_EXTEND_SCAN_EXON_SHORTEST && this_votes > 4){
1906 char * this_rname = is_second_vote_array[ highest_target_end ]?read_name_2:read_name_1;
1907 char * this_rtext = is_second_vote_array[ highest_target_end ]?read_text_2:read_text_1;
1908 int scan_to_tail = 0, mismatched_bases_after_start;
1909 front_mapped_section_pos = first_vote -> pos[ ii_array [highest_target_end] ] [ jj_array [highest_target_end] ];
1910
1911 front_score = extend_uncovered_region_juncs(global_context, thread_context, this_rname, this_rtext , this_rlen, scan_to_tail, front_mapped_section_pos, first_section_read_start , -1, & mismatched_bases_after_start, &front_first_exon_last_base_in_read, &front_first_exon_first_base_on_chro, &front_mismatched_bases, &front_negative_donor);
1912
1913 }
1914
1915 if(0 && front_score>0 && tail_score>0){
1916
1917 SUBREADprintf("\n>>> %s\n", read_name_1);
1918
1919 for(target_element_i = 0; target_element_i < this_cluster->cluster_members; target_element_i++){
1920 SUBREADprintf("R%d\t", is_second_vote_array[target_element_i]+1);
1921 }
1922 SUBREADprintf("\n");
1923
1924 for(target_element_i = 0; target_element_i < this_cluster->cluster_members; target_element_i++){
1925 gene_vote_t * this_vote = is_second_vote_array[target_element_i]?vote_2:vote_1;
1926 int ii = ii_array[target_element_i];
1927 int jj = jj_array[target_element_i];
1928
1929 SUBREADprintf("%d%c%d\t", this_vote->coverage_start[ii][jj], is_on_path[target_element_i]?'=':'-', this_vote->coverage_end[ii][jj]);
1930 }
1931 SUBREADprintf("\n");
1932
1933 for(target_element_i = 0; target_element_i < this_cluster->cluster_members; target_element_i++){
1934 SUBREADprintf("%d\t", dynamic_highest_scores[target_element_i]);
1935 }
1936 SUBREADprintf("\n");
1937 SUBREADprintf("Extra_scores = %d, %d\n", front_score, tail_score);
1938 }
1939
1940 (*this_score) = highest_score + max(0, front_score) + max(0, tail_score);
1941 int applied_score_cut=0;
1942 if(((*R1R2_mapped) & CLUSTER_ALIGNMENT_DONOR_R1_MAPPED)&&( (*R1R2_mapped) & CLUSTER_ALIGNMENT_DONOR_R2_MAPPED ) )
1943 applied_score_cut = read_len_2 + read_len_1 - 70;
1944 else if((*R1R2_mapped) & CLUSTER_ALIGNMENT_DONOR_R1_MAPPED)
1945 applied_score_cut = read_len_1 - 30;
1946 else if((*R1R2_mapped) & CLUSTER_ALIGNMENT_DONOR_R1_MAPPED)
1947 applied_score_cut = read_len_2 - 30;
1948
1949 if( (*this_score) >= applied_score_cut){
1950 for( x1 = 0; x1 < MAX_CLUSTER_ELEMENTS; x1++){
1951 if(!is_on_path[x1]) continue;
1952
1953 int x2, second_end = -1;
1954 for(x2 = x1 + 1; x2 < MAX_CLUSTER_ELEMENTS; x2++){
1955 if(is_on_path[x2]){
1956 second_end = x2;
1957 break;
1958 }
1959 }
1960
1961
1962 if(second_end > 0){
1963 if( dynamic_last_exon[second_end] >0 ){
1964 gene_vote_t * this_vote = is_second_vote_array[ x1 ]?vote_2:vote_1;
1965 unsigned int junction_small_side = this_vote -> pos[ ii_array[ x1 ]][ jj_array[ x1 ]] +
1966 this_vote -> current_indel_cursor[ ii_array[ x1 ]][ jj_array[ x1 ]] + dynamic_last_exon[second_end];
1967
1968 unsigned int junction_large_side = this_vote -> pos[ ii_array[ second_end ]][ jj_array[ second_end ]] + dynamic_last_exon[second_end] + 1;
1969
1970 if(0){
1971 char out1pos[100], out2pos[100];
1972 absoffset_to_posstr(global_context, junction_small_side, out1pos);
1973 absoffset_to_posstr(global_context, junction_large_side, out2pos);
1974 SUBREADprintf("CLUSTER_JUNCTION %s %s\n%s\n%s\n\n", out1pos, out2pos, read_text_1, read_text_2);
1975 }
1976
1977 //#warning "SUBREAD_151 ============= MAKE SURE: CHANGE '0' TO INSERTED BASES ================="
1978 simple_add_junction(global_context, thread_context, junction_small_side, junction_large_side, 0, (dynamic_highest_mask[ second_end ] & CLUSTER_ALIGNMENT_DONOR_NEGATIVE_STRAND)?1:0);
1979 }
1980 }
1981 }
1982
1983
1984
1985 if(0 && front_mismatched_bases <1 && front_score >14){
1986 unsigned int junction_small_side = front_first_exon_first_base_on_chro + front_first_exon_last_base_in_read;
1987 unsigned int junction_large_side = front_mapped_section_pos + front_first_exon_last_base_in_read + 1;
1988
1989 char out1pos[100], out2pos[100];
1990 absoffset_to_posstr(global_context, junction_small_side+1, out1pos);
1991 absoffset_to_posstr(global_context, junction_large_side+1, out2pos);
1992 //SUBREADprintf("FMB=%d\tFS=%d\nPOS=%s , %s\n\n", front_mismatched_bases, front_score, out1pos, out2pos);
1993
1994 simple_add_junction(global_context, thread_context, junction_small_side, junction_large_side, 0, front_negative_donor);
1995 }
1996
1997 if(0 && tail_mismatched_bases <1 && tail_score >14){
1998 unsigned int junction_small_side = tail_mapped_section_pos + tail_first_exon_last_base_in_read;
1999 unsigned int junction_large_side = tail_first_exon_first_base_on_chro + tail_first_exon_last_base_in_read;
2000
2001 char out1pos[100], out2pos[100];
2002 absoffset_to_posstr(global_context, junction_small_side+1, out1pos);
2003 absoffset_to_posstr(global_context, junction_large_side+1, out2pos);
2004 //SUBREADprintf("BMB=%d\tBS=%d\nPOS=%s , %s\n\n", tail_mismatched_bases, tail_score, out1pos, out2pos);
2005
2006
2007
2008 simple_add_junction(global_context, thread_context, junction_small_side, junction_large_side, 0, tail_negative_donor);
2009 }
2010 }
2011 }
2012 return 0;
2013 }
2014
2015 #define paired_donor_receptor_m2(s, c1, c2 ) ( s[0] == c1 && s[1] == c2 )
2016
is_paired_donor_receptor(char * small_bases,char * large_bases)2017 int is_paired_donor_receptor( char * small_bases, char * large_bases ){
2018
2019 //SUBREADprintf("SITE1 = %c%c , SITE2 = %c%c\n", small_bases[0], small_bases[1], large_bases[0], large_bases[1]);
2020 //
2021 if ( paired_donor_receptor_m2( small_bases, 'G', 'T' ) &&
2022 paired_donor_receptor_m2( large_bases, 'A', 'G' ) )
2023 return 1;
2024
2025 if ( paired_donor_receptor_m2( small_bases, 'C', 'T' ) &&
2026 paired_donor_receptor_m2( large_bases, 'A', 'C' ) )
2027 return 2;
2028
2029 // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC113136/
2030 // the 99.24% of splice site pairs should be GT-AG,
2031 // 0.69% GC-AG,
2032 // 0.05% AT-AC
2033 // and finally only 0.02% could consist of other types of non-canonical splice sites.
2034
2035 // non-canonical : GC-AG (+) or CT-GC (-)
2036 if ( paired_donor_receptor_m2( small_bases, 'G', 'C' ) &&
2037 paired_donor_receptor_m2( large_bases, 'A', 'G' ) )
2038 return 3;
2039
2040 if ( paired_donor_receptor_m2( small_bases, 'C', 'T' ) &&
2041 paired_donor_receptor_m2( large_bases, 'G', 'C' ) )
2042 return 4;
2043
2044
2045 // non-canonical : AT-AC (+) or GT-AT (-)
2046 if ( paired_donor_receptor_m2( small_bases, 'A', 'T' ) &&
2047 paired_donor_receptor_m2( large_bases, 'A', 'C' ) )
2048 return 5;
2049
2050 if ( paired_donor_receptor_m2( small_bases, 'G', 'T' ) &&
2051 paired_donor_receptor_m2( large_bases, 'A', 'T' ) )
2052 return 6;
2053
2054
2055 return 0;
2056 }
2057
find_donor_receptor(global_context_t * global_context,thread_context_t * thread_context,char * rname,char * rtext,int rlen,int start_coverage,int end_coverage,unsigned int start_pos,unsigned int end_pos,int indels_in_start,int v1,int v2,int * misma_bases,int * matched_bases,int * is_negative_donor)2058 int find_donor_receptor(global_context_t * global_context, thread_context_t * thread_context, char * rname, char * rtext, int rlen, int start_coverage, int end_coverage, unsigned int start_pos, unsigned int end_pos, int indels_in_start, int v1, int v2, int * misma_bases, int * matched_bases, int * is_negative_donor){
2059
2060 gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
2061 int search_in_read_start = start_coverage - 8, search_in_read_end = end_coverage + 8;
2062 search_in_read_start = max(0, search_in_read_start);
2063 search_in_read_end = min( rlen, search_in_read_end );
2064 unsigned int search_in_chro_start = start_pos + indels_in_start + search_in_read_start;
2065
2066 char chro_bases_startside[ search_in_read_end - search_in_read_start ], chro_bases_endside[search_in_read_end - search_in_read_start];
2067
2068 int x1;
2069
2070 for(x1 = 0; x1 < search_in_read_end - search_in_read_start; x1++){
2071 chro_bases_startside[x1] = gvindex_get( value_index, search_in_chro_start + x1 );
2072 chro_bases_endside[x1] = gvindex_get( value_index , end_pos + search_in_read_start + x1);
2073 }
2074
2075 int insertion_in_between_i, best_testing_score = 500 * 1000;
2076 int best_insertion_in_between = -1, best_last_exon_base_in_start = -1;
2077 int applied_insertion_limit = global_context->config.max_insertion_at_junctions;
2078 for(insertion_in_between_i = 0; insertion_in_between_i <= applied_insertion_limit; insertion_in_between_i ++){
2079 int start_site_match [ search_in_read_end - search_in_read_start ], end_site_match[ search_in_read_end - search_in_read_start ];
2080 int start_last_exon_base, end_site_mismatches = 0, start_site_mismatches = 0;
2081 for(start_last_exon_base = 0 ; start_last_exon_base < search_in_read_end - search_in_read_start ; start_last_exon_base++){
2082 start_site_match[start_last_exon_base] = ( rtext[ search_in_read_start + start_last_exon_base ] == chro_bases_startside[start_last_exon_base] );
2083 int end_site_x = ( rtext[ search_in_read_start + start_last_exon_base] == chro_bases_endside[start_last_exon_base] );
2084 end_site_match[start_last_exon_base] = end_site_x;
2085
2086 if(start_last_exon_base >=insertion_in_between_i )
2087 end_site_mismatches += !end_site_x;
2088 }
2089
2090 for(start_last_exon_base = 0 ; start_last_exon_base < search_in_read_end - search_in_read_start - insertion_in_between_i ; start_last_exon_base++){
2091 end_site_mismatches -= (! end_site_match[start_last_exon_base + insertion_in_between_i] );
2092 start_site_mismatches += (! start_site_match[start_last_exon_base] );
2093
2094 if(start_last_exon_base >= 2 && start_last_exon_base < search_in_read_end - search_in_read_start -insertion_in_between_i -2){
2095
2096
2097 if(0&& FIXLENstrcmp("V0112_0155:7:1101:12618:2466#ACTTGA", rname) == 0)
2098 SUBREADprintf("split=%d, ins=%d, MM=%d+%d \n", start_last_exon_base, insertion_in_between_i, start_site_mismatches, end_site_mismatches);
2099
2100
2101 if( (end_site_mismatches + start_site_mismatches) * 500 + insertion_in_between_i < best_testing_score ){
2102 int donor_paired_ret=is_paired_donor_receptor( chro_bases_startside + start_last_exon_base + 1, chro_bases_endside + insertion_in_between_i + start_last_exon_base - 1 );
2103
2104 if( donor_paired_ret ) {
2105 best_insertion_in_between = insertion_in_between_i;
2106 best_last_exon_base_in_start = start_last_exon_base;
2107 best_testing_score = (end_site_mismatches + start_site_mismatches) * 500 + insertion_in_between_i;
2108 (*misma_bases) = (end_site_mismatches + start_site_mismatches);
2109 if(is_negative_donor) (*is_negative_donor) =(donor_paired_ret -1)%2;
2110 (*matched_bases) = end_coverage - start_coverage - insertion_in_between_i - (end_site_mismatches + start_site_mismatches);
2111 }
2112 }
2113
2114 }
2115 }
2116 }
2117
2118
2119 if(0 && FIXLENstrcmp("V0112_0155:7:1101:12618:2466", rname)==0)
2120 {
2121 chro_bases_startside[x1] = 0;
2122 chro_bases_endside[x1] = 0;
2123 char sp1s[200];
2124 for(x1 =0; x1<200; x1++) sp1s[x1]=' ';
2125 sp1s[search_in_read_start] =0;
2126
2127 char spE[200];
2128 for(x1 =0; x1<200; x1++) spE[x1]=' ';
2129 spE[search_in_read_start + best_last_exon_base_in_start] =0;
2130
2131 char spBB[200];
2132 for(x1 =0; x1<200; x1++) spBB[x1]=' ';
2133 spBB[ best_insertion_in_between] =0;
2134
2135 char out1pos[100];
2136 absoffset_to_posstr(global_context, search_in_chro_start, out1pos);
2137
2138 if(0 && FIXLENstrcmp("chr14:105",out1pos)==0){
2139 SUBREADprintf("POS=%s\t\tINS=%d\t\t%s\n", out1pos, best_insertion_in_between, rname);
2140 SUBREADprintf("R= %s\nS1=%s%s\nS2=%s%s\n %s|%s|\n\n", rtext, sp1s, chro_bases_startside, sp1s, chro_bases_endside, spE, spBB);
2141 }
2142 }
2143
2144 if(best_last_exon_base_in_start>=0)
2145 return best_last_exon_base_in_start + search_in_read_start ;
2146 else return -1;
2147 }
2148
find_path(global_context_t * global_context,thread_context_t * thread_context,int start_element_i,int target_element_i,int * ii_array,int * jj_array,int * is_second_vote_array,gene_vote_t * vote_1,gene_vote_t * vote_2,char * read_name_1,char * read_name_2,char * read_text_1,char * read_text_2,int read_len_1,int read_len_2,int is_negative_strand,int * this_mask,int * exon_last_base)2149 int find_path(global_context_t * global_context, thread_context_t * thread_context, int start_element_i, int target_element_i, int * ii_array, int * jj_array, int * is_second_vote_array, gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, int * this_mask , int * exon_last_base){
2150 gene_vote_t * start_vote = is_second_vote_array[start_element_i]?vote_2:vote_1;
2151 gene_vote_t * end_vote = is_second_vote_array[target_element_i]?vote_2:vote_1;
2152
2153 int start_coverage = start_vote->coverage_end[ ii_array[start_element_i] ][ jj_array[start_element_i] ];
2154 int end_coverage = end_vote->coverage_start[ ii_array[target_element_i] ][ jj_array[target_element_i] ];
2155 unsigned int start_pos = start_vote->pos[ ii_array[start_element_i] ][ jj_array[start_element_i] ];
2156 unsigned int end_pos = end_vote->pos[ ii_array[target_element_i] ][ jj_array[target_element_i] ];
2157 int ret = -1;
2158
2159 long long dist = start_pos;
2160 dist -= end_pos;
2161 (*this_mask)=0;
2162 if( abs(dist)<50000 ) {
2163 if(start_vote == end_vote){
2164 if(start_coverage < end_coverage + 9){
2165 char * this_read_name = is_second_vote_array[start_element_i]?read_name_2:read_name_1;
2166 char * this_read_text = is_second_vote_array[start_element_i]?read_text_2:read_text_1;
2167 int this_read_len = is_second_vote_array[start_element_i]?read_len_2:read_len_1, mismatched_bases = 0, matched_in_the_uncovered_gap = 0;
2168 if(start_pos < end_pos){
2169 int indels_in_start = start_vote -> current_indel_cursor [ ii_array[start_element_i]] [ jj_array[start_element_i] ] , donor_receptor_neg_strand = -1;
2170 int best_last_base_in_start_exon = find_donor_receptor(global_context, thread_context, this_read_name, this_read_text, this_read_len, start_coverage, end_coverage, start_pos, end_pos, indels_in_start, start_vote -> votes[ ii_array[start_element_i]] [ jj_array[start_element_i] ], start_vote -> votes[ ii_array[target_element_i]] [ jj_array[target_element_i] ], &mismatched_bases , &matched_in_the_uncovered_gap, &donor_receptor_neg_strand);
2171
2172 if(best_last_base_in_start_exon > 0 && mismatched_bases<1){
2173 ret = matched_in_the_uncovered_gap + end_vote->coverage_end[ ii_array[target_element_i] ][ jj_array[target_element_i] ] - end_coverage;
2174 (*this_mask) = donor_receptor_neg_strand? CLUSTER_ALIGNMENT_DONOR_NEGATIVE_STRAND : 0 ;
2175
2176 if(0)SUBREADprintf("FROM %d-%d to %d-%d : INC=%d, UNCOV=%d/%d\n",
2177 start_vote->coverage_start[ ii_array[start_element_i] ][ jj_array[start_element_i] ],
2178 start_vote->coverage_end[ ii_array[start_element_i] ][ jj_array[start_element_i] ],
2179 end_vote -> coverage_start[ ii_array[target_element_i] ][ jj_array[target_element_i] ],
2180 end_vote -> coverage_end[ ii_array[target_element_i] ][ jj_array[target_element_i] ], ret,
2181 matched_in_the_uncovered_gap , end_coverage - start_coverage);
2182
2183 // # of matched bases, from the end of the "start" section to the end of the end section.
2184 *exon_last_base = best_last_base_in_start_exon;
2185 }
2186 }
2187 }
2188 }else{
2189 ret = end_vote->coverage_end[ ii_array[target_element_i] ][ jj_array[target_element_i] ] - end_vote->coverage_start[ ii_array[target_element_i] ][ jj_array[target_element_i] ] ;
2190 // if the two sections are on two reads, check the first base of the second read is after the first base of the first read.
2191 }
2192 }
2193 return ret;
2194 }
2195
2196
process_voting_junction_PE_topK(global_context_t * global_context,thread_context_t * thread_context,subread_read_number_t pair_number,gene_vote_t * vote_1,gene_vote_t * vote_2,char * read_name_1,char * read_name_2,char * read_text_1,char * read_text_2,int read_len_1,int read_len_2,int is_negative_strand,gene_vote_number_t v1_all_subreads,gene_vote_number_t v2_all_subreads)2197 int process_voting_junction_PE_topK(global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t pair_number, gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, gene_vote_number_t v1_all_subreads, gene_vote_number_t v2_all_subreads)
2198 {
2199 topK_buffer_t * topbuf = thread_context?&thread_context->topKbuff:&global_context ->topKbuff ;
2200
2201 vote_combination_t * comb_buffer = (vote_combination_t *) topbuf -> comb_buffer;
2202 simple_mapping_t * vote_simple_1_buffer, * vote_simple_2_buffer;
2203 vote_simple_1_buffer =(simple_mapping_t *) topbuf -> vote_simple_1_buffer;
2204 vote_simple_2_buffer =(simple_mapping_t *) topbuf -> vote_simple_2_buffer;
2205 //memset(comb_buffer, 0 , sizeof(vote_combination_t) * global_context -> config.max_vote_combinations);
2206
2207 int is_second_read,i,j;
2208 int third_highest_votes[2][9];
2209 int is_fully_covered_1 = 0;
2210 int is_fully_covered_2 = 0;
2211
2212 for(is_second_read = 0 ; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read ++)
2213 {
2214 gene_vote_t * current_vote = is_second_read?vote_2:vote_1;
2215 int *top_three_buff = third_highest_votes[is_second_read], i , j;
2216 int * is_fully_covered = is_second_read?&is_fully_covered_2:&is_fully_covered_1;
2217 int current_read_len = is_second_read?read_len_2:read_len_1;
2218
2219 memset(top_three_buff, 0 , global_context -> config.top_scores * sizeof(int));
2220
2221 if((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection)){
2222 *is_fully_covered = test_fully_covered(global_context , current_vote, current_read_len);
2223 }
2224
2225
2226
2227 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
2228 {
2229 for (j=0; j< current_vote->items[i]; j++){
2230 int vv = current_vote -> votes[i][j];
2231 if(global_context->config.scRNA_input_mode && !global_context -> input_reads.is_paired_end_reads)vv += SE_READ_IN_KNOWN_EXON_REWARD*is_pos_in_annotated_exon_regions(global_context, current_vote -> pos[i][j]);
2232 update_top_three(global_context, top_three_buff, vv);
2233 }
2234 }
2235
2236 if(0 && FIXLENstrcmp("R00000003493",read_name_1)==0)SUBREADprintf("3N [R %d] =%d,%d,%d\n", 1+is_second_read, top_three_buff[0], top_three_buff[1], top_three_buff[2]);
2237
2238 for(i = 0; i < global_context -> config.multi_best_reads; i++)
2239 {
2240 mapping_result_t * old_result = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, i);
2241 if(old_result -> selected_votes>0)
2242 {
2243 update_top_three(global_context, top_three_buff, old_result -> selected_votes);
2244 }
2245 }
2246 if(0 && FIXLENstrcmp("R00000003493",read_name_1)==0)SUBREADprintf("3Q [R %d] =%d,%d,%d\n", 1+is_second_read, top_three_buff[0], top_three_buff[1], top_three_buff[2]);
2247 }
2248
2249
2250 int simple_record_numbers[2], third_k;
2251
2252 for(is_second_read = 0 ; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read ++)
2253 {
2254 int current_simple_number = 0;
2255 int current_read_len = is_second_read?read_len_2:read_len_1;
2256 // populate the two simple read lists
2257 for(third_k = 0 ; third_k < global_context -> config.top_scores; third_k ++)
2258 {
2259 if(current_simple_number >= global_context -> config.max_vote_simples)break;
2260 int this_vote_N = third_highest_votes [is_second_read][third_k];
2261 // only consider max_votes and max_votes - 1
2262 if(this_vote_N<1 || (third_highest_votes[is_second_read][0] - this_vote_N > global_context -> config.max_vote_number_cutoff )) break;
2263
2264 simple_mapping_t * current_simple = is_second_read ? vote_simple_2_buffer: vote_simple_1_buffer;
2265 gene_vote_t * current_vote = is_second_read?vote_2:vote_1;
2266 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
2267 {
2268 if(current_simple_number >= global_context -> config.max_vote_simples)break;
2269 for (j=0; j< current_vote->items[i]; j++)
2270 {
2271 if(current_simple_number >= global_context -> config.max_vote_simples)break;
2272 if(global_context->config.do_big_margin_filtering_for_junctions && third_k == 0 && current_vote->votes[i][j] >= third_highest_votes [is_second_read][global_context -> config.top_scores - 1])
2273 insert_big_margin_record(global_context , _global_retrieve_big_margin_ptr(global_context,pair_number, is_second_read), current_vote -> votes[i][j], current_vote -> coverage_start[i][j], current_vote -> coverage_end[i][j] , current_read_len, (current_vote -> masks[i][j] & IS_NEGATIVE_STRAND)?1:0);
2274
2275 int vv = current_vote->votes[i][j];
2276 if(global_context->config.scRNA_input_mode && !global_context -> input_reads.is_paired_end_reads)vv += SE_READ_IN_KNOWN_EXON_REWARD*is_pos_in_annotated_exon_regions(global_context, current_vote -> pos[i][j]);
2277 if(vv == this_vote_N && current_vote->votes[i][j] >= global_context->config.minimum_subread_for_second_read)
2278 {
2279 current_simple[current_simple_number].is_vote_t_item = 1;
2280 current_simple[current_simple_number].item_index_i = i;
2281 current_simple[current_simple_number].item_index_j = j;
2282 current_simple[current_simple_number].read_start_base = current_vote -> coverage_start[i][j];
2283 current_simple[current_simple_number].mapping_position = current_vote -> pos[i][j];
2284 current_simple[current_simple_number].major_half_votes = vv;
2285
2286 current_simple_number ++;
2287
2288 }
2289 }
2290 }
2291
2292 for(i = 0; i < global_context -> config.multi_best_reads; i++)
2293 {
2294 mapping_result_t * old_result = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, i);
2295 if(current_simple_number >= global_context -> config.max_vote_simples)break;
2296 if(old_result -> selected_votes == this_vote_N)
2297 {
2298 current_simple[current_simple_number].is_vote_t_item = 0;
2299 current_simple[current_simple_number].item_index_i = i;
2300 current_simple[current_simple_number].mapping_position = old_result -> selected_position;
2301 current_simple[current_simple_number].major_half_votes = old_result -> selected_votes;
2302 current_simple[current_simple_number].read_start_base = old_result -> confident_coverage_start;
2303
2304 current_simple_number ++;
2305 }
2306 }
2307
2308 }
2309 simple_record_numbers[is_second_read] = current_simple_number;
2310 }
2311
2312 int used_comb_buffer = 0;
2313 //calculate all combinations
2314
2315 if(global_context -> input_reads.is_paired_end_reads){
2316 for(i = 0; i < simple_record_numbers[0]; i++){
2317 for(j = 0; j < simple_record_numbers[1]; j++){
2318 int target_index;
2319 int is_PE_distance = 0, is_same_chromosome = 0, is_both_exonic_regions = 0;
2320
2321 if(max(vote_simple_1_buffer[i].major_half_votes, vote_simple_2_buffer[j].major_half_votes) < global_context->config.minimum_subread_for_first_read)continue;
2322
2323 simple_PE_and_same_chro(global_context , vote_simple_1_buffer+i, vote_simple_2_buffer+j , &is_PE_distance, &is_same_chromosome , read_len_1, read_len_2);
2324
2325 if((!is_PE_distance) && min(vote_simple_1_buffer[i].major_half_votes, vote_simple_2_buffer[j].major_half_votes) < global_context->config.minimum_subread_for_first_read)continue;
2326 if( global_context -> exonic_region_bitmap && is_same_chromosome)is_both_exonic_regions = is_pos_in_annotated_exon_regions(global_context, vote_simple_1_buffer[i].mapping_position + vote_simple_1_buffer[i].read_start_base ) && is_pos_in_annotated_exon_regions(global_context, vote_simple_2_buffer[j].mapping_position + vote_simple_2_buffer[j].read_start_base ) ;
2327
2328 int adjusted_weight;
2329
2330 if(1){
2331 if (is_both_exonic_regions && is_PE_distance) adjusted_weight = 1800;
2332 else if(is_both_exonic_regions) adjusted_weight = 1300;
2333 else if(is_PE_distance) adjusted_weight = 1300;
2334 else if(is_same_chromosome) adjusted_weight = 1000;
2335 else adjusted_weight = 800;
2336 }else{
2337 if (is_both_exonic_regions) adjusted_weight = 1300;
2338 else if(is_PE_distance) adjusted_weight = 1300;
2339 else if(is_same_chromosome) adjusted_weight = 1000;
2340 else adjusted_weight = 800;
2341 }
2342 //int adjusted_weight = is_PE_distance?1600:(is_same_chromosome?1000:500);
2343 int adjusted_votes = (vote_simple_1_buffer[i].major_half_votes + vote_simple_2_buffer[j].major_half_votes) * adjusted_weight;
2344
2345 for(target_index=0; target_index<used_comb_buffer; target_index++){
2346 if(comb_buffer[target_index].score_adj < adjusted_votes) break;
2347 }
2348
2349
2350 if(target_index < global_context -> config.max_vote_combinations){
2351 int move_i;
2352
2353 for(move_i = min(used_comb_buffer, global_context -> config.max_vote_combinations - 1) ; move_i > target_index ; move_i --)
2354 //checked: memory boundary
2355 memcpy(comb_buffer + move_i, comb_buffer + move_i - 1 , sizeof(vote_combination_t) );
2356
2357 comb_buffer[target_index].r1_loc = vote_simple_1_buffer+i;
2358 comb_buffer[target_index].r2_loc = vote_simple_2_buffer+j;
2359 comb_buffer[target_index].score_adj = adjusted_votes;
2360
2361 if(used_comb_buffer < global_context -> config.max_vote_combinations)
2362 used_comb_buffer ++;
2363 }
2364
2365 }
2366 }
2367 }
2368
2369 mapping_result_t * alignment_tmp_r1, * alignment_tmp_r2;
2370 alignment_tmp_r1 = (mapping_result_t *) topbuf -> alignment_tmp_r1;
2371 alignment_tmp_r2 = (mapping_result_t *) topbuf -> alignment_tmp_r2;
2372
2373 subjunc_result_t * junction_tmp_r2 , * junction_tmp_r1;
2374 junction_tmp_r1 = (subjunc_result_t *) topbuf -> junction_tmp_r1;
2375 junction_tmp_r2 = (subjunc_result_t *) topbuf -> junction_tmp_r2;
2376
2377 memset(junction_tmp_r1, 0, sizeof(subjunc_result_t) * global_context->config.multi_best_reads);
2378 memset(junction_tmp_r2, 0, sizeof(subjunc_result_t) * global_context->config.multi_best_reads);
2379
2380 memset(alignment_tmp_r1, 0, sizeof(mapping_result_t) * global_context->config.multi_best_reads);
2381 memset(alignment_tmp_r2, 0, sizeof(mapping_result_t) * global_context->config.multi_best_reads);
2382
2383 int alignment_res_r1_cursor = 0, alignment_res_r2_cursor = 0;
2384
2385 if(used_comb_buffer > 0){
2386 merge_sort(comb_buffer, used_comb_buffer, comb_sort_compare, comb_sort_exchange, comb_sort_merge);
2387 for(is_second_read = 0; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read++){
2388 int current_read_len = is_second_read ? read_len_2:read_len_1;
2389 char * current_read_text = is_second_read ? read_text_2:read_text_1;
2390 int current_all_subreads = is_second_read ? v2_all_subreads:v1_all_subreads;
2391 mapping_result_t * current_alignment_tmp = is_second_read?alignment_tmp_r2:alignment_tmp_r1;
2392 int * current_r_cursor = is_second_read ? &alignment_res_r2_cursor:&alignment_res_r1_cursor;
2393 int * is_fully_covered = is_second_read?&is_fully_covered_2:&is_fully_covered_1;
2394 gene_vote_t * current_vote = is_second_read?vote_2:vote_1;
2395
2396 subjunc_result_t * current_junction_tmp = NULL;
2397 if(global_context -> config.do_breakpoint_detection) current_junction_tmp = is_second_read?junction_tmp_r2:junction_tmp_r1;
2398
2399 for(i = used_comb_buffer - 1; i >=0; i--){
2400 if((* current_r_cursor) >= global_context->config.multi_best_reads)break;
2401
2402 // add the combination of comb_buffer[i] into the two mapping_result_t arrays
2403 simple_mapping_t * current_loc = is_second_read?comb_buffer[i].r2_loc:comb_buffer[i].r1_loc;
2404 assert(current_loc);
2405 unsigned int current_pos = current_loc->mapping_position;
2406
2407 int is_exist = 0;
2408 for(j = 0; j < *current_r_cursor; j++)
2409 {
2410 if(current_alignment_tmp[j].selected_position == current_pos){
2411 is_exist = 1;
2412 break;
2413 }
2414 }
2415 //SUBREADprintf("CLLL BUF %d R_%d : %u ; EXIST %d. Written into the %d-th best location\n", i, 1+is_second_read, current_loc->mapping_position, is_exist, *current_r_cursor);
2416
2417 if(!is_exist){
2418 if(current_loc -> is_vote_t_item)
2419 copy_vote_to_alignment_res(global_context, thread_context, current_alignment_tmp + (*current_r_cursor), current_junction_tmp ? current_junction_tmp + (*current_r_cursor) : NULL, current_vote, current_loc -> item_index_i, current_loc -> item_index_j, current_read_len, read_name_1, current_read_text, current_all_subreads , current_vote -> noninformative_subreads, pair_number, is_second_read, is_fully_covered);
2420 else{
2421 //checked: memory boundary
2422 memcpy(current_alignment_tmp + (*current_r_cursor), _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, current_loc -> item_index_i), sizeof(mapping_result_t));
2423 if(current_junction_tmp)
2424 //checked: memory boundary
2425 memcpy(current_junction_tmp + (*current_r_cursor), _global_retrieve_subjunc_ptr(global_context, pair_number, is_second_read, current_loc -> item_index_i), sizeof(subjunc_result_t));
2426 }
2427 (*current_r_cursor)++;
2428 }
2429 }
2430 }
2431 }else{// if the one end is not mapped at all
2432
2433 if(0 == simple_record_numbers[0])
2434 _global_retrieve_alignment_ptr(global_context, pair_number, 0, 0) -> noninformative_subreads_in_vote = vote_1 -> noninformative_subreads;
2435 if(global_context -> input_reads.is_paired_end_reads && 0 == simple_record_numbers[1])
2436 _global_retrieve_alignment_ptr(global_context, pair_number, 1, 0) -> noninformative_subreads_in_vote = vote_2 -> noninformative_subreads;
2437
2438 if(simple_record_numbers[0]>0 || simple_record_numbers[1]>0)
2439 {
2440 // copy all the simple into the mapping_result_t
2441
2442 for(is_second_read = 0; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read++)
2443 {
2444 int * current_r_cursor = is_second_read ? &alignment_res_r2_cursor:&alignment_res_r1_cursor;
2445
2446 int current_read_len = is_second_read ? read_len_2:read_len_1;
2447 char * current_read_text = is_second_read ? read_text_2:read_text_1;
2448 int current_all_subreads = is_second_read ? v2_all_subreads:v1_all_subreads;
2449 mapping_result_t * current_alignment_tmp = is_second_read?alignment_tmp_r2:alignment_tmp_r1;
2450 gene_vote_t * current_vote = is_second_read?vote_2:vote_1;
2451 int * is_fully_covered = is_second_read?&is_fully_covered_2:&is_fully_covered_1;
2452
2453 subjunc_result_t * current_junction_tmp = NULL;
2454 if(global_context -> config.do_breakpoint_detection) current_junction_tmp = is_second_read?junction_tmp_r2:junction_tmp_r1;
2455
2456 for(i = 0; i < simple_record_numbers[is_second_read]; i++){
2457
2458 if((*current_r_cursor) >= global_context->config.multi_best_reads)break;
2459
2460 simple_mapping_t * current_loc = is_second_read?vote_simple_2_buffer+i:vote_simple_1_buffer+i;
2461
2462 if(current_loc -> major_half_votes < global_context->config.minimum_subread_for_first_read) continue;
2463 unsigned int current_pos = current_loc->mapping_position;
2464
2465 int is_exist = 0;
2466 for(j = 0; j < *current_r_cursor; j++)
2467 {
2468 if(current_alignment_tmp[j].selected_position == current_pos){
2469 is_exist = 1;
2470 break;
2471 }
2472 }
2473 if(!is_exist){
2474 if(current_loc -> is_vote_t_item)
2475 copy_vote_to_alignment_res(global_context, thread_context, current_alignment_tmp + (*current_r_cursor), current_junction_tmp ? current_junction_tmp + (*current_r_cursor): NULL, current_vote, current_loc -> item_index_i, current_loc -> item_index_j, current_read_len, read_name_1, current_read_text, current_all_subreads , current_vote -> noninformative_subreads, pair_number, is_second_read, is_fully_covered);
2476 else{
2477 //checked:boundary
2478 memcpy(current_alignment_tmp + (*current_r_cursor), _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, current_loc -> item_index_i), sizeof(mapping_result_t));
2479 if(current_junction_tmp)
2480 //checked:boundary
2481 memcpy(current_junction_tmp + (*current_r_cursor), _global_retrieve_subjunc_ptr(global_context, pair_number, is_second_read, current_loc -> item_index_i), sizeof(subjunc_result_t));
2482 }
2483
2484 (*current_r_cursor)++;
2485 }
2486 }
2487 }
2488 }
2489 }
2490
2491 for(is_second_read = 0; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read++){
2492 int * current_r_cursor = is_second_read ? &alignment_res_r2_cursor:&alignment_res_r1_cursor;
2493 if((*current_r_cursor) > global_context->config.multi_best_reads){
2494 SUBREADprintf("ERROR: multi_best_locations excessed the boundary: %d > %d\n", (*current_r_cursor), global_context->config.multi_best_reads);
2495 return -1;
2496 }
2497 }
2498
2499 for(is_second_read = 0; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read++)
2500 {
2501 int * current_r_cursor = is_second_read ? &alignment_res_r2_cursor:&alignment_res_r1_cursor;
2502 mapping_result_t * current_alignment_tmp = is_second_read?alignment_tmp_r2:alignment_tmp_r1;
2503 subjunc_result_t * current_junction_tmp = NULL;
2504
2505 if(global_context -> config.do_breakpoint_detection) current_junction_tmp = is_second_read?junction_tmp_r2:junction_tmp_r1;
2506
2507 for(i = 0; i < global_context->config.multi_best_reads ; i++){
2508 mapping_result_t * cur_res = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, i);
2509 if( i < (*current_r_cursor))
2510 memcpy(cur_res, current_alignment_tmp + i, sizeof(mapping_result_t));
2511 else cur_res -> selected_votes = 0;
2512
2513 if(global_context -> config.do_breakpoint_detection) {
2514 subjunc_result_t * cur_junc = _global_retrieve_subjunc_ptr(global_context, pair_number, is_second_read, i);
2515 if(i < (*current_r_cursor))
2516 memcpy(cur_junc, current_junction_tmp + i , sizeof(subjunc_result_t));
2517 else cur_junc -> minor_votes = 0;
2518
2519 }
2520 }
2521 }
2522
2523 return 0;
2524 }
2525
2526
2527 // seq1 and seq2 must be on the same strand!
2528 // (seq2 is reversed)
2529 // The second half of seq1 MUST BE the same as the first half of seq2 if the two reads have an overlapping part.
is_gapped_as_funky(global_context_t * global_context,char * rname1,char * chr1,unsigned int pos1,int rlen1,int is_1_negative,char * cigar1,char * seq1,char * rname2,char * chr2,unsigned int pos2,int rlen2,int is_2_negative,char * cigar2,char * seq2,int tlen_removed_intron)2530 int is_gapped_as_funky(global_context_t * global_context, char * rname1, char * chr1, unsigned int pos1, int rlen1, int is_1_negative, char * cigar1, char * seq1, char * rname2, char * chr2, unsigned int pos2, int rlen2, int is_2_negative, char * cigar2, char * seq2, int tlen_removed_intron)
2531 {
2532 /*
2533 if(tlen_removed_intron >= rlen1 + rlen2) return 1; // may be gapped.
2534 int try_overlapping;
2535
2536 int best_matched_bases = 0;
2537 int best_overlapping_len = -1;
2538
2539 int assumed_overlapping = rlen1+rlen2-tlen_removed_intron;
2540 for(try_overlapping = 0; try_overlapping < min(rlen1, rlen2); try_overlapping++)
2541 {
2542 int r1_start = rlen1 - try_overlapping;
2543 int r2_end = try_overlapping;
2544 int xk1;
2545 int all_matched = 0, all_mismatched = 0;
2546 for(xk1 = 0; xk1 < r2_end; xk1++){
2547 char r1ch = seq1[r1_start + xk1];
2548 char r2ch = seq2[xk1];
2549 if(r1ch==r2ch) all_matched++;
2550 else all_mismatched++;
2551 }
2552
2553 if(all_mismatched <= 1 && try_overlapping == assumed_overlapping){
2554 // the assumed overlapping length is good enough.
2555 return 0;
2556 }
2557 if(all_mismatched <= 1 && all_matched > best_matched_bases){
2558 best_overlapping_len = try_overlapping;
2559 best_matched_bases = all_matched;
2560 }
2561 }
2562
2563 if(best_overlapping_len <= 0)return 0;
2564 return assumed_overlapping
2565 */
2566 return tlen_removed_intron > 600;
2567 }
2568
2569 // the positions are not offset by adding the first soft clipping length. I.e., pos1 and pos2 may be smaller than those in the SAM files.
2570 // seq1 and seq2 must be on the same strand!
2571 // (seq2 is reversed)
is_funky_fragment(global_context_t * global_context,char * rname1,char * chr1,unsigned int pos1,int rlen1,int is_1_negative,char * cigar1,char * seq1,char * rname2,char * chr2,unsigned int pos2,int rlen2,int is_2_negative,char * cigar2,char * seq2,int tlen_removed_intron)2572 int is_funky_fragment(global_context_t * global_context, char * rname1, char * chr1, unsigned int pos1, int rlen1, int is_1_negative, char * cigar1, char * seq1, char * rname2, char * chr2, unsigned int pos2, int rlen2, int is_2_negative, char * cigar2, char * seq2, int tlen_removed_intron)
2573 {
2574 long long llraw_tlen = pos1;
2575 llraw_tlen -= pos2;
2576 if(llraw_tlen <0)
2577 llraw_tlen = -llraw_tlen;
2578 unsigned int raw_tlen = llraw_tlen;
2579 raw_tlen += max(rlen2, rlen1);
2580
2581 //SUBREADprintf("CHRS=%p,%p, POS=%u,%u, RTLEN=%u\n", chr1, chr2, pos1, pos2, raw_tlen);
2582
2583 if(chr1 != chr2) raw_tlen = 0;
2584
2585 // note: the two pointers can be compared because they should be derived from the offset table.
2586 // Each chromosome name should have one and only one distinct char * pointer.
2587 if(chr1 == chr2 && raw_tlen <= global_context -> config.maximum_translocation_length && is_2_negative == is_1_negative)
2588 {
2589 if(is_gapped_as_funky(global_context, rname1, chr1, pos1, rlen1, is_1_negative, cigar1, seq1, rname2, chr2, pos2, rlen2, is_2_negative, cigar2, seq2, tlen_removed_intron))
2590 return FUNKY_FRAGMENT_A;
2591 else return NOT_FUNKY;
2592 }
2593 else if( chr1 == chr2 && raw_tlen <= global_context -> config.maximum_translocation_length && is_2_negative != is_1_negative )
2594 return FUNKY_FRAGMENT_DE;
2595 else if( chr1 != chr2 || raw_tlen > global_context -> config.maximum_translocation_length)
2596 return FUNKY_FRAGMENT_BC;
2597
2598 return NOT_FUNKY;
2599 }
2600
process_voting_junction(global_context_t * global_context,thread_context_t * thread_context,subread_read_number_t pair_number,gene_vote_t * vote_1,gene_vote_t * vote_2,char * read_name_1,char * read_name_2,char * read_text_1,char * read_text_2,int read_len_1,int read_len_2,int is_negative_strand,gene_vote_number_t v1_all_subreads,gene_vote_number_t v2_all_subreads)2601 int process_voting_junction(global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t pair_number, gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, gene_vote_number_t v1_all_subreads, gene_vote_number_t v2_all_subreads){
2602
2603
2604 //#warning "FOR TESTING CLUSTER_BASED JUNCTION DETECTION ONLY!!."
2605 //return process_voting_junction_PE_juncs(global_context, thread_context, pair_number, vote_1, vote_2, read_name_1, read_name_2, read_text_1, read_text_2, read_len_1, read_len_2, is_negative_strand, v1_all_subreads, v2_all_subreads);
2606 return process_voting_junction_PE_topK(global_context, thread_context, pair_number, vote_1, vote_2, read_name_1, read_name_2, read_text_1, read_text_2, read_len_1, read_len_2, is_negative_strand, v1_all_subreads, v2_all_subreads);
2607
2608 }
2609
2610
explain_read(global_context_t * global_context,thread_context_t * thread_context,realignment_result_t * final_realignments,subread_read_number_t pair_number,int read_len,char * read_name,char * read_text,char * qual_text,int is_second_read,int best_read_id,int is_negative_strand)2611 unsigned int explain_read(global_context_t * global_context, thread_context_t * thread_context, realignment_result_t * final_realignments, subread_read_number_t pair_number, int read_len, char * read_name , char *read_text, char *qual_text, int is_second_read, int best_read_id, int is_negative_strand)
2612 {
2613 explain_context_t explain_context;
2614 mapping_result_t *current_result = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, best_read_id);
2615
2616 if(global_context -> config.do_big_margin_filtering_for_reads)
2617 {
2618 int current_repeated_times = is_ambiguous_voting(global_context, pair_number, is_second_read, current_result->selected_votes, current_result->confident_coverage_start, current_result->confident_coverage_end, read_len, (current_result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0);
2619 if( global_context -> config.do_big_margin_filtering_for_reads && current_repeated_times>1) return 0;
2620 }
2621
2622 memset(&explain_context,0, sizeof(explain_context_t));
2623 explain_context.full_read_len = read_len;
2624 explain_context.is_fully_covered = current_result -> is_fully_covered ;
2625 explain_context.full_read_text = read_text;
2626 explain_context.full_qual_text = qual_text;
2627 explain_context.read_name = read_name;
2628 explain_context.is_confirmed_section_negative_strand = is_negative_strand ;
2629 explain_context.pair_number = pair_number;
2630 explain_context.is_second_read = is_second_read ;
2631 explain_context.best_read_id = best_read_id;
2632 explain_context.total_tries = 0;
2633
2634 if(0 && FIXLENstrcmp("simulated.24700032", explain_context.read_name)==0)SUBREADprintf("BBFINAL %s SEL_POS=%u COV=%d - %d\n", explain_context.read_name, current_result -> selected_position, current_result -> confident_coverage_start, current_result -> confident_coverage_end);
2635
2636 unsigned int back_search_tail_position,front_search_start_position;
2637 unsigned short back_search_read_tail, front_search_read_start;
2638
2639
2640 back_search_read_tail = min(explain_context.full_read_len , current_result -> confident_coverage_end );//- 5;
2641 back_search_tail_position = current_result -> selected_position + back_search_read_tail + current_result -> indels_in_confident_coverage;
2642
2643 //if( back_search_read_tail > 102)
2644 //SUBREADprintf("MAX back_search_read_tail : MIN %d , %d\n", explain_context.full_read_len , current_result -> confident_coverage_end);
2645
2646 explain_context.tmp_search_junctions[0].read_pos_end = back_search_read_tail;
2647 explain_context.tmp_search_junctions[0].abs_offset_for_start = back_search_tail_position;
2648
2649 explain_context.all_back_alignments = 0;
2650 explain_context.tmp_search_sections = 0;
2651 explain_context.best_indel_penalty =0;
2652 explain_context.best_matching_bases = -9999;
2653 explain_context.second_best_matching_bases = -9999;
2654 explain_context.tmp_indel_penalty = 0;
2655 explain_context.tmp_total_matched_bases = 0;
2656 explain_context.is_currently_tie = 0;
2657 explain_context.best_is_complex = 0;
2658 explain_context.best_support_as_simple = 0;
2659 explain_context.best_min_unsupport_as_simple = 0;
2660 explain_context.tmp_support_as_simple = 0;
2661 explain_context.tmp_min_support_as_complex = 999999;
2662 explain_context.tmp_min_unsupport = 999999;
2663 explain_context.tmp_is_pure_donor_found_explain = 1;
2664 explain_context.best_is_pure_donor_found_explain = 0;
2665
2666 if(1) {
2667 front_search_read_start = back_search_read_tail > 8? back_search_read_tail - 8:0;
2668 front_search_start_position = back_search_tail_position>8?back_search_tail_position - 8:0;
2669 } else {
2670 //front_search_read_start = current_result -> confident_coverage_start + 5;
2671 front_search_read_start = min(explain_context.full_read_len , current_result -> confident_coverage_end);
2672 if(front_search_read_start > 2*global_context -> config.realignment_minimum_variant_distance) front_search_read_start -= 2*global_context -> config.realignment_minimum_variant_distance;
2673 else front_search_read_start = 0;
2674 front_search_start_position = current_result -> selected_position + front_search_read_start;
2675 }
2676
2677 search_events_to_back(global_context, thread_context, &explain_context, read_text , qual_text, back_search_tail_position , back_search_read_tail, 0, 0, 1);
2678 int back_penalty = explain_context.best_indel_penalty;
2679
2680 //int is_backsearch_tie = explain_context.is_currently_tie;
2681 int back_search_matches_diff = -9999;
2682
2683 /*
2684
2685
2686 if(explain_context.back_search_confirmed_sections>0)
2687 {
2688
2689 short last_section_length = explain_context.back_search_junctions[0].read_pos_end - explain_context.back_search_junctions[0].read_pos_start;
2690
2691 front_search_read_start = explain_context.back_search_junctions[0].read_pos_start;
2692 front_search_start_position = explain_context.back_search_junctions[0].abs_offset_for_start - last_section_length;
2693
2694 int last_sec = explain_context.back_search_confirmed_sections-1;
2695
2696 current_result -> selected_position = explain_context.back_search_junctions[last_sec].abs_offset_for_start - explain_context.back_search_junctions[last_sec].read_pos_end + explain_context.back_search_junctions[last_sec].read_pos_start;
2697 back_search_matches_diff = explain_context.best_matching_bases - explain_context.second_best_matching_bases;
2698
2699 if(0 && memcmp(explain_context.read_name, TTTSNAME, 26)==0)
2700 {
2701 int xk1;
2702 for(xk1 = 0; xk1 < explain_context.back_search_confirmed_sections; xk1++)
2703 {
2704 short pr_section_length = explain_context.back_search_junctions[xk1].read_pos_end - explain_context.back_search_junctions[xk1].read_pos_start;
2705 if(explain_context.back_search_junctions[xk1].event_after_section)
2706 SUBREADprintf("BACK_SECTIONS [%d], START IS %u; RPSS=%d ; RPED=%d ; LEN=%d ; EVENT is %u %u INDEL=%d\n", xk1, explain_context.back_search_junctions[xk1].abs_offset_for_start, explain_context.back_search_junctions[xk1].read_pos_start, explain_context.back_search_junctions[last_sec].read_pos_end, pr_section_length, explain_context.back_search_junctions[xk1].event_after_section->event_small_side, explain_context.back_search_junctions[xk1].event_after_section->event_large_side, explain_context.back_search_junctions[xk1].event_after_section->indel_length);
2707 else SUBREADprintf("BACK_SECTIONS [%d], START IS %u; RPSS=%d ; RPED=%d ; LEN=%d\n", xk1, explain_context.back_search_junctions[xk1].abs_offset_for_start, explain_context.back_search_junctions[xk1].read_pos_start, explain_context.back_search_junctions[last_sec].read_pos_end, pr_section_length);
2708 }
2709 }
2710
2711 //SUBREADprintf("DBI:%d - %d;\n", explain_context.best_matching_bases , explain_context.second_best_matching_bases);
2712 }
2713 else
2714 */
2715 explain_context.all_front_alignments = 0;
2716 explain_context.tmp_search_sections = 0;
2717 explain_context.best_indel_penalty = 0;
2718 explain_context.best_matching_bases = -9999;
2719 explain_context.second_best_matching_bases = -9999;
2720 explain_context.tmp_total_matched_bases = 0;
2721 explain_context.tmp_indel_penalty = 0;
2722
2723 explain_context.is_currently_tie = 0;
2724 explain_context.best_is_complex = 0;
2725 explain_context.best_support_as_simple = 0;
2726 explain_context.best_min_unsupport_as_simple = 0;
2727 explain_context.tmp_support_as_simple = 0;
2728 explain_context.tmp_min_support_as_complex = 999999;
2729 explain_context.tmp_min_unsupport = 999999;
2730 explain_context.tmp_is_pure_donor_found_explain = 1;
2731 explain_context.best_is_pure_donor_found_explain = 0;
2732
2733 memset(explain_context.tmp_search_junctions, 0, sizeof(perfect_section_in_read_t ) * MAX_EVENTS_IN_READ);
2734
2735 explain_context.tmp_search_junctions[0].read_pos_start = front_search_read_start;
2736 explain_context.tmp_search_junctions[0].abs_offset_for_start = front_search_start_position;
2737
2738 if(0 && FIXLENstrcmp("R000002689",explain_context.read_name ) == 0)
2739 SUBREADprintf("Enter F_SEARCH: start=%u read_pos=%d REMAIN=%d\n", front_search_start_position, front_search_read_start, read_len - front_search_read_start );
2740
2741
2742 short search_remain = read_len - front_search_read_start;
2743 //#warning "SUBREAD_151 REMOVE THE ASSERT! "
2744 //if(search_remain >= 102)SUBREADprintf("FATAL: RLEN=%d, SEARCH=%d\n", read_len, front_search_read_start);
2745 //assert( search_remain < 102 );
2746
2747 search_events_to_front(global_context, thread_context, &explain_context, read_text + front_search_read_start, qual_text + front_search_read_start, front_search_start_position, search_remain , 0, 0, 1);
2748 if(0 && FIXLENstrcmp("R_chr901_932716_91M1D9M",explain_context.read_name ) == 0)
2749 SUBREADprintf("F_SEARCH has found %d result sets\n", explain_context.all_front_alignments);
2750
2751 explain_context.best_indel_penalty += back_penalty;
2752 //int is_frontsearch_tie = explain_context.is_currently_tie;
2753
2754 //SUBREADprintf("DFI:%d - %d;\n", explain_context.best_matching_bases , explain_context.second_best_matching_bases);
2755 int front_search_matches_diff = explain_context.best_matching_bases - explain_context.second_best_matching_bases;
2756 explain_context.best_second_match_diff = front_search_matches_diff + back_search_matches_diff;
2757
2758 int realignment_number = finalise_explain_CIGAR(global_context, thread_context, &explain_context, final_realignments);
2759
2760 if(0 && FIXLENstrcmp("SRR3439488.572382", explain_context.read_name)==0)
2761 SUBREADprintf("TRYING_REALIGN:%s:%u\n", explain_context.read_name, explain_context.total_tries);
2762
2763 return realignment_number;
2764 }
2765
2766
debug_clipping(global_context_t * global_context,thread_context_t * thread_context,gene_value_index_t * current_value_index,char * read_text,unsigned int mapped_pos,int test_len,int search_to_tail,int search_center,int number_of_clipped,char * read_name)2767 void debug_clipping(global_context_t * global_context, thread_context_t * thread_context, gene_value_index_t * current_value_index, char * read_text, unsigned int mapped_pos, int test_len, int search_to_tail, int search_center, int number_of_clipped, char * read_name){
2768
2769 //if(test_len>100)return;
2770
2771 int xk1;
2772
2773 SUBREADprintf("\n %s CENTER=%d, CLIPPED=%d, TLEN=%d %s\n", read_name, search_center, number_of_clipped, test_len, search_to_tail?">>>>":"<<<<");
2774
2775 for(xk1 = 0 ; xk1 < test_len ; xk1++)
2776 {
2777 char reference_base = gvindex_get(current_value_index, xk1 + mapped_pos);
2778 SUBREADprintf("%c", reference_base == read_text[xk1] ? '-':'#');
2779 }
2780
2781 SUBREADprintf("\n");
2782 for(xk1 = 0 ; xk1 < test_len ; xk1++)
2783 {
2784 if(xk1 == search_center)
2785 SUBREADprintf("%c", search_to_tail?'>':'<');
2786 else SUBREADprintf(" ");
2787 }
2788
2789 SUBREADprintf("\n");
2790 for(xk1 = 0 ; xk1 < test_len ; xk1++)
2791 {
2792 if( search_to_tail && xk1 >= test_len - number_of_clipped)
2793 SUBREADprintf("R");
2794 else if( (!search_to_tail) && xk1 <= number_of_clipped - 1)
2795 SUBREADprintf("L");
2796 else SUBREADprintf(" ");
2797 }
2798
2799 SUBREADprintf("\n");
2800
2801 }
2802
2803 #define SOFT_CLIPPING_WINDOW_SIZE 5
2804 #define SOFT_CLIPPING_MAX_ERROR 1
2805
2806 // it returns the number of bases to be clipped off.
find_soft_clipping(global_context_t * global_context,thread_context_t * thread_context,gene_value_index_t * current_value_index,char * read_text,unsigned int mapped_pos,int test_len,int search_to_tail,int search_center)2807 int find_soft_clipping(global_context_t * global_context, thread_context_t * thread_context, gene_value_index_t * current_value_index, char * read_text, unsigned int mapped_pos, int test_len, int search_to_tail, int search_center)
2808 {
2809 int base_in_window = 0;
2810 int added_base_index = 0, removed_base_index = 0;
2811 int search_start = 0;
2812 int matched_in_window = SOFT_CLIPPING_WINDOW_SIZE;
2813 int last_matched_base_index = -1, delta;
2814
2815 if(search_to_tail)
2816 {
2817 if(search_center < 0)
2818 search_start = 0;
2819 else if(search_center >= test_len)
2820 // SHOULD NOT HAPPEN!!!
2821 search_start = test_len - 1;
2822 else search_start = search_center - 1;
2823
2824 delta = 1;
2825 }else{
2826 if(search_center < 0)
2827 // SHOULD NOT HAPPEN!!!
2828 search_start = 0;
2829 else if(search_center >= test_len)
2830 search_start = test_len - 1;
2831 else search_start = search_center + 1;
2832
2833 delta = -1;
2834 }
2835
2836 for(added_base_index = search_start; added_base_index >= 0 && added_base_index < test_len; added_base_index += delta)
2837 {
2838 // add the new base
2839 char reference_base = gvindex_get(current_value_index, added_base_index + mapped_pos);
2840
2841 if(0){
2842 char outpos1[100];
2843 absoffset_to_posstr(global_context, added_base_index + mapped_pos, outpos1);
2844 SUBREADprintf("CHMAT [%s] %s (%u) ref:read = %c:%c\n", search_to_tail?"T":"H" ,outpos1, added_base_index + mapped_pos, reference_base, read_text[added_base_index]);
2845 }
2846 int added_is_matched = (reference_base == read_text[added_base_index]);
2847 matched_in_window += added_is_matched;
2848 if(added_is_matched)
2849 last_matched_base_index = added_base_index;
2850
2851 base_in_window ++;
2852
2853 if(base_in_window > SOFT_CLIPPING_WINDOW_SIZE){
2854 removed_base_index = added_base_index - delta * SOFT_CLIPPING_WINDOW_SIZE;
2855 char removing_ref_base = gvindex_get(current_value_index, removed_base_index + mapped_pos);
2856 matched_in_window -= (removing_ref_base == read_text[removed_base_index]);
2857 }else{
2858 matched_in_window --;
2859 }
2860
2861 if(matched_in_window < SOFT_CLIPPING_WINDOW_SIZE - SOFT_CLIPPING_MAX_ERROR){
2862 // clip, bondary is the last matched base.
2863 if(search_to_tail){
2864 if(last_matched_base_index < 0) return test_len - search_start;
2865 else return test_len - last_matched_base_index - 1;
2866 }else{
2867 if(last_matched_base_index >= 0) return last_matched_base_index;
2868 else return search_start - 1;
2869 }
2870 }
2871 }
2872
2873 if(last_matched_base_index < 0) return test_len;
2874
2875 if(search_to_tail){
2876 if(last_matched_base_index < 0) return test_len - search_start;
2877 else return test_len - last_matched_base_index - 1;
2878 }else{
2879 if(last_matched_base_index >= 0) return last_matched_base_index;
2880 else return search_start - 1;
2881 }
2882 }
2883
2884 // read_head_abs_offset is the first WANTED base in read.
2885 // If the first section in read is reversed, read_head_abs_offset is the LAST WANTED bases in this section. (the abs offset of the first base in the section is actually larger than read_head_abs_offset)
final_CIGAR_quality(global_context_t * global_context,thread_context_t * thread_context,char * read_text,char * qual_text,int read_len,char * cigar_string,unsigned long read_head_abs_offset,int is_read_head_reversed,int * mismatched_bases,int covered_start,int covered_end,char * read_name,int * non_clipped_length,int * total_indel_length,int * matched_bases,int * chromosomal_length,int * full_section_clipped)2886 int final_CIGAR_quality(global_context_t * global_context, thread_context_t * thread_context, char * read_text, char * qual_text, int read_len, char * cigar_string, unsigned long read_head_abs_offset, int is_read_head_reversed, int * mismatched_bases, int covered_start, int covered_end, char * read_name, int * non_clipped_length, int *total_indel_length, int * matched_bases, int * chromosomal_length, int * full_section_clipped)
2887 {
2888 int cigar_cursor = 0;
2889 int read_cursor = 0;
2890 unsigned int current_perfect_section_abs = read_head_abs_offset;
2891 int rebuilt_read_len = 0, total_insertion_length = 0;
2892 float all_matched_bases = 0;
2893 gene_value_index_t * current_value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
2894 int current_reversed = is_read_head_reversed;
2895 int all_mismatched = 0;
2896 int is_First_M = 1, is_wrong_cigar = 0;
2897 int head_soft_clipped = -1, tail_soft_clipped = -1;
2898 unsigned int tmp_int = 0;
2899
2900 //SUBREADprintf("Coverage : %d ~ %d\n", covered_start, covered_end);
2901
2902 if(0){
2903 char posout1[100];
2904 int chro_max = get_offset_maximum_chro_pos(global_context,thread_context,read_head_abs_offset);
2905 absoffset_to_posstr(global_context, read_head_abs_offset, posout1);
2906 SUBREADprintf("READ %s : mapped to %s ; max_pos=%d\n", read_name, posout1, chro_max);
2907 }
2908
2909 while(1)
2910 {
2911 char nch = cigar_string[cigar_cursor++];
2912 if(!nch)break;
2913 if(isdigit(nch))
2914 tmp_int = tmp_int*10+(nch-'0');
2915 else{
2916 if(tmp_int == 0)is_wrong_cigar = 1;
2917 if(is_wrong_cigar) break;
2918 if(nch == 'M' || nch == 'S')
2919 {
2920 char *qual_text_cur;
2921 if(qual_text[0])qual_text_cur = qual_text+read_cursor;
2922 else qual_text_cur = NULL;
2923
2924 float section_qual;
2925
2926 int is_Last_M = (cigar_string[cigar_cursor]==0);
2927 int has_clipping_this_section_head = 0, has_clipping_this_section_tail = 0;
2928 char * reversed_first_section_text = NULL;
2929
2930 if(0){
2931 int is_head_in_chro = get_offset_maximum_chro_pos(global_context,thread_context, current_perfect_section_abs );
2932 int is_end_in_chro = get_offset_maximum_chro_pos(global_context,thread_context, current_perfect_section_abs + tmp_int );
2933 char posout1[100];
2934 char posout2[100];
2935 int chro_max = get_offset_maximum_chro_pos(global_context,thread_context, current_perfect_section_abs );
2936 absoffset_to_posstr(global_context, current_perfect_section_abs, posout1);
2937 absoffset_to_posstr(global_context, current_perfect_section_abs + tmp_int, posout2);
2938 SUBREADprintf(" %dM SECTION : mapped to %s ~ %s ; max_pos=%d ; Hin=%d, Ein=%d\n", tmp_int, posout1, posout2, chro_max, is_head_in_chro, is_end_in_chro);
2939 SUBREADprintf(" %dM SECTION : Hin=%d, Ein=%d\n", tmp_int, is_head_in_chro, is_end_in_chro);
2940 }
2941
2942 // find "J" sections if it is the first M
2943 if(is_First_M && global_context -> config.show_soft_cliping)
2944 {
2945 int adj_coverage_start = covered_start - read_cursor;
2946
2947 if(current_reversed)
2948 {
2949 reversed_first_section_text = malloc(MAX_READ_LENGTH);
2950 memcpy(reversed_first_section_text, read_text, tmp_int);
2951 reverse_read(reversed_first_section_text, tmp_int, global_context->config.space_type);
2952
2953 head_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, reversed_first_section_text, current_perfect_section_abs, tmp_int, 1, 0);
2954 }
2955 else
2956 head_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, read_text, current_perfect_section_abs, tmp_int, 0, adj_coverage_start);
2957 //SUBREADprintf("SSHEAD:%d\n", head_soft_clipped);
2958
2959 if(head_soft_clipped == tmp_int){
2960 (*full_section_clipped) = 1;
2961 head_soft_clipped = 0;
2962 }
2963 else has_clipping_this_section_head = 1;
2964
2965 if(has_clipping_this_section_head){
2966 if( tmp_int - head_soft_clipped < 3 && head_soft_clipped > 1 ) (*full_section_clipped) = 1;
2967 }
2968
2969 if(reversed_first_section_text)
2970 free(reversed_first_section_text);
2971 reversed_first_section_text = NULL;
2972 }
2973 if(is_Last_M && global_context -> config.show_soft_cliping)
2974 {
2975 int adj_coverage_end = covered_end - read_cursor;
2976
2977 if(current_reversed)
2978 {
2979 reversed_first_section_text = malloc(MAX_READ_LENGTH);
2980 // checked: boundary
2981 memcpy(reversed_first_section_text, read_text + read_cursor, tmp_int);
2982 reverse_read(reversed_first_section_text, tmp_int, global_context->config.space_type);
2983 tail_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, reversed_first_section_text, current_perfect_section_abs, tmp_int, 0, tmp_int);
2984 }
2985 else
2986 tail_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, read_text + read_cursor, current_perfect_section_abs, tmp_int, 1, adj_coverage_end);
2987
2988 if(1 && FIXLENstrcmp("NS500643:556:HGTMTBGXB:4:13403:18179:8012", read_name)==0)
2989 SUBREADprintf("SSTAIL:%d\n", tail_soft_clipped);
2990
2991 if(1 && tail_soft_clipped == tmp_int){
2992 tail_soft_clipped = 0;
2993 if(full_section_clipped)(*full_section_clipped) = 1;
2994 } else has_clipping_this_section_tail = 1;
2995
2996 if( has_clipping_this_section_tail ){
2997 if(tmp_int - tail_soft_clipped < 3 && tail_soft_clipped > 1) (*full_section_clipped) = 1;
2998 }
2999
3000 if(reversed_first_section_text)
3001 free(reversed_first_section_text);
3002 }
3003
3004 if(is_Last_M && is_First_M && tail_soft_clipped+head_soft_clipped >= tmp_int-1)
3005 {
3006 head_soft_clipped=0;
3007 tail_soft_clipped=0;
3008 }
3009
3010 int mismatch_calculation_start = has_clipping_this_section_head?head_soft_clipped:0;
3011 int mismatch_calculation_end = has_clipping_this_section_tail?tail_soft_clipped:0;
3012
3013 if(global_context -> config.space_type == GENE_SPACE_COLOR)
3014 section_qual = match_base_quality_cs(current_value_index, read_text+read_cursor, current_perfect_section_abs, qual_text_cur, tmp_int, global_context->config.phred_score_format , mismatched_bases, &all_mismatched, global_context -> config.high_quality_base_threshold, mismatch_calculation_start, mismatch_calculation_end);
3015 else
3016 section_qual = match_base_quality(current_value_index, read_text+read_cursor, current_perfect_section_abs, qual_text_cur, tmp_int, current_reversed, global_context->config.phred_score_format , mismatched_bases, &all_mismatched, global_context -> config.high_quality_base_threshold, mismatch_calculation_start, mismatch_calculation_end);
3017 all_matched_bases += section_qual;
3018 rebuilt_read_len += tmp_int;
3019 is_First_M=0;
3020
3021 read_cursor += tmp_int;
3022
3023 //move to the NEXT UNWANTED ABS OFFSET.
3024 if(current_reversed)
3025 current_perfect_section_abs --;
3026 else
3027 current_perfect_section_abs += tmp_int;
3028
3029
3030 }
3031 else if(nch == 'I')
3032 {
3033 rebuilt_read_len += tmp_int;
3034 read_cursor += tmp_int;
3035
3036 all_matched_bases += tmp_int;
3037 total_indel_length += tmp_int;
3038 total_insertion_length += tmp_int;
3039 }
3040 else if(nch == 'D')
3041 {
3042 total_indel_length ++;
3043 if(!current_reversed)
3044 current_perfect_section_abs += tmp_int;
3045 }
3046 else if(tolower(nch) == 'n')
3047 {
3048 total_indel_length ++;
3049 current_perfect_section_abs += tmp_int;
3050 if(nch == 'n') current_reversed = !current_reversed;
3051 }
3052 else if(tolower(nch) == 'b')
3053 {
3054 total_indel_length ++;
3055 current_perfect_section_abs -= tmp_int;
3056 if(nch == 'b') current_reversed = !current_reversed;
3057 }
3058
3059 if(read_cursor>MAX_READ_LENGTH){
3060 SUBREADprintf("ERROR: Cigar section longer than read length: %d >= %d, '%s'\n", tmp_int , MAX_READ_LENGTH, cigar_string);
3061 is_wrong_cigar = 1;
3062 }
3063
3064 tmp_int = 0;
3065 }
3066 }
3067
3068 int my_non_clipped_length = read_len;
3069 my_non_clipped_length -= max(0,tail_soft_clipped);
3070 my_non_clipped_length -= max(0,head_soft_clipped);
3071
3072 //#warning " ========== COMMENT THIS LINE !! ========="
3073 //printf("QCR ALL MM=%d, RBLEN=%d, MAPPED_LEN=%d ; CIGAR=%s\n", all_mismatched, rebuilt_read_len , my_non_clipped_length, cigar_string);
3074
3075 if(is_wrong_cigar || rebuilt_read_len != read_len || my_non_clipped_length < global_context->config.min_mapped_fraction){
3076 (*mismatched_bases)=99999;
3077 all_matched_bases = 0;
3078 sprintf(cigar_string, "%dM", read_len);
3079 }
3080 else if((head_soft_clipped>0 || tail_soft_clipped>0))
3081 {
3082 char new_cigar_tmp[120];
3083 is_First_M=1;
3084 new_cigar_tmp[0]=0;
3085 cigar_cursor = 0;
3086 while(1)
3087 {
3088 char nch = cigar_string[cigar_cursor++];
3089
3090 if(!nch)break;
3091 if(isdigit(nch))
3092 tmp_int = tmp_int*10+(nch-'0');
3093 else{
3094 char cigar_piece [30];
3095 cigar_piece[0]=0;
3096
3097 if(nch == 'M')
3098 {
3099 char cigar_tiny [12];
3100 int is_Last_M = (cigar_string[cigar_cursor]==0);
3101 if(is_First_M && head_soft_clipped>0)
3102 {
3103 tmp_int -= head_soft_clipped;
3104 sprintf(cigar_tiny,"%dS",head_soft_clipped);
3105 strcat(cigar_piece, cigar_tiny);
3106 }
3107 if(is_Last_M && tail_soft_clipped>0)
3108 {
3109 tmp_int -= tail_soft_clipped;
3110 }
3111 sprintf(cigar_tiny,"%dM",tmp_int);
3112 strcat(cigar_piece, cigar_tiny);
3113 if(is_Last_M && tail_soft_clipped>0)
3114 {
3115 sprintf(cigar_tiny,"%dS",tail_soft_clipped);
3116 strcat(cigar_piece, cigar_tiny);
3117 }
3118 is_First_M = 0;
3119 }
3120 else
3121 {
3122 sprintf(cigar_piece, "%u%c", tmp_int, nch);
3123 }
3124
3125 strcat(new_cigar_tmp, cigar_piece);
3126 tmp_int = 0;
3127 }
3128 }
3129
3130 if(1 && FIXLENstrcmp("NS500643:556:HGTMTBGXB:4:13403:18179:8012", read_name)==0)
3131 SUBREADprintf("NEW_CIGAR_2 : %s\n", new_cigar_tmp);
3132 strcpy(cigar_string, new_cigar_tmp);
3133 }
3134
3135 if((*mismatched_bases) != 99999)
3136 (*mismatched_bases) = all_mismatched;
3137
3138 (*non_clipped_length) = my_non_clipped_length;
3139 (*matched_bases) = my_non_clipped_length - all_mismatched - total_insertion_length;
3140 (*chromosomal_length) = current_perfect_section_abs - read_head_abs_offset + total_insertion_length;
3141
3142 return max(0, (int)(all_matched_bases*60/my_non_clipped_length));
3143 }
3144
3145 // this function also adds final_counting_reads in chromosome_events.
finalise_explain_CIGAR(global_context_t * global_context,thread_context_t * thread_context,explain_context_t * explain_context,realignment_result_t * final_realignments)3146 unsigned int finalise_explain_CIGAR(global_context_t * global_context, thread_context_t * thread_context, explain_context_t * explain_context, realignment_result_t * final_realignments)
3147 {
3148 int xk1, front_i, back_i;
3149 char tmp_cigar[120];
3150 chromosome_event_t * to_be_supported [20];
3151 short flanking_size_left[20], flanking_size_right[20];
3152 int to_be_supported_count = 0;
3153 int is_junction_read = 0;
3154 int total_perfect_matched_sections = 0;
3155
3156 mapping_result_t * result = _global_retrieve_alignment_ptr(global_context, explain_context->pair_number, explain_context->is_second_read, explain_context-> best_read_id);
3157 result -> result_flags &= ~CORE_IS_FULLY_EXPLAINED;
3158 result -> result_flags &= ~CORE_IS_PAIRED_END;
3159
3160 //SUBREADprintf("FINAL_CIGAR R1 %d[%d] = %p, FLAGS=%d\n", explain_context -> pair_number , explain_context-> best_read_id , result , result -> result_flags);
3161 // reverse the back_search result for every equally best alignment
3162 //
3163
3164 for(back_i = 0; back_i < explain_context -> all_back_alignments; back_i++){
3165 if( explain_context -> result_back_junction_numbers[back_i] > MAX_EVENTS_IN_READ ){
3166 SUBREADprintf("ERROR: Too many cigar sections: %d > %d\n", explain_context -> result_back_junction_numbers[back_i] , MAX_EVENTS_IN_READ);
3167 return 0;
3168 }
3169 for(xk1=0; xk1<explain_context -> result_back_junction_numbers[back_i]/2; xk1++)
3170 {
3171 perfect_section_in_read_t tmp_exp;
3172 // checked: boundary
3173 memcpy(&tmp_exp, &explain_context -> result_back_junctions[back_i][xk1], sizeof(perfect_section_in_read_t));
3174 memcpy(&explain_context -> result_back_junctions[back_i][xk1], &explain_context -> result_back_junctions[back_i][explain_context -> result_back_junction_numbers[back_i] - xk1 - 1] , sizeof(perfect_section_in_read_t));
3175 memcpy(&explain_context -> result_back_junctions[back_i][explain_context -> result_back_junction_numbers[back_i] - xk1 - 1] , &tmp_exp , sizeof(perfect_section_in_read_t));
3176 }
3177 }
3178
3179 // adding indel lengths in read lengths and relocate sections
3180 // note that the last section in back results has the same strand of the main piece.
3181
3182 int is_cigar_overflow = 0, fusions_in_read = 0, final_alignment_number = 0;
3183 for(back_i = 0; back_i < explain_context -> all_back_alignments; back_i++){
3184 if(final_alignment_number >= MAX_ALIGNMENT_PER_ANCHOR)break;
3185
3186 int is_first_section_negative = (result ->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
3187 for(xk1=0; xk1<explain_context -> result_back_junction_numbers[back_i]; xk1++)
3188 {
3189 int section_length = explain_context -> result_back_junctions[back_i][xk1].read_pos_end - explain_context -> result_back_junctions[back_i][xk1].read_pos_start;
3190 if(0 && FIXLENstrcmp("simulated.11420793", explain_context->read_name)==0)SUBREADprintf("FINAL_EXPLAIN %s BACK_%d SEC_%d OLD_START=%d SEC_LENG=%d\n", explain_context->read_name, back_i, xk1, explain_context -> result_back_junctions[back_i][xk1].abs_offset_for_start, section_length);
3191 unsigned int new_start_pos;
3192
3193 if(explain_context -> result_back_junctions[back_i][xk1].is_strand_jumped)
3194 // the "strand_jumped" section do not need to move
3195 // however, the "abs_offset_for_start" is actually for the last base in this section.
3196 // this does not metter if we compare the reversed read to the chromosome.
3197 // "abs_offset_for_start" is the first UNWANTED base (smaller than the first WANTED base)
3198 new_start_pos = explain_context -> result_back_junctions[back_i][xk1].abs_offset_for_start +1;
3199 else
3200 // "abs_offset_for_start" is the first UNWANTED base. By subtracting the length, it becomes the first WANTED base.
3201 new_start_pos = explain_context -> result_back_junctions[back_i][xk1].abs_offset_for_start - section_length;
3202
3203 explain_context -> result_back_junctions[back_i][xk1].abs_offset_for_start = new_start_pos;
3204 if(explain_context -> result_back_junctions[back_i][xk1].event_after_section
3205 && explain_context -> result_back_junctions[back_i][xk1].event_after_section->is_strand_jumped) is_first_section_negative=!is_first_section_negative;
3206 }
3207
3208 // build CIGAR
3209 for(front_i = 0; front_i < explain_context -> all_front_alignments; front_i++){
3210 if(final_alignment_number >= MAX_ALIGNMENT_PER_ANCHOR)break;
3211
3212 to_be_supported_count = 0;
3213 tmp_cigar[0]=0;
3214 int known_junction_supp = 0;
3215
3216 for(xk1 = 0; xk1 < explain_context -> result_back_junction_numbers[back_i] + explain_context -> result_front_junction_numbers[front_i] -1; xk1++)
3217 {
3218 char piece_cigar[25];
3219 int read_pos_start, read_pos_end;
3220 perfect_section_in_read_t * current_section, *next_section = NULL;
3221
3222 int is_front_search = 0;
3223 if(xk1 >= explain_context -> result_back_junction_numbers[back_i] - 1) {
3224 current_section = &explain_context -> result_front_junctions[front_i][xk1 - explain_context -> result_back_junction_numbers[back_i] +1];
3225 if(xk1 - explain_context -> result_back_junction_numbers[back_i] +2 < explain_context -> result_front_junction_numbers[front_i])
3226 next_section = &explain_context -> result_front_junctions[front_i][xk1 - explain_context -> result_back_junction_numbers[back_i] +2];
3227 is_front_search = 1;
3228 } else {
3229 current_section = &explain_context -> result_back_junctions[back_i][xk1];
3230 if(xk1+1 < explain_context -> result_back_junction_numbers[back_i])
3231 next_section = &explain_context -> result_back_junctions[back_i][xk1+1];
3232 }
3233
3234
3235 if(xk1 == explain_context -> result_back_junction_numbers[back_i] - 1)
3236 read_pos_start = explain_context -> result_back_junctions[back_i][xk1].read_pos_start;
3237 else read_pos_start = current_section -> read_pos_start;
3238
3239 read_pos_end = current_section -> read_pos_end;
3240 chromosome_event_t *event_after = current_section -> event_after_section;
3241
3242 sprintf(piece_cigar, "%dM", (read_pos_end - read_pos_start));
3243 total_perfect_matched_sections += (read_pos_end - read_pos_start);
3244 flanking_size_left[xk1] = (read_pos_end - read_pos_start);
3245
3246 if(xk1<explain_context -> result_back_junction_numbers[back_i] + explain_context -> result_front_junction_numbers[front_i] -2)
3247 assert(event_after);
3248
3249 if(xk1>0)
3250 flanking_size_right[xk1-1] = (read_pos_end - read_pos_start);
3251
3252 if(event_after)
3253 {
3254 if(event_after -> event_type == CHRO_EVENT_TYPE_INDEL)
3255 sprintf(piece_cigar+strlen(piece_cigar), "%d%c", abs(event_after->indel_length), event_after->indel_length>0?'D':'I');
3256 else if(event_after -> event_type == CHRO_EVENT_TYPE_JUNCTION||event_after -> event_type == CHRO_EVENT_TYPE_FUSION) {
3257 // the distance in CIGAR is the NEXT UNWANTED BASE of piece#1 to the FIRST WANTED BASE in piece#2
3258 int delta_one ;
3259 if(current_section -> is_strand_jumped + current_section -> is_connected_to_large_side == 1) delta_one = 1;
3260 else delta_one = -1;
3261
3262 // if it is from front_search, the event side points to the first WANTED base of the next section; it should be moved to the last WANTED base the next section if the next section is jumped.
3263 if(next_section && (event_after -> is_strand_jumped + current_section -> is_strand_jumped==1))
3264 {
3265 if(is_front_search)
3266 {
3267 if(current_section -> is_connected_to_large_side)
3268 delta_one += (next_section->read_pos_end - next_section-> read_pos_start - 1);
3269 else
3270 delta_one -= (next_section->read_pos_end - next_section-> read_pos_start - 1);
3271 }
3272 else
3273 {
3274 if(current_section -> is_connected_to_large_side)
3275 delta_one += (next_section->read_pos_end - next_section-> read_pos_start - 1);
3276 else
3277 delta_one -= (next_section->read_pos_end - next_section-> read_pos_start - 1);
3278 }
3279 }
3280
3281 char jump_mode = current_section -> is_connected_to_large_side?'B':'N';
3282 long long int movement = event_after -> event_large_side;
3283 movement -= event_after -> event_small_side - delta_one;
3284 if(1){
3285 if(jump_mode == 'B' && movement < 0){
3286 movement = - movement;
3287 jump_mode = 'N';
3288 }else if(jump_mode == 'N' && movement < 0){
3289 movement = - movement;
3290 jump_mode = 'B';
3291 }
3292 }
3293
3294 if(event_after -> is_strand_jumped) jump_mode = tolower(jump_mode);
3295 fusions_in_read += (event_after -> event_type == CHRO_EVENT_TYPE_FUSION);
3296 sprintf(piece_cigar+strlen(piece_cigar), "%u%c", (int)movement, jump_mode);
3297
3298 if(event_after -> indel_at_junction) sprintf(piece_cigar+strlen(piece_cigar), "%dI", event_after -> indel_at_junction);
3299 is_junction_read ++;
3300 if(event_after -> is_donor_found_or_annotation & 64 ) known_junction_supp ++;
3301 }
3302 to_be_supported[to_be_supported_count++] = event_after;
3303 }
3304 strcat(tmp_cigar, piece_cigar);
3305 if(strlen(tmp_cigar) > CORE_MAX_CIGAR_STR_LEN - 14){
3306 is_cigar_overflow=1;
3307 break;
3308 }
3309 }
3310
3311 int mismatch_bases = 0;
3312
3313 //#warning ">>>>>>>>>>>>>>>> COMMENT NEXT LINE <<<<<<<<<<<<<<<<<<<<<<<"
3314 //SUBREADprintf("ReadDebug:%s\t%s\n", explain_context -> read_name , tmp_cigar);
3315 if(is_cigar_overflow) sprintf(tmp_cigar, "%dM", explain_context -> full_read_len);
3316
3317 unsigned int final_position;
3318
3319 // #warning "'0 &&' is because there could be indels in the high-confident region but this indel is finally disused."
3320 if( 0 && explain_context -> result_back_junction_numbers[back_i] + explain_context -> result_front_junction_numbers[front_i] <= 2) final_position = result -> selected_position;
3321 else final_position = explain_context -> result_back_junctions[back_i][0].abs_offset_for_start;
3322
3323 if(0 && FIXLENstrcmp("simulated.11420793", explain_context->read_name)==0)SUBREADprintf("FFFINAL %s : POS=%u, ABS=%u\n", explain_context->read_name, final_position, explain_context -> result_back_junctions[back_i][0].abs_offset_for_start);
3324
3325 int is_exonic_read_fraction_OK = 1;
3326
3327 if( global_context -> config.minimum_exonic_subread_fraction > 0.0000001 && (!is_junction_read) && result -> used_subreads_in_vote>0)
3328 {
3329 int min_subreads = global_context -> config.minimum_exonic_subread_fraction * result-> used_subreads_in_vote;
3330 if( result -> selected_votes < min_subreads )
3331 is_exonic_read_fraction_OK = 0 ;
3332 }
3333
3334
3335
3336 int final_qual = 0, applied_mismatch = 0, non_clipped_length = 0, total_indel_length = 0, final_MATCH = 0, chromosomal_length = 0, full_section_clipped = 0;
3337
3338 if(is_exonic_read_fraction_OK)
3339 {
3340 final_qual = final_CIGAR_quality(global_context, thread_context, explain_context -> full_read_text, explain_context -> full_qual_text, explain_context -> full_read_len , tmp_cigar, final_position, is_first_section_negative != ((result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0), &mismatch_bases, result -> confident_coverage_start, result -> confident_coverage_end, explain_context -> read_name, &non_clipped_length, &total_indel_length, & final_MATCH, & chromosomal_length, & full_section_clipped);
3341 //#warning ">>>>>>> COMMENT THIS <<<<<<<"
3342 //printf("OCT27-STEP2-%s:%d-POS%u-VOT%d-CIG-%s [ %d ]-INDELs=%llu; M/MM=%d,%d\n", explain_context -> read_name, explain_context -> is_second_read + 1, result -> selected_position, result -> selected_votes, tmp_cigar, is_cigar_overflow, ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table -> numOfElements, final_MATCH, mismatch_bases);
3343
3344
3345 applied_mismatch = is_junction_read? global_context->config.max_mismatch_junction_reads:global_context->config.max_mismatch_exonic_reads ;
3346 if(explain_context->full_read_len > EXON_LONG_READ_LENGTH)
3347 applied_mismatch = ((((explain_context->full_read_len+1)<<16) / 100) * applied_mismatch)>>16;
3348
3349 if(global_context -> config.space_type == GENE_SPACE_COLOR) applied_mismatch += to_be_supported_count*2;
3350 }
3351
3352
3353 //#warning " ========== COMMENT THIS LINE !! ========="
3354 if(0 && FIXLENstrcmp("HWI-ST945:119:D0J2JACXX:1:1303:17374:199067", explain_context -> read_name) ==0){
3355 char outpos1[100];
3356 absoffset_to_posstr(global_context, final_position, outpos1);
3357 SUBREADprintf("FINALQUAL %s : FINAL_POS=%s ( %u )\tCIGAR=%s\tMM=%d / MAPLEN=%d > %d?\tVOTE=%d > %0.2f x %d ? MASK=%d\tQUAL=%d\tBRNO=%d\nKNOWN_JUNCS=%d PENALTY=%d\n\n", explain_context -> read_name, outpos1 , final_position , tmp_cigar, mismatch_bases, non_clipped_length, applied_mismatch, result -> selected_votes, global_context -> config.minimum_exonic_subread_fraction,result-> used_subreads_in_vote, result->result_flags, final_qual, explain_context -> best_read_id, known_junction_supp, explain_context -> best_indel_penalty);
3358 //exit(0);
3359 }
3360
3361
3362 if(mismatch_bases <= applied_mismatch && is_exonic_read_fraction_OK && fusions_in_read < 2 ){// && (0 == full_section_clipped || 0 == global_context -> config.do_breakpoint_detection)) {
3363 realignment_result_t * realign_res = final_realignments+final_alignment_number;
3364 final_alignment_number ++;
3365
3366 realign_res -> realign_flags = result->result_flags;
3367 realign_res -> first_base_is_jumpped = 0;
3368 realign_res -> mapping_result = result;
3369 realign_res -> chromosomal_length = chromosomal_length;
3370 realign_res -> known_junction_supp = known_junction_supp;
3371 realign_res -> final_penalty = explain_context -> best_indel_penalty;
3372
3373 if(mismatch_bases > applied_mismatch ) realign_res -> realign_flags |= CORE_TOO_MANY_MISMATCHES;
3374 else realign_res -> realign_flags &= ~CORE_TOO_MANY_MISMATCHES;
3375
3376 if(((result -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0) != is_first_section_negative)
3377 {
3378 assert((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection));
3379 realign_res -> first_base_is_jumpped = 1;
3380 }
3381 strcpy(realign_res -> cigar_string, tmp_cigar);
3382
3383 if(1)
3384 {
3385 int is_RNA_from_positive = -1;
3386 unsigned long long read_id = 2llu * explain_context -> pair_number + explain_context->is_second_read;
3387
3388 for(xk1= 0; xk1 < to_be_supported_count; xk1++)
3389 {
3390 if(xk1 >= MAX_EVENTS_IN_READ) break;
3391
3392 if(to_be_supported [xk1] -> event_type !=CHRO_EVENT_TYPE_INDEL && is_junction_read){
3393 if(to_be_supported [xk1] -> event_type == CHRO_EVENT_TYPE_JUNCTION && to_be_supported [xk1] -> is_donor_found_or_annotation && is_RNA_from_positive == -1)
3394 is_RNA_from_positive = !(to_be_supported [xk1] -> is_negative_strand);
3395 }
3396
3397 //final counts are added in function "add_realignment_event_support" in core.c
3398
3399 realign_res -> supporting_chromosome_events[xk1] = to_be_supported[xk1];
3400 realign_res -> flanking_size_left[xk1] = flanking_size_left[xk1];
3401 realign_res -> flanking_size_right[xk1] = flanking_size_right[xk1];
3402 realign_res -> crirical_support[xk1] += (read_id == to_be_supported [xk1] -> critical_read_id);
3403 }
3404 if(to_be_supported_count < MAX_EVENTS_IN_READ )
3405 realign_res -> supporting_chromosome_events[to_be_supported_count] = NULL;
3406
3407 result -> result_flags |= CORE_IS_FULLY_EXPLAINED;
3408 result -> read_length = explain_context->full_read_len;
3409
3410 if(is_RNA_from_positive == -1)
3411 {
3412 realign_res -> realign_flags |= CORE_NOTFOUND_DONORS ;
3413 realign_res -> realign_flags &= ~(CORE_IS_GT_AG_DONORS);
3414 }
3415 else
3416 {
3417 realign_res -> realign_flags &= ~ (CORE_NOTFOUND_DONORS | CORE_IS_GT_AG_DONORS);
3418
3419 if(is_RNA_from_positive)
3420 realign_res -> realign_flags |= CORE_IS_GT_AG_DONORS;
3421 }
3422 }
3423
3424 realign_res -> first_base_position = final_position;
3425 realign_res -> final_quality = final_qual;
3426 realign_res -> final_mismatched_bases = mismatch_bases;
3427 realign_res -> final_matched_bases = (unsigned short)final_MATCH;
3428 realign_res -> best_second_diff_bases = (9<explain_context -> best_second_match_diff)?-1:explain_context -> best_second_match_diff;
3429
3430 }
3431 }
3432 }
3433
3434 return final_alignment_number;
3435 }
3436
3437
3438
3439
3440 #define ceq(c,t) ((c)[0]==(t)[0] && (c)[1]==(t)[1])
3441 #define c2eq(ch1, ch2, tg1, tg2) ((ceq(ch1, tg1) && ceq(ch2, tg2)) || (ceq(ch1, tg2) && ceq(ch2, tg1)) )
3442
paired_chars_full_core(char * ch1,char * ch2,int is_reverse)3443 int paired_chars_full_core(char * ch1, char * ch2, int is_reverse)
3444 {
3445 if (c2eq(ch1, ch2, "GT", "AG") || c2eq(ch1, ch2, "CT", "AC"))
3446 {
3447 if (is_reverse) if (ceq(ch1, "AG") || ceq(ch1, "AC")) return 2;
3448 if (!is_reverse) if (ceq(ch1, "CT") || ceq(ch1, "GT")) return 2;
3449 }
3450 else if ( c2eq(ch1, ch2,"GC","AG") || c2eq(ch1, ch2,"GC","CT") || c2eq(ch1, ch2,"AT","AC") || c2eq(ch1, ch2,"GT","AT"))
3451 {
3452 if (is_reverse) if (ceq(ch1, "GC") || ceq(ch1, "AT") || ceq(ch1, "AG") || ceq(ch1, "AC")) return 1;
3453 if (!is_reverse) if (ceq(ch1, "GC") || ceq(ch1, "AT") ||ceq(ch1, "GT") || ceq(ch1, "CT")) return 1;
3454 }
3455 return 0;
3456 }
3457
paired_chars_part_core(char * ch1,char * ch2,int is_reverse)3458 int paired_chars_part_core(char * ch1, char * ch2, int is_reverse)
3459 {
3460 if (c2eq(ch1, ch2, "GT", "AG") || c2eq(ch1, ch2, "CT", "AC")) {
3461 if (is_reverse){
3462 if (ceq(ch1, "AG") || ceq(ch1, "AC")) return 1;
3463 } else {
3464 if (ceq(ch1, "CT") || ceq(ch1, "GT")) return 1;
3465 }
3466 }
3467 return 0;
3468 }
3469
3470 #define is_donor_chars_full(cc) (((cc)[0]=='G' && (cc)[1]=='T') || \
3471 ((cc)[0]=='A' && (cc)[1]=='G') || \
3472 ((cc)[0]=='A' && (cc)[1]=='C') || \
3473 ((cc)[0]=='C' && (cc)[1]=='T') || \
3474 ((cc)[0]=='G' && (cc)[1]=='C') || \
3475 ((cc)[0]=='A' && (cc)[1]=='T') || \
3476 ((cc)[0]=='A' && (cc)[1]=='C') )
3477
3478
3479 #define is_donor_chars_part(cc) (((cc)[0]=='G' && (cc)[1]=='T') || \
3480 ((cc)[0]=='A' && (cc)[1]=='G') || \
3481 ((cc)[0]=='A' && (cc)[1]=='C') || \
3482 ((cc)[0]=='C' && (cc)[1]=='T'))
3483
3484 //#warning "=============== NO DONOR-RECEPTOR NEEDED =============="
3485 //#define is_donor_chars(x) 1
3486 //#define paired_chars(x,y,z) 1
3487
3488 #define is_donor_chars is_donor_chars_part
3489 #define paired_chars paired_chars_part_core
3490
3491
3492
3493
print_big_margin(global_context_t * global_context,subread_read_number_t pair_number,int is_second_read)3494 void print_big_margin(global_context_t * global_context, subread_read_number_t pair_number, int is_second_read){
3495 unsigned short * big_margin_record = _global_retrieve_big_margin_ptr(global_context,pair_number, is_second_read);
3496 int x1;
3497
3498 SUBREADprintf("\n >>> READ_NO=%u, SECOND=%d, MEM=%p <<< \n", (unsigned int)pair_number, is_second_read, big_margin_record);
3499 for(x1 = 0; x1 < global_context->config.big_margin_record_size/3 ; x1++)
3500 {
3501 SUBREADprintf("%d %d~%d ", big_margin_record[x1*3] , big_margin_record[x1*3+1] , big_margin_record[x1*3+2]);
3502 }
3503 SUBREADputs("");
3504 }
3505
3506 #define ABGIGUOUS_TOLERANCE 3
3507
is_ambiguous_voting(global_context_t * global_context,subread_read_number_t pair_number,int is_second_read,int selected_vote,int max_start,int max_end,int read_len,int is_negative)3508 int is_ambiguous_voting(global_context_t * global_context, subread_read_number_t pair_number, int is_second_read, int selected_vote, int max_start,int max_end, int read_len, int is_negative)
3509 {
3510 // #warning "=========== THE NEXT LINE IS ONLY FOR COMPARING WITH STAR!! ============== "
3511 // return 0;
3512 if( global_context->config.big_margin_record_size<3) return 0;
3513 int xk1;
3514 int encounter = 0;
3515
3516 if(is_negative)
3517 {
3518 int tmp = max_start;
3519 max_start = read_len - max_end;
3520 max_end = read_len - tmp;
3521 }
3522
3523 unsigned short * big_margin_record = _global_retrieve_big_margin_ptr(global_context,pair_number, is_second_read);
3524
3525 for(xk1 = 0; xk1 < global_context->config.big_margin_record_size/3 ; xk1++)
3526 {
3527 if(!big_margin_record[xk1*3])break;
3528
3529 if(big_margin_record[xk1*3] >= selected_vote - 1) // actually, max-1
3530 {
3531 if(0) {
3532 if ( max_start >= big_margin_record[xk1*3+1] - ABGIGUOUS_TOLERANCE && max_end <= big_margin_record[xk1*3+2] + ABGIGUOUS_TOLERANCE )
3533 encounter++;
3534 else if ( big_margin_record[xk1*3+1] >= max_start - ABGIGUOUS_TOLERANCE && big_margin_record[xk1*3+2] <= max_end + ABGIGUOUS_TOLERANCE )
3535 encounter++;
3536
3537 } else {
3538 // 4 and 4 are the best setting for indel and fusion simulation.
3539 if(selected_vote >= big_margin_record[xk1*3]) {
3540 if(big_margin_record[xk1*3+1] >= max_start - 4 && big_margin_record[xk1*3+2] <= max_end + 4)
3541 encounter++;
3542 } else {
3543 if(big_margin_record[xk1*3+1] <= max_start + 4 && big_margin_record[xk1*3+2] >= max_end - 4)
3544 encounter++;
3545 }
3546 }
3547 }
3548
3549 }
3550
3551 if(encounter>1) return encounter;
3552 return 0;
3553 }
3554
3555 #define JUNCTION_CONFIRM_WINDOW 17
3556 // This function implements the same function of donor_score, except that the two halves are from different strands.
3557 // Both halves are forced to positive strand and the split point is found.
3558 // Note that the donor/receptor sides are still expected for distinguishing between Fusion Breaks and Fusion Junctions.
3559
3560 // Note that the read_text is on reversed mode. The guess points are on reversed mode too.
3561 // "Left" and "Right" means the left/right half in the "reversed" read.
donor_jumped_score(global_context_t * global_context,thread_context_t * thread_context,unsigned int small_virtualHead_abs_offset,unsigned int large_virtualHead_abs_offset,int guess_start,int guess_end,char * read_text,int read_len,int is_small_half_negative,int is_large_half_negative,int small_half_on_left_reversed,int * final_split_point,int * is_GT_AG_strand,int * is_donor_found_or_annotation,int * small_side_increasing_coordinate,int * large_side_increasing_coordinate)3562 int donor_jumped_score(global_context_t * global_context, thread_context_t * thread_context, unsigned int small_virtualHead_abs_offset, unsigned int large_virtualHead_abs_offset, int guess_start, int guess_end, char * read_text, int read_len, int is_small_half_negative, int is_large_half_negative, int small_half_on_left_reversed, int * final_split_point, int * is_GT_AG_strand, int * is_donor_found_or_annotation, int * small_side_increasing_coordinate, int * large_side_increasing_coordinate)
3563 {
3564 gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index ;
3565 // guess_end is the index of the first UNWANTED BASE.
3566 int most_likely_point_as_reversed = (guess_start+guess_end)/2;
3567
3568 int selected_real_split_point = -1, selected_junction_strand = -1;
3569 //char donor_left[2], donor_right[2];
3570
3571 int best_score = -111111;
3572
3573 int real_split_point_i;
3574 int real_split_point_numbers = guess_end - guess_start;
3575
3576 char positive_read[MAX_READ_LENGTH+1];
3577 strcpy(positive_read, read_text) ;
3578 reverse_read(positive_read, read_len, global_context->config.space_type);
3579
3580 //printf("TEST_JUMPED: %u - %u\n", small_virtualHead_abs_offset, large_virtualHead_abs_offset);
3581
3582
3583 (*small_side_increasing_coordinate) = (small_half_on_left_reversed != is_small_half_negative);
3584 (*large_side_increasing_coordinate) = (small_half_on_left_reversed == is_large_half_negative);
3585
3586
3587 for(real_split_point_i = 0 ; real_split_point_i < real_split_point_numbers; real_split_point_i++)
3588 {
3589 int left_should_match, right_should_match;
3590 int left_should_not_match, right_should_not_match;
3591 int real_split_point_as_reversed = (real_split_point_i % 2)?-((real_split_point_i+1)/2):((1+real_split_point_i)/2);
3592 real_split_point_as_reversed += most_likely_point_as_reversed;
3593
3594 if(real_split_point_as_reversed > read_len-JUNCTION_CONFIRM_WINDOW)continue;
3595 if(real_split_point_as_reversed < JUNCTION_CONFIRM_WINDOW)continue;
3596
3597 int is_donor_test_ok=0;
3598
3599 if(small_half_on_left_reversed)
3600 {
3601 unsigned int small_pos_test_begin = small_virtualHead_abs_offset + (is_small_half_negative?real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW:(read_len - real_split_point_as_reversed));
3602 char * small_pos_read_begin = (is_small_half_negative?read_text:positive_read) + (is_small_half_negative?
3603 (real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW) :
3604 (read_len - real_split_point_as_reversed)
3605 );
3606
3607 unsigned int large_pos_test_begin = large_virtualHead_abs_offset + (is_large_half_negative?real_split_point_as_reversed:(read_len - real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW));
3608 char * large_pos_read_begin = (is_large_half_negative?read_text:positive_read) + (is_large_half_negative?
3609 (real_split_point_as_reversed) :
3610 (read_len - real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW));
3611
3612 left_should_match = match_chro(small_pos_read_begin , value_index , small_pos_test_begin , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3613 right_should_match = match_chro(large_pos_read_begin , value_index , large_pos_test_begin , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3614 left_should_not_match = right_should_not_match = 0;
3615 //match_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, small_virtualHead_abs_offset + real_split_point - JUNCTION_CONFIRM_WINDOW , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3616
3617 }
3618 else
3619 {
3620 unsigned int small_pos_test_begin = small_virtualHead_abs_offset + (is_small_half_negative?real_split_point_as_reversed:(read_len - real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW));
3621 char * small_pos_read_begin = (is_small_half_negative?read_text:positive_read) + (is_small_half_negative?
3622 (real_split_point_as_reversed):(read_len - real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW));
3623
3624 unsigned int large_pos_test_begin = large_virtualHead_abs_offset + (is_large_half_negative?(real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW):(read_len - real_split_point_as_reversed));
3625 char * large_pos_read_begin = (is_large_half_negative?read_text:positive_read) + (is_large_half_negative?
3626 (real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW):(read_len - real_split_point_as_reversed));
3627
3628 left_should_match = match_chro(small_pos_read_begin , value_index , small_pos_test_begin , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3629 right_should_match = match_chro(large_pos_read_begin , value_index , large_pos_test_begin , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3630 left_should_not_match = right_should_not_match = 0;
3631
3632 }
3633
3634 //#warning "============ REMOVE THE TWO '+ 1' FROM THE NEXT LINE ================="
3635 //#warning "============ ADD THE TWO '+ 1's IN THE BLANKETS FOR SVs GRANT APP ================="
3636 int mismatch_in_between_allowd = (global_context -> config.more_accurate_fusions)?(0):(1);
3637 if(left_should_match + right_should_match >= JUNCTION_CONFIRM_WINDOW*2 - mismatch_in_between_allowd &&
3638 left_should_not_match <= JUNCTION_CONFIRM_WINDOW -3 && right_should_not_match <= JUNCTION_CONFIRM_WINDOW -3)
3639 {
3640 int test_score = is_donor_test_ok*500+left_should_match + right_should_match - left_should_not_match - right_should_not_match;
3641 if(test_score > best_score)
3642 {
3643 selected_real_split_point = real_split_point_as_reversed;
3644 best_score = test_score;
3645 }
3646 }
3647 }
3648
3649 if(best_score>0)
3650 {
3651 //printf("TEST_JUMPED: BSCORE=%d SPLT=%d\n", best_score , selected_real_split_point);
3652 *final_split_point = selected_real_split_point;
3653 *is_donor_found_or_annotation = best_score>=500;
3654 *is_GT_AG_strand = selected_junction_strand;
3655 return best_score;
3656 }
3657 return 0;
3658 }
3659
3660
donor_score(global_context_t * global_context,thread_context_t * thread_context,unsigned int left_virtualHead_abs_offset,unsigned int right_virtualHead_abs_offset,int left_indel_offset,int right_indel_offset,int normally_arranged,int guess_start,int guess_end,char * read_text,int read_len,int * final_split_point,int * is_GT_AG_strand,int * is_donor_found_or_annotation,int * final_inserted_bases,int * small_side_increasing_coordinate,int * large_side_increasing_coordinate,char * read_name)3661 int donor_score(global_context_t * global_context, thread_context_t * thread_context, unsigned int left_virtualHead_abs_offset, unsigned int right_virtualHead_abs_offset, int left_indel_offset, int right_indel_offset, int normally_arranged, int guess_start, int guess_end, char * read_text, int read_len, int * final_split_point, int * is_GT_AG_strand, int * is_donor_found_or_annotation, int * final_inserted_bases, int * small_side_increasing_coordinate, int * large_side_increasing_coordinate, char * read_name)
3662 {
3663
3664
3665 gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
3666 int need_donor_test = global_context->config.do_breakpoint_detection && global_context -> config.check_donor_at_junctions && (!( global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection));
3667
3668 (*small_side_increasing_coordinate)=!normally_arranged;
3669 (*large_side_increasing_coordinate)= normally_arranged;
3670
3671 // guess_end is the index of the first UNWANTED BASE.
3672 int most_likely_point = (guess_start+guess_end)/2;
3673
3674 // "split_point" is the first base NOT IN piece 1; it is also the first base IN piece 2.
3675 int selected_real_split_point = -1, selected_junction_strand = -1 , selected_inserted_bases = 0;
3676 char donor_left[3], donor_right[3];
3677
3678
3679 int best_score = -111111;
3680 int non_insertion_preferred = 0;
3681
3682 int real_split_point_i;
3683 int real_split_point_numbers = guess_end - guess_start;
3684
3685 if(0 && FIXLENstrcmp("R006856515", read_name) == 0)
3686 SUBREADprintf("TESTDON: LR=%d; RR=%d\n", left_indel_offset, right_indel_offset);
3687
3688 for(real_split_point_i = 0 ; real_split_point_i < real_split_point_numbers; real_split_point_i++)
3689 {
3690 int left_should_match, right_should_match = 0;
3691 int left_should_not_match = 0, right_should_not_match = 0;
3692 int real_split_point = (real_split_point_i % 2)?-((real_split_point_i+1)/2):((1+real_split_point_i)/2);
3693 real_split_point += most_likely_point;
3694 int is_donor_test_ok = 0;
3695
3696 if(real_split_point > read_len-JUNCTION_CONFIRM_WINDOW)continue;
3697 if(real_split_point < JUNCTION_CONFIRM_WINDOW)continue;
3698
3699 if(global_context->config.prefer_donor_receptor_junctions)
3700 {
3701 if(normally_arranged)
3702 {
3703 gvindex_get_string (donor_left, value_index, left_virtualHead_abs_offset + real_split_point + left_indel_offset, 2, 0);
3704 if(is_donor_chars(donor_left))
3705 {
3706 gvindex_get_string (donor_right, value_index, right_virtualHead_abs_offset + real_split_point + right_indel_offset - 2, 2, 0);
3707 if(is_donor_chars(donor_right))
3708 {
3709 is_donor_test_ok = paired_chars(donor_left, donor_right,0);
3710 }
3711 }
3712 }
3713 else
3714 {
3715 gvindex_get_string (donor_left, value_index, right_virtualHead_abs_offset + real_split_point + left_indel_offset, 2, 0);
3716 gvindex_get_string (donor_right, value_index, left_virtualHead_abs_offset + real_split_point + right_indel_offset - 2, 2, 0);
3717 is_donor_test_ok = is_donor_chars(donor_left) && is_donor_chars(donor_right) && paired_chars(donor_left, donor_right,0);
3718 }
3719 }
3720
3721 // donor_left[2]=0; donor_right[2]=0;
3722
3723 if(0 && FIXLENstrcmp("R006856515", read_name) == 0)
3724 {
3725 donor_left[2]=0;
3726 donor_right[2]=0;
3727 SUBREADprintf("TESTDON: %s %s; OFFSET=%d; DON_OK=%d; NORMAL=%d; LEFT_OFF=%d; RIGHT_OFF=%d\n", donor_left, donor_right, real_split_point_i, is_donor_test_ok, normally_arranged, left_indel_offset, right_indel_offset);
3728 }
3729
3730 //#warning "============ REMOVE THE TWO '+ 1' FROM THE NEXT LINE ================="
3731 //#warning "============ ADD TWO '+ 1' IN THE BLANKETS FOR SVs GRANT APP ================="
3732 int mismatch_in_between_allowd = (global_context -> config.more_accurate_fusions)?(0) : (1);
3733 if(is_donor_test_ok || !need_donor_test)
3734 {
3735 if(normally_arranged)
3736 {
3737 int inserted_bases=0;
3738
3739 left_should_match = match_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, left_virtualHead_abs_offset + real_split_point - JUNCTION_CONFIRM_WINDOW + left_indel_offset , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3740 //printf("INS=%d; LM=%d\t\tLOL=%u, LOR=%u, SP=%d\n", inserted_bases, left_should_match, left_virtualHead_abs_offset, right_virtualHead_abs_offset, real_split_point);
3741 if(left_should_match > JUNCTION_CONFIRM_WINDOW- (global_context->config.max_insertion_at_junctions?5:2))
3742 {
3743 for(inserted_bases = 0; inserted_bases <= global_context->config.max_insertion_at_junctions; inserted_bases++)
3744 {
3745
3746 right_should_match = match_chro(read_text + real_split_point + inserted_bases, value_index, right_virtualHead_abs_offset + real_split_point + right_indel_offset + inserted_bases, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3747 // printf("INS=%d; LM=%d; RM=%d\t\tLOL=%u, LOR=%u, SP=%d\n", inserted_bases, left_should_match, right_should_match, left_virtualHead_abs_offset, right_virtualHead_abs_offset, real_split_point);
3748 if(right_should_match >= 2*JUNCTION_CONFIRM_WINDOW - left_should_match - mismatch_in_between_allowd)
3749 {
3750 left_should_not_match = match_chro(read_text + real_split_point + inserted_bases, value_index, left_virtualHead_abs_offset + real_split_point + left_indel_offset, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3751 right_should_not_match = match_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, right_virtualHead_abs_offset + real_split_point + right_indel_offset - JUNCTION_CONFIRM_WINDOW + inserted_bases, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3752
3753
3754 if(left_should_not_match <= JUNCTION_CONFIRM_WINDOW -5 && right_should_not_match <= JUNCTION_CONFIRM_WINDOW -5)
3755 {
3756 int test_score ;
3757 if(global_context->config.max_insertion_at_junctions)
3758 test_score = 100*(is_donor_test_ok*3000+left_should_match + right_should_match) - (left_should_not_match + right_should_not_match) - 20*inserted_bases;
3759 else
3760 test_score = 100*(is_donor_test_ok*3000+left_should_match + right_should_match - left_should_not_match - right_should_not_match);
3761
3762 if(test_score > best_score)
3763 {
3764 //if(left_virtualHead_abs_offset > 2729745284 - 200 && left_virtualHead_abs_offset< 2729745284 + 200)
3765 // SUBREADprintf("INS=%d; BSS=%d; TSC=%d\n%s\n\n", inserted_bases , best_score, test_score, read_text);
3766 selected_junction_strand = (donor_left[0]=='G' || donor_right[1]=='G');
3767 selected_inserted_bases = inserted_bases;
3768 selected_real_split_point = real_split_point;
3769 best_score = test_score;
3770 }
3771 }
3772
3773 }
3774 if(global_context->config.max_insertion_at_junctions && 0 == inserted_bases && right_should_match >= 2*JUNCTION_CONFIRM_WINDOW - left_should_match - 5)
3775 non_insertion_preferred = 1;
3776
3777 }
3778 }
3779 }
3780 else
3781 {
3782 right_should_match = match_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, right_virtualHead_abs_offset + right_indel_offset + real_split_point - JUNCTION_CONFIRM_WINDOW , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3783 left_should_match = match_chro(read_text + real_split_point, value_index, left_virtualHead_abs_offset + real_split_point + left_indel_offset, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3784
3785 right_should_not_match = match_chro(read_text + real_split_point, value_index, right_virtualHead_abs_offset + real_split_point + right_indel_offset, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3786 left_should_not_match = match_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, left_virtualHead_abs_offset + left_indel_offset + real_split_point - JUNCTION_CONFIRM_WINDOW, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3787
3788 //printf("LEFT:MA=%d UMA=%d RIGHT:MA=%d UMA=%d\n", left_should_match, left_should_not_match, right_should_match, right_should_not_match);
3789
3790 if(left_should_match +right_should_match >= 2*JUNCTION_CONFIRM_WINDOW - mismatch_in_between_allowd &&
3791 left_should_not_match <= JUNCTION_CONFIRM_WINDOW -5 && right_should_not_match <= JUNCTION_CONFIRM_WINDOW -5)
3792 {
3793
3794 int test_score;
3795
3796 test_score = 100*(is_donor_test_ok*3000+left_should_match + right_should_match - left_should_not_match - right_should_not_match);
3797 if(test_score > best_score)
3798 {
3799 selected_junction_strand = (donor_left[0]=='G' || donor_right[1]=='G');
3800 selected_real_split_point = real_split_point;
3801 best_score = test_score;
3802 }
3803 }
3804 }
3805 }
3806 }
3807 if(best_score>0 && (0==non_insertion_preferred || 0==selected_inserted_bases))
3808 {
3809 *final_split_point = selected_real_split_point;
3810 *is_donor_found_or_annotation = best_score>=290000;
3811 *is_GT_AG_strand = selected_junction_strand;
3812 *final_inserted_bases = selected_inserted_bases;
3813
3814 if(0 && FIXLENstrcmp("R006856515", read_name)==0)
3815 SUBREADprintf("FINAL_INS_LEN=%d; BEST_SCORE=%d %s\n", selected_inserted_bases, best_score, read_name);
3816 return (1+best_score)/100;
3817 }
3818 return 0;
3819
3820 }
3821
find_new_junctions(global_context_t * global_context,thread_context_t * thread_context,subread_read_number_t pair_number,char * read_name,char * read_text,char * qual_text,int read_len,int is_second_read,int best_read_id)3822 void find_new_junctions(global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t pair_number, char * read_name, char * read_text, char * qual_text, int read_len, int is_second_read, int best_read_id)
3823 {
3824 mapping_result_t * result =_global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, best_read_id);
3825 subjunc_result_t * subjunc_result =_global_retrieve_subjunc_ptr(global_context, pair_number, is_second_read, best_read_id);
3826
3827 if(read_len > EXON_LONG_READ_LENGTH)
3828 {
3829 assert(result -> selected_position <= 0xffff0000);
3830 core_search_short_exons(global_context, thread_context, read_text, qual_text, read_len, result -> selected_position, (subjunc_result -> minor_votes < 1)? result -> selected_position:subjunc_result -> minor_position, result -> confident_coverage_start, result -> confident_coverage_end);
3831 }
3832
3833 int selected_real_split_point = subjunc_result->split_point;
3834
3835 //#warning " =============== remove "+ 2" FROM THE NEXT LINE (FOR A HIGHER ACCURACY FROM SubFusion on 19 JAN 2015) =================="
3836 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection) && subjunc_result -> minor_votes < 1)return;
3837 if((!(global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection)) && subjunc_result -> minor_votes < 1)return;
3838
3839 //if(result -> selected_votes < global_context->config.minimum_subread_for_first_read)return;
3840
3841 if(global_context->config.do_big_margin_filtering_for_junctions)
3842 if(is_ambiguous_voting(global_context, pair_number, is_second_read, result->selected_votes, result -> confident_coverage_start, result -> confident_coverage_end, read_len, (result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0))return;
3843
3844 unsigned int left_virtualHead_abs_offset = min(result -> selected_position, subjunc_result -> minor_position);
3845 unsigned int right_virtualHead_abs_offset = max(result -> selected_position, subjunc_result -> minor_position);
3846
3847 int is_GT_AG_donors = result->result_flags & 0x3;
3848 int is_donor_found_or_annotation = is_GT_AG_donors<3;
3849 int is_strand_jumped = (result->result_flags & CORE_IS_STRAND_JUMPED)?1:0;
3850
3851 if(selected_real_split_point>0)
3852 {
3853 unsigned int left_edge_wanted, right_edge_wanted;
3854
3855 if(is_strand_jumped)
3856 {
3857 if(0){
3858
3859 // note that splicing point and the coverage coordinates are "major negative" view.
3860 // recover the "negative view" splicing point location
3861 int S = (result->result_flags & CORE_IS_NEGATIVE_STRAND) ? selected_real_split_point : (read_len - selected_real_split_point);
3862 int Sbar = read_len - S;
3863
3864 int is_abnormal_as_reversed = (subjunc_result->minor_coverage_start > result->confident_coverage_start) + (subjunc_result -> minor_position > result -> selected_position) == 1;
3865 if(!(result->result_flags & CORE_IS_NEGATIVE_STRAND)) is_abnormal_as_reversed = !is_abnormal_as_reversed;
3866 int is_small_half_negative = ((result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0) + (subjunc_result->minor_position < result->selected_position) ==1;
3867
3868 if(is_abnormal_as_reversed && is_small_half_negative)
3869 {
3870 left_edge_wanted = left_virtualHead_abs_offset + S;
3871 right_edge_wanted = right_virtualHead_abs_offset + Sbar;
3872 }
3873 else if(is_abnormal_as_reversed && !is_small_half_negative)
3874 {
3875 left_edge_wanted = left_virtualHead_abs_offset + Sbar - 1;
3876 right_edge_wanted = right_virtualHead_abs_offset + S - 1;
3877 }
3878 else if(!is_abnormal_as_reversed && is_small_half_negative)
3879 {
3880 left_edge_wanted = left_virtualHead_abs_offset + S - 1;
3881 right_edge_wanted = right_virtualHead_abs_offset + Sbar - 1;
3882 }
3883 else // if(!is_abnormal_as_reversed && !is_small_half_negative)
3884 {
3885 left_edge_wanted = left_virtualHead_abs_offset + Sbar;
3886 right_edge_wanted = right_virtualHead_abs_offset + S;
3887 }
3888
3889 if(left_edge_wanted >= right_edge_wanted){
3890 SUBREADprintf("REVERSED NEW JUNC: %u ~ %u : ABN_REV=%d , SMALL_NEG=%d, LEFT_VH=%u, RIGHT_VH=%u, S/~S=%d/%d\n", left_edge_wanted, right_edge_wanted, is_abnormal_as_reversed, is_small_half_negative, left_virtualHead_abs_offset, right_virtualHead_abs_offset, S, Sbar);
3891 }
3892
3893 }else{
3894 unsigned int major_half_smallest_coordinate, minor_half_smallest_coordinate;
3895 major_half_smallest_coordinate = result -> selected_position + selected_real_split_point;
3896 minor_half_smallest_coordinate = subjunc_result->minor_position + read_len - selected_real_split_point;
3897 left_edge_wanted = min(major_half_smallest_coordinate, minor_half_smallest_coordinate);
3898 right_edge_wanted = max(major_half_smallest_coordinate, minor_half_smallest_coordinate);
3899 int is_abnormal_as_reversed = (subjunc_result->minor_coverage_start > result->confident_coverage_start) + (minor_half_smallest_coordinate > major_half_smallest_coordinate) == 1;
3900 int is_small_half_negative = ((result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0) + (minor_half_smallest_coordinate < major_half_smallest_coordinate) ==1;
3901 if(!(result->result_flags & CORE_IS_NEGATIVE_STRAND)) is_abnormal_as_reversed = !is_abnormal_as_reversed;
3902 if(is_small_half_negative != is_abnormal_as_reversed)
3903 {
3904 left_edge_wanted -=1;
3905 right_edge_wanted -=1;
3906 }
3907 }
3908 }
3909 else
3910 {
3911 int selected_real_split_point_for_left = selected_real_split_point;
3912 int selected_real_split_point_for_right = selected_real_split_point;
3913 if((subjunc_result->minor_coverage_start > result->confident_coverage_start) + (subjunc_result -> minor_position > result -> selected_position) == 1) //abnormally arranged halves
3914 selected_real_split_point_for_right --;
3915 else // normally arranged halves
3916 selected_real_split_point_for_left --;
3917
3918
3919
3920 int minor_indel_offset = (subjunc_result->double_indel_offset & 0xf);
3921 int major_indel_offset = (subjunc_result->double_indel_offset >> 4) & 0xf;
3922 if(major_indel_offset>=8)major_indel_offset=-(16-major_indel_offset);
3923 //assert(minor_indel_offset==0);
3924 //assert(major_indel_offset==0);
3925
3926 left_edge_wanted = left_virtualHead_abs_offset + selected_real_split_point_for_left + ((result -> selected_position > subjunc_result -> minor_position)?minor_indel_offset: major_indel_offset);
3927 right_edge_wanted = right_virtualHead_abs_offset + selected_real_split_point_for_right;
3928 }
3929
3930 char * chro_name_left, *chro_name_right;
3931 int chro_pos_left,chro_pos_right;
3932
3933 locate_gene_position( left_edge_wanted , &global_context -> chromosome_table, &chro_name_left, &chro_pos_left);
3934 locate_gene_position( right_edge_wanted , &global_context -> chromosome_table, &chro_name_right, &chro_pos_right);
3935 if((!( global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection) ) && chro_name_right!=chro_name_left) return;
3936
3937 //insert event
3938 HashTable * event_table = NULL;
3939 chromosome_event_t * event_space = NULL;
3940 if(thread_context)
3941 {
3942 event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
3943 event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
3944 }
3945 else
3946 {
3947 event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
3948 event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
3949 }
3950
3951 // note that selected_real_split_point is the first UNWANTED base after left half.
3952
3953 //if(abs(left_edge_wanted-27286396) < 250 || abs(right_edge_wanted - 27286396)<250)
3954 if(0 && FIXLENstrcmp("R003738400", read_name) == 0)
3955 {
3956 char leftpos[100], rightpos[100];
3957 absoffset_to_posstr(global_context, left_edge_wanted, leftpos);
3958 absoffset_to_posstr(global_context, right_edge_wanted, rightpos);
3959 SUBREADprintf("READ=%s, LEFT=%s, RIGHT=%s\n", read_name, leftpos, rightpos);
3960 }
3961
3962 chromosome_event_t * found = NULL;
3963 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
3964 int found_events = search_event(global_context, event_table, event_space, left_edge_wanted , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_INDEL | CHRO_EVENT_TYPE_JUNCTION | CHRO_EVENT_TYPE_FUSION, search_return);
3965
3966 mark_gapped_read(result);
3967 if(found_events)
3968 {
3969 int kx1;
3970 for(kx1 = 0; kx1 < found_events ; kx1++)
3971 {
3972 if(search_return[kx1] -> event_large_side == right_edge_wanted)
3973 {
3974 found = search_return[kx1];
3975 break;
3976 }
3977 }
3978 }
3979
3980 //if( 1018082 == pair_number)
3981 // SUBREADprintf("NEW_CHIMERISM_HERE [%u:%d: R_%d] : %s , %s , %u , %u, %c ; INC=%d %d\n", pair_number, best_read_id, is_second_read+1, chro_name_left, chro_name_right, chro_pos_left, chro_pos_right, is_strand_jumped?'X':'=', subjunc_result -> small_side_increasing_coordinate, subjunc_result -> large_side_increasing_coordinate);
3982
3983
3984 int is_key_fusion = 0;
3985 if(0)if(
3986 ( 9566365 + 1210 - 200 <= left_edge_wanted && 9566365 + 1210 + 200 >= left_edge_wanted) &&
3987 ( 36859887 + 1210 - 200 <= right_edge_wanted && 36859887 + 1210 + 200 >= right_edge_wanted)
3988 ){
3989 SUBREADprintf("Read = %s, FOUND = %p in %d , %s:%u , %s:%u, INCs= %d, %d, JUMP=%d\n", read_name, found, found_events, chro_name_left, chro_pos_left, chro_name_right, chro_pos_right, subjunc_result -> small_side_increasing_coordinate, subjunc_result -> large_side_increasing_coordinate, is_strand_jumped);
3990 is_key_fusion = 1;
3991 }
3992
3993 if(found) found -> supporting_reads ++;
3994 else
3995 {
3996 int event_no;
3997
3998
3999 if(thread_context)
4000 event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
4001 else
4002 event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> total_events ++;
4003
4004
4005 event_space = reallocate_event_space(global_context, thread_context, event_no);
4006
4007 chromosome_event_t * new_event = event_space+event_no;
4008 memset(new_event,0,sizeof(chromosome_event_t));
4009 new_event -> event_small_side = left_edge_wanted;
4010 new_event -> event_large_side = right_edge_wanted + subjunc_result->indel_at_junction;
4011 new_event -> critical_read_id = 2llu * pair_number + is_second_read;
4012
4013 int new_event_type =(((global_context -> config.entry_program_name == CORE_PROGRAM_SUBJUNC && global_context -> config.do_fusion_detection)||(global_context -> config.entry_program_name == CORE_PROGRAM_SUBJUNC && global_context -> config.do_long_del_detection))&& !global_context -> config.prefer_donor_receptor_junctions)?CHRO_EVENT_TYPE_FUSION:CHRO_EVENT_TYPE_JUNCTION;
4014
4015 //#warning "=========================== DELETE NEXT LINE !!! =================================="
4016 //new_event_type = CHRO_EVENT_TYPE_REMOVED;
4017
4018 if(is_strand_jumped) new_event_type = CHRO_EVENT_TYPE_FUSION;
4019 if((subjunc_result->minor_coverage_start > result->confident_coverage_start) + (subjunc_result -> minor_position > result -> selected_position) ==1)
4020 new_event_type = CHRO_EVENT_TYPE_FUSION;
4021 if(chro_name_right!=chro_name_left)
4022 new_event_type = CHRO_EVENT_TYPE_FUSION;
4023 if(right_edge_wanted - left_edge_wanted > global_context -> config.maximum_intron_length)
4024 if(!(global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection))
4025 new_event_type = CHRO_EVENT_TYPE_REMOVED;
4026
4027
4028 if(1)
4029 {
4030 unsigned int dist = new_event -> event_large_side - new_event -> event_small_side;
4031 int origin_type = new_event_type;
4032 int fusion_cover_len = -1;
4033
4034 if(dist > MAX_INSERTION_LENGTH && new_event_type == CHRO_EVENT_TYPE_FUSION)
4035 {
4036 int cov_end, cover_start, major_cov;
4037 cov_end = max(subjunc_result->minor_coverage_end, result->confident_coverage_end );
4038 cover_start = min(subjunc_result->minor_coverage_start, result->confident_coverage_start);
4039
4040 major_cov = result->confident_coverage_end - result->confident_coverage_start;
4041
4042 fusion_cover_len = cov_end - cover_start ;
4043
4044 if(fusion_cover_len < read_len - 15 || major_cov > read_len - 15)
4045 new_event_type = CHRO_EVENT_TYPE_REMOVED;
4046 }
4047
4048 if(dist > MAX_INSERTION_LENGTH && new_event_type == CHRO_EVENT_TYPE_FUSION && subjunc_result -> minor_votes < 2)
4049 new_event_type = CHRO_EVENT_TYPE_REMOVED;
4050 else if(new_event_type == CHRO_EVENT_TYPE_FUSION && subjunc_result -> minor_votes < 1)
4051 new_event_type = CHRO_EVENT_TYPE_REMOVED;
4052
4053
4054 if(dist > MAX_INSERTION_LENGTH && new_event_type == CHRO_EVENT_TYPE_FUSION && result -> selected_votes < 2)
4055 new_event_type = CHRO_EVENT_TYPE_REMOVED;
4056 else if(new_event_type == CHRO_EVENT_TYPE_FUSION && result -> selected_votes < 1)
4057 new_event_type = CHRO_EVENT_TYPE_REMOVED;
4058
4059 if(0 && origin_type == CHRO_EVENT_TYPE_FUSION)
4060 {
4061 char leftpos[100], rightpos[100];
4062 absoffset_to_posstr(global_context, new_event -> event_small_side, leftpos);
4063 absoffset_to_posstr(global_context, new_event -> event_large_side, rightpos);
4064
4065 if(new_event_type == CHRO_EVENT_TYPE_REMOVED)
4066 SUBREADprintf("NEW_FUSION REMOVED %s SUGGEST %s ~ %s MAJOR COV=%d ~ %d, MINOR COV=%d ~ %d, RLEN=%d, COVED=%d, VOTES=%d, %d, %s, SPLIT=%d\n", read_name, leftpos, rightpos, result->confident_coverage_start, result->confident_coverage_end, subjunc_result->minor_coverage_start, subjunc_result->minor_coverage_end, read_len, fusion_cover_len, result -> selected_votes, subjunc_result -> minor_votes, is_strand_jumped?"JUMPED":"======", selected_real_split_point);
4067 else
4068 SUBREADprintf("NEW_FUSION WANTED %s SUGGEST %s ~ %s MAJOR COV=%d ~ %d, MINOR COV=%d ~ %d, RLEN=%d, COVED=%d, VOTES=%d, %d, %s, SPLIT=%d\n", read_name, leftpos, rightpos, result->confident_coverage_start, result->confident_coverage_end, subjunc_result->minor_coverage_start, subjunc_result->minor_coverage_end, read_len, fusion_cover_len, result -> selected_votes, subjunc_result -> minor_votes, is_strand_jumped?"JUMPED":"======", selected_real_split_point);
4069 }
4070
4071 if(dist > MAX_INSERTION_LENGTH && new_event_type == CHRO_EVENT_TYPE_FUSION && (selected_real_split_point < read_len * 0.2 || selected_real_split_point >= read_len *0.8000) )
4072 new_event_type = CHRO_EVENT_TYPE_REMOVED;
4073 }
4074 //if(pair_number == 13)
4075 //printf("MMMMX %d %u -- %u : TYPE %d\n" , event_no, left_edge_wanted, right_edge_wanted, new_event_type);
4076
4077
4078 // if((is_donor_found_or_annotation || !global_context -> config.check_donor_at_junctions) &&(!is_strand_jumped) && right_edge_wanted - left_edge_wanted <= global_context -> config.maximum_intron_length
4079 // && (subjunc_result->minor_coverage_start > result->confident_coverage_start) + (subjunc_result -> minor_position > result -> selected_position) !=1)
4080
4081
4082 if(is_key_fusion) SUBREADprintf(" INSERTED AS %d ( in %d or %d )\n", new_event_type , CHRO_EVENT_TYPE_JUNCTION, CHRO_EVENT_TYPE_FUSION);
4083
4084 if(new_event_type == CHRO_EVENT_TYPE_JUNCTION)
4085 {
4086 new_event -> is_negative_strand= !is_GT_AG_donors;
4087 new_event -> event_type = CHRO_EVENT_TYPE_JUNCTION;
4088
4089 new_event -> supporting_reads = 1;
4090 new_event -> indel_length = 0;
4091 new_event -> indel_at_junction = subjunc_result->indel_at_junction;
4092 new_event -> is_donor_found_or_annotation = is_donor_found_or_annotation;
4093
4094 new_event -> small_side_increasing_coordinate = subjunc_result -> small_side_increasing_coordinate;
4095 new_event -> large_side_increasing_coordinate = subjunc_result -> large_side_increasing_coordinate;
4096
4097 put_new_event(event_table, new_event , event_no);
4098
4099 if(0 && FIXLENstrcmp("R000000052", read_name) == 0)
4100 SUBREADprintf("NEW_JUNCTION_HERE : %s , %u , %u (%u, %u)\n", chro_name_right, chro_pos_left, chro_pos_right, new_event -> event_small_side, new_event -> event_large_side);
4101 }
4102 else if(new_event_type == CHRO_EVENT_TYPE_FUSION)
4103 {
4104 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection))
4105 {
4106 new_event -> event_type = CHRO_EVENT_TYPE_FUSION;
4107 new_event -> is_strand_jumped = is_strand_jumped;
4108
4109
4110 new_event -> supporting_reads = 1;
4111 new_event -> indel_length = 0;
4112
4113 new_event -> small_side_increasing_coordinate = subjunc_result -> small_side_increasing_coordinate;
4114 new_event -> large_side_increasing_coordinate = subjunc_result -> large_side_increasing_coordinate;
4115
4116 put_new_event(event_table, new_event , event_no);
4117 //if( 1018082 == pair_number)
4118 // SUBREADprintf("NEW_CHIMERISM_HERE_FULL [%u:%d: R_%d] : %s , %s , %u , %u, %c ; INC=%d %d\n", pair_number, best_read_id, is_second_read+1, chro_name_left, chro_name_right, chro_pos_left, chro_pos_right, is_strand_jumped?'X':'=', new_event -> small_side_increasing_coordinate, new_event -> large_side_increasing_coordinate);
4119 }
4120 }
4121 }
4122 }
4123 }
4124
4125 void write_translocation_results_final(void * key, void * buckv, HashTable * tab);
4126 void write_inversion_results_final(void * key, void * buckv, HashTable * tab);
4127
write_fusion_final_results(global_context_t * global_context)4128 int write_fusion_final_results(global_context_t * global_context)
4129 {
4130 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
4131 char fn2 [MAX_FILE_NAME_LENGTH+30];
4132
4133 sprintf(fn2,"%s.breakpoints.vcf", global_context->config.output_prefix);
4134 FILE * ofp = f_subr_open(fn2, "wb");
4135 fprintf(ofp,"##fileformat=VCFv4.1\n");
4136 fprintf(ofp,"##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n");
4137 fprintf(ofp,"##INFO=<ID=MATEID,Number=1,Type=String,Description=\"Paired breakend id\">\n");
4138 fprintf(ofp,"##INFO=<ID=SR,Number=1,Type=Integer,Description=\"Supporting read number\">\n");
4139 fprintf(ofp,"#CHROM POS ID REF ALT QUAL FILTER INFO\n");
4140
4141 int xk1, disk_is_full = 0;
4142 unsigned int all_junctions = 0;
4143 int no_sup_juncs = 0;
4144 int all_juncs = 0;
4145
4146 for(xk1 = 0; xk1 < indel_context -> total_events ; xk1++)
4147 {
4148 char * chro_name_left,* chro_name_right;
4149 int chro_pos_left, chro_pos_right;
4150 chromosome_event_t * event_body = indel_context -> event_space_dynamic +xk1;
4151 if(event_body -> event_type != CHRO_EVENT_TYPE_FUSION && (global_context->config.entry_program_name != CORE_PROGRAM_SUBREAD || event_body -> event_type != CHRO_EVENT_TYPE_JUNCTION))
4152 continue;
4153
4154 all_juncs++;
4155
4156 if(event_body->final_counted_reads<1|| event_body->critical_supporting_reads < 1 - 1)
4157 {
4158 no_sup_juncs++;
4159 continue;
4160 }
4161 locate_gene_position( event_body -> event_small_side , &global_context -> chromosome_table, &chro_name_left, &chro_pos_left);
4162 locate_gene_position( event_body -> event_large_side , &global_context -> chromosome_table, &chro_name_right, &chro_pos_right);
4163
4164 chro_pos_left+=1;
4165 chro_pos_right+=1;
4166 all_junctions ++;
4167
4168 int wlen;
4169 char alt_base[500];
4170 char ref_base;
4171 char bkt = event_body -> large_side_increasing_coordinate?'[':']';
4172
4173 gene_value_index_t * current_index = find_current_value_index(global_context , event_body -> event_small_side , 1);
4174 ref_base = gvindex_get( current_index, event_body -> event_small_side);
4175 if(event_body -> small_side_increasing_coordinate)
4176 sprintf(alt_base,"%c%s:%u%c%c", bkt, chro_name_right, chro_pos_right, bkt, ref_base);
4177 else
4178 sprintf(alt_base,"%c%c%s:%u%c", ref_base, bkt, chro_name_right, chro_pos_right, bkt);
4179
4180 wlen = fprintf(ofp,"%s\t%u\tbnd_%d\t%c\t%s\t.\tPASS\tSVTYPE=BND;MATEID=bnd_%d;SR=%d\n", chro_name_left, chro_pos_left, all_junctions *2 -1, ref_base, alt_base, all_junctions*2, event_body -> final_counted_reads);
4181
4182 current_index = find_current_value_index(global_context , event_body -> event_large_side , 1);
4183 ref_base = gvindex_get( current_index, event_body -> event_large_side );
4184 bkt = event_body -> small_side_increasing_coordinate?'[':']';
4185 if(event_body -> large_side_increasing_coordinate)
4186 sprintf(alt_base,"%c%s:%u%c%c", bkt, chro_name_left, chro_pos_left, bkt, ref_base);
4187 else
4188 sprintf(alt_base,"%c%c%s:%u%c", ref_base, bkt, chro_name_left, chro_pos_left, bkt);
4189
4190 wlen += fprintf(ofp,"%s\t%u\tbnd_%d\t%c\t%s\t.\tPASS\tSVTYPE=BND;MATEID=bnd_%d;SR=%d\n", chro_name_right, chro_pos_right, all_junctions *2, ref_base, alt_base, all_junctions*2 -1, event_body -> final_counted_reads);
4191 if(wlen <18) disk_is_full = 1;
4192 // fprintf(ofp, "%s\t%u\t%s\t%u\t%s\t%d\t%s\t%s\n", chro_name_left, chro_pos_left, chro_name_right, chro_pos_right+1, event_body -> is_strand_jumped?"No":"Yes", event_body -> final_counted_reads, event_body -> small_side_increasing_coordinate?"Yes":"No", event_body -> large_side_increasing_coordinate?"Yes":"No");
4193 }
4194
4195 global_context -> all_fusions = all_junctions;
4196
4197 if(global_context->config.do_structural_variance_detection){
4198 global_context -> translocation_result_table.entry_table -> appendix1 = ofp;
4199 global_context -> translocation_result_table.entry_table -> appendix2 = global_context;
4200 HashTableIteration(global_context -> translocation_result_table.entry_table, write_translocation_results_final);
4201 global_context -> inversion_result_table.entry_table -> appendix1 = ofp;
4202 global_context -> inversion_result_table.entry_table -> appendix2 = global_context;
4203 HashTableIteration(global_context -> inversion_result_table.entry_table, write_inversion_results_final);
4204 }
4205
4206 fclose(ofp);
4207
4208 if(disk_is_full){
4209 unlink(fn2);
4210 SUBREADprintf("ERROR: disk is full. No fusion table is generated.\n");
4211 }
4212 return 0;
4213 }
4214
write_inversion_results_final(void * bukey,void * buckv,HashTable * tab)4215 void write_inversion_results_final(void * bukey, void * buckv, HashTable * tab){
4216 int x1;
4217 bucketed_table_bucket_t * buck = buckv;
4218
4219 FILE * ofp = (FILE *)tab -> appendix1;
4220 global_context_t * global_context = (global_context_t * )tab -> appendix2;
4221 for(x1 = 0; x1 < buck -> items; x1++)
4222 {
4223 if(buck->positions[x1] - buck->positions[x1] % buck -> maximum_interval_length == buck -> keyed_bucket)
4224 {
4225 inversion_result_t * inv_res = buck -> details[x1];
4226
4227 char * src_chr;
4228 int src_pos;
4229
4230 locate_gene_position(inv_res -> small_side, &global_context -> chromosome_table, &src_chr , &src_pos);
4231 fprintf(ofp, "INV\t%s\t%d\t%s\t%u\t%s\n", src_chr, src_pos + 1, src_chr, src_pos + 1 + inv_res -> length, inv_res -> is_precisely_called ? "PRECISE":"IMPRECISE");
4232 fprintf(ofp, "INV\t%s\t%d\t%s\t%u\t%s\n", src_chr, src_pos + 2, src_chr, src_pos + inv_res -> length, inv_res -> is_precisely_called ? "PRECISE":"IMPRECISE");
4233
4234 //fprintf(ofp, "INVERSION\t%s\t%u\t%u\t%u\t%u\n", src_chr, src_pos, inv_res -> length, inv_res -> all_sup_D , inv_res -> max_sup_E);
4235 }
4236 }
4237
4238 }
4239
write_translocation_results_final(void * bukey,void * buckv,HashTable * tab)4240 void write_translocation_results_final(void * bukey, void * buckv, HashTable * tab){
4241 int x1;
4242 bucketed_table_bucket_t * buck = buckv;
4243
4244 FILE * ofp = (FILE *)tab -> appendix1;
4245 global_context_t * global_context = (global_context_t * )tab -> appendix2;
4246 for(x1 = 0; x1 < buck -> items; x1++)
4247 {
4248 if(buck->positions[x1] - buck->positions[x1] % buck -> maximum_interval_length == buck -> keyed_bucket)
4249 {
4250 char * src_chr, *targ_chr;
4251 int src_pos, targ_pos;
4252
4253 translocation_result_t * trans_res = buck -> details[x1];
4254
4255 locate_gene_position(trans_res -> source_left_side, &global_context -> chromosome_table, &src_chr , &src_pos);
4256 locate_gene_position(trans_res -> target_left_side, &global_context -> chromosome_table, &targ_chr , &targ_pos);
4257
4258 //fprintf(ofp, "TRANSLOCATION\t%s\t%u\t%u\t%s\t%u\t%s\t%u\t%u\n", src_chr, src_pos, trans_res -> length, targ_chr, targ_pos, trans_res -> is_inv?"INV":"STR", trans_res -> all_sup_P , trans_res -> max_sup_QR);
4259 /*
4260 SUBREADprintf("ABS=%u, %u, PRECISE=%d\n", trans_res -> source_left_side, trans_res -> target_left_side, trans_res -> is_precisely_called);
4261 SUBREADprintf("%u, %u\n", src_pos, targ_pos);
4262 SUBREADprintf("%s, %s\n", src_chr, targ_chr);
4263 */
4264 fprintf(ofp, "%s\t%s\t%u\t%s\t%d\t%s\t%s\n", src_chr == targ_chr?"ITX":"CTX", src_chr, src_pos + 1, targ_chr, targ_pos + 1, trans_res -> is_inv?"X":"=", trans_res -> is_precisely_called ? "PRECISE":"IMPRECISE");
4265 fprintf(ofp, "%s\t%s\t%u\t%s\t%d\t%s\t%s\n", src_chr == targ_chr?"ITX":"CTX", src_chr, src_pos + trans_res -> length + 1, targ_chr, targ_pos + 1, trans_res -> is_inv?"X":"=", trans_res -> is_precisely_called ? "PRECISE":"IMPRECISE");
4266 fprintf(ofp, "DEL\t%s\t%d\t%u\t%s\n", src_chr, src_pos + 1, trans_res -> length , trans_res -> is_precisely_called ? "PRECISE":"IMPRECISE");
4267 }
4268 }
4269
4270 }
4271
write_junction_final_results(global_context_t * global_context)4272 int write_junction_final_results(global_context_t * global_context)
4273 {
4274
4275 int no_sup_juncs = 0, disk_is_full = 0;
4276
4277 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
4278 char fn2 [MAX_FILE_NAME_LENGTH+30];
4279
4280 sprintf(fn2,"%s.junction.bed", global_context->config.output_prefix);
4281 FILE * ofp = f_subr_open(fn2, "wb");
4282
4283 fprintf(ofp, "#Chr, StartLeftBlock, EndRightBlock, Junction_Name, nSupport, Strand, StartLeftBlock, EndRightBlock, Color, nBlocks, BlockSizes, BlockStarts\n");
4284
4285 int xk1;
4286 unsigned int all_junctions = 0;
4287
4288 for(xk1 = 0; xk1 < indel_context -> total_events ; xk1++)
4289 {
4290 char * chro_name_left,* chro_name_right, indel_sect[10];
4291 int chro_pos_left, chro_pos_right;
4292 chromosome_event_t * event_body = indel_context -> event_space_dynamic +xk1;
4293 if(event_body -> event_type != CHRO_EVENT_TYPE_JUNCTION)
4294 continue;
4295
4296 //#warning " ================================== remove '- 1' from the next line!!! ================================="
4297 if(event_body->final_counted_reads < 1 || ( event_body->critical_supporting_reads < 1 - 1&& event_body->indel_at_junction))
4298 {
4299 no_sup_juncs++;
4300 continue;
4301 }
4302
4303 locate_gene_position( event_body -> event_small_side , &global_context -> chromosome_table, &chro_name_left, &chro_pos_left);
4304 locate_gene_position( event_body -> event_large_side , &global_context -> chromosome_table, &chro_name_right, &chro_pos_right);
4305
4306 chro_pos_left++;
4307
4308
4309 unsigned int feature_start = chro_pos_left - event_body -> junction_flanking_left;
4310 if(chro_pos_left <= event_body -> junction_flanking_left){
4311 feature_start = 1;
4312 event_body -> junction_flanking_left = chro_pos_left - 1;
4313 }
4314
4315 unsigned int feature_end = chro_pos_right + event_body -> junction_flanking_right;
4316
4317 all_junctions ++;
4318
4319 indel_sect[0]=0;
4320 if(event_body->indel_at_junction)
4321 sprintf(indel_sect,"INS%d", event_body->indel_at_junction);
4322 if(event_body-> is_donor_found_or_annotation &64)strcat(indel_sect,"ANNO");
4323 //else if(event_body->critical_supporting_reads < 1)
4324 // strcpy(indel_sect, "NOCRT");
4325
4326
4327 int wlen = fprintf(ofp,"%s\t%u\t%u\tJUNC%08u%s\t%d\t%c\t%u\t%u\t%d,%d,%d\t2\t%d,%d\t0,%u\n", chro_name_left, feature_start, feature_end,
4328 all_junctions, indel_sect, event_body -> final_counted_reads, event_body->is_negative_strand?'-':'+',
4329 feature_start, feature_end, event_body->is_negative_strand?0:255, /*event_body -> anti_supporting_reads*/ event_body->is_negative_strand?255:0, event_body->is_negative_strand?255:0,
4330 event_body -> junction_flanking_left, event_body -> junction_flanking_right, feature_end-feature_start-event_body -> junction_flanking_right);
4331 if(wlen < 10) disk_is_full = 1;
4332 }
4333
4334 fclose(ofp);
4335 if(disk_is_full){
4336 unlink(fn2);
4337 SUBREADprintf("ERROR: disk is full; no junction table is created.\n");
4338 }
4339 global_context -> all_junctions = all_junctions;
4340 //printf("Non-support juncs=%d; Final juncs = %d\n", no_sup_juncs, all_junctions);
4341 return 0;
4342 }
4343
4344
4345
get_chro_2base(char * buf,gene_value_index_t * index,unsigned int pos,int is_negative_strand)4346 void get_chro_2base(char *buf, gene_value_index_t * index, unsigned int pos, int is_negative_strand)
4347 {
4348 gvindex_get_string (buf, index, pos, 2, is_negative_strand);
4349 }
4350
4351
paired_chars_part(char * ch1,char * ch2,int is_reverse)4352 int paired_chars_part(char * ch1, char * ch2, int is_reverse)
4353 {
4354 if (c2eq(ch1, ch2, "GT", "AG") || c2eq(ch1, ch2, "CT", "AC"))
4355 {
4356 if (is_reverse) if (ceq(ch1, "AG") || ceq(ch1, "AC")) return 1;
4357 if (!is_reverse) if (ceq(ch1, "CT") || ceq(ch1, "GT")) return 1;
4358 }
4359 return 0;
4360 }
4361 #define is_donar_chars_part(cc) (((cc)[0]=='G' && (cc)[1]=='T') || \
4362 ((cc)[0]=='A' && (cc)[1]=='G') || \
4363 ((cc)[0]=='A' && (cc)[1]=='C') || \
4364 ((cc)[0]=='C' && (cc)[1]=='T'))
4365
4366
4367 #define SHORT_EXON_MIN_LENGTH 18
4368 #define EXON_EXTENDING_SCAN 0
4369 #define SHORT_EXON_WINDOW 6
4370 #define SHORT_EXON_EXTEND 5000
4371
core_search_short_exons(global_context_t * global_context,thread_context_t * thread_context,char * read_text,char * qualityb0,int rl,unsigned int P1_Pos,unsigned int P2_Pos,short read_coverage_start,short read_coverage_end)4372 void core_search_short_exons(global_context_t * global_context, thread_context_t * thread_context, char * read_text, char * qualityb0, int rl, unsigned int P1_Pos, unsigned int P2_Pos, short read_coverage_start, short read_coverage_end)
4373 {
4374 char inb[MAX_READ_LENGTH], qualityb[MAX_READ_LENGTH];
4375 if ( (rl <= EXON_LONG_READ_LENGTH ) && (!EXON_EXTENDING_SCAN)) return;
4376 //return;
4377 gene_value_index_t * base_index = thread_context?thread_context->current_value_index:global_context->current_value_index ;
4378 //insert event
4379 HashTable * event_table = NULL;
4380 chromosome_event_t * event_space = NULL;
4381 if(thread_context)
4382 {
4383 event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
4384 event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
4385 }
4386 else
4387 {
4388 event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
4389 event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
4390 }
4391
4392 strcpy(inb, read_text);
4393 strcpy(qualityb, qualityb0);
4394
4395 unsigned int pos_small=min(P1_Pos, P2_Pos), pos_big = max(P1_Pos, P2_Pos);
4396
4397 int max_score , test_score;
4398 unsigned int best_j1_edge=0 , best_j2_edge=0;
4399 int need_to_test = 0;
4400
4401 //////////////////////////////////////////////////////////////////////////////////////////////
4402 //////////////////////////////////////////////////////////////////////////////////////////////
4403 // SCAN TO THE HEAD /////////////////////////////////////////////////////////////////////////
4404 //////////////////////////////////////////////////////////////////////////////////////////////
4405 //////////////////////////////////////////////////////////////////////////////////////////////
4406
4407 if (read_coverage_start > SHORT_EXON_MIN_LENGTH)
4408 {
4409 max_score = -1;
4410
4411 int need_check2 = 1;
4412 if(qualityb[0])
4413 {
4414 float head_quality = read_quality_score(qualityb , SHORT_EXON_MIN_LENGTH , global_context->config.phred_score_format);
4415 if(head_quality < 6 )
4416 need_check2 = 0;
4417 }
4418
4419
4420 if(need_check2)
4421 if(SHORT_EXON_MIN_LENGTH *0.6 < match_chro(inb, base_index, pos_small, SHORT_EXON_MIN_LENGTH , 0, global_context->config.space_type))
4422 need_check2 = 0;
4423
4424
4425 if(need_check2)
4426 {
4427
4428 int delta_pos, is_indel = 0;
4429 for(delta_pos=-3; delta_pos <=3; delta_pos ++)
4430 {
4431 if(match_chro(inb, base_index, pos_small + delta_pos, SHORT_EXON_MIN_LENGTH , 0, global_context->config.space_type) >= SHORT_EXON_MIN_LENGTH*.7)
4432 {
4433 is_indel = 1;
4434 break;
4435 }
4436 }
4437 // The head of the read is incorrect. Do we need to search a long way?
4438 // See if there is a donor in the head area.
4439 int test_donor_pos;
4440 char cc[3];
4441 cc[2]=0;
4442
4443 if(!is_indel)
4444 for(test_donor_pos = SHORT_EXON_MIN_LENGTH ; test_donor_pos < read_coverage_start ; test_donor_pos ++)
4445 {
4446 get_chro_2base(cc, base_index, pos_small + test_donor_pos, 0);
4447 if(is_donar_chars_part(cc))
4448 {
4449 need_to_test = 1;
4450 break;
4451 }
4452 }
4453 }
4454 }
4455
4456 max_score = -999;
4457 int max_is_GTAG = 0;
4458
4459 if(need_to_test && pos_small >= SHORT_EXON_MIN_LENGTH)
4460 {
4461 unsigned int test_end = pos_small - SHORT_EXON_EXTEND;
4462 if(SHORT_EXON_EXTEND > pos_small) test_end = 0;
4463
4464 unsigned int new_pos = pos_small-SHORT_EXON_MIN_LENGTH;
4465 while(1)
4466 {
4467 new_pos = match_chro_range(inb, base_index, new_pos, 7 , new_pos - test_end , SEARCH_BACK);
4468 if(new_pos==0xffffffff) break;
4469 // There is an exact match. See if the donor/receptors are matched.
4470 // new_pos is the new head position of the read.
4471 int splice_point;
4472 for(splice_point = SHORT_EXON_MIN_LENGTH; splice_point < read_coverage_start ; splice_point ++)
4473 {
4474 char cc[3];
4475 cc[2]=0;
4476 char cc2[3];
4477 cc2[2]=0;
4478
4479 get_chro_2base(cc, base_index, pos_small + splice_point -2, 0);
4480 if(is_donar_chars_part(cc))
4481 {
4482 // <<< EXON---|CC2---INTRON---CC|---EXON
4483 get_chro_2base(cc2, base_index, new_pos + splice_point, 0);
4484 if(is_donar_chars_part(cc2) && paired_chars_part(cc2 , cc, 0))
4485 {
4486 int matched_in_exon_old = match_chro(inb + splice_point, base_index, pos_small + splice_point , SHORT_EXON_WINDOW , 0, global_context->config.space_type);
4487 int matched_in_exon_new = match_chro(inb, base_index, new_pos , splice_point, 0, global_context->config.space_type);
4488
4489
4490 test_score = 1000000+ (matched_in_exon_new )*10000 + matched_in_exon_old * 1000 + new_pos - test_end;
4491 if(test_score <= max_score) continue;
4492 max_score = test_score;
4493
4494 if(matched_in_exon_new < splice_point || matched_in_exon_old < SHORT_EXON_WINDOW )
4495 continue;
4496
4497 max_is_GTAG = (cc2[0]=='G' || cc2[1]=='G');
4498 //printf("EX CC=%s\tCC2=%s\tis_GTAG=%d\n",cc,cc2,max_is_GTAG);
4499 best_j1_edge = new_pos + splice_point - 1;
4500 best_j2_edge = pos_small + splice_point;
4501 }
4502 }
4503 }
4504 }
4505 }
4506
4507
4508 if(best_j1_edge>0)
4509 {
4510 int event_no;
4511 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
4512 chromosome_event_t * found = NULL;
4513
4514 int found_events = search_event(global_context, event_table, event_space, best_j1_edge , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_JUNCTION|CHRO_EVENT_TYPE_FUSION, search_return);
4515
4516 if(found_events)
4517 {
4518 int kx1;
4519 for(kx1 = 0; kx1 < found_events ; kx1++)
4520 {
4521 if(search_return[kx1] -> event_large_side == best_j2_edge)
4522 {
4523 found = search_return[kx1];
4524 break;
4525 }
4526 }
4527 }
4528
4529 if(found) found -> supporting_reads ++;
4530 else
4531 {
4532 if(thread_context)
4533 event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
4534 else
4535 event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> total_events ++;
4536
4537 event_space = reallocate_event_space(global_context, thread_context, event_no);
4538
4539 chromosome_event_t * new_event = event_space+event_no;
4540 memset(new_event,0,sizeof(chromosome_event_t));
4541 new_event -> event_small_side = best_j1_edge;
4542 new_event -> event_large_side = best_j2_edge;
4543
4544 new_event -> is_negative_strand= !max_is_GTAG;
4545 new_event -> event_type = CHRO_EVENT_TYPE_JUNCTION;
4546
4547 new_event -> supporting_reads = 1;
4548 new_event -> indel_length = 0;
4549
4550 put_new_event(event_table, new_event , event_no);
4551 }
4552 //printf("FOUND NEW JUNCTION HEAD: %u - %u\n", best_j1_edge, best_j2_edge);
4553 }
4554
4555
4556 //////////////////////////////////////////////////////////////////////////////////////////////
4557 //////////////////////////////////////////////////////////////////////////////////////////////
4558 // SCAN TO THE TAIL /////////////////////////////////////////////////////////////////////////
4559 //////////////////////////////////////////////////////////////////////////////////////////////
4560 //////////////////////////////////////////////////////////////////////////////////////////////
4561
4562 need_to_test = 0;
4563 max_score = -999;
4564
4565
4566 if (read_coverage_end< rl - SHORT_EXON_MIN_LENGTH)
4567 {
4568 int need_check2 = 1;
4569 if(qualityb[0])
4570 {
4571 float head_quality = read_quality_score(qualityb + rl - SHORT_EXON_MIN_LENGTH , SHORT_EXON_MIN_LENGTH , global_context->config.phred_score_format);
4572 if(head_quality < 6 )
4573 need_check2 = 0;
4574 }
4575
4576
4577 if(SHORT_EXON_MIN_LENGTH *0.6 < match_chro(inb + rl - SHORT_EXON_MIN_LENGTH, base_index, pos_big + rl - SHORT_EXON_MIN_LENGTH , SHORT_EXON_MIN_LENGTH , 0, global_context->config.space_type))
4578 need_check2 = 0;
4579 if(need_check2)
4580 {
4581 int delta_pos, is_indel = 0;
4582 for(delta_pos=-3; delta_pos <=3; delta_pos ++)
4583 {
4584 if(match_chro(inb + rl - SHORT_EXON_MIN_LENGTH, base_index, pos_big + rl - SHORT_EXON_MIN_LENGTH + delta_pos, SHORT_EXON_MIN_LENGTH , 0, global_context->config.space_type) >= SHORT_EXON_MIN_LENGTH*.7)
4585 {
4586 is_indel = 1;
4587 break;
4588 }
4589 }
4590 // The head of the read is incorrect. Do we need to search a long way?
4591 // See if there is a donor in the head area.
4592 int test_donor_pos;
4593 char cc[3];
4594 cc[2]=0;
4595
4596 if(!is_indel)
4597 for(test_donor_pos = read_coverage_end ; test_donor_pos < rl ; test_donor_pos ++)
4598 {
4599 get_chro_2base(cc, base_index, pos_big + test_donor_pos, 0);
4600 if(is_donar_chars_part(cc))
4601 {
4602 need_to_test = 1;
4603 break;
4604 }
4605 }
4606 }
4607 }
4608
4609 best_j1_edge = 0;
4610 max_is_GTAG = 0;
4611
4612 if(need_to_test)
4613 {
4614 unsigned int test_end = pos_big + SHORT_EXON_EXTEND;
4615 if(test_end > base_index -> length + base_index -> start_point) test_end = base_index -> length + base_index -> start_point;
4616
4617 unsigned int new_pos = pos_big +rl - SHORT_EXON_MIN_LENGTH +16;
4618
4619 while(1)
4620 {
4621 if(new_pos + test_end - new_pos < base_index-> start_base_offset + base_index->length)
4622 {
4623 assert(new_pos<0xffff0000);
4624 new_pos = match_chro_range(inb + rl - SHORT_EXON_MIN_LENGTH, base_index, new_pos, 7 , test_end - new_pos , SEARCH_FRONT);
4625 }
4626 else break;
4627
4628 if(new_pos==0xffffffff) break;
4629 // There is an exact match. See if the donor/receptors are matched.
4630 // (new_pos + SHORT_EXON_MIN_LENGTH -rl + splice_point) is the new exon start.
4631
4632 int splice_point;
4633 for(splice_point = read_coverage_end ; splice_point < rl - SHORT_EXON_MIN_LENGTH; splice_point ++)
4634 {
4635 char cc[3];
4636 cc[2]=0;
4637 char cc2[3];
4638 cc2[2]=0;
4639
4640 unsigned int new_pos_tail = (new_pos + SHORT_EXON_MIN_LENGTH -rl + splice_point);
4641
4642 get_chro_2base(cc, base_index, pos_big + splice_point, 0);
4643 if(is_donar_chars_part(cc))
4644 {
4645 get_chro_2base(cc2, base_index, new_pos_tail -2, 0);
4646 if(is_donar_chars_part(cc2) && paired_chars_part(cc , cc2, 0))
4647 {
4648 int matched_in_exon_new = match_chro(inb + splice_point, base_index, new_pos_tail , rl - splice_point , 0, global_context->config.space_type);
4649 int matched_in_exon_old = match_chro(inb + splice_point - SHORT_EXON_WINDOW , base_index, pos_big + splice_point - SHORT_EXON_WINDOW , SHORT_EXON_WINDOW, 0, global_context->config.space_type);
4650
4651 test_score = 1000000+ (matched_in_exon_new)*10000 + matched_in_exon_old * 1000 + test_end - new_pos;
4652 if(test_score <= max_score) continue;
4653 max_score = test_score;
4654
4655 if(matched_in_exon_new < (rl - splice_point) || matched_in_exon_old < SHORT_EXON_WINDOW)
4656 continue;
4657
4658 // EXON ---|CC---INTRON---CC2|--- EXON >>>
4659 max_is_GTAG = (cc[0]=='G'|| cc[1]=='G');
4660 best_j1_edge = pos_big + splice_point - 1;
4661 best_j2_edge = new_pos_tail;
4662 }
4663 }
4664 }
4665
4666 }
4667 }
4668
4669
4670 if(best_j1_edge>0)
4671 {
4672 int event_no;
4673 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
4674 chromosome_event_t * found = NULL;
4675
4676 int found_events = search_event(global_context, event_table, event_space, best_j1_edge , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_JUNCTION|CHRO_EVENT_TYPE_FUSION, search_return);
4677
4678 if(found_events)
4679 {
4680 int kx1;
4681 for(kx1 = 0; kx1 < found_events ; kx1++)
4682 {
4683 if(search_return[kx1] -> event_large_side == best_j2_edge)
4684 {
4685 found = search_return[kx1];
4686 break;
4687 }
4688 }
4689 }
4690
4691 if(found) found -> supporting_reads ++;
4692 else
4693 {
4694 if(thread_context)
4695 event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
4696 else
4697 event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> total_events ++;
4698
4699
4700 event_space = reallocate_event_space(global_context, thread_context, event_no);
4701
4702 chromosome_event_t * new_event = event_space+event_no;
4703 memset(new_event,0,sizeof(chromosome_event_t));
4704 new_event -> event_small_side = best_j1_edge;
4705 new_event -> event_large_side = best_j2_edge;
4706
4707 new_event -> is_negative_strand= !max_is_GTAG;
4708 new_event -> event_type = CHRO_EVENT_TYPE_JUNCTION;
4709
4710 new_event -> supporting_reads = 1;
4711 new_event -> indel_length = 0;
4712
4713 put_new_event(event_table, new_event , event_no);
4714 //printf("FOUND NEW JUNCTION TAIL: %u - %u\n", best_j1_edge, best_j2_edge);
4715 }
4716 }
4717 }
4718
4719
4720
4721
4722
4723
4724
4725
4726
core_select_best_matching_halves_maxone(global_context_t * global_context,gene_vote_t * vote,unsigned int * best_pos1,unsigned int * best_pos2,int * best_vote1,int * best_vote2,char * is_abnormal,short * half_marks,int * is_reversed_halves,float accept_rate,int read_len,long long int hint_pos,int tolerable_bases,short * read_coverage_start,short * read_coverage_end,gene_vote_number_t * indel_in_p1,gene_vote_number_t * indel_in_p2,gehash_data_t max_pos,gene_vote_number_t max_votes,short max_start,short max_end,short max_mask,gene_vote_number_t * max_indel_recorder,int * best_select_max_votes,int rl)4727 int core_select_best_matching_halves_maxone(global_context_t * global_context, gene_vote_t * vote, unsigned int * best_pos1, unsigned int * best_pos2, int * best_vote1, int * best_vote2, char * is_abnormal, short * half_marks, int * is_reversed_halves, float accept_rate, int read_len, long long int hint_pos, int tolerable_bases, short * read_coverage_start, short * read_coverage_end, gene_vote_number_t * indel_in_p1, gene_vote_number_t * indel_in_p2, gehash_data_t max_pos, gene_vote_number_t max_votes, short max_start, short max_end, short max_mask, gene_vote_number_t * max_indel_recorder, int* best_select_max_votes, int rl)
4728 {
4729 int best_splicing_point = -1, i,j;
4730 char * best_chro_name, is_reversed;
4731 int best_chro_pos;
4732 int selected_max_votes = -1;
4733
4734
4735 is_reversed = (max_mask & IS_NEGATIVE_STRAND)?1:0;
4736 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
4737 for(j=0; j< vote->items[i]; j++)
4738 {
4739 char * chro_name;
4740 char is_partner_reversed;
4741 int chro_pos;
4742
4743 int overlapped_len, overlap_start, overlap_end;
4744 // All logical conditions
4745
4746 //if( (vote->votes[i][j] < vote-> coverage_start[i][j]) < 12 && (vote-> coverage_end[i][j] > rl - 12 )) continue;
4747
4748 is_partner_reversed = (vote->masks [i][j] & IS_NEGATIVE_STRAND) ? 1:0;
4749 overlap_start = max(max_start , vote->coverage_start[i][j]);
4750 overlap_end = min(max_end , vote->coverage_end[i][j]);
4751 overlapped_len =overlap_end - overlap_start;
4752
4753 int coverage_len = max_end - max_start + vote->coverage_end[i][j] - vote->coverage_start[i][j];
4754 if (overlapped_len >0)coverage_len -= overlapped_len;
4755 //SUBREADprintf("MAX: %d-%d OTHER %d-%d COV=%d OVLP=%d\n", max_start, max_end, vote->coverage_start[i][j], vote->coverage_end[i][j], coverage_len, overlapped_len);
4756
4757
4758
4759 if(overlapped_len >=14)
4760 continue;
4761
4762 long long int dist = vote->pos[i][j];
4763 dist -= max_pos;
4764
4765 //SUBREADprintf ("D=%lld\n", abs(dist));
4766 if (abs(dist)<6)
4767 continue;
4768
4769 int support_r1 = 1;
4770 int support_r2 = 1;
4771
4772 if (max_votes < support_r1 || vote->votes[i][j]<support_r2)
4773 continue;
4774
4775 // Same chromosome
4776 if ((vote->coverage_start[i][j] < max_start) + is_reversed == 1)
4777 {
4778 locate_gene_position(max_pos + read_len, &(global_context -> chromosome_table) , &best_chro_name, &best_chro_pos);
4779 locate_gene_position(vote->pos[i][j] , &(global_context -> chromosome_table), &chro_name, &chro_pos);
4780 }else
4781 {
4782 locate_gene_position(max_pos , &(global_context -> chromosome_table), &best_chro_name, &best_chro_pos);
4783 locate_gene_position(vote->pos[i][j] +read_len, &(global_context -> chromosome_table), &chro_name, &chro_pos);
4784 }
4785
4786 if (chro_name != best_chro_name) // The pointers can be compared because they can be the same.
4787 continue;
4788
4789 int is_fusion = 0;
4790
4791 if(is_reversed != is_partner_reversed) is_fusion = 1;
4792
4793 if( is_reversed && ((max_pos > vote->pos[i][j]) + (vote->coverage_start[i][j] < max_start) != 1))is_fusion = 1;
4794 if((! is_reversed) && ((max_pos > vote->pos[i][j]) + (vote->coverage_start[i][j] > max_start) != 1)) is_fusion = 1;
4795
4796 if(abs(dist) > 500000 || chro_name != best_chro_name) continue;
4797
4798 int test_vote_value ;
4799 test_vote_value = 8888888 + vote->votes[i][j]* 1000000 - abs(dist);
4800 if (hint_pos>=0)
4801 {
4802 long long int hint_dist = hint_pos;
4803 hint_dist -= vote->pos[i][j];
4804 if (abs (hint_dist) < 100000)
4805 test_vote_value += 100;
4806 if (abs (hint_dist) < 5000)
4807 test_vote_value += 100;
4808 }
4809
4810 if (test_vote_value<selected_max_votes)continue;
4811 // Conditions of order of R3 and R5
4812 *half_marks &= ~IS_REVERSED_HALVES;
4813 if (vote->coverage_start[i][j] < max_start && (((max_pos < vote->pos[i][j]) && !is_reversed) || ((max_pos > vote->pos[i][j]) && is_reversed) ) )
4814 *half_marks |= IS_REVERSED_HALVES;
4815 if (vote->coverage_start[i][j] >= max_end && (((max_pos > vote->pos[i][j]) && !is_reversed) || ((max_pos < vote->pos[i][j]) && is_reversed) ) )
4816 *half_marks |= IS_REVERSED_HALVES;
4817
4818 if (vote->coverage_start[i][j] < max_start)
4819 {
4820 (*half_marks) = (*half_marks) & ~IS_R1_CLOSE_TO_5;
4821 }
4822 else
4823 {
4824 (*half_marks) |= IS_R1_CLOSE_TO_5;
4825 }
4826
4827 if(max_mask & IS_NEGATIVE_STRAND)
4828 *half_marks = (*half_marks) | IS_NEGATIVE_STRAND_R1;
4829 else
4830 *half_marks = (*half_marks) & ~IS_NEGATIVE_STRAND_R1;
4831
4832 if(vote->masks[i][j] & IS_NEGATIVE_STRAND)
4833 *half_marks = (*half_marks) | IS_NEGATIVE_STRAND_R2;
4834 else
4835 *half_marks = (*half_marks) & ~IS_NEGATIVE_STRAND_R2;
4836
4837
4838
4839 best_splicing_point = ((vote->coverage_start[i][j] < max_start)? (vote->coverage_end[i][j]):(max_end)) + ((vote->coverage_start[i][j] < max_start)? (max_start):(vote->coverage_start[i][j]));
4840
4841
4842 best_splicing_point /=2;
4843
4844 * best_pos1 = max_pos ;
4845 * best_pos2 = vote->pos[i][j] ;
4846 * best_vote1 = max_votes ;
4847 * best_vote2 = vote->votes[i][j] ;
4848 * read_coverage_start = min(vote->coverage_start[i][j] , max_start);
4849 * read_coverage_end = max(vote->coverage_end[i][j] , max_end);
4850
4851 * read_coverage_start = max_start;
4852 * read_coverage_end = max_end;
4853
4854 int k;
4855 for(k=0; k<MAX_INDEL_TOLERANCE ; k+=3)
4856 if(!max_indel_recorder[k+3])break;
4857 * indel_in_p1 = max_indel_recorder[k+2];
4858
4859 for(k=0; k<MAX_INDEL_TOLERANCE ; k+=3)
4860 if(!vote->indel_recorder[i][j][k+3])break;
4861 * indel_in_p2 = vote->indel_recorder[i][j][k+2];
4862
4863
4864 * is_reversed_halves = is_reversed;
4865
4866 if (test_vote_value >=100)
4867 *half_marks = (*half_marks) | IS_PAIRED_HINTED;
4868 else
4869 *half_marks = (*half_marks) & ~(IS_PAIRED_HINTED);
4870
4871 if (is_fusion)
4872 *half_marks = (*half_marks) | IS_FUSION;
4873 else
4874 *half_marks = (*half_marks) & ~( IS_FUSION);
4875
4876
4877 selected_max_votes = test_vote_value;
4878
4879 }
4880 *best_select_max_votes = selected_max_votes ;
4881 return best_splicing_point;
4882 }
4883
4884
4885
core_select_best_matching_halves(global_context_t * global_context,gene_vote_t * vote,unsigned int * best_pos1,unsigned int * best_pos2,int * best_vote1,int * best_vote2,char * is_abnormal,short * half_marks,int * is_reversed_halves,float accept_rate,int read_len,long long int hint_pos,int tolerable_bases,short * read_coverage_start,short * read_coverage_end,char * indel_in_p1,char * indel_in_p2,int * max_cover_start,int * max_cover_end,int rl,int repeated_pos_base,int is_negative,char * repeat_record,unsigned int index_valid_range)4886 int core_select_best_matching_halves(global_context_t * global_context , gene_vote_t * vote, unsigned int * best_pos1, unsigned int * best_pos2, int * best_vote1, int * best_vote2, char * is_abnormal, short * half_marks, int * is_reversed_halves, float accept_rate, int read_len, long long int hint_pos, int tolerable_bases, short * read_coverage_start, short * read_coverage_end, char * indel_in_p1, char * indel_in_p2 , int * max_cover_start, int * max_cover_end, int rl, int repeated_pos_base, int is_negative, char * repeat_record, unsigned int index_valid_range)
4887 {
4888 unsigned int tmp_best_pos1=0, tmp_best_pos2=0;
4889 int tmp_best_vote1=0, tmp_best_vote2=0, tmp_is_reversed_halves=0;
4890 char tmp_is_abnormal=0;
4891 gene_vote_number_t tmp_indel_in_p1=0, tmp_indel_in_p2=0;
4892 short tmp_half_marks=0, tmp_read_coverage_start=0, tmp_read_coverage_end=0;
4893 int ret = 0, best_ret = 0;
4894
4895 int i,j;
4896 int test_select_votes=-1, best_select_votes = 1000000;
4897 //int max_minor = 0;
4898
4899 /*
4900 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
4901 for(j=0; j< vote->items[i]; j++)
4902 {
4903 if(vote->votes[i][j] < vote->max_vote)continue;
4904 int ii,jj;
4905 for (ii=0; ii<GENE_VOTE_TABLE_SIZE;ii++)
4906 for(jj=0; jj< vote->items[ii]; jj++)
4907 {
4908 if(max_minor >= vote->votes[ii][jj]) continue;
4909 if(ii==i && jj==j)continue;
4910 long long int dist = vote->pos[ii][jj];
4911 dist =abs(dist - vote->pos[i][j]);
4912 if(dist > 500000)
4913 continue;
4914 max_minor = vote->votes[ii][jj];
4915 }
4916
4917 }
4918
4919 int encountered = 0;
4920
4921
4922 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
4923 for(j=0; j< vote->items[i]; j++)
4924 {
4925 if(vote->votes[i][j] < vote->max_vote)continue;
4926 int ii,jj;
4927 for (ii=0; ii<GENE_VOTE_TABLE_SIZE;ii++)
4928 for(jj=0; jj< vote->items[ii]; jj++)
4929 {
4930 if(max_minor != vote->votes[ii][jj]) continue;
4931 if(ii==i && jj==j)continue;
4932 long long int dist = vote->pos[ii][jj];
4933 dist =abs(dist - vote->pos[i][j]);
4934 if(dist > 500000)
4935 continue;
4936 encountered++;
4937 }
4938
4939 }
4940 */
4941
4942 int repeated_pos = repeated_pos_base;
4943 int offset_shifting = (rl > 220)?4:0;
4944 //int encounter = 0;
4945
4946 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
4947 for(j=0; j< vote->items[i]; j++)
4948 {
4949 /*if((vote->votes[i][j] >= vote->max_vote -1) && (vote->max_coverage_start >= vote-> coverage_start[i][j] - EXON_MAX_BIGMARGIN_OVERLAPPING ) && (vote->max_coverage_end <= vote-> coverage_end[i][j] + EXON_MAX_BIGMARGIN_OVERLAPPING))
4950 encounter++;*/
4951 if(repeated_pos_base>=0 && vote->pos[i][j]<=index_valid_range)
4952 if(vote->votes[i][j] >= vote->max_vote && repeated_pos < repeated_pos_base+12)
4953 {
4954 repeat_record[repeated_pos] = (vote-> coverage_start[i][j] >> offset_shifting);
4955 repeat_record[repeated_pos+1] = (vote-> coverage_end[i][j] >> offset_shifting);
4956 repeat_record[repeated_pos+2] = (is_negative?0x80:0) | (vote->votes[i][j]&0x7f);
4957 repeated_pos+=3;
4958 }
4959 }
4960 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
4961 for(j=0; j< vote->items[i]; j++)
4962 {
4963 if(repeated_pos_base>=0 && vote->pos[i][j]<=index_valid_range)
4964 if(vote->votes[i][j] == vote->max_vote -1 && repeated_pos < repeated_pos_base+12)
4965 {
4966 repeat_record[repeated_pos] = (vote-> coverage_start[i][j] >> offset_shifting);
4967 repeat_record[repeated_pos+1] = (vote-> coverage_end[i][j] >> offset_shifting);
4968 repeat_record[repeated_pos+2] = (is_negative?0x80:0) | (vote->votes[i][j]&0x7f);
4969 repeated_pos+=3;
4970 }
4971 }
4972
4973
4974 /*
4975 if(encounter>=2)
4976 return 0;
4977 */
4978
4979 ret = core_select_best_matching_halves_maxone(global_context, vote, &tmp_best_pos1, &tmp_best_pos2, &tmp_best_vote1, &tmp_best_vote2, &tmp_is_abnormal,&tmp_half_marks, &tmp_is_reversed_halves, accept_rate, read_len, hint_pos, tolerable_bases, &tmp_read_coverage_start, &tmp_read_coverage_end, &tmp_indel_in_p1, &tmp_indel_in_p2, vote -> max_position, vote->max_vote, vote-> max_coverage_start, vote-> max_coverage_end, vote-> max_mask, vote->max_indel_recorder, &test_select_votes, rl);
4980 test_select_votes += vote->max_vote*1000000;
4981 //SUBREADprintf("TSV=%d\n",test_select_votes);
4982
4983 if(test_select_votes > best_select_votes)
4984 {
4985 best_select_votes = test_select_votes;
4986 *best_pos1 = tmp_best_pos1;
4987 *best_pos2 = tmp_best_pos2;
4988 *is_reversed_halves= tmp_is_reversed_halves;
4989
4990 *best_vote1 = tmp_best_vote1;
4991 *best_vote2 = tmp_best_vote2;
4992 *is_abnormal = tmp_is_abnormal;
4993 *indel_in_p1 = tmp_indel_in_p1;
4994 *indel_in_p2 = tmp_indel_in_p2;
4995
4996 *half_marks = tmp_half_marks;
4997 *read_coverage_start = tmp_read_coverage_start;
4998 *read_coverage_end = tmp_read_coverage_end;
4999
5000 * max_cover_start = vote-> max_coverage_start;
5001 * max_cover_end = vote-> max_coverage_end;
5002 best_ret = ret;
5003 }
5004 return best_ret;
5005 }
5006
5007
5008
5009 #define EXON_DONOR_TEST_WINDOW 17
5010
5011
5012 // pos1 must be small than pos2.
core13_test_donor(char * read,int read_len,unsigned int pos1,unsigned int pos2,int guess_break_point,char negative_strand,int test_range,char is_soft_condition,int EXON_INDEL_TOLERANCE,int * real_break_point,gene_value_index_t * my_value_array_index,int indel_offset1,int indel_offset2,int is_reversed,int space_type,int * best_donor_score,int * is_GTAG)5013 int core13_test_donor(char *read, int read_len, unsigned int pos1, unsigned int pos2, int guess_break_point, char negative_strand, int test_range, char is_soft_condition, int EXON_INDEL_TOLERANCE, int* real_break_point, gene_value_index_t * my_value_array_index, int indel_offset1, int indel_offset2, int is_reversed, int space_type, int * best_donor_score, int * is_GTAG)
5014 {
5015 int bps_pos_x;
5016 int search_start = guess_break_point - test_range ;
5017 int search_end = guess_break_point + test_range ;
5018 char h1_2ch[3], h2_2ch[3];
5019
5020 h1_2ch[2] = h2_2ch[2]=0;
5021 search_start=max(10, search_start);
5022 search_end = min(read_len-10, search_end);
5023 int best_break = -1;
5024 int min_x = -9099;
5025
5026 for (bps_pos_x = search_start; bps_pos_x < search_end ; bps_pos_x ++)
5027 {
5028 int paired_score = 0;
5029 get_chro_2base(h1_2ch, my_value_array_index, pos1 - indel_offset1+ bps_pos_x , is_reversed);
5030 get_chro_2base(h2_2ch, my_value_array_index, pos2 - 2 - indel_offset2 + bps_pos_x, is_reversed);
5031
5032
5033 //if(!is_reversed)
5034 //SUBREADprintf("C1=%s @%u, C2=%s @%u\n",h1_2ch, pos1 + bps_pos_x, h2_2ch,pos2 - 2 + indel_offset + bps_pos_x);
5035 if(h1_2ch[0]==h2_2ch[0] && h1_2ch[1]==h2_2ch[1]) continue;
5036
5037 if(is_donar_chars_part(h1_2ch) && is_donar_chars_part(h2_2ch))
5038 {
5039
5040 paired_score = paired_chars_part(h1_2ch, h2_2ch, is_reversed);
5041
5042 if(paired_score)
5043 {
5044 int m1, m2, x1, x2;
5045 int break_point_half = is_reversed?(read_len - bps_pos_x):bps_pos_x;
5046 int first_exon_end,second_half_start;
5047 int donar_conf_len = 0;
5048
5049 donar_conf_len = min(break_point_half , EXON_DONOR_TEST_WINDOW);
5050 donar_conf_len = min(read_len - break_point_half, donar_conf_len);
5051 //SUBREADprintf("DONOR_CONF_LEN=%d\n", donar_conf_len);
5052
5053 if (is_reversed)
5054 {
5055 first_exon_end = pos2 + bps_pos_x - indel_offset2;
5056 second_half_start = pos1 + bps_pos_x- indel_offset1;
5057
5058 m1 = match_chro(read + break_point_half - donar_conf_len , my_value_array_index, first_exon_end, donar_conf_len, is_reversed, space_type);
5059 m2 = match_chro(read + break_point_half , my_value_array_index, second_half_start-donar_conf_len , donar_conf_len, is_reversed, space_type);
5060
5061 x1 = match_chro(read + break_point_half , my_value_array_index, first_exon_end - donar_conf_len, donar_conf_len , is_reversed, space_type);
5062 x2 = match_chro(read + break_point_half - donar_conf_len , my_value_array_index, second_half_start , donar_conf_len, is_reversed, space_type);
5063 }
5064 else
5065 {
5066 first_exon_end = pos1 + bps_pos_x - indel_offset1;
5067 second_half_start = pos2 + bps_pos_x - indel_offset2;
5068
5069 m1 = match_chro(read + break_point_half - donar_conf_len, my_value_array_index, first_exon_end-donar_conf_len , donar_conf_len, is_reversed, space_type);
5070 m2 = match_chro(read + break_point_half , my_value_array_index, second_half_start, donar_conf_len, is_reversed, space_type);
5071
5072 x1 = match_chro(read + break_point_half , my_value_array_index, first_exon_end, donar_conf_len , is_reversed,space_type);
5073 x2 = match_chro(read + break_point_half - donar_conf_len, my_value_array_index, second_half_start - donar_conf_len, donar_conf_len , is_reversed,space_type);
5074 }
5075
5076 #ifdef TEST_TARGET
5077 if(memcmp(read, TEST_TARGET, 15)==0)
5078 {
5079 SUBREADprintf("DONOR TEST STR=%s, %s ; pos=%d %d %d ; M=%d %d ; X=%d %d\n", h1_2ch, h2_2ch, bps_pos_x, indel_offset1, indel_offset2, m1, m2, x1, x2);
5080 }
5081 #endif
5082
5083 int threshold = 3;
5084 if (paired_score == 1)
5085 threshold = 3;
5086
5087 #ifdef QUALITY_KILL
5088 if (m1 >= donar_conf_len-1 && m2>=donar_conf_len-1 )
5089 if(x1<donar_conf_len - threshold && x2<donar_conf_len- threshold )
5090 #else
5091 if (m1 >= donar_conf_len-1 && m2>=donar_conf_len -1)
5092 if(x1<donar_conf_len - threshold && x2<donar_conf_len - threshold)
5093 #endif
5094 {
5095 int score = 3000-(x1 + x2) + (m1+ m2) ;
5096 if (min_x < score)
5097 {
5098 min_x = score;
5099 best_break = bps_pos_x;
5100 *is_GTAG = 1==((is_reversed) + (h1_2ch[0]=='G' || h1_2ch[1]=='G')); //"GT" or "AG"
5101 //printf("FL CC=%s\tCC2=%s\tis_GTAG=%d\tREV=%d\n",h1_2ch,h2_2ch,*is_GTAG, is_reversed);
5102 *best_donor_score = score;
5103 }
5104 }
5105 }
5106 }
5107 }
5108
5109 if (best_break>0)
5110 {
5111 #ifdef TEST_TARGET
5112 if(memcmp(read, TEST_TARGET, 15)==0)
5113 SUBREADprintf("SELECRED!!!_BREAKPOINT=%d, RAW POS=%u,%u, R=%s\n", best_break, pos1 , pos2, read);
5114 #endif
5115 //SUBREADprintf ("FINAL BREAK: %d ; REV = %d\n ", best_break, is_reversed);
5116 *real_break_point = best_break;
5117 return 1;
5118 }
5119 else
5120 {
5121 #ifdef TEST_TARGET
5122 if(memcmp(read, TEST_TARGET, 15)==0)
5123 SUBREADprintf("KILLED!!!_BREAKPOINT=%d, R=%s\n", best_break+ pos1, read);
5124 #endif
5125 }
5126 return 0;
5127 }
5128
5129
5130
5131
5132
5133
5134 #define EXON_LARGE_WINDOW 60
5135 #define ACCEPTED_SUPPORT_RATE 0.3
5136
core_fragile_junction_voting(global_context_t * global_context,thread_context_t * thread_context,char * rname,char * read,char * qual,unsigned int full_rl,int negative_strand,int color_space,unsigned int low_border,unsigned int high_border,gene_vote_t * vote_p1)5137 void core_fragile_junction_voting(global_context_t * global_context, thread_context_t * thread_context, char * rname, char * read, char * qual, unsigned int full_rl, int negative_strand, int color_space, unsigned int low_border, unsigned int high_border, gene_vote_t *vote_p1)
5138 {
5139 int windows = full_rl / EXON_LARGE_WINDOW +1;
5140 float overlap = (1.0*windows * EXON_LARGE_WINDOW - full_rl) / (windows-1);
5141
5142 int ww;
5143 int window_cursor = 0;
5144
5145 HashTable * event_table = NULL;
5146 chromosome_event_t * event_space = NULL;
5147 if(thread_context)
5148 {
5149 event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
5150 event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
5151 }
5152 else
5153 {
5154 event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
5155 event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
5156 }
5157
5158 int GENE_SLIDING_STEP = global_context->current_index -> index_gap;
5159
5160
5161 for(ww=0; ww<windows;ww++)
5162 {
5163 window_cursor = (int)(ww * EXON_LARGE_WINDOW - ww * overlap);
5164 int read_len = EXON_LARGE_WINDOW;
5165 if(ww == windows-1)
5166 read_len = full_rl -window_cursor;
5167
5168 float subread_step = 3.00001;
5169 int i;
5170 int subread_no;
5171 char * InBuff;
5172 InBuff = read + window_cursor;
5173 char tmp_char = InBuff[read_len];
5174 InBuff[read_len] = 0;
5175
5176 init_gene_vote(vote_p1);
5177 for(subread_no=0; ; subread_no++)
5178 {
5179 int subread_offset1 = (int)(subread_step * (subread_no+1));
5180 subread_offset1 -= subread_offset1%GENE_SLIDING_STEP;
5181 subread_offset1 += GENE_SLIDING_STEP-1;
5182
5183 for(i=0; i<GENE_SLIDING_STEP ; i++)
5184 {
5185 int subread_offset = (int)(subread_step * subread_no);
5186 subread_offset -= subread_offset%GENE_SLIDING_STEP -i;
5187
5188 char * subread_string = InBuff + subread_offset;
5189 gehash_key_t subread_integer = genekey2int(subread_string, color_space);
5190
5191 gehash_go_q(global_context->current_index, subread_integer , subread_offset, read_len,negative_strand, vote_p1, 5, subread_no, low_border, high_border - read_len);
5192 }
5193 if(subread_offset1 >= read_len -16)
5194 break;
5195 }
5196
5197 int ii, jj, kk;
5198 for(ii = 0; ii < GENE_VOTE_TABLE_SIZE; ii++) {
5199 for(jj = 0; jj < vote_p1 -> items[ii] ; jj++) {
5200 if(vote_p1 -> votes[ii][jj] < vote_p1 -> max_vote) continue;
5201
5202 gene_vote_number_t * indel_recorder = vote_p1 -> indel_recorder[ii][jj];
5203 unsigned int voting_position = vote_p1 -> pos[ii][jj];
5204 int last_indel = 0, last_correct_subread=0;
5205
5206 for(kk =0; indel_recorder[kk] && (kk < MAX_INDEL_SECTIONS); kk+=3){
5207 char movement_buffer[MAX_READ_LENGTH * 10 / 7];
5208 //chromosome_event_t * last_event = NULL;
5209 int last_event_id = -1;
5210
5211 int indels = indel_recorder[kk+2] - last_indel;
5212 if(indels==0) continue;
5213
5214 int next_correct_subread = indel_recorder[kk] -1;
5215
5216 int last_correct_base = find_subread_end(read_len, global_context->config.total_subreads , last_correct_subread) - 9;
5217 int first_correct_base = find_subread_end(read_len, global_context->config.total_subreads , next_correct_subread) - 16 + 9;
5218 first_correct_base = min(first_correct_base+10, read_len);
5219 last_correct_base = max(0, last_correct_base);
5220 last_correct_base = min(read_len-1, last_correct_base);
5221
5222 int x1, dyna_steps;
5223
5224 dyna_steps = core_dynamic_align(global_context, thread_context, InBuff + last_correct_base, first_correct_base - last_correct_base, voting_position + last_correct_base + last_indel, movement_buffer, indels, rname);
5225
5226 movement_buffer[dyna_steps]=0;
5227
5228 if(0 && strcmp("MISEQ:13:000000000-A1H1M:1:1112:12194:5511", rname) == 0)
5229 {
5230 SUBREADprintf("IR= %d %d~%d\n", dyna_steps, last_correct_base, first_correct_base);
5231
5232 for(x1=0; x1<dyna_steps;x1++)
5233 {
5234 int mc, mv=movement_buffer[x1];
5235 if(mv==0)mc='=';
5236 else if(mv==1)mc='D';
5237 else if(mv==2)mc='I';
5238 else mc='X';
5239 SUBREADprintf("%c",mc);
5240 }
5241 SUBREADputs("");
5242 }
5243 unsigned int cursor_on_chromosome = voting_position + last_correct_base + last_indel, cursor_on_read = last_correct_base;
5244 int last_mv = 0;
5245 unsigned int indel_left_boundary = 0;
5246 int is_in_indel = 0, current_indel_len = 0, total_mismatch = 0;
5247
5248 for(x1=0; x1<dyna_steps;x1++)
5249 {
5250 int mv=movement_buffer[x1];
5251 if(mv==3) total_mismatch++;
5252 }
5253
5254 if(total_mismatch<2 || (global_context->config.maximise_sensitivity_indel && total_mismatch <= 2 ))
5255 for(x1=0; x1<dyna_steps;x1++)
5256 {
5257 int mv=movement_buffer[x1];
5258
5259 if(last_mv != mv)
5260 {
5261 if( ( mv==1 || mv==2 ) && ! is_in_indel)
5262 {
5263 indel_left_boundary = cursor_on_chromosome;
5264 is_in_indel = 1;
5265 current_indel_len = 0;
5266 }
5267 else if ( is_in_indel && (mv == 0 || mv == 3) )
5268 {
5269 gene_value_index_t * current_value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
5270 int ambiguous_i, ambiguous_count=0;
5271 int best_matched_bases = match_chro(InBuff + cursor_on_read - 6, current_value_index, indel_left_boundary - 6, 6, 0, global_context->config.space_type) +
5272 match_chro(InBuff + cursor_on_read - min(current_indel_len,0), current_value_index, indel_left_boundary + max(0, current_indel_len), 6, 0, global_context->config.space_type);
5273 for(ambiguous_i=-5; ambiguous_i<=5; ambiguous_i++)
5274 {
5275 int left_match = match_chro(InBuff + cursor_on_read - 6, current_value_index, indel_left_boundary - 6, 6+ambiguous_i, 0, global_context->config.space_type);
5276 int right_match = match_chro(InBuff + cursor_on_read + ambiguous_i - min(current_indel_len,0), current_value_index, indel_left_boundary + ambiguous_i + max(0, current_indel_len), 6-ambiguous_i, 0,global_context->config.space_type);
5277 if(left_match+right_match == best_matched_bases) ambiguous_count ++;
5278 }
5279
5280 if(0 && strcmp("MISEQ:13:000000000-A1H1M:1:1112:12194:5511", rname) == 0)
5281 SUBREADprintf("INDEL_DDADD: abs(I=%d); INDELS=%d; LOC=%u\n",i, current_indel_len, indel_left_boundary-1);
5282 if(abs(current_indel_len)<=global_context -> config.max_indel_length)
5283 {
5284 chromosome_event_t * new_event = local_add_indel_event(global_context, thread_context, event_table, InBuff + cursor_on_read + min(0,current_indel_len), indel_left_boundary - 1, current_indel_len, 1, ambiguous_count, 0, NULL);
5285 if(last_event_id >=0 && new_event){
5286 // the event space can be changed when the new event is added. the location is updated everytime.
5287 chromosome_event_t * event_space = NULL;
5288 if(thread_context)
5289 event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
5290 else
5291 event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
5292 chromosome_event_t * last_event = event_space + last_event_id;
5293
5294 int dist = new_event -> event_small_side - last_event -> event_large_side +1;
5295
5296 new_event -> connected_previous_event_distance = dist;
5297 last_event -> connected_next_event_distance = dist;
5298 }
5299
5300 if (new_event)
5301 last_event_id = new_event -> global_event_id;
5302 else last_event_id = -1;
5303 }
5304 }
5305
5306
5307 if(mv == 0 || mv == 3)
5308 is_in_indel = 0;
5309 }
5310
5311 if(is_in_indel && mv == 1)
5312 current_indel_len += 1;
5313 if(is_in_indel && mv == 2)
5314 current_indel_len -= 1;
5315
5316 if(mv == 1 || mv == 3 || mv == 0) cursor_on_chromosome++;
5317 if(mv == 2 || mv == 3 || mv == 0) cursor_on_read++;
5318
5319 last_mv = mv;
5320 }
5321 last_correct_subread = indel_recorder[i+1]-1;
5322 }
5323
5324 }
5325 }
5326
5327
5328
5329 if(1)
5330 {
5331 finalise_vote(vote_p1);
5332 select_best_vote(vote_p1);
5333 //print_votes(vote_p1, global_context -> config.index_prefix);
5334 unsigned int best_pos1=0;
5335 unsigned int best_pos2=0;
5336 int best_vote1=0;
5337 int best_vote2=0;
5338 char is_abnormal=0;
5339 short half_marks=0;
5340 int is_reversed_halves=0, max_cover_start=0, max_cover_end=0;
5341 char indel_in_p1=0, indel_in_p2=0;
5342 short read_coverage_start =0, read_coverage_end=0;
5343 gene_value_index_t * base_index = thread_context?thread_context->current_value_index:global_context->current_value_index ;
5344
5345 int splice_point = core_select_best_matching_halves(global_context, vote_p1, &best_pos1, &best_pos2, &best_vote1, &best_vote2, &is_abnormal ,&half_marks, &is_reversed_halves, ACCEPTED_SUPPORT_RATE, read_len, -1, 0, &read_coverage_start, &read_coverage_end, &indel_in_p1, &indel_in_p2, &max_cover_start, &max_cover_end, read_len, -1 , 0, NULL , 0xffffffff);
5346
5347 //SUBREADprintf("RN=%s , WINDOW = %d ~ %d , SP=%d; BV=%d; BV2=%d\n", rname , window_cursor , window_cursor + read_len , splice_point, best_vote1, best_vote2);
5348 if (splice_point>0 && best_vote1 >= 1 && best_vote2>=1)
5349 {
5350 int test_real_break_point = -1, test_donor_score=-1;
5351 int is_GTAG = 0;
5352 int is_accepted = core13_test_donor(InBuff, read_len, min(best_pos1, best_pos2), max(best_pos1,best_pos2), splice_point, negative_strand, read_len/4, 0, 5, &test_real_break_point, base_index, 0, 0, negative_strand, color_space, &test_donor_score, &is_GTAG);
5353
5354 if (is_accepted ){
5355 unsigned int pos_small = min(test_real_break_point+ best_pos1, test_real_break_point+ best_pos2) - 1;
5356 unsigned int pos_big = max(test_real_break_point+ best_pos1, test_real_break_point+ best_pos2);
5357
5358 int event_no;
5359 chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
5360 chromosome_event_t * found = NULL;
5361
5362 int found_events = search_event(global_context, event_table, event_space, pos_small , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_JUNCTION|CHRO_EVENT_TYPE_FUSION, search_return);
5363
5364 if(found_events)
5365 {
5366 int kx1;
5367 for(kx1 = 0; kx1 < found_events ; kx1++)
5368 {
5369 if(search_return[kx1] -> event_large_side == pos_big)
5370 {
5371 found = search_return[kx1];
5372 break;
5373 }
5374 }
5375 }
5376
5377 if(found) found -> supporting_reads ++;
5378 else
5379 {
5380 if(thread_context)
5381 event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
5382 else
5383 event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> total_events ++;
5384
5385 event_space = reallocate_event_space(global_context, thread_context, event_no);
5386
5387 chromosome_event_t * new_event = event_space+event_no;
5388 memset(new_event,0,sizeof(chromosome_event_t));
5389 new_event -> event_small_side = pos_small;
5390 new_event -> event_large_side = pos_big;
5391
5392 new_event -> is_negative_strand= !is_GTAG;
5393 new_event -> event_type = CHRO_EVENT_TYPE_JUNCTION;
5394
5395 new_event -> supporting_reads = 1;
5396 new_event -> indel_length = 0;
5397
5398 put_new_event(event_table, new_event , event_no);
5399 // SUBREADprintf("ADD JUNCTION BY FRAGILE, %d-%d\n", pos_small, pos_big);
5400 }
5401
5402 }
5403
5404 }
5405 }
5406 InBuff[read_len] = tmp_char;
5407 }
5408 }
5409
5410
print_frags(global_context_t * global_context,fragment_list_t * fls)5411 void print_frags(global_context_t * global_context, fragment_list_t * fls){
5412 int x1;
5413
5414 for(x1 =0; x1 < fls -> fragments; x1++){
5415 subread_read_number_t fno = fls -> fragment_numbers[x1] / 2;
5416 int f_is_B = fls -> fragment_numbers[x1] % 2;
5417
5418 mapping_result_t * f_res = _global_retrieve_alignment_ptr(global_context, fno, f_is_B, 0);
5419 mapping_result_t * mate_res = _global_retrieve_alignment_ptr(global_context, fno, !f_is_B, 0);
5420 char outpos[100];
5421 char outposm[100];
5422 absoffset_to_posstr(global_context, f_res -> selected_position, outpos);
5423 absoffset_to_posstr(global_context, mate_res -> selected_position, outposm);
5424
5425 int f_negative = (f_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5426 int mate_negative = (mate_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5427
5428 if(f_is_B) f_negative=!f_negative;
5429 else mate_negative=!mate_negative;
5430
5431 //SUBREADprintf("TRALOG: READ %09u %c AT %s (%c) ; MATE: %s (%c)\n", fno, f_is_B?'B':'A' , outpos, f_negative?'N':'P' , outposm, mate_negative?'N':'P');
5432
5433 }
5434 }
5435
5436 // fragnos_paired_B = B_fragment_no * 2 + is_mate_b (is_mate_b points the mate that has the location in locations_mate_B)
5437 // fragnos_paired_C = C_fragment_no * 2 + is_mate_c (is_mate_c points the mate that has the location in locations_mate_C)
5438 //
5439 // locations_mate_B and locations_mate_C are the locations where the sequence is moved to. I.e., locations_mate_B and locations_mate_C are far far away from fragment A.
5440 //
find_translocation_BC_mates(global_context_t * global_context,mapping_result_t * res_A1,mapping_result_t * res_A2,fragment_list_t * listB,fragment_list_t * listC,int is_INV,unsigned long long * fragnos_paired_B,unsigned long long * fragnos_paired_C,unsigned int * locations_mate_B,unsigned int * locations_mate_C,unsigned int * guessed_brkP_small_sum,unsigned int * guessed_moved_length_sum,unsigned int * guessed_brkQ_small_sum)5441 int find_translocation_BC_mates(global_context_t * global_context, mapping_result_t * res_A1, mapping_result_t * res_A2, fragment_list_t * listB, fragment_list_t * listC, int is_INV, unsigned long long * fragnos_paired_B, unsigned long long * fragnos_paired_C, unsigned int * locations_mate_B, unsigned int * locations_mate_C,unsigned int * guessed_brkP_small_sum, unsigned int * guessed_moved_length_sum , unsigned int * guessed_brkQ_small_sum){
5442
5443 int ret = 0, xk1, xk2;
5444 char * is_C_used = malloc(sizeof(char) * listC->fragments);
5445 memset(is_C_used, 0, sizeof(char) * listC->fragments);
5446 long long tmp_guessed_brkP_small_sum = 0, tmp_guessed_moved_length_sum = 0, tmp_guessed_brkQ_small_sum = 0;
5447
5448 for(xk1 = 0; xk1 < listB->fragments; xk1++)
5449 {
5450 long long minimum_mate_distance = 0x7fffffff;
5451 int minimum_xk2 = -1;
5452 unsigned int mate_C_pos = 0;
5453 mapping_result_t * res_Ca = NULL, * res_Cc = NULL, * res_Ba = NULL, *res_Bb = NULL;
5454 mapping_result_t meta_C_res_body, res_Ca_body;
5455 res_Ca = &res_Ca_body;
5456
5457 mapping_result_t * meta_C_res = &meta_C_res_body;
5458
5459 subread_read_number_t B_read_no = listB->fragment_numbers[xk1]/2;
5460 int B_read_is_b = listB->fragment_numbers[xk1]%2;
5461
5462 mapping_result_t meta_B_res_body, res_Ba_body;
5463 mapping_result_t * meta_B_res = &meta_B_res_body;
5464 res_Ba = &res_Ba_body;
5465
5466 bigtable_readonly_result(global_context, NULL, B_read_no, 0, !B_read_is_b, meta_B_res, NULL);
5467 res_Bb = meta_B_res;
5468
5469 bigtable_readonly_result(global_context, NULL, B_read_no, 0, B_read_is_b, res_Ba, NULL);
5470
5471 for(xk2 = 0; xk2 < listC->fragments; xk2++)
5472 {
5473 if(is_C_used[xk2]) continue;
5474
5475 subread_read_number_t C_read_no = listC->fragment_numbers[xk2]/2;
5476 int C_read_is_b = listC->fragment_numbers[xk2]%2;
5477
5478 bigtable_readonly_result(global_context, NULL, C_read_no, 0, !C_read_is_b, meta_C_res, NULL);
5479 res_Cc = meta_C_res;
5480
5481 bigtable_readonly_result(global_context, NULL, C_read_no, 0, C_read_is_b, res_Ca, NULL);
5482
5483 int is_meta_B_negative = (meta_B_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5484 if(!B_read_is_b) is_meta_B_negative = !is_meta_B_negative;
5485
5486 int is_meta_C_negative = (meta_C_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5487 if(!C_read_is_b) is_meta_C_negative = !is_meta_C_negative;
5488
5489 //SUBREADprintf("TRALOG: MATES : B[%d] = %u (%c); C[%d] = %u (%c)\n", xk1, meta_B_res -> selected_position, is_meta_B_negative?'N':'P' , xk2, meta_C_res -> selected_position, is_meta_C_negative?'N':'P');
5490
5491 if(is_meta_B_negative != is_meta_C_negative &&
5492 meta_B_res -> selected_position < meta_C_res -> selected_position &&
5493 meta_C_res -> selected_position - meta_C_res -> selected_position < global_context -> config.maximum_translocation_length &&
5494 meta_C_res -> selected_position - meta_B_res -> selected_position < minimum_mate_distance)
5495 {
5496 minimum_mate_distance = meta_C_res -> selected_position - meta_B_res -> selected_position;
5497 minimum_xk2 = xk2;
5498 mate_C_pos = meta_C_res -> selected_position;
5499 }
5500 }
5501 // read B has a mate of C[minimum xk2] if there is one.
5502 if(minimum_xk2>=0)
5503 {
5504 subread_read_number_t C_mate_fno = listC -> fragment_numbers[minimum_xk2] / 2;
5505 int C_mate_is_b = listC -> fragment_numbers[minimum_xk2] % 2;
5506
5507 fragnos_paired_B[ret] = (B_read_no*2)+(!B_read_is_b);
5508 locations_mate_B[ret] = meta_B_res -> selected_position;
5509
5510 fragnos_paired_C[ret] = (C_mate_fno*2)+(C_mate_is_b);
5511 locations_mate_C[ret] = mate_C_pos;
5512
5513 is_C_used[minimum_xk2] = 1;
5514
5515
5516 int gapA, gapB, gapC;
5517
5518 if(is_INV){
5519 gapA = res_Ca -> selected_position - res_A1 -> selected_position - res_A1 -> read_length;
5520 gapB = res_A2 -> selected_position - res_Ba -> selected_position - res_Ba -> read_length;
5521 gapC = res_Cc -> selected_position - res_Bb -> selected_position - res_Bb -> read_length;
5522 }else{
5523 gapA = res_Ba -> selected_position - res_A1 -> selected_position - res_A1 -> read_length;
5524 gapB = res_A2 -> selected_position - res_Ca -> selected_position - res_Ca -> read_length;
5525 gapC = res_Cc -> selected_position - res_Bb -> selected_position - res_Bb -> read_length;
5526 }
5527
5528 tmp_guessed_brkP_small_sum += res_A1 -> selected_position + res_A1 -> read_length + gapA/2;
5529 tmp_guessed_moved_length_sum += res_A2 -> selected_position - res_A1 -> selected_position - res_A1 -> read_length - gapB/2 + gapA/2;
5530 tmp_guessed_brkQ_small_sum += res_Bb -> selected_position + res_Bb -> read_length + gapC/2;
5531
5532 ret ++;
5533 }
5534 }
5535
5536 free(is_C_used);
5537
5538 if(ret>0){
5539 *guessed_brkP_small_sum= tmp_guessed_brkP_small_sum / ret;
5540 *guessed_moved_length_sum = tmp_guessed_moved_length_sum/ ret;
5541 *guessed_brkQ_small_sum = tmp_guessed_brkQ_small_sum / ret;
5542 }
5543
5544 return ret;
5545 }
5546
5547
5548 // This function sees if all the mates of read B_x and C_y are at the same location.
5549 // If mates of B_x and C_y spread on a large region, it is usually unreliable.
5550 // posesB and posesB are linear absolute positions of the mate reads.
find_translocation_BC_conformation(global_context_t * global_context,int PEmates,unsigned int * posesB,unsigned int * posesC)5551 int find_translocation_BC_conformation(global_context_t * global_context, int PEmates, unsigned int * posesB, unsigned int * posesC){
5552
5553 unsigned int min_pos = 0xffffffff, max_pos = 0, xk1;
5554 if(PEmates<1) return 0;
5555
5556 for(xk1 = 0; xk1 < PEmates; xk1++)
5557 {
5558 min_pos = min(min_pos, posesB[xk1]);
5559 min_pos = min(min_pos, posesC[xk1]);
5560
5561 max_pos = max(max_pos, posesB[xk1]);
5562 max_pos = max(max_pos, posesC[xk1]);
5563 }
5564
5565 if(max_pos - min_pos< 2*global_context -> config.maximum_pair_distance)return 1;
5566 return 0;
5567 }
5568
5569
5570 // fliB and fliB are : frag_[BC]_no * 2 + is_Read_b_close_to_BreakPoint_P
breakpoint_PQR_supported(global_context_t * global_context,unsigned int brkPno,unsigned int brkQno,unsigned int brkRno,fragment_list_t * fliB,fragment_list_t * fliC,int isInv)5571 int breakpoint_PQR_supported(global_context_t * global_context , unsigned int brkPno , unsigned int brkQno, unsigned int brkRno, fragment_list_t * fliB, fragment_list_t * fliC, int isInv){
5572 int fli_i;
5573 int isFliB, nSupB=0, nSupC=0;
5574
5575 for(isFliB = 0; isFliB < 2; isFliB++){
5576 fragment_list_t * fli = isFliB?fliB:fliC;
5577 int * nSup = isFliB?&nSupB:&nSupC;
5578 // fliB => support source_small ~ target_large if inv, or source_small ~ target_small if !inv
5579 // fliC => support source_large ~ target_small if inv, or source_large ~ target_large if !inv
5580
5581 // the read that is close to BreakPoint_P should support source, the other read should support target
5582 for(fli_i = 0; fli_i < fli -> fragments; fli_i ++){
5583 subread_read_number_t frag_BC_no = fli -> fragment_numbers[fli_i]/2;
5584 int is_Read_b_close_to_BreakPoint_P = fli -> fragment_numbers[fli_i]%2;
5585 unsigned int source_small, source_large, target_smallQ, target_largeQ, target_smallR, target_largeR, target_large, target_small;
5586
5587 get_event_two_coordinates(global_context, brkPno, NULL, NULL, &source_small, NULL, NULL, &source_large);
5588 get_event_two_coordinates(global_context, brkQno, NULL, NULL, &target_smallQ, NULL, NULL, &target_largeQ);
5589 get_event_two_coordinates(global_context, brkRno, NULL, NULL, &target_smallR, NULL, NULL, &target_largeR);
5590
5591
5592 if(target_smallQ <= target_smallR + BREAK_POINT_MAXIMUM_TOLERANCE && target_smallQ >= target_smallR - BREAK_POINT_MAXIMUM_TOLERANCE)
5593 {
5594 //target_smallQ is target, target_smallR is target
5595 target_large = target_smallR;
5596 target_small = target_smallQ;
5597 }else{
5598
5599 //target_largeQ is target, target_largeR is target
5600 target_large = target_largeQ;
5601 target_small = target_largeR;
5602 }
5603
5604
5605 mapping_result_t res_BC_close_P_body, res_BC_close_Q_body;
5606
5607 mapping_result_t * res_BC_close_P = &res_BC_close_P_body, * res_BC_close_Q = & res_BC_close_Q_body;
5608
5609 bigtable_readonly_result(global_context, NULL, frag_BC_no, 0, is_Read_b_close_to_BreakPoint_P, res_BC_close_P, NULL);
5610 bigtable_readonly_result(global_context, NULL, frag_BC_no, 0, !is_Read_b_close_to_BreakPoint_P, res_BC_close_Q, NULL);
5611
5612 unsigned int P_pos = isInv?( isFliB?source_large:source_small ):( isFliB?source_small:source_large );
5613 unsigned int Q_pos = isInv?( isFliB?target_large:target_small ):( isFliB?target_small:target_large );
5614
5615 SUBREADprintf("TRALOG: PQR_TARGET P=%u~%u; Q=%u~%u, R=%u~%u ; Ppos=%u, Qpos=%u, Pread=%u, Qread=%u on %s\n", source_small, source_large, target_smallQ, target_largeQ, target_smallR, target_largeR, P_pos, Q_pos, res_BC_close_P -> selected_position, res_BC_close_Q -> selected_position, isInv?"INV":"STR");
5616
5617 long long dist;
5618 dist = res_BC_close_P -> selected_position;
5619 dist -= P_pos;
5620 if(abs(dist) < global_context -> config.maximum_pair_distance){
5621 dist = res_BC_close_Q -> selected_position;
5622 dist -= Q_pos;
5623 if(abs(dist) < global_context -> config.maximum_pair_distance)
5624 (*nSup)++;
5625 }
5626 }
5627 }
5628 //return nSupB + 1 >= fliB -> fragments/2 && nSupC + 1 >= fliC-> fragments/2 ;
5629 SUBREADprintf("TRALOG: PQR_NSUP: B=%d, C=%d on %s\n", nSupB, nSupC, isInv?"INV":"STR");
5630 return nSupB > 0 && nSupC > 0 && nSupB + 2 >= fliB->fragments / 2 && nSupC + 2 >= fliC->fragments / 2;
5631 }
5632
5633 // fragnoD1_mates and fragnoD2_mates are poteltial E reads 1/2.
5634 // D1: D's small read; D2: D's large read
5635 // E2 ~ D2
5636 // E1 ~ D1
5637 // E2.start > Y.large
5638 // E1.start > Y.small
5639
breakpoint_YZ_supported(global_context_t * global_context,unsigned int brkYno,unsigned int brkZno,unsigned long long * fragnoD1_mates,int fragnoD1len,unsigned long long * fragnoD2_mates,int fragnoD2len)5640 int breakpoint_YZ_supported(global_context_t * global_context, unsigned int brkYno, unsigned int brkZno, unsigned long long * fragnoD1_mates, int fragnoD1len, unsigned long long * fragnoD2_mates, int fragnoD2len){
5641 int x1;
5642 int is_D2_mates;
5643
5644 unsigned int inversion_small_edge, inversion_large_edge;
5645 get_event_two_coordinates(global_context, brkYno, NULL, NULL, &inversion_small_edge, NULL, NULL, &inversion_large_edge);
5646
5647
5648 int nSupD1mates = 0, nSupD2mates = 0;
5649 for(is_D2_mates = 0; is_D2_mates < 2; is_D2_mates ++){
5650 unsigned long long * fragno_Dmates = is_D2_mates?fragnoD2_mates:fragnoD1_mates;
5651 int fragno_Dno = is_D2_mates?fragnoD2len:fragnoD1len;
5652 int * nSupMates = is_D2_mates?&nSupD2mates:&nSupD1mates;
5653 for(x1 = 0; x1 < fragno_Dno; x1++){
5654 subread_read_number_t fragno_Dmate = fragno_Dmates[x1] / 2;
5655 int is_large_read_far_from_D = fragno_Dmates[x1] % 2;
5656
5657 mapping_result_t frag_D_mate_a_body, frag_D_mate_b_body;
5658 mapping_result_t * frag_D_mate_a = &frag_D_mate_a_body, * frag_D_mate_b = & frag_D_mate_b_body;
5659
5660 bigtable_readonly_result(global_context, NULL, fragno_Dmate, 0, 0, frag_D_mate_a, NULL);
5661 bigtable_readonly_result(global_context, NULL, fragno_Dmate, 0, 1, frag_D_mate_b, NULL);
5662
5663 mapping_result_t * frag_D_mate_1 = (frag_D_mate_a -> selected_position > frag_D_mate_b -> selected_position)?frag_D_mate_b:frag_D_mate_a;
5664 mapping_result_t * frag_D_mate_2 = (frag_D_mate_a -> selected_position <=frag_D_mate_b -> selected_position)?frag_D_mate_b:frag_D_mate_a;
5665
5666 mapping_result_t * res_to_support_small_edge = (is_D2_mates ^ is_large_read_far_from_D)?frag_D_mate_2:frag_D_mate_1;
5667 mapping_result_t * res_to_support_large_edge = (is_D2_mates ^ is_large_read_far_from_D)?frag_D_mate_1:frag_D_mate_2;
5668
5669 long long distsm;
5670 distsm = res_to_support_small_edge -> selected_position;
5671 distsm -= inversion_small_edge;
5672
5673 long long distla;
5674 distla = res_to_support_large_edge -> selected_position;
5675 distla -= inversion_large_edge;
5676
5677 //SUBREADprintf("INVLOG: Dist_SM=%lld, Dist_LA=%lld\n", distsm, distla);
5678
5679 if(distsm > -8 && distsm < global_context -> config.maximum_pair_distance){
5680
5681 if(distla > -8 && distla < global_context -> config.maximum_pair_distance)
5682 (*nSupMates) ++;
5683 }
5684
5685
5686 }
5687 }
5688
5689 //SUBREADprintf("INVLOG: breakpoint_YZ_supported nSupD1=%d >= %d, nSupD2=%d >= %d\n", nSupD1mates, fragnoD1len, nSupD2mates, fragnoD2len);
5690 return nSupD1mates > 0 && nSupD2mates > 0 && nSupD1mates + 2 >= fragnoD1len / 2 && nSupD2mates + 2 >= fragnoD2len / 2;
5691 }
5692
5693 #define _PQR_LIST_SIZE 48
5694
find_translocation_brk_PQR(global_context_t * global_context,mapping_result_t * resA1,mapping_result_t * resA2,fragment_list_t * fliB,fragment_list_t * fliC,unsigned int * brkPno,unsigned int * brkQno,unsigned int * brkRno,int isInv,unsigned int * is_cand_P_found)5695 int find_translocation_brk_PQR(global_context_t * global_context, mapping_result_t * resA1, mapping_result_t * resA2, fragment_list_t * fliB, fragment_list_t * fliC, unsigned int * brkPno, unsigned int * brkQno, unsigned int * brkRno, int isInv, unsigned int * is_cand_P_found)
5696 {
5697 unsigned int event_pos_list_A1[_PQR_LIST_SIZE];
5698 void * event_ptr_list_A1[_PQR_LIST_SIZE];
5699
5700 char * chroA=NULL;
5701 int posA1=0;
5702
5703 locate_gene_position(resA1 -> selected_position, &global_context -> chromosome_table, &chroA, &posA1);
5704
5705
5706 int candA1i, found_PQR = 0;
5707 int candA1Number = bktable_lookup(&global_context -> breakpoint_table_P, chroA, posA1, global_context -> config.maximum_pair_distance , event_pos_list_A1, event_ptr_list_A1, _PQR_LIST_SIZE);
5708 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
5709 int candBrkPi , candBrkPNumber=0;
5710
5711 //SUBREADprintf("A FOUND %d P ", candA1Number);
5712
5713 for(candA1i = 0; candA1i < candA1Number ; candA1i++){
5714 int event_no = event_ptr_list_A1[candA1i] - NULL;
5715 chromosome_event_t * event_body = indel_context -> event_space_dynamic + event_no;
5716
5717 long long small_dist = event_body -> event_small_side, large_dist = event_body -> event_large_side;
5718 small_dist -= resA1 -> selected_position;
5719 large_dist -= resA2 -> selected_position;
5720 }
5721
5722 //SUBREADprintf(", (%d may be used)\n", candBrkPNumber);
5723
5724 *is_cand_P_found = candBrkPNumber;
5725
5726 for(candBrkPi = 0; candBrkPi < candBrkPNumber; candBrkPi++){
5727 unsigned int event_no_P = event_ptr_list_A1[candBrkPi] - NULL;
5728 chromosome_event_t * event_body_P = indel_context -> event_space_dynamic + event_no_P;
5729
5730 unsigned int anchor_for_brkQ = isInv?event_body_P -> event_large_side:event_body_P -> event_small_side;
5731 unsigned int anchor_for_brkR = isInv?event_body_P -> event_small_side:event_body_P -> event_large_side;
5732
5733 unsigned int event_pos_list_Q[_PQR_LIST_SIZE];
5734 void * event_ptr_list_Q[_PQR_LIST_SIZE];
5735
5736 unsigned int event_pos_list_R[_PQR_LIST_SIZE];
5737 void * event_ptr_list_R[_PQR_LIST_SIZE];
5738
5739 char * charAncQ = NULL, * charAncR = NULL;
5740 int posAncQ=0, posAncR = 0;
5741 locate_gene_position(anchor_for_brkQ, &global_context -> chromosome_table, &charAncQ, &posAncQ);
5742 locate_gene_position(anchor_for_brkR, &global_context -> chromosome_table, &charAncR, &posAncR);
5743
5744 int candQi, candQnumber = bktable_lookup(&global_context -> breakpoint_table_QR, charAncQ, posAncQ - BREAK_POINT_MAXIMUM_TOLERANCE , 2* BREAK_POINT_MAXIMUM_TOLERANCE , event_pos_list_Q, event_ptr_list_Q, _PQR_LIST_SIZE);
5745 int candRi, candRnumber = bktable_lookup(&global_context -> breakpoint_table_QR, charAncR, posAncR - BREAK_POINT_MAXIMUM_TOLERANCE , 2* BREAK_POINT_MAXIMUM_TOLERANCE , event_pos_list_R, event_ptr_list_R, _PQR_LIST_SIZE);
5746
5747 SUBREADprintf("P [%s] FOUND %d Q AT %s:%u and %d R AT %s:%u\n", isInv?"INV":"STR", candQnumber, charAncQ, posAncQ, candRnumber, charAncR, posAncR);
5748
5749 for(candQi = 0 ; candQi < candQnumber ; candQi++){
5750 unsigned int event_no_Q = event_ptr_list_Q[candQi] - NULL;
5751 chromosome_event_t * event_body_Q = indel_context -> event_space_dynamic + event_no_Q;
5752
5753 long long cand_Q_small_dist = event_body_Q -> event_small_side;
5754 cand_Q_small_dist -= isInv?event_body_P -> event_large_side:event_body_P -> event_small_side;
5755
5756 int is_Q_small_side_close_to_P = abs(cand_Q_small_dist) <= BREAK_POINT_MAXIMUM_TOLERANCE;
5757
5758 SUBREADprintf("Q: SMALL_CLOSE_P = %d, DIR = %c %c\n", is_Q_small_side_close_to_P, event_body_Q -> small_side_increasing_coordinate?'>':'<', event_body_Q -> large_side_increasing_coordinate?'>':'<');
5759
5760 if( is_Q_small_side_close_to_P && event_body_Q -> large_side_increasing_coordinate == 1) continue; // the large side is the target location.
5761 if((!is_Q_small_side_close_to_P) && event_body_Q -> small_side_increasing_coordinate == 1) continue; // the small side is the target location.
5762
5763
5764 if( isInv && event_body_Q -> large_side_increasing_coordinate != event_body_Q -> small_side_increasing_coordinate) continue;
5765 if((!isInv) && event_body_Q -> large_side_increasing_coordinate == event_body_Q -> small_side_increasing_coordinate) continue;
5766
5767 for(candRi = 0 ; candRi < candRnumber ; candRi++){
5768 unsigned int event_no_R = event_ptr_list_R[candRi] - NULL;
5769 chromosome_event_t * event_body_R = indel_context -> event_space_dynamic + event_no_R;
5770
5771 srInt_64 cand_R_dist_to_Q = is_Q_small_side_close_to_P?event_body_Q -> event_large_side:event_body_Q -> event_small_side;
5772 cand_R_dist_to_Q -= is_Q_small_side_close_to_P?event_body_R -> event_large_side:event_body_R-> event_small_side;
5773
5774 SUBREADprintf("R: candDist=%lld, DIR = %c %c\n", cand_R_dist_to_Q, event_body_Q -> small_side_increasing_coordinate?'>':'<', event_body_Q -> large_side_increasing_coordinate?'>':'<');
5775
5776 if(abs(cand_R_dist_to_Q) > BREAK_POINT_MAXIMUM_TOLERANCE) continue;
5777 int is_R_small_side_close_to_P = is_Q_small_side_close_to_P;
5778
5779 if( is_R_small_side_close_to_P && !event_body_R -> large_side_increasing_coordinate) continue;
5780 if(!(is_R_small_side_close_to_P) && !event_body_R -> small_side_increasing_coordinate) continue;
5781
5782 if( isInv && event_body_R -> large_side_increasing_coordinate != event_body_R -> small_side_increasing_coordinate) continue;
5783 if(!(isInv) && event_body_R -> large_side_increasing_coordinate == event_body_R -> small_side_increasing_coordinate) continue;
5784 (*brkPno) = event_no_P;
5785 (*brkQno) = event_no_Q;
5786 (*brkRno) = event_no_R;
5787 found_PQR++;
5788 return 1;
5789 }
5790 }
5791 }
5792
5793 return found_PQR;
5794 }
5795
5796
get_event_two_coordinates(global_context_t * global_context,unsigned int event_no,char ** small_chro,int * small_pos,unsigned int * small_abs,char ** large_chro,int * large_pos,unsigned int * large_abs)5797 void get_event_two_coordinates(global_context_t * global_context, unsigned int event_no, char ** small_chro, int * small_pos, unsigned int * small_abs, char ** large_chro, int * large_pos, unsigned int * large_abs){
5798
5799 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
5800 chromosome_event_t * event_body = indel_context -> event_space_dynamic + event_no;
5801
5802 if(small_abs)(*small_abs) = event_body -> event_small_side;
5803 if(large_abs)(*large_abs) = event_body -> event_large_side;
5804
5805 if(small_chro && small_pos)
5806 locate_gene_position(event_body -> event_small_side, &global_context -> chromosome_table, small_chro, small_pos);
5807 if(large_chro && large_pos)
5808 locate_gene_position(event_body -> event_large_side, &global_context -> chromosome_table, large_chro, large_pos);
5809 }
5810
5811
create_or_update_translocation_imprecise_result(global_context_t * global_context,unsigned int guessed_P_small,unsigned int guessed_tra_len,unsigned int guessed_Q_small,int paired_BC_reads,int isInv)5812 void create_or_update_translocation_imprecise_result(global_context_t * global_context , unsigned int guessed_P_small, unsigned int guessed_tra_len, unsigned int guessed_Q_small , int paired_BC_reads, int isInv){
5813
5814 char * brkPchr;
5815 int brkPsmall;
5816 void * trans_old_ptrs [_PQR_LIST_SIZE];
5817 unsigned int trans_old_poses [_PQR_LIST_SIZE];
5818
5819 locate_gene_position(guessed_P_small, &global_context -> chromosome_table, &brkPchr, &brkPsmall);
5820
5821 int is_trans_found = 0, old_res_i, old_res_number = bktable_lookup(&global_context -> translocation_result_table, brkPchr, brkPsmall - BREAK_POINT_MAXIMUM_TOLERANCE, 2*BREAK_POINT_MAXIMUM_TOLERANCE, trans_old_poses, trans_old_ptrs, _PQR_LIST_SIZE);
5822 for(old_res_i = 0; old_res_i < old_res_number; old_res_i++){
5823 translocation_result_t * old_res = (translocation_result_t * )trans_old_ptrs[old_res_i];
5824
5825 long long target_dist = old_res -> target_left_side;
5826 target_dist -= guessed_Q_small;
5827
5828 if(abs(target_dist) < BREAK_POINT_MAXIMUM_TOLERANCE && isInv == old_res -> is_inv){
5829 target_dist = old_res -> length;
5830 target_dist -= guessed_tra_len;
5831 if(abs(target_dist) < BREAK_POINT_MAXIMUM_TOLERANCE){
5832 old_res -> all_sup_P ++;
5833 old_res -> max_sup_QR = max(old_res -> max_sup_QR , paired_BC_reads);
5834 is_trans_found = 1;
5835 break;
5836 }
5837 }
5838 }
5839
5840 if(0 == is_trans_found){
5841 translocation_result_t * new_res = malloc(sizeof(translocation_result_t));
5842 memset(new_res, 0, sizeof(translocation_result_t));
5843 new_res -> target_left_side = guessed_Q_small;
5844 new_res -> length = guessed_tra_len;
5845 new_res -> source_left_side = guessed_P_small;
5846 new_res -> is_precisely_called = 0;
5847 new_res -> all_sup_P = 1;
5848 new_res -> max_sup_QR = paired_BC_reads;
5849 new_res -> is_inv = isInv;
5850
5851 bktable_append(&global_context -> translocation_result_table,brkPchr, brkPsmall, new_res);
5852 }
5853
5854 }
5855
create_or_update_translocation_result(global_context_t * global_context,unsigned int brkPno,unsigned int brkQno,unsigned int brkRno,int paired_BC_reads,int isInv)5856 void create_or_update_translocation_result(global_context_t * global_context , unsigned int brkPno, unsigned int brkQno, unsigned int brkRno , int paired_BC_reads, int isInv){
5857
5858 char *brkPchr, *brkQchr, *tmpchr;
5859 int brkPsmall, brkPlarge, brkQsmall, tmpint;
5860 unsigned int brkPabs_small, brkQabs_small, brkRabs_small, brkRabs_large, brkQabs_large;
5861
5862 SUBREADprintf("\nTRALOG: FINALLY_CONFIRMED: %s ; %d PE_MATES\n", isInv?"INV":"STR", paired_BC_reads);
5863
5864 get_event_two_coordinates(global_context, brkPno, &brkPchr, &brkPsmall, &brkPabs_small, &tmpchr, &brkPlarge, NULL);
5865 get_event_two_coordinates(global_context, brkQno, &brkQchr, &brkQsmall, &brkQabs_small, &tmpchr, &tmpint, &brkQabs_large);
5866 get_event_two_coordinates(global_context, brkRno, NULL, NULL, &brkRabs_small, NULL, NULL, &brkRabs_large);
5867
5868 SUBREADprintf("TRARES: %s:%u (len=%d) => %s:%u (Coor: last_base_before)\n", brkPchr, brkPsmall, brkPlarge - brkPsmall - 1, brkQchr, brkQsmall);
5869
5870 void * trans_old_ptrs [_PQR_LIST_SIZE];
5871 unsigned int trans_old_poses [_PQR_LIST_SIZE];
5872
5873 unsigned int new_target_left_side, new_length;
5874
5875
5876 if(brkQabs_small >= brkRabs_small - BREAK_POINT_MAXIMUM_TOLERANCE && brkQabs_small <= brkRabs_small + BREAK_POINT_MAXIMUM_TOLERANCE)
5877 {
5878 // Q small and R large are target
5879 new_target_left_side = brkQabs_small;
5880 } else{
5881 // Q large and R small are target
5882 new_target_left_side = brkQabs_large;
5883 }
5884
5885 new_length = brkPlarge - brkPsmall - 1;
5886
5887 int is_trans_found = 0, old_res_i, old_res_number = bktable_lookup(&global_context -> translocation_result_table, brkPchr, brkPsmall - BREAK_POINT_MAXIMUM_TOLERANCE, 2*BREAK_POINT_MAXIMUM_TOLERANCE, trans_old_poses, trans_old_ptrs, _PQR_LIST_SIZE);
5888 for(old_res_i = 0; old_res_i < old_res_number; old_res_i++){
5889 translocation_result_t * old_res = (translocation_result_t * )trans_old_ptrs[old_res_i];
5890
5891 long long target_dist = old_res -> target_left_side;
5892 target_dist -= new_target_left_side;
5893
5894 if(abs(target_dist) < BREAK_POINT_MAXIMUM_TOLERANCE && isInv == old_res -> is_inv){
5895 target_dist = old_res -> length;
5896 target_dist -= new_length;
5897 if(abs(target_dist) < BREAK_POINT_MAXIMUM_TOLERANCE){
5898 old_res -> all_sup_P ++;
5899 old_res -> max_sup_QR = max(old_res -> max_sup_QR , paired_BC_reads);
5900 is_trans_found = 1;
5901 break;
5902 }
5903 }
5904 }
5905
5906 if(0 == is_trans_found){
5907
5908 translocation_result_t * new_res = malloc(sizeof(translocation_result_t));
5909 memset(new_res, 0, sizeof(translocation_result_t));
5910 new_res -> target_left_side = new_target_left_side;
5911 new_res -> length = new_length;
5912 new_res -> source_left_side = brkPabs_small;
5913 new_res -> is_precisely_called = 1;
5914 new_res -> event_P_number = brkPno;
5915 new_res -> event_Q_number = brkQno;
5916 new_res -> event_R_number = brkRno;
5917 new_res -> all_sup_P = 1;
5918 new_res -> max_sup_QR = paired_BC_reads;
5919 new_res -> is_inv = isInv;
5920
5921 bktable_append(&global_context -> translocation_result_table,brkPchr, brkPsmall, new_res);
5922 }
5923 }
5924
5925
finalise_translocations(global_context_t * global_context)5926 void finalise_translocations(global_context_t * global_context){
5927
5928 void ** s1_ptrs, **s2_ptrs;
5929 unsigned int * s1_poses, * s2_poses;
5930
5931 s1_ptrs = malloc(sizeof(void *) * S12_LIST_CAPACITY);
5932 s2_ptrs = malloc(sizeof(void *) * S12_LIST_CAPACITY);
5933
5934 s1_poses = malloc(sizeof(int) * S12_LIST_CAPACITY);
5935 s2_poses = malloc(sizeof(int) * S12_LIST_CAPACITY);
5936
5937 unsigned long long * s1_selected_list = malloc(sizeof(long long) * S12_LIST_CAPACITY); // fragment_no * 2 + is_second_read
5938 unsigned long long * s2_selected_list = malloc(sizeof(long long) * S12_LIST_CAPACITY);
5939
5940 mapping_result_t ** s1_result_ptr_list = malloc(sizeof(mapping_result_t *) * S12_LIST_CAPACITY);
5941 mapping_result_t ** s2_result_ptr_list = malloc(sizeof(mapping_result_t *) * S12_LIST_CAPACITY);
5942
5943 int frag_Q_larger_read;
5944 subread_read_number_t frag_A_i;
5945
5946 for(frag_A_i = 0; frag_A_i < global_context -> funky_list_A.fragments; frag_A_i ++){
5947 fragment_list_t fli_STR_B, fli_STR_C, fli_INV_B, fli_INV_C;
5948
5949 fraglist_init(&fli_STR_B);
5950 fraglist_init(&fli_STR_C);
5951 fraglist_init(&fli_INV_B);
5952 fraglist_init(&fli_INV_C);
5953
5954 subread_read_number_t frag_A_no = global_context -> funky_list_A.fragment_numbers[frag_A_i];
5955
5956 mapping_result_t q_res_A_body, q_res_B_body;
5957
5958 mapping_result_t * q_res_A = &q_res_A_body;
5959 mapping_result_t * q_res_B = &q_res_B_body;
5960
5961 bigtable_readonly_result(global_context, NULL, frag_A_no, 0, 0, q_res_A, NULL);
5962 bigtable_readonly_result(global_context, NULL, frag_A_no, 0, 1, q_res_B, NULL);
5963
5964 mapping_result_t * q_res_1 = q_res_A -> selected_position > q_res_B -> selected_position?q_res_B:q_res_A;
5965 mapping_result_t * q_res_2 = q_res_A -> selected_position <= q_res_B -> selected_position?q_res_B:q_res_A;
5966
5967 /***************************************************************************************************
5968 *
5969 * is_q1_negative and is_q2_negative describes the strandness of the original FASTQ read sequence.
5970 *
5971 * For the very normal mappings, is_q1_negative must be 0 and is_q2_negative must be 1.
5972 *
5973 * If is_q1_negative != is_q2_negative, then there is a strand-jumpping fusion between the two reads.
5974 */
5975
5976 int is_q1_negative = (q_res_1 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5977 int is_q2_negative = (q_res_2 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5978
5979 if(q_res_B == q_res_1)is_q1_negative=!is_q1_negative;
5980 if(q_res_B == q_res_2)is_q2_negative=!is_q2_negative;
5981
5982 long long dist = q_res_A ->selected_position;
5983 dist -= q_res_B->selected_position;
5984
5985 if( abs(dist) < 1000 && !(is_q1_negative == 0 && is_q2_negative == 1))
5986 {
5987 SUBREADprintf("TRALOG: STRANDNESS_BUG %08llu\n", frag_A_no);
5988 }
5989
5990
5991 for(frag_Q_larger_read = 0; frag_Q_larger_read < 2; frag_Q_larger_read++){
5992 void ** s_ptrs = frag_Q_larger_read?s2_ptrs:s1_ptrs;
5993 unsigned int * s_poses = frag_Q_larger_read?s2_poses:s1_poses;
5994 int q_res_offset = 0;
5995 mapping_result_t * q_res = frag_Q_larger_read?q_res_2:q_res_1;
5996
5997 char * q_res_chro = NULL;
5998 locate_gene_position(q_res -> selected_position, &global_context -> chromosome_table, &q_res_chro, &q_res_offset);
5999 q_res_offset +=1 ; // all tables are one-based.
6000
6001 unsigned int q_search_start = q_res_offset;
6002 if(q_search_start > FUNKY_COLOCATION_TOLERANCE) q_search_start -= FUNKY_COLOCATION_TOLERANCE;
6003 else q_search_start = 0;
6004
6005 int cand_i, canidate_s_items = bktable_lookup(&global_context -> funky_table_BC, q_res_chro, q_search_start, 2*FUNKY_COLOCATION_TOLERANCE, s_poses, s_ptrs, S12_LIST_CAPACITY);
6006
6007 if(0 && frag_A_no == 143736){
6008 SUBREADprintf("TRALOG: SEARCH CLOSE TO %s READ: %s:%u ; HAD %d HITS\n", frag_Q_larger_read?"LARGE":"SMALL", q_res_chro, q_search_start, canidate_s_items);
6009 }
6010
6011 // scan if candidate is reversed.
6012 // s_ptrs - NULL is the fragment no.
6013 for(cand_i = 0; cand_i < canidate_s_items; cand_i ++){
6014 subread_read_number_t frag_S_no = (s_ptrs[cand_i] - NULL)/ 2;
6015 int frag_S_is_read_B = (s_ptrs[cand_i] - NULL) % 2;
6016
6017 mapping_result_t read_S_res_body, mate_S_res_body;
6018 mapping_result_t * read_S_res = &read_S_res_body;
6019 mapping_result_t * mate_S_res = &mate_S_res_body;
6020
6021 bigtable_readonly_result(global_context, NULL, frag_S_no, 0, frag_S_is_read_B, read_S_res, NULL);
6022 bigtable_readonly_result(global_context, NULL, frag_S_no, 0, !frag_S_is_read_B, mate_S_res, NULL);
6023
6024 int is_read_S_negative = (read_S_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6025 int is_mate_S_negative = (mate_S_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6026 if(frag_S_is_read_B) is_read_S_negative = !is_read_S_negative;
6027 else is_mate_S_negative = !is_mate_S_negative;
6028
6029 int is_INV_TRA = is_mate_S_negative == is_read_S_negative;
6030
6031 if(is_INV_TRA && is_read_S_negative == !frag_Q_larger_read){
6032 if(frag_Q_larger_read)
6033 fraglist_append(&fli_INV_B, frag_S_no * 2 + frag_S_is_read_B);
6034 else
6035 fraglist_append(&fli_INV_C, frag_S_no * 2 + frag_S_is_read_B);
6036 }
6037
6038 if((!is_INV_TRA) && is_read_S_negative == !frag_Q_larger_read){
6039 if(frag_Q_larger_read)
6040 fraglist_append(&fli_STR_C, frag_S_no * 2 + frag_S_is_read_B);
6041 else
6042 fraglist_append(&fli_STR_B, frag_S_no * 2 + frag_S_is_read_B);
6043 }
6044 }
6045 }
6046
6047 unsigned int guesed_p_small, guessed_tra_length, guessed_q_small, is_brkP_cand_found = 0;
6048
6049 if(fli_INV_B.fragments >= 1 && fli_INV_C.fragments >= 1){
6050 int PEmates = find_translocation_BC_mates(global_context, q_res_1, q_res_2, &fli_INV_B, &fli_INV_C, 1, s1_selected_list, s2_selected_list, s1_poses, s2_poses, &guesed_p_small, &guessed_tra_length, &guessed_q_small);
6051 int ConformPE = find_translocation_BC_conformation(global_context, PEmates, s1_poses, s2_poses);
6052 int brkPQR_are_found = 0;
6053 unsigned int brkPno, brkQno, brkRno;
6054
6055 char out1pos[100], out2pos[100];
6056 absoffset_to_posstr(global_context, q_res_1 -> selected_position, out1pos);
6057 absoffset_to_posstr(global_context, q_res_2 -> selected_position, out2pos);
6058 SUBREADprintf("TRALOG: A_READ: %09llu: INV : %s ~ %s ; %d PE_MATES (%s)\n", frag_A_no, out1pos, out2pos, PEmates, ConformPE?"CONFORMABLE":"INCONSISTENT");
6059
6060 //SUBREADputs("TRALOG: INV_C:");
6061 //print_frags(global_context,&fli_INV_C);
6062 //SUBREADputs("TRALOG: INV_B:");
6063 //print_frags(global_context,&fli_INV_B);
6064 if(PEmates)
6065 brkPQR_are_found = find_translocation_brk_PQR(global_context, q_res_1, q_res_2, &fli_INV_B, &fli_INV_C, &brkPno, &brkQno, &brkRno, 1, &is_brkP_cand_found);
6066
6067 if(brkPQR_are_found){
6068 brkPQR_are_found = breakpoint_PQR_supported(global_context , brkPno , brkQno, brkRno, &fli_INV_B, &fli_INV_C, 1);
6069 SUBREADprintf("TRALOG: A_READ: INV BRK_PQR_SUPPED=%d\n", brkPQR_are_found);
6070 }
6071 if(brkPQR_are_found)
6072 create_or_update_translocation_result( global_context , brkPno, brkQno, brkRno , PEmates, 1);
6073 else if(ConformPE && fli_INV_B.fragments > 2 && fli_INV_C.fragments > 2 && is_brkP_cand_found)
6074 create_or_update_translocation_imprecise_result(global_context, guesed_p_small, guessed_tra_length, guessed_q_small, PEmates, 1);
6075 }
6076
6077 if(fli_STR_B.fragments >= 1 && fli_STR_C.fragments >= 1){
6078 int PEmates = find_translocation_BC_mates(global_context, q_res_1, q_res_2, &fli_STR_B, &fli_STR_C, 0, s1_selected_list, s2_selected_list, s1_poses, s2_poses, &guesed_p_small, &guessed_tra_length, &guessed_q_small);
6079 int ConformPE = find_translocation_BC_conformation(global_context, PEmates, s1_poses, s2_poses);
6080
6081 char out1pos[100], out2pos[100];
6082 absoffset_to_posstr(global_context, q_res_1 -> selected_position, out1pos);
6083 absoffset_to_posstr(global_context, q_res_2 -> selected_position, out2pos);
6084
6085 SUBREADprintf("TRALOG: A_READ: %09llu: TRA : %s ~ %s ; %d PE_MATES (%s)\n", frag_A_no, out1pos, out2pos, PEmates, ConformPE?"CONFORMABLE":"INCONSISTENT");
6086
6087 //SUBREADputs("TRALOG: STR_B:");
6088 //print_frags(global_context,&fli_STR_B);
6089 //SUBREADputs("TRALOG: STR_C:");
6090 //print_frags(global_context,&fli_STR_C);
6091
6092 int brkPQR_are_found = 0;
6093 unsigned int brkPno, brkQno, brkRno;
6094
6095 if(PEmates)
6096 brkPQR_are_found = find_translocation_brk_PQR(global_context, q_res_1, q_res_2, &fli_STR_B, &fli_STR_C, &brkPno, &brkQno, &brkRno, 0, &is_brkP_cand_found);
6097
6098 if(brkPQR_are_found){
6099 brkPQR_are_found = breakpoint_PQR_supported(global_context , brkPno , brkQno, brkRno, &fli_STR_B, &fli_STR_C, 0);
6100 }
6101
6102 if(brkPQR_are_found)
6103 create_or_update_translocation_result( global_context , brkPno, brkQno, brkRno , PEmates, 0);
6104 else if(ConformPE && fli_INV_B.fragments > 2 && fli_INV_C.fragments > 2 && is_brkP_cand_found)
6105 create_or_update_translocation_imprecise_result(global_context, guesed_p_small, guessed_tra_length, guessed_q_small, PEmates, 0);
6106 }
6107
6108 fraglist_destroy(&fli_STR_B);
6109 fraglist_destroy(&fli_STR_C);
6110 fraglist_destroy(&fli_INV_B);
6111 fraglist_destroy(&fli_INV_C);
6112 }
6113
6114 free(s1_result_ptr_list);
6115 free(s2_result_ptr_list);
6116 free(s1_ptrs);
6117 free(s2_ptrs);
6118 free(s1_poses);
6119 free(s2_poses);
6120 free(s1_selected_list);
6121 free(s2_selected_list);
6122
6123 }
6124
finalise_inversions(global_context_t * global_context)6125 void finalise_inversions(global_context_t * global_context){
6126 subread_read_number_t frag_A_i;
6127 void ** s1_ptrs, **s2_ptrs;
6128 unsigned int * s1_poses, * s2_poses;
6129
6130 s1_ptrs = malloc(sizeof(void *) * S12_LIST_CAPACITY);
6131 s2_ptrs = malloc(sizeof(void *) * S12_LIST_CAPACITY);
6132
6133 s1_poses = malloc(sizeof(int) * S12_LIST_CAPACITY);
6134 s2_poses = malloc(sizeof(int) * S12_LIST_CAPACITY);
6135
6136 unsigned long long * s1_selected_list = malloc(sizeof(long long) * S12_LIST_CAPACITY); // fragment_no * 2 + is_second_read
6137 unsigned long long * s2_selected_list = malloc(sizeof(long long) * S12_LIST_CAPACITY);
6138
6139 mapping_result_t ** s1_result_ptr_list = malloc(sizeof(mapping_result_t *) * S12_LIST_CAPACITY);
6140 mapping_result_t ** s2_result_ptr_list = malloc(sizeof(mapping_result_t *) * S12_LIST_CAPACITY);
6141
6142 int frag_Q_larger_read, xk1, xk2;
6143
6144 for(frag_A_i = 0; frag_A_i < global_context -> funky_list_DE.fragments; frag_A_i ++){
6145 int s1_list_items = 0, s2_list_items = 0;
6146
6147 subread_read_number_t frag_A_no = global_context -> funky_list_DE.fragment_numbers[frag_A_i];
6148
6149 mapping_result_t q_res_A_body, q_res_B_body;
6150
6151 mapping_result_t * q_res_A = &q_res_A_body, * q_res_B = &q_res_B_body;
6152
6153 bigtable_readonly_result(global_context, NULL, frag_A_no, 0, 0, q_res_A, NULL);
6154 bigtable_readonly_result(global_context, NULL, frag_A_no, 0, 1, q_res_B, NULL);
6155
6156 mapping_result_t * q_res_1 = q_res_A -> selected_position > q_res_B -> selected_position?q_res_B:q_res_A;
6157 mapping_result_t * q_res_2 = q_res_A -> selected_position <= q_res_B -> selected_position?q_res_B:q_res_A;
6158
6159
6160 /***************************************************************************************************
6161 *
6162 * is_q1_negative and is_q2_negative describes the strandness of the original FASTQ read sequence.
6163 *
6164 * For the very normal mappings, is_q1_negative must be 0 and is_q2_negative must be 1.
6165 *
6166 * If is_q1_negative != is_q2_negative, then there is a strand-jumpping fusion between the two reads.
6167 */
6168
6169 int is_q1_negative = (q_res_1 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6170 int is_q2_negative = (q_res_2 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6171
6172 if(q_res_B == q_res_1)is_q1_negative=!is_q1_negative;
6173 if(q_res_B == q_res_2)is_q2_negative=!is_q2_negative;
6174
6175 if(is_q1_negative == 0 && is_q2_negative == 0) // D READ
6176 {
6177 for(frag_Q_larger_read = 0; frag_Q_larger_read < 2; frag_Q_larger_read++){
6178 int * s_list_items = frag_Q_larger_read?&s2_list_items:&s1_list_items;
6179 void ** s_ptrs = frag_Q_larger_read?s2_ptrs:s1_ptrs;
6180 unsigned int * s_poses = frag_Q_larger_read?s2_poses:s1_poses;
6181 int q_res_offset = 0;
6182 mapping_result_t * q_res = frag_Q_larger_read?q_res_2:q_res_1;
6183 unsigned long long * s_selected_list = frag_Q_larger_read?s2_selected_list:s1_selected_list;
6184 mapping_result_t ** s_result_ptr_list = frag_Q_larger_read?s2_result_ptr_list:s1_result_ptr_list;
6185
6186
6187 char * q_res_chro = NULL;
6188 locate_gene_position(q_res -> selected_position, &global_context -> chromosome_table, &q_res_chro, &q_res_offset);
6189 q_res_offset +=1 ; // all tables are one-based.
6190
6191 unsigned int q_search_start = q_res_offset;
6192 if(q_search_start > FUNKY_COLOCATION_TOLERANCE) q_search_start -= FUNKY_COLOCATION_TOLERANCE;
6193 else q_search_start = 0;
6194
6195 int cand_i, canidate_s_items = bktable_lookup(&global_context -> funky_table_DE, q_res_chro, q_search_start, 2*FUNKY_COLOCATION_TOLERANCE, s_poses, s_ptrs, S12_LIST_CAPACITY);
6196 // scan if candidate is reversed.
6197 // s_ptrs - NULL is the fragment no.
6198 for(cand_i = 0; cand_i < canidate_s_items; cand_i ++){
6199 subread_read_number_t frag_S_no = (s_ptrs[cand_i] - NULL)/2;
6200 int frag_S_larger_read = (s_ptrs[cand_i] - NULL)%2;
6201
6202 if(frag_S_no == frag_A_no) continue;
6203
6204 if(frag_S_larger_read == frag_Q_larger_read){
6205
6206 mapping_result_t res_S_A_body, res_S_B_body;
6207 mapping_result_t * res_S_A = &res_S_A_body , * res_S_B = &res_S_B_body;
6208
6209 bigtable_readonly_result(global_context, NULL, frag_S_no, 0, 0, res_S_A, NULL);
6210 bigtable_readonly_result(global_context, NULL, frag_S_no, 0, 1, res_S_B, NULL);
6211
6212 mapping_result_t * res_S_1 = res_S_A -> selected_position > res_S_B -> selected_position?res_S_B:res_S_A;
6213 mapping_result_t * res_S_2 = res_S_A -> selected_position <= res_S_B -> selected_position?res_S_B:res_S_A;
6214
6215 mapping_result_t * co_locatted_S_res = frag_S_larger_read?res_S_2:res_S_1;
6216
6217 int is_s1_negative = (res_S_1 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6218 int is_s2_negative = (res_S_2 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6219
6220 if(res_S_B == res_S_1) is_s1_negative = !is_s1_negative;
6221 if(res_S_B == res_S_2) is_s2_negative = !is_s2_negative;
6222
6223
6224 if( is_s1_negative != 0 && is_s2_negative != 0 ){ // E READ
6225 s_selected_list[*s_list_items] = frag_S_no * 2 + frag_S_larger_read;
6226 s_result_ptr_list[*s_list_items] = co_locatted_S_res;
6227 (*s_list_items)++;
6228 }
6229 }
6230 }
6231 }
6232 }
6233
6234 int found_INV_frags = 0;
6235 srInt_64 guessed_Z_large_abs_sum = 0, guessed_Y_small_abs_sum = 0;
6236
6237 for(xk1 = 0; xk1 < s1_list_items; xk1++){
6238 for(xk2 = 0; xk2 < s2_list_items ; xk2 ++){
6239 if(s1_selected_list[xk1]/2 == s2_selected_list[xk2]/2)
6240 {
6241 found_INV_frags ++;
6242 // now there is only one D fragment. here we found the E fragment for it (E fragment is in s1[xk1] and s2[xk2])
6243 // s1 is the E read that is close to D_1; s2 is the E read that is close to D_2; D_1 is the D read with smaller coordinate.
6244 // res_E1 is the read that is close to D_2; mapping location of E_1 should be larger than D_2
6245
6246 mapping_result_t * res_D1 = q_res_1;
6247 mapping_result_t * res_D2 = q_res_2;
6248
6249 mapping_result_t * res_E1 = s2_result_ptr_list[xk2];
6250 mapping_result_t * res_E2 = s1_result_ptr_list[xk1];
6251
6252 int Gap_a_length = res_E2 -> selected_position - res_D1 -> selected_position - res_D1 -> read_length;
6253 int Gap_b_length = res_E1 -> selected_position - res_D2 -> selected_position - res_D2 -> read_length;
6254 int average_gap_len = (Gap_b_length + Gap_a_length)/2;
6255 guessed_Y_small_abs_sum += res_D1 -> selected_position + res_D1 -> read_length - average_gap_len / 2;
6256 guessed_Z_large_abs_sum += res_E1 -> selected_position - average_gap_len / 2;
6257 SUBREADprintf("INVLOG: GUESSED_LEN = %d + %d / 2 = %d\n", Gap_a_length, Gap_b_length, average_gap_len);
6258 }
6259 }
6260 }
6261
6262 unsigned int brkYno=0xffffffff, brkZno=0xffffffff;
6263 int cand_YZ_breakpoints = 0;
6264 if(found_INV_frags > 0)
6265 {
6266 char * q_small_chro = NULL;
6267 int q_small_pos = 0;
6268
6269 guessed_Y_small_abs_sum /= found_INV_frags;
6270 guessed_Z_large_abs_sum /= found_INV_frags;
6271 SUBREADprintf("INVLOG: GUESSED_YZ=%lld, %lld\n", guessed_Y_small_abs_sum, guessed_Z_large_abs_sum);
6272
6273 locate_gene_position(q_res_1 -> selected_position, &global_context -> chromosome_table, &q_small_chro, &q_small_pos);
6274 int cand_Y, cand_Z;
6275 cand_YZ_breakpoints = bktable_lookup(&global_context -> breakpoint_table_YZ, q_small_chro, q_small_pos, global_context -> config.maximum_pair_distance , s1_poses, s1_ptrs, S12_LIST_CAPACITY);
6276
6277 //SUBREADprintf("INVLOG: %09u FOUND %d CANDIDATE BKs AT %s:%u\n", frag_A_no, cand_YZ_breakpoints, q_small_chro, q_small_pos);
6278
6279 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
6280
6281 for(cand_Y = 0; cand_Y < cand_YZ_breakpoints ; cand_Y ++){
6282 if(brkYno < 0xffffffff) break;
6283
6284 int event_no_Y = s1_ptrs[cand_Y] - NULL;
6285 chromosome_event_t * event_body_Y = indel_context -> event_space_dynamic + event_no_Y;
6286
6287 if(event_body_Y -> small_side_increasing_coordinate) continue;
6288 if(event_body_Y -> small_side_increasing_coordinate != event_body_Y -> large_side_increasing_coordinate)
6289 assert(0);
6290
6291 if(abs(event_body_Y -> event_large_side - q_res_2 -> selected_position) < global_context -> config.maximum_pair_distance){
6292
6293 for(cand_Z = 0; cand_Z < cand_YZ_breakpoints ; cand_Z ++){
6294 int event_no_Z = s1_ptrs[cand_Z] - NULL;
6295 chromosome_event_t * event_body_Z = indel_context -> event_space_dynamic + event_no_Z;
6296
6297 if(!event_body_Z -> small_side_increasing_coordinate) continue;
6298 if(event_body_Z -> small_side_increasing_coordinate != event_body_Z -> large_side_increasing_coordinate)
6299 assert(0);
6300
6301 long long dist_small = event_body_Z -> event_small_side , dist_large = event_body_Z -> event_large_side;
6302 dist_small -= event_body_Y -> event_small_side;
6303 dist_large -= event_body_Y -> event_large_side;
6304
6305 long long dist_small_large_diff = dist_small;
6306 dist_small_large_diff -= dist_large;
6307
6308 if(abs(dist_small_large_diff) <= BREAK_POINT_MAXIMUM_TOLERANCE && abs(dist_large) <= BREAK_POINT_MAXIMUM_TOLERANCE && event_body_Z -> small_side_increasing_coordinate != event_body_Y -> small_side_increasing_coordinate){
6309
6310 brkYno = event_no_Y;
6311 brkZno = event_no_Z;
6312
6313 break;
6314 }
6315 }
6316
6317
6318 if(1)
6319 {
6320 char outpos1[100], outpos2[100];
6321 absoffset_to_posstr(global_context, event_body_Y -> event_small_side, outpos1);
6322 absoffset_to_posstr(global_context, event_body_Y -> event_large_side, outpos2);
6323
6324 SUBREADprintf("INVLOG: %09llu FOUND BREAKPOINT YZ: %s ~ %s, INC_COR: %c %c , nSUP=%d\n", frag_A_no, outpos1, outpos2, event_body_Y -> small_side_increasing_coordinate?'>':'<', event_body_Y -> large_side_increasing_coordinate?'>':'<' , event_body_Y -> final_counted_reads);
6325
6326 }
6327
6328 }
6329 }
6330 }
6331
6332
6333 char *brkYchr = "NULL";
6334 unsigned int brkYabs_small = 0, brkYabs_large = 0;
6335 int brkYsmall = 0, brkYlarge = 0;
6336 int is_precisely_called = 0, is_roughly_called = 0;
6337 if(brkYno < 0xffffffff){
6338 // s1_selected_list : 2 * fragment_S_no + frag_S_larger_read
6339 int is_passed_YZ = breakpoint_YZ_supported(global_context, brkYno, brkZno, s1_selected_list, s1_list_items, s2_selected_list, s2_list_items);
6340 if(is_passed_YZ)
6341 {
6342 is_precisely_called = 1;
6343
6344 get_event_two_coordinates(global_context, brkYno, &brkYchr, &brkYsmall, &brkYabs_small, &brkYchr, &brkYlarge, &brkYabs_large);
6345
6346 }
6347 else is_roughly_called = 1;
6348 //SUBREADprintf("\nINVLOG: FINALLY_%sCONFIRMED: %09u %s:%u (len=%d) INVERSED.\n", is_passed_YZ?"":"NOT ", frag_A_no, brkYchr, brkYsmall, brkYlarge - brkYsmall);
6349 }
6350
6351 //SUBREADprintf("\nINVLOG: FINALLY_GUESSED: %09u found_INV_frags=%d, s1_list_items=%d, s2_list_items=%d, cand_YZ_breakpoints=%d\n", frag_A_no, found_INV_frags, s1_list_items, s2_list_items, cand_YZ_breakpoints);
6352
6353 //for(xk1 = 0; xk1 < s1_list_items; xk1++) SUBREADprintf("INVLOG: %09d S_1 MATES: %09llu\n" , frag_A_no , s1_selected_list[xk1]/2);
6354 //for(xk1 = 0; xk1 < s2_list_items; xk1++) SUBREADprintf("INVLOG: %09d S_2 MATES: %09llu\n" , frag_A_no , s2_selected_list[xk1]/2);
6355
6356
6357
6358 /*
6359 if(found_INV_frags >= min(s1_list_items , s2_list_items) - 2 && found_INV_frags > 1 && !is_precisely_called && cand_YZ_breakpoints>0){
6360 // guess brkYlarge, brkYsmall, brkZlarge, brkZsmall, brkYabsLarge, brkZabsLarge...
6361 locate_gene_position(guessed_Y_small_abs_sum, &global_context -> chromosome_table, &brkYchr, &brkYsmall);
6362 locate_gene_position(guessed_Z_large_abs_sum, &global_context -> chromosome_table, &brkYchr, &brkYlarge);
6363 //SUBREADprintf("\nINVLOG: FINALLY_GUESSED: %09u %s:%u (len=%llu) INVERSED.\n", frag_A_no, brkYchr, brkYsmall, guessed_Z_large_abs_sum - guessed_Y_small_abs_sum);
6364 is_roughly_called = 1;
6365 }*/
6366
6367 if( is_precisely_called || is_roughly_called )
6368 {
6369 void * old_ptrs[_PQR_LIST_SIZE];
6370 unsigned int old_poses[_PQR_LIST_SIZE];
6371 int old_found = 0, old_i, old_inversions = bktable_lookup(&global_context -> inversion_result_table, brkYchr, brkYsmall - BREAK_POINT_MAXIMUM_TOLERANCE, 2*BREAK_POINT_MAXIMUM_TOLERANCE, old_poses, old_ptrs, _PQR_LIST_SIZE);
6372 for(old_i = 0; old_i < old_inversions; old_i ++){
6373 inversion_result_t * inv_res_old = (inversion_result_t *) old_ptrs[old_i];
6374 long long old_dist = inv_res_old -> length;
6375 old_dist -= brkYlarge - brkYsmall; // the difference on inversion length.
6376 if(abs(old_dist) < BREAK_POINT_MAXIMUM_TOLERANCE){
6377 inv_res_old -> all_sup_D ++;
6378 inv_res_old -> max_sup_E = max(inv_res_old -> max_sup_E , found_INV_frags);
6379 old_found = 1;
6380 break;
6381 }
6382 }
6383
6384 if(0 == old_found){
6385 inversion_result_t * inv_res_new = malloc(sizeof(chromosome_event_t));
6386 memset(inv_res_new, 0 , sizeof(chromosome_event_t));
6387
6388 inv_res_new -> length = brkYlarge - brkYsmall;
6389 inv_res_new -> is_precisely_called = is_precisely_called;
6390 if(is_precisely_called){
6391 inv_res_new -> event_Y_number = brkYno;
6392 inv_res_new -> event_Z_number = brkZno;
6393 inv_res_new -> small_side = brkYabs_small;
6394 }else{
6395 inv_res_new -> event_Y_rough_small_abs = guessed_Y_small_abs_sum;
6396 inv_res_new -> event_Z_rough_large_abs = guessed_Z_large_abs_sum;
6397 inv_res_new -> small_side = guessed_Y_small_abs_sum;
6398 }
6399 inv_res_new -> all_sup_D = 1;
6400 inv_res_new -> max_sup_E = found_INV_frags;
6401
6402 bktable_append(&global_context -> inversion_result_table, brkYchr, brkYsmall, inv_res_new);
6403 }
6404 }
6405 }
6406
6407 free(s1_result_ptr_list);
6408 free(s2_result_ptr_list);
6409 free(s1_ptrs);
6410 free(s2_ptrs);
6411 free(s1_poses);
6412 free(s2_poses);
6413 free(s1_selected_list);
6414 free(s2_selected_list);
6415 }
6416
build_breakpoint_tables(global_context_t * global_context)6417 void build_breakpoint_tables(global_context_t * global_context){
6418
6419 int xk1;
6420 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
6421
6422 for(xk1 = 0; xk1 < indel_context -> total_events ; xk1++)
6423 {
6424 char * chro_name_left= NULL,* chro_name_right = NULL;
6425 int chro_pos_left= 0, chro_pos_right = 0;
6426
6427 chromosome_event_t * event_body = indel_context -> event_space_dynamic + xk1;
6428
6429 if(event_body -> event_type != CHRO_EVENT_TYPE_FUSION && event_body -> event_type != CHRO_EVENT_TYPE_JUNCTION)
6430 continue;
6431
6432 locate_gene_position(event_body -> event_small_side, &global_context -> chromosome_table, &chro_name_left, &chro_pos_left);
6433 locate_gene_position(event_body -> event_large_side, &global_context -> chromosome_table, &chro_name_right, &chro_pos_right);
6434
6435 long long dist = chro_pos_left;
6436 dist -= chro_pos_right;
6437 if(dist<0)dist=-dist;
6438
6439 int breakpoint_group = -1;
6440
6441 if(event_body -> is_strand_jumped){
6442 // breakpoint QR or YZ
6443 if(chro_name_left != chro_name_right || dist > global_context -> config.maximum_translocation_length)
6444 breakpoint_group = 2; // QR
6445 else
6446 breakpoint_group = 3; // YZ
6447 }else{
6448 // breakpoint QR or P
6449 if(chro_name_left != chro_name_right || dist > global_context -> config.maximum_translocation_length)
6450 breakpoint_group = 2; // QR
6451 else
6452 breakpoint_group = 1; // P
6453 }
6454
6455
6456 bucketed_table_t * index_table = breakpoint_group == 1?
6457 &global_context -> breakpoint_table_P :
6458 (breakpoint_group == 2?
6459 &global_context -> breakpoint_table_QR:
6460 (breakpoint_group == 3?
6461 &global_context -> breakpoint_table_YZ:
6462 NULL
6463 )
6464 );
6465
6466 //SUBREADprintf("BPLOG: %s:%u ~ %s:%u (%c) GRP=%d (%p)\n", chro_name_left, chro_pos_left, chro_name_right, chro_pos_right, event_body -> is_strand_jumped?'X':'=', breakpoint_group, index_table);
6467
6468 if(index_table) bktable_append(index_table, chro_name_left, chro_pos_left, NULL + xk1);
6469 if(index_table) bktable_append(index_table, chro_name_right, chro_pos_right, NULL + xk1);
6470 }
6471 }
6472
finalise_structural_variances(global_context_t * global_context)6473 void finalise_structural_variances(global_context_t * global_context){
6474 SUBREADprintf("Funky Tables: A:%u, BC:%u, DE:%u\n", (unsigned int) global_context -> funky_list_A.fragments, (unsigned int)global_context -> funky_table_BC.fragments / 2, (unsigned int)global_context -> funky_list_DE.fragments);
6475
6476 build_breakpoint_tables(global_context);
6477 SUBREADprintf("Breakpoint Tables: P:%u, QR:%u, YZ:%u\n", (unsigned int)global_context -> breakpoint_table_P.fragments, (unsigned int)global_context -> breakpoint_table_QR.fragments, (unsigned int)global_context -> breakpoint_table_YZ.fragments);
6478 finalise_translocations(global_context);
6479 finalise_inversions(global_context);
6480 }
6481