1 /***************************************************************
2 
3    The Subread software package is free software package:
4    you can redistribute it and/or modify it under the terms
5    of the GNU General Public License as published by the
6    Free Software Foundation, either version 3 of the License,
7    or (at your option) any later version.
8 
9    Subread is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty
11    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 
13    See the GNU General Public License for more details.
14 
15    Authors: Drs Yang Liao and Wei Shi
16 
17   ***************************************************************/
18 
19 
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <string.h>
23 #include <ctype.h>
24 #include <assert.h>
25 #include <unistd.h>
26 #include "subread.h"
27 #include "sublog.h"
28 #include "gene-value-index.h"
29 #include "gene-algorithms.h"
30 #include "input-files.h"
31 #include "core.h"
32 #include "core-indel.h"
33 #include "core-junction.h"
34 #include "core-bigtable.h"
35 
36 #define TTTSNAME "V0112_0155:7:1308:1308:136442"
37 
38 #define CLUSTER_ALIGNMENT_DONOR_R1_MAPPED 2
39 #define CLUSTER_ALIGNMENT_DONOR_R2_MAPPED 4
40 #define CLUSTER_ALIGNMENT_DONOR_NEGATIVE_STRAND 1
41 
42 
abs32uint(unsigned int x)43 unsigned int abs32uint(unsigned int x){
44 	if(x > 0x7fffffff) x = (0xffffffff - x) + 1;
45 	return x;
46 }
47 
48 typedef struct{
49 	unsigned int piece_main_abs_offset;
50 	unsigned int piece_minor_abs_offset;
51 	int piece_main_masks;
52 	short piece_main_coverage_start;
53 	short piece_main_coverage_end;
54 
55 	short piece_main_hamming_match;
56 	short piece_main_read_quality;
57 	short piece_minor_hamming_match;
58 	short piece_minor_read_quality;
59 	int piece_minor_score;
60 	short intron_length;
61 
62 	gene_vote_number_t *piece_main_indel_record;
63 	unsigned short piece_main_indels;
64 	unsigned short piece_minor_indel_offset;
65 	gene_vote_number_t piece_main_votes;
66 	gene_vote_number_t piece_minor_votes;
67 
68 	short piece_minor_coverage_start;
69 	short piece_minor_coverage_end;
70 	short split_point;
71 	char inserted_bases;
72 	char is_GT_AG_donors;
73 	char is_donor_found_or_annotation;
74 	char is_strand_jumped;
75 	char is_break_even;
76 
77 	//unsigned long long int Score_H;
78 	//unsigned long long int Score_L;
79 } select_junction_record_t;
80 
81 
debug_show_event(global_context_t * global_context,chromosome_event_t * event)82 void debug_show_event(global_context_t* global_context, chromosome_event_t * event){
83 	char outpos1[100], outpos2[100];
84 	absoffset_to_posstr(global_context, event -> event_small_side, outpos1);
85 	absoffset_to_posstr(global_context, event -> event_large_side, outpos2);
86 	SUBREADprintf("Event between %s and %s\n", outpos1, outpos2);
87 }
88 
get_offset_maximum_chro_pos(global_context_t * global_context,thread_context_t * thread_context,unsigned int linear)89 int get_offset_maximum_chro_pos(global_context_t * global_context, thread_context_t * thread_context, unsigned int linear){
90 	gene_offset_t * chros =& global_context -> chromosome_table;
91 	int n = 0;
92 	int total_offsets = chros -> total_offsets;
93 
94 	int LL = 0, RR = total_offsets-1;
95 
96 	while(1){
97 		if(LL >= RR-1) break;
98 		int MM = (LL+RR)/2;
99 		if( linear > chros->read_offsets[MM]) LL = MM;
100 		else if(linear < chros->read_offsets[MM]) RR = MM;
101 		else break;
102 	}
103 
104 	n = max(0, LL - 2);
105 
106 	for (; n < chros -> total_offsets; n++) {
107 		if (chros->read_offsets[n] > linear) {
108 			int ret;
109 			unsigned int last_linear = 0;
110 			if(n==0)
111 				ret = chros->read_offsets[0] - chros -> padding  *2 +16;
112 			else{
113 				ret = ( chros->read_offsets[n] - chros->read_offsets[n-1]  ) - chros -> padding  *2 +16;
114 				last_linear =  chros->read_offsets[n-1];
115 			}
116 			linear -= last_linear;
117 			if(linear < chros -> padding || linear >= chros -> padding + ret) return -1;
118 			return ret;
119 		}
120 	}
121 	return -2;
122 }
123 
124 
125 // read_head_abs_pos is the offset of the FIRST WANTED base.
search_events_to_front(global_context_t * global_context,thread_context_t * thread_context,explain_context_t * explain_context,char * read_text,char * qual_text,unsigned int read_head_abs_offset,short remainder_len,short sofar_matched,int suggested_movement,int do_not_jump)126 void search_events_to_front(global_context_t * global_context, thread_context_t * thread_context, explain_context_t * explain_context, char * read_text , char * qual_text, unsigned int read_head_abs_offset, short remainder_len, short sofar_matched, int suggested_movement, int do_not_jump)
127 {
128 	short tested_read_pos;
129 	//				#warning "SUBREAD_151 REMOVE THIS ASSERTION! "
130 	//				if(remainder_len >= 102)SUBREADprintf("FATAL:%d\n", remainder_len );
131 	//				assert(remainder_len < 102);
132 
133 	HashTable * event_table = NULL;
134 	chromosome_event_t * event_space = NULL;
135 
136 	if(thread_context) {
137 		event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
138 		event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
139 	} else {
140 		event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
141 		event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
142 	}
143 
144 	gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index ;
145 
146 	if((global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection)|| there_are_events_in_range(event_table->appendix1, read_head_abs_offset, remainder_len)) {
147 
148 		int event_search_method;
149 		if((global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection))
150 			event_search_method = EVENT_SEARCH_BY_BOTH_SIDES;
151 		else
152 			event_search_method = EVENT_SEARCH_BY_SMALL_SIDE;
153 
154 		// tested_read_pos is the index of the first base unwanted!
155 
156 
157 		int move_start = do_not_jump?0:global_context -> config.realignment_minimum_variant_distance;
158 		if(suggested_movement) move_start = suggested_movement-1;
159 		int is_junction_scanned = 0;
160 
161 		if(MAX_EVENTS_IN_READ - 1 > explain_context -> tmp_search_sections)
162 		for(tested_read_pos = move_start ; tested_read_pos <= remainder_len; tested_read_pos++)
163 		{
164 			int xk1, matched_bases_to_site;
165 			chromosome_event_t *site_events[MAX_EVENT_ENTRIES_PER_SITE+1];
166 
167 			int jump_penalty = 0;
168 
169 			unsigned potential_event_pos;
170 			if(explain_context -> current_is_strand_jumped)
171 				potential_event_pos = read_head_abs_offset - tested_read_pos +1;
172 			else
173 				potential_event_pos = read_head_abs_offset + tested_read_pos -1;
174 			if(!check_event_bitmap(  event_table->appendix1, potential_event_pos )) continue;
175 
176 			int search_types =  CHRO_EVENT_TYPE_INDEL | CHRO_EVENT_TYPE_JUNCTION | CHRO_EVENT_TYPE_FUSION;
177 			int site_events_no = search_event(global_context, event_table , event_space , potential_event_pos, event_search_method , search_types , site_events);
178 
179 
180 			if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0)
181 			{
182 				SUBREADprintf("FOUND THE EVENT FRONT:%d at %u\n", site_events_no, potential_event_pos);
183 				if(site_events_no)
184 					SUBREADprintf("EVENT0_type = %d\n", site_events[0]->event_type);
185 			}
186 
187 			if(!site_events_no)continue;
188 
189 			unsigned int tested_chro_begin;
190 			if(explain_context -> current_is_strand_jumped)
191 				tested_chro_begin = read_head_abs_offset - tested_read_pos + 1;
192 			else
193 				tested_chro_begin = read_head_abs_offset;
194 
195 			matched_bases_to_site = match_chro(read_text, value_index, tested_chro_begin, tested_read_pos, explain_context -> current_is_strand_jumped, global_context -> config.space_type);
196 
197 			/*
198 			#warning "========= COMMENT TWO LINES ===================="
199 			SUBREADprintf("MBASETOSITE=%d, tested_read_pos=%d\n", matched_bases_to_site, tested_read_pos);
200 			SUBREADprintf("TXT=%s, tested_read_pos=%d\n", read_text, tested_chro_begin);
201 			*/
202 
203 			int this_round_junction_scanned = 0;
204 
205 			if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0)
206 				SUBREADprintf("F_JUMP?  match=%d / tested=%d\n", matched_bases_to_site , tested_read_pos);
207 
208 			//#warning "========= remove - 2000 from next line ============="
209 			if(explain_context -> total_tries < REALIGN_TOTAL_TRIES && tested_read_pos >0 && ( matched_bases_to_site*10000/tested_read_pos > 9000 - 2000 || global_context->config.maximise_sensitivity_indel) )
210 				for(xk1 = 0; xk1 < site_events_no ; xk1++)
211 				{
212 					chromosome_event_t * tested_event = site_events[xk1];
213 
214 					if(explain_context -> is_fully_covered && tested_event -> event_type == CHRO_EVENT_TYPE_FUSION && tested_event -> event_large_side - tested_event -> event_small_side > MAX_DELETION_LENGTH){
215 						continue;
216 					}
217 					//if(explain_context -> pair_number == 23)
218 					if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0){
219 						SUBREADprintf("F_JUMP?%d > %d    %s (%u) ; SEARCH_TAG=%u\n", (1+matched_bases_to_site)*10000 / tested_read_pos , 9000, read_text, tested_chro_begin, potential_event_pos);
220 						debug_show_event(global_context, tested_event);
221 
222 					}
223 
224 					// note that these two values are the index of the first wanted base.
225 					unsigned int new_read_head_abs_offset;
226 
227 					if((global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection) && tested_event -> event_type == CHRO_EVENT_TYPE_INDEL)
228 					{
229 						if(explain_context ->current_is_strand_jumped){
230 							if(potential_event_pos == tested_event-> event_small_side) continue;
231 						}else{
232 							if(potential_event_pos == tested_event-> event_large_side) continue;
233 						}
234 					}
235 					if( tested_event -> event_type != CHRO_EVENT_TYPE_INDEL){
236 						this_round_junction_scanned = 1;
237 					}
238 
239 					if((global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection))// && tested_event->event_type == CHRO_EVENT_TYPE_FUSION)
240 						new_read_head_abs_offset = (potential_event_pos == tested_event -> event_large_side)?tested_event -> event_small_side:tested_event -> event_large_side;
241 					else
242 						new_read_head_abs_offset = tested_event -> event_large_side;
243 
244 
245 					short new_remainder_len = remainder_len - tested_read_pos + min(0, tested_event->indel_length) - tested_event -> indel_at_junction;
246 
247 	//				#warning "SUBREAD_151 REMOVE THIS ASSERTION! "
248 				//	assert(new_remainder_len < 102);
249 
250 					if(new_remainder_len>0)
251 					{
252 						//if(explain_context -> pair_number==2074) printf("JUMPPED IN.\n");
253 
254 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_end = explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start + tested_read_pos;
255 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].event_after_section = tested_event;
256 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].is_connected_to_large_side = (potential_event_pos == tested_event -> event_large_side);
257 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].read_pos_start = tested_read_pos - min(0, tested_event -> indel_length) + tested_event -> indel_at_junction;
258 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].abs_offset_for_start = new_read_head_abs_offset;
259 
260 
261 						if(tested_event->event_type == CHRO_EVENT_TYPE_FUSION) jump_penalty = 2;
262 
263 						int current_is_jumped = explain_context -> current_is_strand_jumped;
264 						int current_sup_as_complex = explain_context -> tmp_min_support_as_complex;
265 						int current_sup_as_simple = explain_context -> tmp_support_as_simple;
266 						//int current_unsup_as_simple = explain_context -> tmp_min_unsupport;
267 						int current_pure_donor_found = explain_context -> tmp_is_pure_donor_found_explain;
268 
269 						explain_context -> tmp_support_as_simple = tested_event -> supporting_reads;
270 						explain_context -> tmp_min_support_as_complex = min((tested_event -> is_donor_found_or_annotation & 64)?0x7fffffff:tested_event -> supporting_reads,explain_context -> tmp_min_support_as_complex);
271 						explain_context -> tmp_min_unsupport = min(tested_event -> anti_supporting_reads,explain_context -> tmp_min_unsupport);
272 						explain_context -> tmp_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain && tested_event -> is_donor_found_or_annotation;
273 						explain_context -> tmp_indel_penalty += ( tested_event -> event_type == CHRO_EVENT_TYPE_INDEL );
274 
275 						if(tested_event -> event_type == CHRO_EVENT_TYPE_FUSION && tested_event -> is_strand_jumped)
276 							explain_context -> current_is_strand_jumped = !explain_context -> current_is_strand_jumped;
277 
278 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].is_strand_jumped = explain_context -> current_is_strand_jumped;
279 						explain_context -> tmp_search_sections ++;
280 
281 						explain_context -> total_tries ++;
282 						search_events_to_front(global_context, thread_context, explain_context, read_text + tested_event -> indel_at_junction + tested_read_pos -  min(0, tested_event->indel_length), qual_text + tested_read_pos -  min(0, tested_event->indel_length), new_read_head_abs_offset, new_remainder_len, sofar_matched + matched_bases_to_site - jump_penalty, tested_event -> connected_next_event_distance, 0);
283 						explain_context -> tmp_search_sections --;
284 
285 						explain_context -> current_is_strand_jumped = current_is_jumped;
286 						explain_context -> tmp_indel_penalty -= ( tested_event -> event_type == CHRO_EVENT_TYPE_INDEL );
287 						explain_context -> tmp_min_support_as_complex = current_sup_as_complex;
288 						explain_context -> tmp_support_as_simple = current_sup_as_simple;
289 						//explain_context -> tmp_min_unsupport = current_unsup_as_simple;
290 						explain_context -> tmp_is_pure_donor_found_explain = current_pure_donor_found;
291 					}
292 					//if(global_context ->config.limited_tree_scan) break;
293 				}
294 			if( (global_context ->config.limited_tree_scan) && explain_context -> full_read_len <= EXON_LONG_READ_LENGTH) break;
295 			is_junction_scanned = max(is_junction_scanned, this_round_junction_scanned);
296 		}
297 	}
298 	//#warning "SUBREAD_151 REMOVE THE ASSERT! "
299 	//assert( remainder_len< 102 );
300 	int whole_section_matched = match_chro(read_text , value_index, explain_context -> current_is_strand_jumped?read_head_abs_offset - remainder_len +1:read_head_abs_offset, remainder_len , explain_context -> current_is_strand_jumped, global_context -> config.space_type);
301 
302 	explain_context -> tmp_total_matched_bases = whole_section_matched + sofar_matched ;
303 
304 	new_explain_try_replace(global_context, thread_context, explain_context, remainder_len, 0);
305 }
306 
new_explain_try_replace(global_context_t * global_context,thread_context_t * thread_context,explain_context_t * explain_context,int remainder_len,int search_to_back)307 void new_explain_try_replace(global_context_t* global_context, thread_context_t * thread_context, explain_context_t * explain_context, int remainder_len, int search_to_back)
308 {
309 	int is_better_result = 0, is_same_best = 0;
310 
311 	if(0 && FIXLENstrcmp("simulated.11420793", explain_context->read_name)==0){
312 		SUBREADprintf("TRY_REPLACE : %s has best=%d, b_evn=%d, tscore=%d, t_evn=%d\n", explain_context->read_name, explain_context -> best_matching_bases ,  explain_context -> best_is_complex , explain_context-> tmp_total_matched_bases,  explain_context -> tmp_search_sections );
313 	}
314 
315 	if(explain_context -> best_matching_bases - explain_context -> best_indel_penalty < explain_context-> tmp_total_matched_bases - explain_context -> tmp_indel_penalty)
316 	{
317 		is_better_result = 1;
318 		explain_context -> best_is_complex = explain_context -> tmp_search_sections ;
319 		explain_context -> is_currently_tie = 0;
320 		explain_context -> best_support_as_simple = explain_context -> tmp_support_as_simple;
321 		explain_context -> best_min_unsupport_as_simple = explain_context -> tmp_min_unsupport;
322 		explain_context -> best_min_support_as_complex = explain_context -> tmp_min_support_as_complex;
323 		explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
324 		explain_context -> second_best_matching_bases = max(explain_context -> second_best_matching_bases, explain_context -> best_matching_bases);
325 		explain_context -> best_matching_bases = explain_context-> tmp_total_matched_bases ;
326 		explain_context -> best_indel_penalty = explain_context -> tmp_indel_penalty;
327 	}
328 	else if(explain_context -> best_matching_bases - explain_context -> best_indel_penalty == explain_context-> tmp_total_matched_bases - explain_context -> tmp_indel_penalty)
329 	{
330 		// only gapped explainations are complex counted.
331 		explain_context -> best_is_complex +=  explain_context -> tmp_search_sections;
332 		explain_context -> second_best_matching_bases = explain_context -> best_matching_bases;
333 		explain_context -> best_indel_penalty = explain_context -> tmp_indel_penalty;
334 
335 		if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0){
336 			SUBREADprintf("complexity: curr=%d, new=%d   ;   sections=%d\n", explain_context->best_min_support_as_complex, explain_context -> tmp_min_support_as_complex, explain_context -> tmp_search_sections );
337 		}
338 		if(explain_context -> best_is_complex > 1)
339 		{
340 			// is complex now!
341 			if(explain_context -> tmp_search_sections == 0)
342 			{
343 				if(explain_context -> tmp_min_unsupport >explain_context->best_min_support_as_complex){
344 					is_better_result = 1;
345 					explain_context->best_min_support_as_complex =explain_context -> tmp_min_unsupport;
346 					explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
347 					explain_context -> is_currently_tie = 0;
348 				}
349 				else if(explain_context -> tmp_min_unsupport == explain_context->best_min_support_as_complex)
350 				{
351 					explain_context -> is_currently_tie = 1;
352 					is_same_best = 1;
353 				}
354 			}
355 			else{
356 				if(explain_context -> tmp_min_support_as_complex  >explain_context->best_min_support_as_complex){
357 					is_better_result = 1;
358 					explain_context -> best_min_support_as_complex =explain_context -> tmp_min_support_as_complex;
359 					explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
360 					explain_context -> is_currently_tie = 0;
361 				}
362 				else if(explain_context -> tmp_min_support_as_complex  == explain_context->best_min_support_as_complex){
363 					explain_context -> is_currently_tie = 1;
364 					is_same_best = 1;
365 				}
366 			}
367 
368 		}
369 		else
370 		{
371 			// this branch is reached ONLY if the last best is ONE-gapped (50M3D50M) and the current best is ungapped (100M)!
372 			if(explain_context -> best_is_pure_donor_found_explain)
373 			{
374 				if(explain_context -> best_min_unsupport_as_simple >= explain_context -> best_support_as_simple+2)
375 				{
376 					is_better_result = 1;
377 					explain_context -> best_min_support_as_complex = explain_context -> best_min_unsupport_as_simple;
378 					explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
379 					explain_context -> is_currently_tie = 0;
380 				}
381 			}
382 	//#warning "======= MAKE if(0) IS CORRECT BEFORE RELEASE ======"
383 			else if(0)
384 				if(explain_context -> best_min_unsupport_as_simple >= explain_context -> best_support_as_simple)
385 				{
386 					is_better_result = 1;
387 					explain_context -> best_min_support_as_complex = explain_context -> best_min_unsupport_as_simple;
388 					explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
389 					explain_context -> is_currently_tie = 0;
390 				}
391 		}
392 	}
393 	else return;
394 
395 	if(is_better_result || is_same_best){
396 		if(search_to_back){
397 			explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start =  0;
398 		}else{
399 			explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_end = explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start + remainder_len;
400 			explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].event_after_section = NULL;
401 		}
402 	}
403 
404 	if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0){
405 		SUBREADprintf("TRY_REPLACE_DESICION TO %s: BETTER=%d, SAME=%d ; CURRENT : %d secs ; NEWBEST : %d secs\n", search_to_back?"BACK":"FRONT", is_better_result, is_same_best, search_to_back? explain_context -> result_back_junction_numbers[0]:explain_context -> result_front_junction_numbers[0] ,explain_context -> tmp_search_sections);
406 		int xx1;
407 		for(xx1 = 0; xx1 < explain_context -> tmp_search_sections;xx1++){
408 			SUBREADprintf("  Event : %d ~ %d in read\n", explain_context -> tmp_search_junctions[xx1].read_pos_start, explain_context -> tmp_search_junctions[xx1].read_pos_end);
409 			if(explain_context -> tmp_search_junctions[xx1].event_after_section){
410 				SUBREADprintf("    ");
411 				debug_show_event(global_context, explain_context -> tmp_search_junctions[xx1].event_after_section);
412 			}
413 		}
414 	}
415 
416 	if(is_better_result)
417 	{
418 		if(search_to_back){
419 			explain_context -> all_back_alignments = 1;
420 			explain_context -> result_back_junction_numbers[0] = explain_context -> tmp_search_sections +1;
421 			// checked: memory boundary
422 			memcpy(explain_context -> result_back_junctions[0], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
423 
424 		}else{
425 			explain_context -> all_front_alignments = 1;
426 			explain_context -> result_front_junction_numbers[0] = explain_context -> tmp_search_sections +1;
427 			// checked: memory boundary
428 			memcpy(explain_context -> result_front_junctions[0], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
429 		}
430 
431 	}else if(is_same_best){
432 		if(search_to_back && explain_context -> all_back_alignments < MAX_ALIGNMENT_PER_ANCHOR){
433 			explain_context -> result_back_junction_numbers[explain_context -> all_back_alignments] = explain_context -> tmp_search_sections +1;
434 
435 			// checked: memory boundary
436 			memcpy(explain_context -> result_back_junctions[explain_context -> all_back_alignments], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
437 			explain_context -> all_back_alignments ++;
438 		}else if((!search_to_back) && explain_context -> all_front_alignments < MAX_ALIGNMENT_PER_ANCHOR){
439 			explain_context -> result_front_junction_numbers[explain_context -> all_front_alignments] = explain_context -> tmp_search_sections +1;
440 
441 			// checked: memory boundary
442 			memcpy(explain_context -> result_front_junctions[explain_context -> all_front_alignments], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
443 			explain_context -> all_front_alignments ++;
444 		}
445 	}
446 }
447 
448 
449 
new_explain_try_replace_xe(global_context_t * global_context,thread_context_t * thread_context,explain_context_t * explain_context,int remainder_len,int search_to_back)450 void new_explain_try_replace_xe(global_context_t* global_context, thread_context_t * thread_context, explain_context_t * explain_context, int remainder_len, int search_to_back)
451 {
452 	int is_better_result = 0, is_same_best = 0;
453 
454 	//SUBREADprintf("TRYING SET %s %s : Matched_bases : %d -> %d ; SECS : %d -> %d\n", explain_context -> read_name, search_to_back?"BACK":"FRONT", explain_context -> best_matching_bases, explain_context-> tmp_total_matched_bases, search_to_back? explain_context -> result_back_junction_numbers[0]:explain_context -> result_front_junction_numbers[0] ,explain_context -> tmp_search_sections );
455 
456 	if(explain_context -> best_matching_bases < explain_context-> tmp_total_matched_bases)
457 	{
458 		is_better_result = 1;
459 		explain_context -> best_is_complex = explain_context -> tmp_search_sections ;
460 		explain_context -> is_currently_tie = 0;
461 		explain_context -> best_support_as_simple = explain_context -> tmp_support_as_simple;
462 		explain_context -> best_min_unsupport_as_simple = explain_context -> tmp_min_unsupport;
463 		explain_context -> best_min_support_as_complex = explain_context -> tmp_min_support_as_complex;
464 		explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
465 		explain_context -> second_best_matching_bases = max(explain_context -> second_best_matching_bases, explain_context -> best_matching_bases);
466 		explain_context -> best_matching_bases = explain_context-> tmp_total_matched_bases ;
467 
468 	} else if(explain_context -> tmp_search_sections < search_to_back? explain_context -> result_back_junction_numbers[0]:explain_context -> result_front_junction_numbers[0] - 1 && explain_context -> best_matching_bases == explain_context-> tmp_total_matched_bases) {
469 		is_better_result = 1;
470 		explain_context -> best_is_complex = explain_context -> tmp_search_sections ;
471 		explain_context -> is_currently_tie = 0;
472 		explain_context -> best_support_as_simple = explain_context -> tmp_support_as_simple;
473 		explain_context -> best_min_unsupport_as_simple = explain_context -> tmp_min_unsupport;
474 		explain_context -> best_min_support_as_complex = explain_context -> tmp_min_support_as_complex;
475 		explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
476 		explain_context -> second_best_matching_bases = max(explain_context -> second_best_matching_bases, explain_context -> best_matching_bases);
477 		explain_context -> best_matching_bases = explain_context-> tmp_total_matched_bases ;
478 	} else if( 0 && explain_context -> best_matching_bases == explain_context-> tmp_total_matched_bases) {
479 		// only gapped explainations are complex counted.
480 		explain_context -> best_is_complex +=  explain_context -> tmp_search_sections;
481 		explain_context -> second_best_matching_bases = explain_context -> best_matching_bases;
482 
483 		if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0){
484 			SUBREADprintf("complexity: curr=%d, new=%d   ;   sections=%d\n", explain_context->best_min_support_as_complex, explain_context -> tmp_min_support_as_complex, explain_context -> tmp_search_sections );
485 		}
486 		if(explain_context -> best_is_complex > 1)
487 		{
488 			// is complex now!
489 			if(explain_context -> tmp_search_sections == 0)
490 			{
491 				if(explain_context -> tmp_min_unsupport >explain_context->best_min_support_as_complex){
492 					is_better_result = 1;
493 					explain_context->best_min_support_as_complex =explain_context -> tmp_min_unsupport;
494 					explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
495 					explain_context -> is_currently_tie = 0;
496 				}
497 				else if(explain_context -> tmp_min_unsupport == explain_context->best_min_support_as_complex)
498 				{
499 					explain_context -> is_currently_tie = 1;
500 					is_same_best = 1;
501 				}
502 			} else {
503 				if(explain_context -> tmp_min_support_as_complex  >explain_context->best_min_support_as_complex){
504 					is_better_result = 1;
505 					explain_context -> best_min_support_as_complex =explain_context -> tmp_min_support_as_complex;
506 					explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
507 					explain_context -> is_currently_tie = 0;
508 				}
509 				else if(explain_context -> tmp_min_support_as_complex  == explain_context->best_min_support_as_complex){
510 					explain_context -> is_currently_tie = 1;
511 					is_same_best = 1;
512 				}
513 			}
514 
515 		} else {
516 			// this branch is reached ONLY if the last best is ONE-gapped (50M3D50M) and the current best is ungapped (100M)!
517 			if(explain_context -> best_is_pure_donor_found_explain)
518 			{
519 				if(explain_context -> best_min_unsupport_as_simple >= explain_context -> best_support_as_simple+2)
520 				{
521 					is_better_result = 1;
522 					explain_context -> best_min_support_as_complex = explain_context -> best_min_unsupport_as_simple;
523 					explain_context -> best_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain;
524 					explain_context -> is_currently_tie = 0;
525 				}
526 			}
527 		}
528 	}
529 	else return;
530 
531 	if(is_better_result || is_same_best){
532 		if(search_to_back){
533 			explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start =  0;
534 		}else{
535 			explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_end = explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start + remainder_len;
536 			explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].event_after_section = NULL;
537 		}
538 	}
539 
540 	if(0 && FIXLENstrcmp("R010442852", explain_context -> read_name) == 0){
541 		SUBREADprintf("RNAME=%s TRY_REPLACE_DESICION TO %s: BETTER=%d, SAME=%d ; CURRENT : %d secs ; NEWBEST : %d secs MM_bases : %d -> %d\n", explain_context -> read_name, search_to_back?"BACK":"FRONT", is_better_result, is_same_best, search_to_back? explain_context -> result_back_junction_numbers[0]:explain_context -> result_front_junction_numbers[0] ,explain_context -> tmp_search_sections, explain_context -> best_matching_bases , explain_context-> tmp_total_matched_bases);
542 		int xx1;
543 		for(xx1 = 0; xx1 < explain_context -> tmp_search_sections;xx1++){
544 			SUBREADprintf("  Event : %d ~ %d in read\n", explain_context -> tmp_search_junctions[xx1].read_pos_start, explain_context -> tmp_search_junctions[xx1].read_pos_end);
545 			if(explain_context -> tmp_search_junctions[xx1].event_after_section){
546 				SUBREADprintf("    ");
547 				debug_show_event(global_context, explain_context -> tmp_search_junctions[xx1].event_after_section);
548 			}
549 		}
550 	}
551 
552 	if(is_better_result)
553 	{
554 		if(search_to_back){
555 			explain_context -> all_back_alignments = 1;
556 			explain_context -> result_back_junction_numbers[0] = explain_context -> tmp_search_sections +1;
557 			// checked: memory boundary
558 			memcpy(explain_context -> result_back_junctions[0], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
559 
560 		}else{
561 			explain_context -> all_front_alignments = 1;
562 			explain_context -> result_front_junction_numbers[0] = explain_context -> tmp_search_sections +1;
563 			// checked: memory boundary
564 			memcpy(explain_context -> result_front_junctions[0], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
565 		}
566 
567 	}else if(is_same_best && 0){
568 		if(search_to_back && explain_context -> all_back_alignments < MAX_ALIGNMENT_PER_ANCHOR){
569 			explain_context -> result_back_junction_numbers[explain_context -> all_back_alignments] = explain_context -> tmp_search_sections +1;
570 
571 			// checked: memory boundary
572 			memcpy(explain_context -> result_back_junctions[explain_context -> all_back_alignments], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
573 			explain_context -> all_back_alignments ++;
574 		}else if((!search_to_back) && explain_context -> all_front_alignments < MAX_ALIGNMENT_PER_ANCHOR){
575 			explain_context -> result_front_junction_numbers[explain_context -> all_front_alignments] = explain_context -> tmp_search_sections +1;
576 
577 			// checked: memory boundary
578 			memcpy(explain_context -> result_front_junctions[explain_context -> all_front_alignments], explain_context -> tmp_search_junctions , sizeof(perfect_section_in_read_t) * (explain_context -> tmp_search_sections +1));
579 			explain_context -> all_front_alignments ++;
580 		}
581 	}
582 }
583 
584 
585 // read_tail_abs_offset is actually the offset of the base next to the last base in read tail.
586 // read_tail_pos is the FIRST UNWANTED BASE, after the read.
search_events_to_back(global_context_t * global_context,thread_context_t * thread_context,explain_context_t * explain_context,char * read_text,char * qual_text,unsigned int read_tail_abs_offset,short read_tail_pos,short sofar_matched,int suggested_movement,int do_not_jump)587 void search_events_to_back(global_context_t * global_context, thread_context_t * thread_context, explain_context_t * explain_context, char * read_text , char * qual_text, unsigned int read_tail_abs_offset, short read_tail_pos, short sofar_matched, int suggested_movement, int do_not_jump)
588 {
589 	short tested_read_pos;
590 
591 	HashTable * event_table = NULL;
592 	chromosome_event_t * event_space = NULL;
593 
594 	if(thread_context) {
595 		event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
596 		event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
597 	} else {
598 		event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
599 		event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
600 	}
601 
602 	gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index ;
603 	if( there_are_events_in_range(event_table -> appendix2, read_tail_abs_offset - read_tail_pos, read_tail_pos) || global_context ->  config.do_fusion_detection ||global_context ->  config.do_long_del_detection ){
604 		int event_search_method;
605 		if((global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection))
606 			event_search_method = EVENT_SEARCH_BY_BOTH_SIDES;
607 		else
608 			event_search_method = EVENT_SEARCH_BY_LARGE_SIDE;
609 
610 
611 		int is_junction_scanned = 0;
612 		// minimum perfect section length is 1
613 		// tested_read_pos is the first WANTED BASE in section.
614 		int move_start = read_tail_pos - (do_not_jump?0:global_context -> config.realignment_minimum_variant_distance);
615 		if(suggested_movement) move_start = read_tail_pos - suggested_movement + 1;
616 
617 	//#warning ">>>>>>>>>>>>>> COMMENT THIS <<<<<<<<<<<<<<<<<<<<<"
618 	//printf("OCT27-STEP-BKIN : %s , STT=%d, %u, %d\n", explain_context -> read_name, move_start, read_tail_abs_offset, read_tail_pos);
619 
620 		if(MAX_EVENTS_IN_READ - 1> explain_context -> tmp_search_sections)
621 		for(tested_read_pos =  move_start; tested_read_pos >=0;tested_read_pos --)
622 		{
623 			int xk1, matched_bases_to_site;
624 			int jump_penalty = 0;
625 			chromosome_event_t *site_events[MAX_EVENT_ENTRIES_PER_SITE];
626 
627 			int potential_event_pos;
628 
629 			if(explain_context -> current_is_strand_jumped)
630 				potential_event_pos = read_tail_abs_offset + ( read_tail_pos - tested_read_pos);
631 			else
632 				potential_event_pos = read_tail_abs_offset - ( read_tail_pos - tested_read_pos);
633 
634 
635 			if(!check_event_bitmap(  event_table->appendix2, potential_event_pos )) continue;
636 			int search_types = CHRO_EVENT_TYPE_INDEL | CHRO_EVENT_TYPE_JUNCTION | CHRO_EVENT_TYPE_FUSION;
637 			int site_events_no = search_event(global_context, event_table , event_space , potential_event_pos, event_search_method , search_types, site_events);
638 			//#warning ">>>>>>>>>>>>>> COMMENT THIS <<<<<<<<<<<<<<<<<<<<<"
639 			//printf("OCT27-STEP-BKIN-SR: %s at %u, FOUND=%d\n" , explain_context -> read_name,potential_event_pos,site_events_no);
640 
641 			if(!site_events_no)continue;
642 
643 			unsigned int tested_chro_begin;
644 			if(explain_context -> current_is_strand_jumped)
645 				tested_chro_begin = read_tail_abs_offset + 1;
646 			else
647 				tested_chro_begin = read_tail_abs_offset - (read_tail_pos - tested_read_pos);
648 
649 			matched_bases_to_site = match_chro(read_text + tested_read_pos, value_index, tested_chro_begin , read_tail_pos - tested_read_pos, explain_context -> current_is_strand_jumped, global_context -> config.space_type);
650 
651 			int this_round_junction_scanned = 0;
652 
653 			//#warning ">>>>>>>>>>>>>>>> REMOVE IT <<<<<<<<<<<<<<<<<<<<<<"
654 			//printf("OCT27-STEPSB-JB-%s: test %u = %d events; TEST=%d > 7000 : MA=%d; %s ; %u = %u - (%d - %d) ; LEV=%d\n", explain_context -> read_name, potential_event_pos, site_events_no, (read_tail_pos<=tested_read_pos)?(-1234):( matched_bases_to_site*10000/(read_tail_pos - tested_read_pos)) , matched_bases_to_site, read_text + tested_read_pos, potential_event_pos, read_tail_abs_offset, read_tail_pos, tested_read_pos, explain_context -> tmp_search_sections);
655 			//#warning "========= remove - 2000 from next line ============="
656 			if(explain_context -> total_tries < REALIGN_TOTAL_TRIES && (read_tail_pos>tested_read_pos) && ( matched_bases_to_site*10000/(read_tail_pos - tested_read_pos) > 9000 - 2000 || global_context->config.maximise_sensitivity_indel) )
657 				for(xk1 = 0; xk1 < site_events_no ; xk1++)
658 				{
659 					chromosome_event_t * tested_event = site_events[xk1];
660 
661 					if(explain_context -> is_fully_covered && tested_event -> event_type == CHRO_EVENT_TYPE_FUSION && tested_event -> event_large_side - tested_event -> event_small_side > MAX_DELETION_LENGTH){
662 						continue;
663 					}
664 
665 					if((global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection) && tested_event -> event_type == CHRO_EVENT_TYPE_INDEL)
666 					{
667 						if(explain_context->current_is_strand_jumped){
668 							if(potential_event_pos == tested_event-> event_large_side) continue;
669 						}else{
670 							if(potential_event_pos == tested_event-> event_small_side) continue;
671 						}
672 					}
673 					if( tested_event -> event_type != CHRO_EVENT_TYPE_INDEL){
674 						this_round_junction_scanned = 1;
675 					}
676 
677 
678 					if(0 && strcmp("S_chr901_565784_72M8D28M", explain_context -> read_name) == 0)
679 						SUBREADprintf("B_JUMP?%d > %d TLEN=%d \n", (1+matched_bases_to_site)*10000 / (read_tail_pos - tested_read_pos) , 9000, read_tail_pos - tested_read_pos);
680 					// note that read_tail_pos is the first unwanted base.
681 					int new_read_tail_pos = tested_read_pos;
682 					if(tested_event->event_type == CHRO_EVENT_TYPE_INDEL) new_read_tail_pos +=  min(0, tested_event -> indel_length);
683 					// note that read_tail_abs_offset is the first unwanted base.
684 					unsigned int new_read_tail_abs_offset;
685 
686 					if((global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection))// && tested_event->event_type == CHRO_EVENT_TYPE_FUSION)
687 					{
688 						new_read_tail_abs_offset = (potential_event_pos == tested_event -> event_small_side)? tested_event -> event_large_side : tested_event -> event_small_side;
689 						if(tested_event->is_strand_jumped + explain_context -> current_is_strand_jumped == 1)
690 							new_read_tail_abs_offset--;
691 						else
692 							new_read_tail_abs_offset++;
693 					}
694 					else
695 						new_read_tail_abs_offset = tested_event -> event_small_side + 1;
696 
697 					new_read_tail_pos -= tested_event -> indel_at_junction;
698 
699 					if(new_read_tail_pos>0)
700 					{
701 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections].read_pos_start = tested_read_pos;
702 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].event_after_section = tested_event;
703 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].is_connected_to_large_side = (potential_event_pos == tested_event -> event_small_side);
704 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].read_pos_end = tested_read_pos + min(0, tested_event->indel_length) - tested_event -> indel_at_junction;
705 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].abs_offset_for_start = new_read_tail_abs_offset;
706 
707 						if(tested_event->event_type == CHRO_EVENT_TYPE_FUSION) jump_penalty = 2;
708 
709 						int current_is_jumped = explain_context -> current_is_strand_jumped ;
710 						int current_sup_as_complex = explain_context -> tmp_min_support_as_complex;
711 						int current_sup_as_simple = explain_context -> tmp_support_as_simple;
712 						int current_pure_donor_found = explain_context -> tmp_is_pure_donor_found_explain;
713 
714 						explain_context -> tmp_support_as_simple = tested_event -> supporting_reads;
715 						explain_context -> tmp_min_support_as_complex = min((tested_event -> is_donor_found_or_annotation & 64)?0x7fffffff:tested_event -> supporting_reads,explain_context -> tmp_min_support_as_complex);
716 						explain_context -> tmp_min_unsupport = min(tested_event -> anti_supporting_reads,explain_context -> tmp_min_unsupport);
717 						explain_context -> tmp_is_pure_donor_found_explain = explain_context -> tmp_is_pure_donor_found_explain && tested_event -> is_donor_found_or_annotation;
718 						explain_context -> tmp_indel_penalty += ( tested_event -> event_type == CHRO_EVENT_TYPE_INDEL );
719 
720 						if(tested_event -> event_type == CHRO_EVENT_TYPE_FUSION && tested_event -> is_strand_jumped)
721 							explain_context -> current_is_strand_jumped = !explain_context -> current_is_strand_jumped;
722 						explain_context -> tmp_search_junctions[explain_context -> tmp_search_sections + 1].is_strand_jumped = explain_context -> current_is_strand_jumped;
723 						explain_context -> tmp_search_sections ++;
724 						explain_context -> total_tries ++;
725 
726 						search_events_to_back(global_context, thread_context, explain_context, read_text , qual_text, new_read_tail_abs_offset , new_read_tail_pos, sofar_matched + matched_bases_to_site - jump_penalty, tested_event -> connected_previous_event_distance, 0);
727 
728 						explain_context -> tmp_search_sections --;
729 						explain_context -> tmp_indel_penalty -= ( tested_event -> event_type == CHRO_EVENT_TYPE_INDEL );
730 						explain_context -> current_is_strand_jumped = current_is_jumped;
731 						explain_context -> tmp_min_support_as_complex = current_sup_as_complex;
732 						explain_context -> tmp_support_as_simple = current_sup_as_simple;
733 						explain_context -> tmp_is_pure_donor_found_explain = current_pure_donor_found;
734 					}
735 				}
736 			if(( global_context ->config.limited_tree_scan) && explain_context -> full_read_len <= EXON_LONG_READ_LENGTH) break;
737 			this_round_junction_scanned = max(this_round_junction_scanned, is_junction_scanned);
738 		}
739 	}
740 	int whole_section_matched = match_chro(read_text , value_index, read_tail_abs_offset - (explain_context -> current_is_strand_jumped?-1:read_tail_pos), read_tail_pos , explain_context -> current_is_strand_jumped, global_context -> config.space_type);
741 
742 	explain_context -> tmp_total_matched_bases = whole_section_matched + sofar_matched ;
743 
744 	new_explain_try_replace(global_context, thread_context, explain_context, 0, 1);
745 }
746 
init_junction_tables(global_context_t * context)747 int init_junction_tables(global_context_t * context)
748 {
749 	fraglist_init(&context -> funky_list_A);
750 	fraglist_init(&context -> funky_list_DE);
751 
752 	bktable_init(&context -> funky_table_BC, FUNKY_COLOCATION_TOLERANCE * 2, 10000000);
753 	bktable_init(&context -> funky_table_DE, FUNKY_COLOCATION_TOLERANCE * 2, 10000000);
754 
755 	bktable_init(&context -> breakpoint_table_P, 2 * context -> config.maximum_pair_distance, 1000000);
756 	bktable_init(&context -> breakpoint_table_QR, 2 * BREAK_POINT_MAXIMUM_TOLERANCE, 1000000);
757 	bktable_init(&context -> breakpoint_table_YZ, 2 * context -> config.maximum_pair_distance, 1000000);
758 
759 	bktable_init(&context -> translocation_result_table, 2*BREAK_POINT_MAXIMUM_TOLERANCE, 1000000);
760 	bktable_init(&context -> inversion_result_table, 2*BREAK_POINT_MAXIMUM_TOLERANCE, 1000000);
761 	return 0;
762 }
763 
destroy_junction_tables(global_context_t * context)764 int destroy_junction_tables(global_context_t * context)
765 {
766 	fraglist_destroy(&context -> funky_list_A);
767 	fraglist_destroy(&context -> funky_list_DE);
768 
769 	bktable_destroy(&context -> funky_table_BC);
770 	bktable_destroy(&context -> funky_table_DE);
771 	bktable_destroy(&context -> breakpoint_table_P);
772 	bktable_destroy(&context -> breakpoint_table_QR);
773 	bktable_destroy(&context -> breakpoint_table_YZ);
774 
775 	HashTableIteration(context -> inversion_result_table.entry_table , bktable_free_ptrs);
776 	bktable_destroy(&context -> inversion_result_table);
777 
778 	HashTableIteration(context -> translocation_result_table.entry_table , bktable_free_ptrs);
779 	bktable_destroy(&context -> translocation_result_table);
780 
781 	return 0;
782 }
init_junction_thread_contexts(global_context_t * global_context,thread_context_t * thread_context,int task)783 int init_junction_thread_contexts(global_context_t * global_context, thread_context_t * thread_context, int task)
784 {
785 	return 0;
786 }
787 
insert_big_margin_record(global_context_t * global_context,unsigned short * big_margin_record,unsigned char votes,short read_pos_start,short read_pos_end,int read_len,int is_negative)788 void insert_big_margin_record(global_context_t * global_context , unsigned short * big_margin_record, unsigned char votes, short read_pos_start, short read_pos_end, int read_len, int is_negative)
789 {
790 
791 	if( global_context->config.big_margin_record_size<3) return;
792 
793 	unsigned short read_pos_start_2 = (is_negative?read_len -read_pos_end:read_pos_start) ;
794 	unsigned short read_pos_end_2 = (is_negative?read_len -read_pos_start:read_pos_end);
795 
796 	int xk1;
797 	for(xk1=0; xk1< global_context->config.big_margin_record_size / 3; xk1++)
798 	{
799 		if( votes >= big_margin_record[xk1*3])
800 			break;
801 	}
802 	if(xk1< global_context->config.big_margin_record_size / 3)
803 	{
804 		int xk2;
805 		for(xk2 = global_context->config.big_margin_record_size-4; xk2 >= xk1*3; xk2--)
806 			big_margin_record[xk2 + 3] = big_margin_record[xk2];
807 		big_margin_record[xk1*3+0] = votes;
808 		big_margin_record[xk1*3+1] = read_pos_start_2;
809 		big_margin_record[xk1*3+2] = read_pos_end_2;
810 	}
811 }
812 
is_PE_distance(global_context_t * global_context,unsigned int pos1,unsigned int pos2,int rlen1,int rlen2,int is_negative_R1,int is_negative_R2)813 int is_PE_distance(global_context_t * global_context, unsigned int pos1,  unsigned int pos2, int rlen1, int rlen2, int is_negative_R1, int is_negative_R2)
814 {
815 	long long int dist = pos2;
816 	dist -= pos1;
817 
818 	is_negative_R1 = (is_negative_R1>0)?1:0;
819 	is_negative_R2 = (is_negative_R2>0)?1:0;
820 
821 	if(pos1 > pos2) dist -= rlen1;
822 	else if(pos1 < pos2) dist += rlen2;
823 	else dist += max(rlen2, rlen1);
824 
825 	if(abs(dist) > global_context->config.maximum_pair_distance || abs(dist)<global_context->config.minimum_pair_distance) return 0;
826 
827 	if(is_negative_R1 != is_negative_R2) return 0;
828 	if(pos1 > pos2 && !is_negative_R1) return 0;
829 	if(pos1 < pos2 && is_negative_R1) return 0;
830 	return 1;
831 }
832 
833 
834 #define MAX_VOTE_TOLERANCE 1
835 //returns 1 if the vote number is not significantly higher than the vote numbers in the vote list.
test_small_minor_votes(global_context_t * global_context,int minor_i,int minor_j,int major_i,int major_j,gene_vote_t * votes,int read_len)836 int test_small_minor_votes(global_context_t * global_context, int minor_i, int minor_j, int major_i, int major_j , gene_vote_t * votes, int read_len)
837 {
838 	int is_small_margin_minor = 0;
839 	long long dist = votes -> pos[minor_i][minor_j];
840 	dist -= votes -> pos[major_i][major_j];
841 
842 	if(abs(dist)> global_context->config.maximum_intron_length)
843 	{
844 		int iii, jjj;
845 		for(iii=0; iii<GENE_VOTE_TABLE_SIZE; iii++)
846 		{
847 			for(jjj = 0; jjj < votes->items[iii]; jjj++)
848 			{
849 				if(iii == minor_i && jjj == minor_j) continue;
850 				// "2" is the tolerance.
851 				if(votes -> votes[minor_i][minor_j] - votes -> votes[iii][jjj] >=1) continue;
852 
853 				int minor_coverage_start = votes -> coverage_start[minor_i][minor_j] ;
854 				int minor_coverage_end = votes -> coverage_end[minor_i][minor_j] ;
855 
856 				int other_coverage_start = votes -> coverage_start[iii][jjj];
857 				int other_coverage_end = votes -> coverage_end[iii][jjj];
858 
859 				int minor_negative = votes -> masks[minor_i][minor_j] & IS_NEGATIVE_STRAND;
860 				int other_negative = votes -> masks[iii][jjj] & IS_NEGATIVE_STRAND;
861 
862 				if(minor_negative) {
863 					int ttt = read_len - minor_coverage_end;
864 					minor_coverage_end = read_len - minor_coverage_start;
865 					minor_coverage_start = ttt;
866 				}
867 
868 				if(other_negative){
869 					int ttt = read_len - other_coverage_end;
870 					other_coverage_end = read_len - other_coverage_start;
871 					other_coverage_start = ttt;
872 				}
873 
874 				if(abs(minor_coverage_end - other_coverage_end) < 7 && abs(minor_coverage_start - other_coverage_start)<7)
875 					is_small_margin_minor = 1;
876 
877 				if(is_small_margin_minor) break;
878 			}
879 			if(is_small_margin_minor) break;
880 		}
881 	}
882 	return is_small_margin_minor;
883 }
884 
885 
886 // function test_junction_minor returns 1 if the current anchor and current_vote[i][j] are not good mates in terms of junction reads:
887 // for example, if the distance is too far, if the coverered region overlapped or if the two mapped parts in the read are reversely arranged (expect in fusion detection)
test_junction_minor(global_context_t * global_context,thread_context_t * thread_context,gene_vote_t * votes,int vote_i,int vote_j,int i,int j,long long int dist)888 int test_junction_minor(global_context_t * global_context, thread_context_t * thread_context, gene_vote_t * votes, int vote_i, int vote_j, int i, int j, long long int dist)
889 {
890 	if(abs(dist)> global_context->config.maximum_intron_length) return 1;
891 	if(votes -> coverage_start[vote_i][vote_j] == votes -> coverage_start[i][j])return 2;
892 	if(votes -> coverage_end[vote_i][vote_j]   == votes -> coverage_end[i][j])return 3;
893 
894 	//SUBREADprintf( " COV_IN_READ: %d ~ %d         CHRO_POS: %u ~ %u \n", votes -> coverage_start[vote_i][vote_j] , votes -> coverage_start[i][j] , votes -> pos[vote_i][vote_j] ,  votes -> pos[i][j] );
895 	if(votes -> coverage_start[vote_i][vote_j] > votes -> coverage_start[i][j])
896 	{
897 		if(votes -> pos[vote_i][vote_j] < votes -> pos[i][j])return 4;
898 	}
899 	else
900 	{
901 		if(votes -> pos[vote_i][vote_j] > votes -> pos[i][j])return 5;
902 	}
903 
904 	return 0;
905 }
906 
update_top_three(global_context_t * global_context,int * top_buffer_3i,int new_value)907 void update_top_three(global_context_t * global_context, int * top_buffer_3i, int new_value){
908 	if(new_value > top_buffer_3i[global_context -> config.top_scores - 1]){
909 		int x1;
910 		for(x1 = 0;x1 < global_context -> config.top_scores ; x1++){
911 			if(new_value > top_buffer_3i[x1]){
912 				int x2;
913 				for(x2 = global_context -> config.top_scores - 1 ; x2 > x1 ; x2 --){
914 					top_buffer_3i[x2] = top_buffer_3i[x2-1];
915 				}
916 				top_buffer_3i[x1] = new_value;
917 				break;
918 			}else if(new_value == top_buffer_3i[x1]) break;
919 		}
920 	}
921 }
922 
923 
924 
comb_sort_compare(void * Vcomb_buffer,int i,int j)925 int comb_sort_compare(void * Vcomb_buffer, int i, int j){
926 	vote_combination_t * comb_buffer = (vote_combination_t *)Vcomb_buffer;
927 	return comb_buffer[i].score_adj - comb_buffer[j].score_adj;
928 }
929 
comb_sort_exchange(void * Vcomb_buffer,int i,int j)930 void comb_sort_exchange(void * Vcomb_buffer, int i, int j){
931 	vote_combination_t * comb_buffer = (vote_combination_t *)Vcomb_buffer;
932 	vote_combination_t tmpv;
933 	memcpy(&tmpv, comb_buffer + i, sizeof(vote_combination_t));
934 	memcpy(comb_buffer + i, comb_buffer + j, sizeof(vote_combination_t));
935 	memcpy(comb_buffer + j, &tmpv, sizeof(vote_combination_t));
936 }
937 
comb_sort_merge(void * Vcomb_buffer,int start,int items,int items2)938 void comb_sort_merge(void * Vcomb_buffer, int start, int items, int items2){
939 	vote_combination_t * comb_buffer = (vote_combination_t *)Vcomb_buffer;
940 	vote_combination_t * merge_target = malloc(sizeof(vote_combination_t) * (items + items2));
941 
942 	int items1_cursor = start, items2_cursor = start + items, x1;
943 
944 	for(x1=0; x1 < items+items2; x1++){
945 		int select_items_1 = (items1_cursor < items + start && comb_sort_compare(comb_buffer, items1_cursor, items2_cursor) <=0) || (items2_cursor == start + items + items2);
946 		if(select_items_1){
947 			memcpy(merge_target+x1, comb_buffer+items1_cursor, sizeof(vote_combination_t));
948 			items1_cursor++;
949 		}else{
950 			memcpy(merge_target+x1, comb_buffer+items2_cursor, sizeof(vote_combination_t));
951 			items2_cursor++;
952 		}
953 
954 	}
955 
956 	memcpy(comb_buffer + start, merge_target, (items+items2) * sizeof(vote_combination_t));
957 	free(merge_target);
958 
959 }
960 
is_better_inner(global_context_t * global_context,thread_context_t * thread_context,subjunc_result_t * junc_res,int old_intron_length,gene_vote_number_t Vote_minor,int coverage_minor_length,int intron)961 int is_better_inner(global_context_t * global_context, thread_context_t * thread_context, subjunc_result_t * junc_res, int old_intron_length,  gene_vote_number_t Vote_minor, int coverage_minor_length, int intron)
962 {
963 	if( Vote_minor > junc_res -> minor_votes ||
964 	  (Vote_minor ==junc_res -> minor_votes && coverage_minor_length > junc_res -> minor_coverage_end - junc_res -> minor_coverage_start) ||
965 	  (Vote_minor ==junc_res -> minor_votes && coverage_minor_length ==junc_res -> minor_coverage_end - junc_res -> minor_coverage_start && intron < old_intron_length))
966 		return 1;
967 	else    return 0;
968 }
969 
970 
971 #define COVERAGE_STAB_NUMBER 100
test_fully_covered(global_context_t * global_context,gene_vote_t * vote,int read_length)972 int test_fully_covered(global_context_t * global_context, gene_vote_t *  vote, int read_length){
973 	int i,j,xk1,xk2;
974 	char local_strands[COVERAGE_STAB_NUMBER];
975 	unsigned int local_locations[COVERAGE_STAB_NUMBER];
976 	unsigned long long local_coverage[COVERAGE_STAB_NUMBER];
977 	int used_stabs = 0;
978 
979 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
980 	{
981 		for (j=0; j< vote->items[i]; j++)
982 		{
983 			if(vote -> votes[i][j]>2 && used_stabs < COVERAGE_STAB_NUMBER)
984 			{
985 				int is_fresh = 1;
986 				int is_negative = (vote -> masks[i][j] & IS_NEGATIVE_STRAND)?1:0;
987 				for(xk1=0; xk1<used_stabs; xk1++){
988 					if(local_strands[xk1] == is_negative){
989 						long long dist = vote -> pos[i][j];
990 						dist -= local_locations[xk1];
991 						if(abs(dist) < MAX_DELETION_LENGTH)
992 						{
993 							is_fresh=0;
994 							break;
995 						}
996 					}
997 				}
998 
999 				if(is_fresh){
1000 					local_strands[used_stabs]=is_negative;
1001 					local_locations[used_stabs]= vote -> pos[i][j];
1002 					local_coverage[used_stabs] = 0;
1003 					used_stabs++;
1004 				}
1005 			}
1006 		}
1007 	}
1008 	if(!used_stabs) return 0;
1009 
1010 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1011 	{
1012 		for (j=0; j< vote->items[i]; j++)
1013 		{
1014 			if(vote -> votes[i][j]>=1)
1015 			{
1016 				int is_negative = (vote -> masks[i][j] & IS_NEGATIVE_STRAND)?1:0;
1017 				for(xk1=0; xk1<used_stabs; xk1++){
1018 					if(local_strands[xk1] == is_negative){
1019 						long long dist = vote -> pos[i][j];
1020 						dist -= local_locations[xk1];
1021 						if(abs(dist) < MAX_DELETION_LENGTH)
1022 						{
1023 							for(xk2 = vote -> coverage_start[i][j] * 64 / read_length; xk2 <=
1024 							    vote -> coverage_end[i][j] * 64 / read_length; xk2++){
1025 								local_coverage[xk1] |= 1llu<<xk2;
1026 							}
1027 						}
1028 					}
1029 				}
1030 			}
1031 		}
1032 	}
1033 
1034 	for(xk1=0; xk1<used_stabs; xk1++){
1035 		int covered = 0;
1036 		for(xk2 = 0; xk2<64; xk2++){
1037 			covered += ( local_coverage[xk1] & (1llu<<xk2) )?1:0;
1038 		}
1039 		//SUBREADprintf("COVERAGE LEVEL=%d\n", covered);
1040 
1041 		if(covered > 54){
1042 			return 1;
1043 		}
1044 	}
1045 
1046 	return 0;
1047 }
1048 
is_long_del_high_quality(global_context_t * global_context,thread_context_t * thread_context,int p1_start,int p1_end,int p2_start,int p2_end,int read_len,int p1_votes,int p2_votes)1049 int is_long_del_high_quality(global_context_t * global_context, thread_context_t * thread_context, int p1_start, int p1_end, int p2_start, int p2_end, int read_len, int p1_votes, int p2_votes){
1050 	if(p1_votes < 3 || p2_votes < 3) return 0;
1051 	if( min( p1_start, p2_start ) > 10 ) return 0;
1052 	if( read_len - max(p1_end, p2_end)  > 10 ) return 0;
1053 	return 1;
1054 }
1055 #define SE_READ_IN_KNOWN_EXON_REWARD 1
1056 
copy_vote_to_alignment_res(global_context_t * global_context,thread_context_t * thread_context,mapping_result_t * align_res,subjunc_result_t * junc_res,gene_vote_t * current_vote,int vote_i,int vote_j,int curr_read_len,char * read_name,char * curr_read_text,int used_subreads_in_vote,int noninformative_subreads_in_vote,subread_read_number_t pair_number,int is_second_read,int * is_fully_covered)1057 void copy_vote_to_alignment_res(global_context_t * global_context, thread_context_t * thread_context, mapping_result_t * align_res, subjunc_result_t * junc_res, gene_vote_t * current_vote, int vote_i, int vote_j, int curr_read_len, char * read_name, char * curr_read_text, int used_subreads_in_vote, int noninformative_subreads_in_vote, subread_read_number_t pair_number, int is_second_read, int * is_fully_covered)
1058 {
1059 	int vv = current_vote -> votes[vote_i][vote_j];
1060 	if(global_context->config.scRNA_input_mode && !global_context -> input_reads.is_paired_end_reads) vv += SE_READ_IN_KNOWN_EXON_REWARD *is_pos_in_annotated_exon_regions(global_context, current_vote -> pos[vote_i][vote_j]);
1061 	align_res -> selected_position = current_vote -> pos[vote_i][vote_j];
1062 	align_res -> selected_votes = vv;
1063 	align_res -> indels_in_confident_coverage = indel_recorder_copy(align_res -> selected_indel_record, current_vote -> indel_recorder[vote_i][vote_j]);
1064 	align_res -> confident_coverage_end = current_vote -> coverage_end[vote_i][vote_j];
1065 	align_res -> confident_coverage_start = current_vote -> coverage_start[vote_i][vote_j];
1066 	align_res -> result_flags = (current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND)?(CORE_IS_NEGATIVE_STRAND):0;
1067 	align_res -> used_subreads_in_vote = used_subreads_in_vote;
1068 	align_res -> noninformative_subreads_in_vote = noninformative_subreads_in_vote;
1069 	align_res -> is_fully_covered = *is_fully_covered ;
1070 
1071 	if(global_context -> config.do_breakpoint_detection)
1072 	{
1073 		int i,j;
1074 
1075 		// iterate all the anchors we have found in step 1:
1076 		for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1077 		{
1078 			for (j=0; j< current_vote->items[i]; j++)
1079 			{
1080 				if(i == vote_i && j == vote_j) continue;
1081 				if(align_res -> selected_votes < current_vote -> votes[i][j]) continue;	// major half must be the anchor
1082 
1083 				long long int dist = current_vote -> pos[vote_i][vote_j];
1084 				dist -= current_vote -> pos[i][j];
1085 
1086 				int is_strand_jumpped = (current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND)!=(current_vote -> masks[i][j] & IS_NEGATIVE_STRAND);
1087 				if((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection) && (*is_fully_covered) && (dist > MAX_DELETION_LENGTH || is_strand_jumpped)) continue;
1088 
1089 				if((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection)){
1090 					// function test_small_minor_votes returns 1 if the vote number is not significantly
1091 					// higher than the vote numbers in the vote list.
1092 					//#warning "SUBREAD_151 =========== THE TWO LINES SHOULD BE UNCOMMENTED IN RELEASED VERSION ==== WE COMMENT IT FOR A BETTER FUSION SENSITIVITY BUT ONLY FOR TEST ==================="
1093 					if(1){
1094 						int small_minor_bigmargin = test_small_minor_votes(global_context , i, j, vote_i, vote_j, current_vote, curr_read_len);
1095 						if(small_minor_bigmargin) continue;
1096 					}
1097 				}else{
1098 					// function test_junction_minor returns 1 if the current anchor and current_vote[i][j]
1099 					// are not good mates in terms of junction reads:
1100 					//
1101 					// for example, if the distance is too far, if the coverered region overlapped or
1102 					// if the covered region has a wrong arrangement to their relative positions.
1103 					int test_minor_res = test_junction_minor(global_context, thread_context, current_vote, vote_i, vote_j, i, j, dist);
1104 //#warning " ============ DEBUG 1 ==================== "
1105 					if(0 && FIXLENstrcmp("R002403247", read_name) == 0) {
1106 						char posout2[100];
1107 						char posout1[100];
1108 						absoffset_to_posstr(global_context, current_vote -> pos[vote_i][vote_j], posout1);
1109 						absoffset_to_posstr(global_context, current_vote -> pos[i][j], posout2);
1110 						SUBREADprintf("SMALL_MARGIN=%d at %s ~ %s\n", test_minor_res, posout1, posout2);
1111 					}
1112 					//	SUBREADprintf("TMR=%d (V=%d)\n", test_minor_res, current_vote -> votes[i][j]);
1113 					if(test_minor_res)continue;
1114 				}
1115 
1116 				int is_better = is_better_inner(global_context, thread_context,
1117 							junc_res, abs32uint(current_vote -> pos[vote_i][vote_j] - junc_res -> minor_position), current_vote -> votes[i][j], current_vote -> coverage_end[i][j] - current_vote -> coverage_start[i][j],
1118 							abs32uint(current_vote -> pos[vote_i][vote_j] - current_vote -> pos[i][j]));
1119 
1120 				int replace_minor = 0, minor_indel_offset = 0, inserted_bases = 0, is_GT_AG_donors = 0, is_donor_found_or_annotation = 0, final_split_point = 0, major_indels = 0, small_side_increasing_coordinate = 0, large_side_increasing_coordinate = 0;
1121 
1122 				if(0 && FIXLENstrcmp("R002403247", read_name) == 0)
1123 				{
1124 					char posout[100];
1125 					absoffset_to_posstr(global_context, current_vote -> pos[i][j], posout);
1126 					SUBREADprintf("IBT=%d (V=%d , OV=%d) at %s\n", is_better, current_vote -> votes[i][j], junc_res -> minor_votes, posout);
1127 					SUBREADprintf("IBT OLD_INTRON=%d, INTRON=%d\n", abs32uint(current_vote -> pos[vote_i][vote_j] - junc_res -> minor_position),
1128 							abs32uint(current_vote -> pos[vote_i][vote_j] - current_vote -> pos[i][j])
1129 						);
1130 				}
1131 
1132 				if(is_better){
1133 					// Determine the splicing point of the fusion or the junction
1134 					// If the splicing point is determined, then set replace_minor = 1
1135 					if(is_strand_jumpped){
1136 						if(!global_context -> config.do_fusion_detection) continue;
1137 
1138 						int minor_cover_end_as_reversed = (current_vote -> masks[i][j] & IS_NEGATIVE_STRAND)? current_vote -> coverage_end[i][j]:(curr_read_len - current_vote -> coverage_start[i][j]);
1139 						int minor_cover_start_as_reversed = (current_vote -> masks[i][j] & IS_NEGATIVE_STRAND)? current_vote -> coverage_start[i][j]:(curr_read_len - current_vote -> coverage_end[i][j]);
1140 						int main_cover_end_as_reversed = (current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND)?current_vote -> coverage_end[vote_i][vote_j]:(curr_read_len - current_vote -> coverage_start[vote_i][vote_j]);
1141 						int main_cover_start_as_reversed = (current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND)?current_vote -> coverage_start[vote_i][vote_j]:(curr_read_len - current_vote -> coverage_end[vote_i][vote_j]);
1142 
1143 
1144 						int overlapped ;
1145 						if(main_cover_start_as_reversed > minor_cover_start_as_reversed)
1146 							overlapped = minor_cover_end_as_reversed - main_cover_start_as_reversed;
1147 						else
1148 							overlapped = main_cover_end_as_reversed - minor_cover_start_as_reversed;
1149 
1150 						if(overlapped > 14) continue;
1151 
1152 
1153 						int guess_start_as_reversed = (main_cover_start_as_reversed > minor_cover_start_as_reversed)?
1154 									 (minor_cover_end_as_reversed - 15): (main_cover_end_as_reversed - 15);
1155 
1156 						int guess_end_as_reversed = (main_cover_start_as_reversed > minor_cover_start_as_reversed)?
1157 									 (main_cover_start_as_reversed + 15): (minor_cover_start_as_reversed + 15);
1158 
1159 						int is_small_half_negative = 0 != ((current_vote -> pos[vote_i][vote_j]>current_vote -> pos[i][j]?current_vote -> masks[i][j]:current_vote -> masks[vote_i][vote_j])&IS_NEGATIVE_STRAND);
1160 						int is_large_half_negative = !is_small_half_negative;
1161 
1162 						int is_small_half_on_left_as_reversed = (main_cover_start_as_reversed > minor_cover_start_as_reversed) + (current_vote -> pos[vote_i][vote_j]> current_vote -> pos[i][j]) !=1;
1163 						// small half on left(as reversed)  ===  small half on right (as 'forward' form of the read, i.e., the raw FASTQ form for read_A and reversed FASTQ form for read_B)
1164 
1165 						unsigned int small_half_abs_offset = min(current_vote -> pos[i][j], current_vote -> pos[vote_i][vote_j]);
1166 						unsigned int large_half_abs_offset = max(current_vote -> pos[i][j], current_vote -> pos[vote_i][vote_j]);
1167 
1168 						// curr_read_text is the 'reversed' form of the read. I.e., the reversed FASTQ form for read_A and the raw FASTQ form for read_B.
1169 						replace_minor = donor_jumped_score(global_context, thread_context, small_half_abs_offset, large_half_abs_offset,
1170 									max(0, guess_start_as_reversed) , min( guess_end_as_reversed, curr_read_len),  curr_read_text,
1171 									curr_read_len, is_small_half_negative, is_large_half_negative, is_small_half_on_left_as_reversed,
1172 									& final_split_point, & is_GT_AG_donors, & is_donor_found_or_annotation, &small_side_increasing_coordinate, &large_side_increasing_coordinate);
1173 
1174 						if( 0 && 1018082 == pair_number)
1175 						{
1176 							print_votes(current_vote, global_context -> config.index_prefix);
1177 							SUBREADprintf("JUMP_001018082  NORMAL=%d  SMALL_NEG=%d  LARGE_NEG=%d,  SMALL_ABS=%u  LARGE_ABS=%u,  REPLACE=%d,   INCS=%d %d\n" ,  is_small_half_on_left_as_reversed, is_small_half_negative, is_large_half_negative, small_half_abs_offset, large_half_abs_offset, replace_minor, small_side_increasing_coordinate, large_side_increasing_coordinate);
1178 						}
1179 
1180 
1181 						// Now "final_split_point" is the read offset on the 'reversed' form of the read. It needs to be changed to (read_len - final_split_point) if the major half is on negative strand.
1182 
1183 						if(replace_minor>0) replace_minor += current_vote -> votes[i][j] * 100000;
1184 
1185 					}
1186 					else
1187 					{
1188 
1189 						int overlapped ;
1190 						if(current_vote -> coverage_start[vote_i][vote_j] > current_vote -> coverage_start[i][j])
1191 							overlapped = current_vote -> coverage_end[i][j] - current_vote -> coverage_start[vote_i][vote_j];
1192 						else
1193 							overlapped = current_vote -> coverage_end[vote_i][vote_j] - current_vote -> coverage_start[i][j];
1194 
1195 						if(0 && FIXLENstrcmp("R000404427", read_name) == 0)
1196 						{
1197 							SUBREADprintf("OVL=%d, DIST=%u\n", overlapped, (unsigned int)abs(dist));
1198 						}
1199 
1200 						if(overlapped > 14) continue;
1201 						if(abs(dist)<6) continue;
1202 
1203 						int guess_start = (current_vote -> coverage_start[vote_i][vote_j] > current_vote -> coverage_start[i][j])?
1204 									 (current_vote -> coverage_end[i][j] - 8): (current_vote -> coverage_end[vote_i][vote_j] - 8);
1205 
1206 						int guess_end = (current_vote -> coverage_start[vote_i][vote_j] < current_vote -> coverage_start[i][j])?
1207 									 (current_vote -> coverage_start[i][j] + 8): (current_vote -> coverage_start[vote_i][vote_j] + 8);
1208 
1209 						if((global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection) && !(current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND))
1210 							// if for fusion, the current read must have been reversed.
1211 							// hence, it is now changed to "main half" view.
1212 							reverse_read(curr_read_text, curr_read_len, global_context -> config.space_type);
1213 
1214 						int left_indel_offset=0,  right_indel_offset=0;
1215 						int kx2;
1216 
1217 						int normally_arranged = 1!=(current_vote -> coverage_start[vote_i][vote_j] > current_vote -> coverage_start[i][j]) + (current_vote -> pos[vote_i][vote_j] > current_vote -> pos[i][j]);
1218 
1219 						if(curr_read_len > EXON_LONG_READ_LENGTH){
1220 
1221 							int kx1;
1222 							gene_vote_number_t * indel_recorder = current_vote -> indel_recorder[vote_i][vote_j];
1223 							for(kx1=0; kx1<MAX_INDEL_SECTIONS; kx1++)
1224 							{
1225 								if(!indel_recorder[kx1*3]) break;
1226 								major_indels += indel_recorder[kx1*3+2];
1227 							}
1228 
1229 							for(kx2=0; kx2<MAX_INDEL_SECTIONS; kx2++)
1230 							{
1231 								if(!current_vote -> indel_recorder[i][j][kx2*3]) break;
1232 								minor_indel_offset += (current_vote -> indel_recorder[i][j][kx2*3+2]);
1233 							}
1234 
1235 							if(current_vote -> pos[vote_i][vote_j]  <  current_vote -> pos[i][j])
1236 							{
1237 								left_indel_offset=major_indels;
1238 								right_indel_offset=minor_indel_offset;
1239 							}
1240 							else
1241 							{
1242 								right_indel_offset=major_indels;
1243 								left_indel_offset=minor_indel_offset;
1244 
1245 							}
1246 
1247 							// the section having a smaller coordinate will have indel_offset !=0
1248 							// the section having a larger coordiname MUST HAVE indel_offset == 0
1249 							right_indel_offset=0;
1250 						}
1251 
1252 						if(is_long_del_high_quality( global_context, thread_context, current_vote -> coverage_start[i][j], current_vote -> coverage_end[i][j], current_vote -> coverage_start[vote_i][vote_j], current_vote -> coverage_end[vote_i][vote_j], curr_read_len, current_vote -> votes[i][j], current_vote -> votes[vote_i][vote_j])|| ! global_context -> config.do_long_del_detection)
1253 						replace_minor = donor_score(global_context, thread_context, min(current_vote -> pos[vote_i][vote_j],
1254 									current_vote -> pos[i][j]),max(current_vote -> pos[vote_i][vote_j] ,
1255 									current_vote -> pos[i][j]), left_indel_offset, right_indel_offset, normally_arranged,
1256 									max(0, guess_start), min( guess_end, curr_read_len), curr_read_text, curr_read_len,
1257 									& final_split_point, & is_GT_AG_donors, & is_donor_found_or_annotation, & inserted_bases, &small_side_increasing_coordinate, &large_side_increasing_coordinate, read_name);
1258 						else replace_minor = 0;
1259 
1260 						// Now "final_split_point" is the read offset on the 'reversed' form of the read (I.e., the reversed FASTQ form for read_A and the raw FASTQ form for read_B.) if do_fusion_detection AND if the main half is on negative strand.
1261 						// However, because the final_split_point is ALWAYS on the form where the major half can be mapped, final_split_point will never be changed.
1262 
1263 						if(replace_minor>0) replace_minor += current_vote -> votes[i][j] * 100000;
1264 
1265 						if(0 && ( FIXLENstrcmp("R006232475", read_name) == 0 ) )
1266 							SUBREADprintf("NOJUMP_DONORs=%d   LOC=%u\n", replace_minor , current_vote -> pos[i][j]);
1267 						if((global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection) && !(current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND))
1268 							// changed back.
1269 							reverse_read(curr_read_text, curr_read_len, global_context -> config.space_type);
1270 					}
1271 				}
1272 
1273 				if(0 && ( FIXLENstrcmp("R006232475", read_name) == 0 ) )
1274 				{
1275 					char posout[100];
1276 					absoffset_to_posstr(global_context, current_vote -> pos[i][j], posout);
1277 					SUBREADprintf("TEST MINOR: POS=%s, REPLACE=%d\n", posout, replace_minor);
1278 				}
1279 
1280 				if(replace_minor){// && (replace_minor > current_piece_minor_score)){
1281 					junc_res -> minor_position = current_vote -> pos[i][j];
1282 					junc_res -> minor_votes = current_vote -> votes[i][j];
1283 
1284 					junc_res -> minor_coverage_start = current_vote -> coverage_start[i][j];
1285 					junc_res -> minor_coverage_end   = current_vote -> coverage_end  [i][j];
1286 
1287 					junc_res -> double_indel_offset = (minor_indel_offset & 0xf)|((major_indels & 0xf)<<4);
1288 					junc_res -> split_point = final_split_point;
1289 
1290 
1291 					if(0 && 1018082 == pair_number)
1292 					{
1293 						SUBREADprintf("REPLACED: LOC %u, INCS=%d %d\n", junc_res -> minor_position, small_side_increasing_coordinate, large_side_increasing_coordinate);
1294 					}
1295 
1296 					junc_res -> small_side_increasing_coordinate = small_side_increasing_coordinate;
1297 					junc_res -> large_side_increasing_coordinate = large_side_increasing_coordinate;
1298 					junc_res -> indel_at_junction = inserted_bases;
1299 
1300 					align_res -> result_flags &=~0x3;
1301 					if( (!is_donor_found_or_annotation) || is_GT_AG_donors > 2) align_res -> result_flags |= 3;
1302 					else	align_res -> result_flags = is_GT_AG_donors? (align_res -> result_flags|CORE_IS_GT_AG_DONORS):(align_res  -> result_flags &~CORE_IS_GT_AG_DONORS);
1303 
1304 					align_res -> result_flags = is_strand_jumpped? (align_res -> result_flags|CORE_IS_STRAND_JUMPED):(align_res -> result_flags &~CORE_IS_STRAND_JUMPED);
1305 				}
1306 			}
1307 		}
1308 
1309 		if(0 && memcmp("V0112_0155:7:1101:1173:2204", read_name, 26) == 0)
1310 		{
1311 			char leftpos[100], rightpos[100];
1312 			absoffset_to_posstr(global_context, current_vote -> pos[vote_i][vote_j]  , leftpos);
1313 			absoffset_to_posstr(global_context, junc_res -> minor_position, rightpos);
1314 			SUBREADprintf("READ=%s, MAJOR=%s, MINOR=%s\n", read_name, leftpos, rightpos);
1315 		}
1316 
1317 
1318 		// This block runs after the minor half of this anchor is fully determined.
1319 		// If the minor half is a fusion and there is a strand jump, move the minor half coverage to the major half strand.
1320 		if(align_res -> result_flags & CORE_IS_STRAND_JUMPED)
1321 		{
1322 			// If "is_strand_jumped" is true, all coordinates so far are on the best voted strands (must be differnet strands, namely they're very likely to be overlapped).
1323 			int tmpv = junc_res -> minor_coverage_start;
1324 			junc_res -> minor_coverage_start = curr_read_len - junc_res -> minor_coverage_end;
1325 			junc_res -> minor_coverage_end = curr_read_len - tmpv;
1326 
1327 			// Split_point is now the "negative strand read" view. It has to be changed to "main piece" view
1328 			junc_res -> split_point = (align_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?
1329 							junc_res -> split_point :
1330 							(curr_read_len - junc_res -> split_point);
1331 		}
1332 	}
1333 }
1334 
1335 
simple_PE_and_same_chro(global_context_t * global_context,simple_mapping_t * r1,simple_mapping_t * r2,int * is_PE_distance,int * is_same_chromosome,int rlen1,int rlen2)1336 void simple_PE_and_same_chro(global_context_t * global_context , simple_mapping_t * r1, simple_mapping_t * r2 , int * is_PE_distance, int * is_same_chromosome , int rlen1, int rlen2){
1337 	test_PE_and_same_chro(global_context, r1 -> mapping_position, r2 -> mapping_position, is_PE_distance, is_same_chromosome, rlen1, rlen2);
1338 }
1339 
1340 
1341 #define MAX_CLUSTER_ELEMENTS 7
1342 
1343 struct cluster_element{
1344 	unsigned int initial_position;
1345 	char cluster_members;
1346 	char from_second_read[MAX_CLUSTER_ELEMENTS];
1347 	int i_list[MAX_CLUSTER_ELEMENTS], j_list[MAX_CLUSTER_ELEMENTS];
1348 };
1349 
add_cluster_member(struct cluster_element * cl,int i,int j,int is_second_read)1350 int add_cluster_member(struct cluster_element * cl , int i, int j, int is_second_read){
1351 	if(cl->cluster_members < MAX_CLUSTER_ELEMENTS){
1352 		cl->i_list[(int)cl->cluster_members] = i;
1353 		cl->j_list[(int)cl->cluster_members] = j;
1354 		cl->from_second_read[(int)cl->cluster_members] = is_second_read;
1355 		cl->cluster_members++;
1356 	}
1357 	return cl->cluster_members;
1358 }
1359 
is_same_cluster(global_context_t * global_context,struct cluster_element * cl,unsigned int pos)1360 int is_same_cluster( global_context_t * global_context, struct cluster_element * cl , unsigned int pos){
1361 	long long int test_pos = pos;
1362 	test_pos -= cl -> initial_position;
1363 	if(abs(test_pos) < global_context -> config.maximum_intron_length)
1364 		return 1;
1365 	return 0;
1366 }
1367 
1368 int process_voting_junction_PE_topK(global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t pair_number, gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, gene_vote_number_t v1_all_subreads, gene_vote_number_t v2_all_subreads);
1369 int align_cluster(global_context_t * global_context, thread_context_t * thread_context, struct cluster_element * this_cluster, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand,  gene_vote_t * vote_1, gene_vote_t * vote_2, int * this_score, int * ii_path, int * jj_path, int * masks, int * path_len, int * R1R2_mapped);
1370 
simple_copy_vote_to_result(mapping_result_t * align_res,gene_vote_t * current_vote,int vote_i,int vote_j,int used_subreads_in_vote,int noninformative_subreads_in_vote,int score)1371 void simple_copy_vote_to_result( mapping_result_t * align_res, gene_vote_t * current_vote, int vote_i, int vote_j, int used_subreads_in_vote, int noninformative_subreads_in_vote, int score){
1372 	align_res -> selected_position = current_vote -> pos[vote_i][vote_j];
1373 	align_res -> selected_votes = score;
1374 	align_res -> indels_in_confident_coverage = indel_recorder_copy(align_res -> selected_indel_record, current_vote -> indel_recorder[vote_i][vote_j]);
1375 	align_res -> confident_coverage_end = current_vote -> coverage_end[vote_i][vote_j];
1376 	align_res -> confident_coverage_start = current_vote -> coverage_start[vote_i][vote_j];
1377 	align_res -> result_flags = (current_vote -> masks[vote_i][vote_j] & IS_NEGATIVE_STRAND)?(CORE_IS_NEGATIVE_STRAND):0;
1378 	align_res -> used_subreads_in_vote = used_subreads_in_vote;
1379 	align_res -> noninformative_subreads_in_vote = noninformative_subreads_in_vote;
1380 }
1381 
process_voting_junction_PE_juncs(global_context_t * global_context,thread_context_t * thread_context,subread_read_number_t pair_number,gene_vote_t * vote_1,gene_vote_t * vote_2,char * read_name_1,char * read_name_2,char * read_text_1,char * read_text_2,int read_len_1,int read_len_2,int is_negative_strand,gene_vote_number_t v1_all_subreads,gene_vote_number_t v2_all_subreads)1382 int process_voting_junction_PE_juncs( global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t pair_number, gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, gene_vote_number_t v1_all_subreads, gene_vote_number_t v2_all_subreads ){
1383 	int current_cluster_number = 0,max_clusters = global_context -> config.max_vote_simples * 2;
1384 	int i,j, is_second_read, tested_votes, x1;
1385 
1386 	struct cluster_element * cluster_buffer = malloc(max_clusters * sizeof(struct cluster_element));
1387 	int max_cluster_size_r1 = 0, max_cluster_size_r2 = 0;
1388 
1389 	for( tested_votes = max(vote_1 -> max_vote, vote_2 -> max_vote); tested_votes > 0; tested_votes--) {
1390 		for(is_second_read = 0 ; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read ++) {
1391 			gene_vote_t * current_vote = is_second_read?vote_2:vote_1;
1392 			int * max_cluster_size = is_second_read?(&max_cluster_size_r2):(&max_cluster_size_r1);
1393 			for (i=0; i<GENE_VOTE_TABLE_SIZE; i++) {
1394 				for (j=0; j< current_vote->items[i]; j++) {
1395 					if(current_vote->votes[i][j]!=tested_votes) continue;
1396 					int is_added = 0;
1397 
1398 					for(x1 = 0; x1 < current_cluster_number ; x1++){
1399 						if(is_same_cluster(global_context, cluster_buffer+x1, current_vote->pos[i][j])){
1400 							int new_size =add_cluster_member(cluster_buffer+x1, i, j, is_second_read);
1401 							(*max_cluster_size) = max(*max_cluster_size, new_size);
1402 							is_added = 1;
1403 						}
1404 						if(is_added)break;
1405 					}
1406 					if(current_cluster_number < max_clusters && !is_added){
1407 						cluster_buffer[current_cluster_number].initial_position = current_vote->pos[i][j];
1408 						cluster_buffer[current_cluster_number].i_list[0] = i;
1409 						cluster_buffer[current_cluster_number].j_list[0] = j;
1410 						cluster_buffer[current_cluster_number].from_second_read[0] = is_second_read;
1411 						cluster_buffer[current_cluster_number].cluster_members = 1;
1412 						current_cluster_number++;
1413 					}
1414 				}
1415 			}
1416 		}
1417 	}
1418 
1419 	if(1 || max_cluster_size_r1 == 3 || max_cluster_size_r2 == 3 ) // if there are 3-section clusters then parse it, else go to the regular procedure. There is a upper-limit to the sections to avoid fragile mapping.
1420 	{
1421 		for(x1 = 0 ; x1 < current_cluster_number ; x1++){
1422 			int this_score = -1, path_len = -1, R1R2_mapped = 0;
1423 			int this_ii_path[ MAX_CLUSTER_ELEMENTS ], this_jj_path[ MAX_CLUSTER_ELEMENTS ], this_masks [ MAX_CLUSTER_ELEMENTS ];
1424 			align_cluster(global_context, thread_context, cluster_buffer + x1, read_name_1, read_name_2, read_text_1, read_text_2, read_len_1, read_len_2, is_negative_strand, vote_1, vote_2, &this_score, this_ii_path, this_jj_path, this_masks, &path_len, &R1R2_mapped);
1425 
1426 			if(0 && FIXLENstrcmp("R00000003493", read_name_1)==0)
1427 				SUBREADprintf("REAE_TEST : R12MAP=%d, PATHLEN=%d, SCORE=%d\n", R1R2_mapped, path_len, this_score);
1428 
1429 			if(this_score > 0){
1430 				if(( R1R2_mapped & CLUSTER_ALIGNMENT_DONOR_R1_MAPPED) && ( R1R2_mapped & CLUSTER_ALIGNMENT_DONOR_R2_MAPPED)){
1431 					for(i = 0; i < global_context -> config.multi_best_reads; i++) {
1432 						mapping_result_t * old_result_R1 = _global_retrieve_alignment_ptr(global_context, pair_number, 0, i);
1433 						mapping_result_t * old_result_R2 = _global_retrieve_alignment_ptr(global_context, pair_number, 1, i);
1434 						short old_score_R1 = old_result_R1 -> selected_votes;
1435 						short old_score_R2 = old_result_R2 -> selected_votes;
1436 
1437 						if( old_score_R1 < this_score || old_score_R2 < this_score ){
1438 
1439 							for(j = global_context -> config.multi_best_reads - 2; j>=i; j--){
1440 								mapping_result_t * shifted_result_R1 = _global_retrieve_alignment_ptr(global_context, pair_number, 0, j);
1441 								mapping_result_t * shifted_result_R2 = _global_retrieve_alignment_ptr(global_context, pair_number, 1, j);
1442 								mapping_result_t * target_result_R1 = _global_retrieve_alignment_ptr(global_context, pair_number, 0, j+1);
1443 								mapping_result_t * target_result_R2 = _global_retrieve_alignment_ptr(global_context, pair_number, 1, j+1);
1444 								memcpy( target_result_R1, shifted_result_R1 , sizeof(mapping_result_t));
1445 								memcpy( target_result_R2, shifted_result_R2 , sizeof(mapping_result_t) );
1446 
1447 							}
1448 
1449 							int  best_R1_i = -1, best_R1_j = - 1 , highest_vote_R1 = -1, highest_vote_R2 = -1,  best_R2_i = -2, best_R2_j = -2;
1450 
1451 							for(j = 0; j < path_len ; j++){
1452 								if( this_masks[j] & CLUSTER_ALIGNMENT_DONOR_R1_MAPPED ){
1453 									if( highest_vote_R1 < vote_1 -> votes [  this_ii_path [j] ] [  this_jj_path [j] ] ){
1454 										best_R1_i = this_ii_path [j] ;
1455 										best_R1_j = this_jj_path [j] ;
1456 										highest_vote_R1 =  vote_1 -> votes [  this_ii_path [j] ] [  this_jj_path [j] ] ;
1457 									}
1458 								}else{
1459 									if( highest_vote_R2 < vote_2 -> votes [  this_ii_path [j] ] [  this_jj_path [j] ] ){
1460 										best_R2_i = this_ii_path [j] ;
1461 										best_R2_j = this_jj_path [j] ;
1462 										highest_vote_R2 =  vote_2 -> votes [  this_ii_path [j] ] [  this_jj_path [j] ] ;
1463 									}
1464 								}
1465 								//SUBREADprintf("MASK=%d\n",  this_masks[j]);
1466 							}
1467 
1468 							//SUBREADprintf("IJ: R1=%d,%d  R2=%d,%d  MASK=%d\n", best_R1_i,best_R1_j,best_R2_i,best_R2_j);
1469 
1470 							simple_copy_vote_to_result( old_result_R1, vote_1, best_R1_i, best_R1_j , v1_all_subreads, vote_1 -> noninformative_subreads, this_score);
1471 							simple_copy_vote_to_result( old_result_R2, vote_2, best_R2_i, best_R2_j , v2_all_subreads, vote_2 -> noninformative_subreads, this_score);
1472 							break;
1473 						}
1474 					}
1475 				} else if(  R1R2_mapped & ( CLUSTER_ALIGNMENT_DONOR_R1_MAPPED | CLUSTER_ALIGNMENT_DONOR_R2_MAPPED ) ) {
1476 					int is_R2_mapped = ( R1R2_mapped & CLUSTER_ALIGNMENT_DONOR_R2_MAPPED)?1:0;
1477 					for(i = 0; i < global_context -> config.multi_best_reads; i++) {
1478 						mapping_result_t * old_result_R = _global_retrieve_alignment_ptr(global_context, pair_number, is_R2_mapped, i);
1479 						short old_score_R = old_result_R -> selected_votes;
1480 
1481 
1482 
1483 						if( old_score_R < this_score ){
1484 
1485 							for(j = global_context -> config.multi_best_reads - 2; j>=i; j--){
1486 								mapping_result_t * shifted_result_R = _global_retrieve_alignment_ptr(global_context, pair_number, is_R2_mapped, j);
1487 								mapping_result_t * target_result_R = _global_retrieve_alignment_ptr(global_context, pair_number, is_R2_mapped, j+1);
1488 								memcpy( target_result_R, shifted_result_R , sizeof(mapping_result_t));
1489 
1490 							}
1491 
1492 							int  best_R_i = -1, best_R_j = - 1 , highest_vote_R = -1;
1493 							gene_vote_t * this_vote = is_R2_mapped?vote_2:vote_1;
1494 
1495 							for(j = 0; j < path_len ; j++){
1496 								if( highest_vote_R < this_vote -> votes [  this_ii_path [j] ] [  this_jj_path [j] ] ){
1497 									best_R_i = this_ii_path [j] ;
1498 									best_R_j = this_jj_path [j] ;
1499 									highest_vote_R =  this_vote -> votes [  this_ii_path [j] ] [  this_jj_path [j] ] ;
1500 								}
1501 								//SUBREADprintf("MASK=%d\n",  this_masks[j]);
1502 							}
1503 
1504 							//SUBREADprintf("IJ: R1=%d,%d  R2=%d,%d  MASK=%d\n", best_R1_i,best_R1_j,best_R2_i,best_R2_j);
1505 
1506 							simple_copy_vote_to_result( old_result_R, this_vote, best_R_i, best_R_j , v1_all_subreads, this_vote -> noninformative_subreads, this_score);
1507 							break;
1508 						}
1509 
1510 
1511 					}
1512 				}
1513 			}
1514 
1515 			/*if(highest_score > 0){
1516 				if(this_score >0){
1517 					if(this_score > highest_score){
1518 						highest_score = this_score;
1519 						highest_occurance = 1;
1520 						memcpy(best_ii_path, this_ii_path, sizeof(int)*path_len);
1521 						memcpy(best_jj_path, this_jj_path, sizeof(int)*path_len);
1522 						best_path_len = path_len;
1523 					}else if(this_score == highest_score)
1524 						highest_occurance ++;
1525 				}
1526 			}*/
1527 		}
1528 
1529 
1530 		// call new junctions from the path
1531 		// then put the alignment into the best list.
1532 
1533 	}else{
1534 		return process_voting_junction_PE_topK(global_context, thread_context, pair_number, vote_1, vote_2, read_name_1, read_name_2, read_text_1, read_text_2, read_len_1, read_len_2, is_negative_strand, v1_all_subreads, v2_all_subreads);
1535 	}
1536 
1537 	free(cluster_buffer);
1538 	return 0;
1539 }
1540 
1541 
compare_cluster_elements(void * arr,int l,int r)1542 int compare_cluster_elements (void * arr, int l, int r){
1543 	int * ii_array = ((void **)arr)[0];
1544 	int * jj_array = ((void **)arr)[1];
1545 	int * second_vote = ((void **)arr)[2];
1546 
1547 	if(second_vote[l] != second_vote[r])
1548 		return second_vote[l] - second_vote[r];
1549 
1550 	gene_vote_t * vote_1 = ((void **)arr)[3];
1551 	gene_vote_t * vote_2 = ((void **)arr)[4];
1552 
1553 
1554 	gene_vote_t * this_vote_L = second_vote[l]?vote_2:vote_1;
1555 	gene_vote_t * this_vote_R = second_vote[r]?vote_2:vote_1;
1556 
1557 	return this_vote_L->coverage_start[ii_array[l]][jj_array[l]] - this_vote_R -> coverage_start[ii_array[r]][jj_array[r]];
1558 }
1559 
exchange_cluster_elements(void * arr,int l,int r)1560 void exchange_cluster_elements (void * arr, int l, int r){
1561 	int * ii_array = ((void **)arr)[0];
1562 	int * jj_array = ((void **)arr)[1];
1563 	int * second_vote = ((void **)arr)[2];
1564 
1565 	int ti;
1566 	ti = ii_array[l];
1567 	ii_array[l] = ii_array[r];
1568 	ii_array[r]=ti;
1569 
1570 	ti = jj_array[l];
1571 	jj_array[l] = jj_array[r];
1572 	jj_array[r]=ti;
1573 
1574 	ti = second_vote[l];
1575 	second_vote[l] = second_vote[r];
1576 	second_vote[r]=ti;
1577 }
1578 
1579 #define NEW_EXTEND_SCAN_INTRON_LONGEST 6000
1580 #define NEW_EXTEND_SCAN_EXON_SHORTEST 14
1581 
1582 int find_path(global_context_t * global_context, thread_context_t * thread_context, int start_element_i, int target_element_i, int * ii_array, int * jj_array, int * is_second_vote_array,  gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, int * this_mask , int * exon_last_base);
1583 int find_donor_receptor(global_context_t * global_context, thread_context_t * thread_context, char * rname, char * rtext, int rlen, int start_coverage, int end_coverage, unsigned int start_pos, unsigned int end_pos, int indels_in_start, int v1, int v2, int * misma_bases, int * matched_bases, int * is_negative_donor);
extend_uncovered_region_juncs(global_context_t * global_context,thread_context_t * thread_context,char * rname,char * rtext,int rlen,int scan_to_tail,unsigned int scan_chro_start,int scan_read_start,unsigned short expect_donor,int * mismatched_bases_after_start,int * first_exon_last_base,unsigned int * first_exon_first_base,int * ret_mismatched_bases,int * is_negative_donor)1584 int extend_uncovered_region_juncs(global_context_t * global_context, thread_context_t * thread_context, char * rname, char * rtext, int rlen, int scan_to_tail, unsigned int scan_chro_start, int scan_read_start, unsigned short expect_donor, int * mismatched_bases_after_start, int * first_exon_last_base, unsigned int * first_exon_first_base, int * ret_mismatched_bases, int * is_negative_donor){
1585 
1586 	if(  scan_to_tail  ) assert( scan_read_start < rlen - NEW_EXTEND_SCAN_EXON_SHORTEST );
1587 	else	assert( scan_read_start >= NEW_EXTEND_SCAN_EXON_SHORTEST);
1588 
1589 	gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
1590 	int x1, best_best_score = -1, best_best_occurance = 0;
1591 
1592 	unsigned long long matching_target = 0, rolling_bases = 0;
1593 
1594 	for(x1 = 0 ; x1 < 8 ; x1++){
1595 		int nch = scan_to_tail? rtext[ rlen - 2 - x1 ] :  rtext[ 10 - x1 ] ;
1596 		matching_target = ( matching_target << 8 ) | nch;
1597 	}
1598 		if(0 && FIXLENstrcmp("V0112_0155:7:1101:13762:2349#ACTTGA", rname ) == 0 )
1599 			SUBREADprintf("TAG=%016llX\n",matching_target);
1600 
1601 	for(x1 = 0; x1 < NEW_EXTEND_SCAN_INTRON_LONGEST ; x1++){
1602 		int best_last_exon_base = -1, matched_in_the_uncovered_gap = -1, mismatched_bases = -1, extended_should_mismatch = -1;
1603 		unsigned int scan_cursor = scan_chro_start ;
1604 		if(scan_to_tail) scan_cursor+=x1;else scan_cursor-=x1;
1605 		unsigned long long nch = gvindex_get( value_index, scan_cursor );
1606 		if(scan_to_tail)
1607 			rolling_bases = (rolling_bases >>  8) | nch << 56;
1608 		else
1609 			rolling_bases = nch | ( rolling_bases << 8 );
1610 
1611 		//SUBREADprintf("MATCH:%016llX,%016llX\n", rolling_bases, matching_target);
1612 		if(rolling_bases == matching_target){
1613 			//SUBREADprintf("PNTT-M\n");
1614 			if(scan_to_tail) {
1615 				best_last_exon_base = find_donor_receptor(global_context, thread_context, rname, rtext, rlen, scan_read_start, rlen - 2 - 7,  scan_chro_start, scan_cursor - (rlen - 2) , 0, 0,0, &mismatched_bases , &matched_in_the_uncovered_gap, is_negative_donor);
1616 				if(best_last_exon_base>0)
1617 					extended_should_mismatch = match_chro( rtext + best_last_exon_base , value_index, scan_chro_start + best_last_exon_base, rlen - best_last_exon_base, 0, global_context->config.space_type);
1618 			} else {
1619 				best_last_exon_base = find_donor_receptor(global_context, thread_context, rname, rtext, rlen, 10, scan_read_start, scan_cursor - 3 , scan_chro_start, 0, 0,0, &mismatched_bases , &matched_in_the_uncovered_gap,is_negative_donor);
1620 				if(best_last_exon_base>0)
1621 					extended_should_mismatch = match_chro( rtext, value_index, scan_chro_start, best_last_exon_base , 0, global_context->config.space_type);
1622 			}
1623 
1624 		}
1625 
1626 		if(best_last_exon_base >0 && extended_should_mismatch < ( scan_to_tail?( rlen - best_last_exon_base - 4  ):(  best_last_exon_base - 4  )) && mismatched_bases < 2 ){
1627 			int this_score;
1628 			if(scan_to_tail) this_score = rlen - scan_read_start - mismatched_bases;
1629 			else	this_score = scan_read_start - mismatched_bases;
1630 			if(best_best_score < this_score){
1631 				best_best_score = this_score;
1632 				(*mismatched_bases_after_start) = mismatched_bases;
1633 				(*first_exon_last_base) = best_last_exon_base;
1634 				(*first_exon_first_base) = scan_to_tail?( scan_cursor - (rlen - 2) ) : ( scan_cursor - 3 );
1635 				(*ret_mismatched_bases) = mismatched_bases;
1636 				best_best_occurance = 1;
1637 
1638 			}else if( best_best_score == this_score ) best_best_occurance ++;
1639 		}
1640 
1641 
1642 		if(0 && (!scan_to_tail) && best_last_exon_base >0 && extended_should_mismatch < best_last_exon_base - 4 && mismatched_bases < 2){
1643 			char out1pos[100], out2pos[100];
1644 			absoffset_to_posstr(global_context, scan_chro_start + best_last_exon_base, out1pos);
1645 			absoffset_to_posstr(global_context, scan_cursor - (rlen - 2) + best_last_exon_base, out2pos);
1646 			SUBREADprintf("LIMMISMA: %d < %d - 4\t\tfor %s\n" , extended_should_mismatch,   best_last_exon_base ,rname);
1647 
1648 			SUBREADprintf("HEAD MATCH: %s - %s : MM=%d ; SPLIT=%d\t%s\n",out1pos, out2pos, mismatched_bases, best_last_exon_base, rname);
1649 
1650 			SUBREADprintf("R =%s\nS1=", rtext);
1651 			int x2;
1652 			for(x2 = 0; x2 <  rlen; x2++){
1653 				if(x2 > best_last_exon_base + 16) SUBREADprintf(" ");
1654 				else{
1655 					int nch =  gvindex_get( value_index, scan_cursor - 3 + x2 );
1656 					SUBREADprintf("%c", nch);
1657 				}
1658 			}
1659 			SUBREADprintf("\nS2=");
1660 
1661 			for(x2 = 0; x2 <  rlen; x2++){
1662 				if(x2 > best_last_exon_base + 16 ) SUBREADprintf(" ");
1663 				else{
1664 					int nch =  gvindex_get( value_index, scan_chro_start +x2);
1665 					SUBREADprintf("%c", nch);
1666 				}
1667 			}
1668 			SUBREADprintf("\n   ");
1669 
1670 			for(x2 = 0; x2 <  rlen; x2++){
1671 				if(x2 < best_last_exon_base ) SUBREADprintf(" ");
1672 				else if( x2 > best_last_exon_base + 1 ) SUBREADprintf(" ");
1673 				else SUBREADprintf("|");
1674 			}
1675 			SUBREADprintf("\n\n");
1676 		}
1677 		if(0 && scan_to_tail && best_last_exon_base >0 && extended_should_mismatch < rlen - best_last_exon_base - 4 && mismatched_bases < 2){
1678 
1679 			SUBREADprintf("LIMMISMA: %d < %d - 4\t\tfor %s\n" , extended_should_mismatch,  (rlen - best_last_exon_base ),rname);
1680 			char out1pos[100], out2pos[100];
1681 			absoffset_to_posstr(global_context, scan_chro_start + best_last_exon_base, out1pos);
1682 			absoffset_to_posstr(global_context, scan_cursor - (rlen - 2) + best_last_exon_base, out2pos);
1683 			SUBREADprintf("TAIL MATCH: %s - %s : MM=%d ; SPLIT=%d\t%s\n",out1pos, out2pos, mismatched_bases, best_last_exon_base, rname);
1684 
1685 			SUBREADprintf("R =%s\nS1=", rtext);
1686 			int x2;
1687 			for(x2 = 0; x2 <  rlen; x2++){
1688 				if(x2 < scan_read_start - 16) SUBREADprintf(" ");
1689 				else{
1690 					int nch =  gvindex_get( value_index, x2 + scan_chro_start);
1691 					SUBREADprintf("%c", nch);
1692 				}
1693 			}
1694 			SUBREADprintf("\nS2=");
1695 
1696 			for(x2 = 0; x2 <  rlen; x2++){
1697 				if(x2 < best_last_exon_base - 16 ) SUBREADprintf(" ");
1698 				else{
1699 					int nch =  gvindex_get( value_index, scan_cursor - (rlen - 2)+x2);
1700 					SUBREADprintf("%c", nch);
1701 				}
1702 			}
1703 			SUBREADprintf("\n   ");
1704 
1705 			for(x2 = 0; x2 <  rlen; x2++){
1706 				if(x2 < best_last_exon_base ) SUBREADprintf(" ");
1707 				else if( x2 > best_last_exon_base + 1 ) SUBREADprintf(" ");
1708 				else SUBREADprintf("|");
1709 			}
1710 			SUBREADprintf("\n\n");
1711 		}
1712 	}
1713 	if(0&&best_best_occurance>0 && best_best_score>0)
1714 		SUBREADprintf("OCCR=%d : SCR=%d\n", best_best_occurance, best_best_score);
1715 	if(best_best_occurance == 1) return best_best_score;
1716 	return -1;
1717 }
1718 
simple_add_junction(global_context_t * global_context,thread_context_t * thread_context,unsigned int left_edge_wanted,unsigned int right_edge_wanted,int indel_at_junction,int is_negative_donors)1719 void simple_add_junction( global_context_t * global_context, thread_context_t * thread_context, unsigned int left_edge_wanted, unsigned int right_edge_wanted, int indel_at_junction, int is_negative_donors ){
1720 	char * chro_name_left, *chro_name_right;
1721 	int chro_pos_left,chro_pos_right;
1722 
1723 	locate_gene_position( left_edge_wanted , &global_context -> chromosome_table, &chro_name_left, &chro_pos_left);
1724 	locate_gene_position( right_edge_wanted , &global_context -> chromosome_table, &chro_name_right, &chro_pos_right);
1725 	if((!( global_context-> config.do_fusion_detection ||  global_context-> config.do_long_del_detection) ) && chro_name_right!=chro_name_left) return;
1726 
1727 	//insert event
1728 	HashTable * event_table = NULL;
1729 	chromosome_event_t * event_space = NULL;
1730 	if(thread_context)
1731 	{
1732 			event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
1733 			event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1734 	}
1735 	else
1736 	{
1737 			event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
1738 			event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1739 	}
1740 	chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
1741 	chromosome_event_t * found = NULL;
1742 	int found_events = search_event(global_context, event_table, event_space, left_edge_wanted , EVENT_SEARCH_BY_SMALL_SIDE,  CHRO_EVENT_TYPE_INDEL | CHRO_EVENT_TYPE_JUNCTION | CHRO_EVENT_TYPE_FUSION, search_return);
1743 
1744 	if(found_events)
1745 	{
1746 			int kx1;
1747 			for(kx1 = 0; kx1 < found_events ; kx1++)
1748 			{
1749 					if(search_return[kx1] -> event_large_side == right_edge_wanted)
1750 					{
1751 							found = search_return[kx1];
1752 							break;
1753 					}
1754 			}
1755 	}
1756 
1757 	if(found) found -> supporting_reads ++;
1758 	else
1759 	{
1760 			int event_no;
1761 
1762 
1763 			if(thread_context)
1764 				event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
1765 			else
1766 				event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) ->  total_events ++;
1767 
1768 
1769 			chromosome_event_t * new_event = event_space+event_no;
1770 			memset(new_event,0,sizeof(chromosome_event_t));
1771 			new_event -> event_small_side = left_edge_wanted;
1772 			new_event -> event_large_side = right_edge_wanted;
1773 			new_event -> is_negative_strand= is_negative_donors;
1774 			new_event -> event_type = CHRO_EVENT_TYPE_JUNCTION;
1775 			new_event -> supporting_reads = 1;
1776 			new_event -> indel_length = 0;
1777 			new_event -> indel_at_junction = indel_at_junction;
1778 			new_event -> is_donor_found_or_annotation = 1;
1779 			new_event -> small_side_increasing_coordinate = 0;
1780 			new_event -> large_side_increasing_coordinate = 1;
1781 			put_new_event(event_table, new_event , event_no);
1782 	}
1783 }
1784 
align_cluster(global_context_t * global_context,thread_context_t * thread_context,struct cluster_element * this_cluster,char * read_name_1,char * read_name_2,char * read_text_1,char * read_text_2,int read_len_1,int read_len_2,int is_negative_strand,gene_vote_t * vote_1,gene_vote_t * vote_2,int * this_score,int * best_ii_path,int * best_jj_path,int * best_masks,int * best_path_length,int * R1R2_mapped)1785 int align_cluster(global_context_t * global_context, thread_context_t * thread_context, struct cluster_element * this_cluster, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand,  gene_vote_t * vote_1, gene_vote_t * vote_2, int * this_score, int * best_ii_path, int * best_jj_path, int * best_masks, int * best_path_length, int * R1R2_mapped){
1786 	//int cluster_x1;
1787 
1788 	//SUBREADprintf("\n === Cluster %s    %s  === \n", is_negative_strand?"NEG":"POS", read_name_1);
1789 	//unsigned int min_frag_start = 0xffffffff;
1790 
1791 	int ii_array[MAX_CLUSTER_ELEMENTS], jj_array[MAX_CLUSTER_ELEMENTS], is_second_vote_array[MAX_CLUSTER_ELEMENTS], dynamic_highest_mask[MAX_CLUSTER_ELEMENTS], x1;
1792 	void * sort_pointers [5];
1793 
1794 	for(x1 = 0 ; x1 < this_cluster->cluster_members; x1++){
1795 		ii_array[x1] = this_cluster -> i_list[x1];
1796 		jj_array[x1] = this_cluster -> j_list[x1];
1797 		is_second_vote_array[x1] = this_cluster -> from_second_read[x1];
1798 
1799 	}
1800 
1801 	sort_pointers[0] = ii_array;
1802 	sort_pointers[1] = jj_array;
1803 	sort_pointers[2] = is_second_vote_array;
1804 	sort_pointers[3] = vote_1;
1805 	sort_pointers[4] = vote_2;
1806 
1807 	basic_sort(sort_pointers, this_cluster->cluster_members, compare_cluster_elements, exchange_cluster_elements);
1808 
1809 	if(0)
1810 	for(x1 = 0 ; x1 < this_cluster->cluster_members; x1++){
1811 		gene_vote_t * this_vote = is_second_vote_array[x1]?vote_2:vote_1;
1812 		int ii = ii_array[x1];
1813 		int jj = jj_array[x1];
1814 
1815 		SUBREADprintf("   R%d %d - %d   POS=%u  VOTES=%d\n", 1+is_second_vote_array[x1], this_vote->coverage_start[ii][jj], this_vote->coverage_end[ii][jj], this_vote -> pos[ii][jj], this_vote->votes[ii][jj]);
1816 	}
1817 
1818 	int dynamic_highest_scores[MAX_CLUSTER_ELEMENTS], dynamic_last_exon[MAX_CLUSTER_ELEMENTS];
1819 	char dynamic_highest_path[MAX_CLUSTER_ELEMENTS];
1820 	memset(dynamic_highest_scores,0,sizeof(int)*MAX_CLUSTER_ELEMENTS);
1821 
1822 	int target_element_i;
1823 
1824 	for(target_element_i = 0; target_element_i < this_cluster->cluster_members; target_element_i++){
1825 		gene_vote_t * this_vote = is_second_vote_array[target_element_i]?vote_2:vote_1;
1826 		int ii = ii_array[target_element_i];
1827 		int jj = jj_array[target_element_i];
1828 		dynamic_highest_scores[target_element_i] =  this_vote->coverage_end[ii][jj] - this_vote->coverage_start[ii][jj];
1829 		dynamic_highest_path[ target_element_i ] = -1;
1830 	}
1831 
1832 	int highest_score = -1;
1833 	int highest_target_end = -1;
1834 	for(target_element_i = 0; target_element_i < this_cluster->cluster_members; target_element_i++){
1835 		int start_element_i;
1836 		for(start_element_i = 0; start_element_i < this_cluster->cluster_members; start_element_i++){
1837 			if(target_element_i <= start_element_i) continue;
1838 			int this_mask = -1, breakpount_last_exon = -1;
1839 			int increasing_score = find_path(global_context, thread_context, start_element_i, target_element_i, ii_array, jj_array, is_second_vote_array, vote_1, vote_2, read_name_1, read_name_2, read_text_1, read_text_2, read_len_1, read_len_2, is_negative_strand, &this_mask, &breakpount_last_exon);
1840 			if(increasing_score >= 0 && increasing_score + dynamic_highest_scores[start_element_i] > dynamic_highest_scores[target_element_i]){
1841 				dynamic_highest_path[ target_element_i ] = start_element_i;
1842 				dynamic_highest_scores[target_element_i] = increasing_score + dynamic_highest_scores[start_element_i];
1843 				dynamic_highest_mask[ target_element_i ] = this_mask;
1844 				dynamic_last_exon[ target_element_i ] = breakpount_last_exon;
1845 				if(  dynamic_highest_scores[target_element_i] > highest_score ){
1846 					highest_score =  dynamic_highest_scores[target_element_i] ;
1847 					highest_target_end = target_element_i;
1848 				}
1849 			}
1850 		}
1851 	}
1852 
1853 
1854 	if(highest_target_end >=0 && highest_score > 160 - 159){
1855 		int is_on_path [MAX_CLUSTER_ELEMENTS];
1856 		memset(is_on_path,0,sizeof(int)*MAX_CLUSTER_ELEMENTS);
1857 
1858 		gene_vote_t * last_vote = is_second_vote_array[ highest_target_end ]?vote_2:vote_1;
1859 		int last_section_read_end = last_vote -> coverage_end[ ii_array [highest_target_end]  ] [ jj_array [highest_target_end]   ];
1860 		int this_rlen = is_second_vote_array[ highest_target_end ]?read_len_2 : read_len_1;
1861 		int this_votes = last_vote -> votes [ ii_array [highest_target_end]  ] [ jj_array [highest_target_end]   ];
1862 		int tail_first_exon_last_base_in_read=-1, tail_mismatched_bases=-1;
1863 		unsigned int tail_first_exon_first_base_on_chro, tail_mapped_section_pos;
1864 		int front_first_exon_last_base_in_read=-1, front_mismatched_bases=-1;
1865 		unsigned int front_first_exon_first_base_on_chro, front_mapped_section_pos ;
1866 		int front_score = 0, tail_score = 0, front_negative_donor = 0, tail_negative_donor = 0;
1867 
1868 		if(0 && last_section_read_end < this_rlen - NEW_EXTEND_SCAN_EXON_SHORTEST && this_votes > 4)
1869 		{
1870 			char * this_rname = is_second_vote_array[ highest_target_end ]?read_name_2:read_name_1;
1871 			char * this_rtext = is_second_vote_array[ highest_target_end ]?read_text_2:read_text_1;
1872 			int scan_to_tail = 1, mismatched_bases_after_start;
1873 			tail_mapped_section_pos  =	last_vote -> pos[ ii_array [highest_target_end]  ] [ jj_array [highest_target_end]   ] +
1874 						last_vote -> current_indel_cursor[ ii_array [highest_target_end]  ] [ jj_array [highest_target_end]   ] ;
1875 			if(0){
1876 				char out1pos[100];
1877 				absoffset_to_posstr(global_context, tail_mapped_section_pos, out1pos);
1878 				SUBREADprintf("RN=%s\nSTART=%u, READ_START=%d, READ_FIRTS_BASE_POS=%s\n", this_rname, tail_mapped_section_pos , last_section_read_end, out1pos);
1879 			}
1880 
1881 			tail_score = extend_uncovered_region_juncs(global_context, thread_context, this_rname, this_rtext , this_rlen, scan_to_tail, tail_mapped_section_pos, last_section_read_end , -1, & mismatched_bases_after_start, &tail_first_exon_last_base_in_read, &tail_first_exon_first_base_on_chro, &tail_mismatched_bases, &tail_negative_donor);
1882 		}
1883 		(*best_path_length) = 0;
1884 		if(highest_score>0) while(1){
1885 			best_ii_path[(*best_path_length)] = ii_array[highest_target_end];
1886 			best_jj_path[(*best_path_length)] = jj_array[highest_target_end];
1887 			best_masks[ (*best_path_length) ] = dynamic_highest_mask[highest_target_end];
1888 
1889 			if( dynamic_last_exon [ highest_target_end ] > 0 ) best_masks[ (*best_path_length) ] |= ( is_second_vote_array[ highest_target_end ]?CLUSTER_ALIGNMENT_DONOR_R2_MAPPED:CLUSTER_ALIGNMENT_DONOR_R1_MAPPED);
1890 
1891 			if(  is_second_vote_array[ highest_target_end ] ) (*R1R2_mapped) |= CLUSTER_ALIGNMENT_DONOR_R2_MAPPED;
1892 			else  (*R1R2_mapped) |= CLUSTER_ALIGNMENT_DONOR_R1_MAPPED;
1893 
1894 			(*best_path_length)++;
1895 
1896 			is_on_path[highest_target_end] = 1;
1897 			if(  dynamic_highest_path[highest_target_end] == -1 ) break;
1898 			highest_target_end = dynamic_highest_path[highest_target_end];
1899 		}
1900 
1901 		gene_vote_t * first_vote = is_second_vote_array[ highest_target_end ]?vote_2:vote_1;
1902 		int first_section_read_start = first_vote -> coverage_start [ ii_array [highest_target_end]  ] [ jj_array [highest_target_end] ] ;
1903 		this_votes = first_vote  -> votes [ ii_array [highest_target_end]  ] [ jj_array [highest_target_end]   ];
1904 
1905 		if(0 && first_section_read_start > NEW_EXTEND_SCAN_EXON_SHORTEST && this_votes > 4){
1906 			char * this_rname = is_second_vote_array[ highest_target_end ]?read_name_2:read_name_1;
1907 			char * this_rtext = is_second_vote_array[ highest_target_end ]?read_text_2:read_text_1;
1908 			int scan_to_tail = 0, mismatched_bases_after_start;
1909 			front_mapped_section_pos =  first_vote -> pos[ ii_array [highest_target_end]  ] [ jj_array [highest_target_end]   ];
1910 
1911 			front_score = extend_uncovered_region_juncs(global_context, thread_context, this_rname, this_rtext , this_rlen, scan_to_tail, front_mapped_section_pos, first_section_read_start , -1, & mismatched_bases_after_start, &front_first_exon_last_base_in_read, &front_first_exon_first_base_on_chro, &front_mismatched_bases, &front_negative_donor);
1912 
1913 		}
1914 
1915 		if(0 && front_score>0 && tail_score>0){
1916 
1917 			SUBREADprintf("\n>>> %s\n", read_name_1);
1918 
1919 			for(target_element_i = 0; target_element_i < this_cluster->cluster_members; target_element_i++){
1920 				SUBREADprintf("R%d\t", is_second_vote_array[target_element_i]+1);
1921 			}
1922 			SUBREADprintf("\n");
1923 
1924 			for(target_element_i = 0; target_element_i < this_cluster->cluster_members; target_element_i++){
1925 				gene_vote_t * this_vote = is_second_vote_array[target_element_i]?vote_2:vote_1;
1926 				int ii = ii_array[target_element_i];
1927 				int jj = jj_array[target_element_i];
1928 
1929 				SUBREADprintf("%d%c%d\t", this_vote->coverage_start[ii][jj], is_on_path[target_element_i]?'=':'-', this_vote->coverage_end[ii][jj]);
1930 			}
1931 			SUBREADprintf("\n");
1932 
1933 			for(target_element_i = 0; target_element_i < this_cluster->cluster_members; target_element_i++){
1934 				SUBREADprintf("%d\t", dynamic_highest_scores[target_element_i]);
1935 			}
1936 			SUBREADprintf("\n");
1937 			SUBREADprintf("Extra_scores = %d, %d\n", front_score, tail_score);
1938 		}
1939 
1940 		(*this_score) = highest_score + max(0, front_score) + max(0, tail_score);
1941 		int applied_score_cut=0;
1942 		if(((*R1R2_mapped) & CLUSTER_ALIGNMENT_DONOR_R1_MAPPED)&&( (*R1R2_mapped) & CLUSTER_ALIGNMENT_DONOR_R2_MAPPED ) )
1943 			applied_score_cut = read_len_2 + read_len_1 - 70;
1944 		else if((*R1R2_mapped) & CLUSTER_ALIGNMENT_DONOR_R1_MAPPED)
1945 			applied_score_cut = read_len_1 - 30;
1946 		else if((*R1R2_mapped) & CLUSTER_ALIGNMENT_DONOR_R1_MAPPED)
1947 			applied_score_cut = read_len_2 - 30;
1948 
1949 		if( (*this_score) >= applied_score_cut){
1950 			for( x1 = 0;  x1 < MAX_CLUSTER_ELEMENTS; x1++){
1951 				if(!is_on_path[x1]) continue;
1952 
1953 				int x2, second_end = -1;
1954 				for(x2 = x1 + 1; x2 < MAX_CLUSTER_ELEMENTS; x2++){
1955 					if(is_on_path[x2]){
1956 						second_end = x2;
1957 						break;
1958 					}
1959 				}
1960 
1961 
1962 				if(second_end > 0){
1963 					if( dynamic_last_exon[second_end] >0 ){
1964 						gene_vote_t * this_vote = is_second_vote_array[ x1 ]?vote_2:vote_1;
1965 						unsigned int junction_small_side = this_vote -> pos[ ii_array[ x1 ]][ jj_array[ x1 ]] +
1966 											this_vote ->  current_indel_cursor[ ii_array[ x1 ]][ jj_array[ x1 ]] + dynamic_last_exon[second_end];
1967 
1968 						unsigned int junction_large_side = this_vote -> pos[ ii_array[ second_end ]][ jj_array[ second_end ]] + dynamic_last_exon[second_end] + 1;
1969 
1970 						if(0){
1971 							char out1pos[100], out2pos[100];
1972 							absoffset_to_posstr(global_context, junction_small_side, out1pos);
1973 							absoffset_to_posstr(global_context, junction_large_side, out2pos);
1974 							SUBREADprintf("CLUSTER_JUNCTION %s %s\n%s\n%s\n\n", out1pos, out2pos, read_text_1, read_text_2);
1975 						}
1976 
1977 						//#warning "SUBREAD_151 ============= MAKE SURE:  CHANGE '0' TO INSERTED BASES ================="
1978 						simple_add_junction(global_context, thread_context, junction_small_side, junction_large_side, 0, (dynamic_highest_mask[ second_end ] & CLUSTER_ALIGNMENT_DONOR_NEGATIVE_STRAND)?1:0);
1979 					}
1980 				}
1981 			}
1982 
1983 
1984 
1985 			if(0 && front_mismatched_bases <1 && front_score >14){
1986 				unsigned int junction_small_side = front_first_exon_first_base_on_chro + front_first_exon_last_base_in_read;
1987 				unsigned int junction_large_side = front_mapped_section_pos + front_first_exon_last_base_in_read + 1;
1988 
1989 				char out1pos[100], out2pos[100];
1990 				absoffset_to_posstr(global_context, junction_small_side+1, out1pos);
1991 				absoffset_to_posstr(global_context, junction_large_side+1, out2pos);
1992 				//SUBREADprintf("FMB=%d\tFS=%d\nPOS=%s , %s\n\n", front_mismatched_bases, front_score, out1pos, out2pos);
1993 
1994 				simple_add_junction(global_context, thread_context, junction_small_side, junction_large_side, 0, front_negative_donor);
1995 			}
1996 
1997 			if(0 && tail_mismatched_bases <1 && tail_score >14){
1998 				unsigned int junction_small_side = tail_mapped_section_pos + tail_first_exon_last_base_in_read;
1999 				unsigned int junction_large_side = tail_first_exon_first_base_on_chro + tail_first_exon_last_base_in_read;
2000 
2001 				char out1pos[100], out2pos[100];
2002 				absoffset_to_posstr(global_context, junction_small_side+1, out1pos);
2003 				absoffset_to_posstr(global_context, junction_large_side+1, out2pos);
2004 				//SUBREADprintf("BMB=%d\tBS=%d\nPOS=%s , %s\n\n", tail_mismatched_bases, tail_score, out1pos, out2pos);
2005 
2006 
2007 
2008 				simple_add_junction(global_context, thread_context, junction_small_side, junction_large_side, 0, tail_negative_donor);
2009 			}
2010 		}
2011 	}
2012 	return 0;
2013 }
2014 
2015 #define paired_donor_receptor_m2(s, c1, c2 ) ( s[0] == c1 && s[1] == c2 )
2016 
is_paired_donor_receptor(char * small_bases,char * large_bases)2017 int is_paired_donor_receptor( char * small_bases, char * large_bases ){
2018 
2019 	//SUBREADprintf("SITE1 = %c%c , SITE2 = %c%c\n", small_bases[0], small_bases[1], large_bases[0], large_bases[1]);
2020 	//
2021 	if ( paired_donor_receptor_m2( small_bases, 'G', 'T' ) &&
2022 		 paired_donor_receptor_m2( large_bases, 'A', 'G' ) )
2023 		return 1;
2024 
2025 	if ( paired_donor_receptor_m2( small_bases, 'C', 'T' ) &&
2026 		 paired_donor_receptor_m2( large_bases, 'A', 'C' ) )
2027 		return 2;
2028 
2029 	// http://www.ncbi.nlm.nih.gov/pmc/articles/PMC113136/
2030 	//  the 99.24% of splice site pairs should be GT-AG,
2031 	//  0.69% GC-AG,
2032 	//  0.05% AT-AC
2033 	//  and finally only 0.02% could consist of other types of non-canonical splice sites.
2034 
2035 	// non-canonical : GC-AG (+) or CT-GC (-)
2036 	if ( paired_donor_receptor_m2( small_bases, 'G', 'C' ) &&
2037 		 paired_donor_receptor_m2( large_bases, 'A', 'G' ) )
2038 		return 3;
2039 
2040 	if ( paired_donor_receptor_m2( small_bases, 'C', 'T' ) &&
2041 		 paired_donor_receptor_m2( large_bases, 'G', 'C' ) )
2042 		return 4;
2043 
2044 
2045 	// non-canonical : AT-AC (+) or GT-AT (-)
2046 	if ( paired_donor_receptor_m2( small_bases, 'A', 'T' ) &&
2047 		 paired_donor_receptor_m2( large_bases, 'A', 'C' ) )
2048 		return 5;
2049 
2050 	if ( paired_donor_receptor_m2( small_bases, 'G', 'T' ) &&
2051 		 paired_donor_receptor_m2( large_bases, 'A', 'T' ) )
2052 		return 6;
2053 
2054 
2055 	return 0;
2056 }
2057 
find_donor_receptor(global_context_t * global_context,thread_context_t * thread_context,char * rname,char * rtext,int rlen,int start_coverage,int end_coverage,unsigned int start_pos,unsigned int end_pos,int indels_in_start,int v1,int v2,int * misma_bases,int * matched_bases,int * is_negative_donor)2058 int find_donor_receptor(global_context_t * global_context, thread_context_t * thread_context, char * rname, char * rtext, int rlen, int start_coverage, int end_coverage, unsigned int start_pos, unsigned int end_pos, int indels_in_start, int v1, int v2, int * misma_bases, int * matched_bases, int * is_negative_donor){
2059 
2060 	gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
2061 	int search_in_read_start = start_coverage - 8, search_in_read_end = end_coverage + 8;
2062 	search_in_read_start = max(0, search_in_read_start);
2063 	search_in_read_end = min( rlen, search_in_read_end );
2064 	unsigned int search_in_chro_start = start_pos + indels_in_start + search_in_read_start;
2065 
2066 	char chro_bases_startside[ search_in_read_end - search_in_read_start ], chro_bases_endside[search_in_read_end - search_in_read_start];
2067 
2068 	int x1;
2069 
2070 	for(x1 = 0; x1 < search_in_read_end - search_in_read_start; x1++){
2071 		chro_bases_startside[x1] = gvindex_get( value_index, search_in_chro_start + x1 );
2072 		chro_bases_endside[x1] = gvindex_get( value_index , end_pos + search_in_read_start + x1);
2073 	}
2074 
2075 	int insertion_in_between_i, best_testing_score = 500 * 1000;
2076 	int best_insertion_in_between = -1, best_last_exon_base_in_start = -1;
2077 	int applied_insertion_limit = global_context->config.max_insertion_at_junctions;
2078 	for(insertion_in_between_i = 0; insertion_in_between_i <= applied_insertion_limit; insertion_in_between_i ++){
2079 		int start_site_match [ search_in_read_end - search_in_read_start ], end_site_match[ search_in_read_end - search_in_read_start  ];
2080 		int start_last_exon_base,  end_site_mismatches = 0, start_site_mismatches = 0;
2081 		for(start_last_exon_base = 0 ; start_last_exon_base < search_in_read_end - search_in_read_start ; start_last_exon_base++){
2082 			start_site_match[start_last_exon_base] = ( rtext[ search_in_read_start + start_last_exon_base ] == chro_bases_startside[start_last_exon_base] );
2083 			int end_site_x = ( rtext[ search_in_read_start + start_last_exon_base] == chro_bases_endside[start_last_exon_base] );
2084 			end_site_match[start_last_exon_base] = end_site_x;
2085 
2086 			if(start_last_exon_base >=insertion_in_between_i )
2087 				end_site_mismatches += !end_site_x;
2088 		}
2089 
2090 		for(start_last_exon_base = 0 ; start_last_exon_base < search_in_read_end - search_in_read_start - insertion_in_between_i ; start_last_exon_base++){
2091 			end_site_mismatches -= (! end_site_match[start_last_exon_base + insertion_in_between_i] );
2092 			start_site_mismatches += (! start_site_match[start_last_exon_base] );
2093 
2094 			if(start_last_exon_base >= 2 && start_last_exon_base < search_in_read_end - search_in_read_start -insertion_in_between_i -2){
2095 
2096 
2097 			if(0&& FIXLENstrcmp("V0112_0155:7:1101:12618:2466#ACTTGA", rname) == 0)
2098 				SUBREADprintf("split=%d, ins=%d, MM=%d+%d \n", start_last_exon_base, insertion_in_between_i, start_site_mismatches, end_site_mismatches);
2099 
2100 
2101 				if( (end_site_mismatches + start_site_mismatches) * 500 + insertion_in_between_i < best_testing_score ){
2102 					int donor_paired_ret=is_paired_donor_receptor( chro_bases_startside + start_last_exon_base + 1, chro_bases_endside + insertion_in_between_i + start_last_exon_base - 1 );
2103 
2104 					if( donor_paired_ret ) {
2105 						best_insertion_in_between = insertion_in_between_i;
2106 						best_last_exon_base_in_start = start_last_exon_base;
2107 						best_testing_score = (end_site_mismatches + start_site_mismatches) * 500 + insertion_in_between_i;
2108 						(*misma_bases) = (end_site_mismatches + start_site_mismatches);
2109 						if(is_negative_donor) (*is_negative_donor) =(donor_paired_ret -1)%2;
2110 						(*matched_bases) = end_coverage - start_coverage - insertion_in_between_i - (end_site_mismatches + start_site_mismatches);
2111 					}
2112 				}
2113 
2114 			}
2115 		}
2116 	}
2117 
2118 
2119 	if(0 && FIXLENstrcmp("V0112_0155:7:1101:12618:2466", rname)==0)
2120 	{
2121 		chro_bases_startside[x1] = 0;
2122 		chro_bases_endside[x1] = 0;
2123 		char sp1s[200];
2124 		for(x1 =0; x1<200; x1++) sp1s[x1]=' ';
2125 		sp1s[search_in_read_start] =0;
2126 
2127 		char spE[200];
2128 		for(x1 =0; x1<200; x1++) spE[x1]=' ';
2129 		spE[search_in_read_start + best_last_exon_base_in_start] =0;
2130 
2131 		char spBB[200];
2132 		for(x1 =0; x1<200; x1++) spBB[x1]=' ';
2133 		spBB[ best_insertion_in_between] =0;
2134 
2135 		char out1pos[100];
2136 		absoffset_to_posstr(global_context, search_in_chro_start, out1pos);
2137 
2138 		if(0 && FIXLENstrcmp("chr14:105",out1pos)==0){
2139 			SUBREADprintf("POS=%s\t\tINS=%d\t\t%s\n", out1pos, best_insertion_in_between, rname);
2140 			SUBREADprintf("R= %s\nS1=%s%s\nS2=%s%s\n   %s|%s|\n\n", rtext, sp1s, chro_bases_startside, sp1s, chro_bases_endside, spE, spBB);
2141 		}
2142 	}
2143 
2144 	if(best_last_exon_base_in_start>=0)
2145 		return best_last_exon_base_in_start + search_in_read_start ;
2146 	else return -1;
2147 }
2148 
find_path(global_context_t * global_context,thread_context_t * thread_context,int start_element_i,int target_element_i,int * ii_array,int * jj_array,int * is_second_vote_array,gene_vote_t * vote_1,gene_vote_t * vote_2,char * read_name_1,char * read_name_2,char * read_text_1,char * read_text_2,int read_len_1,int read_len_2,int is_negative_strand,int * this_mask,int * exon_last_base)2149 int find_path(global_context_t * global_context, thread_context_t * thread_context, int start_element_i, int target_element_i, int * ii_array, int * jj_array, int * is_second_vote_array,  gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, int * this_mask , int * exon_last_base){
2150 	gene_vote_t * start_vote = is_second_vote_array[start_element_i]?vote_2:vote_1;
2151 	gene_vote_t * end_vote = is_second_vote_array[target_element_i]?vote_2:vote_1;
2152 
2153 	int start_coverage = start_vote->coverage_end[ ii_array[start_element_i] ][ jj_array[start_element_i] ];
2154 	int end_coverage = end_vote->coverage_start[ ii_array[target_element_i] ][ jj_array[target_element_i] ];
2155 	unsigned int start_pos =  start_vote->pos[ ii_array[start_element_i] ][ jj_array[start_element_i] ];
2156 	unsigned int end_pos   =  end_vote->pos[ ii_array[target_element_i] ][ jj_array[target_element_i] ];
2157 	int ret = -1;
2158 
2159 	long long dist = start_pos;
2160 	dist -= end_pos;
2161 	(*this_mask)=0;
2162 	if( abs(dist)<50000 ) {
2163 		if(start_vote == end_vote){
2164 			if(start_coverage < end_coverage + 9){
2165 				char * this_read_name = is_second_vote_array[start_element_i]?read_name_2:read_name_1;
2166 				char * this_read_text = is_second_vote_array[start_element_i]?read_text_2:read_text_1;
2167 				int this_read_len = is_second_vote_array[start_element_i]?read_len_2:read_len_1, mismatched_bases = 0, matched_in_the_uncovered_gap = 0;
2168 				if(start_pos < end_pos){
2169 					int indels_in_start =  start_vote -> current_indel_cursor [ ii_array[start_element_i]] [ jj_array[start_element_i] ] , donor_receptor_neg_strand = -1;
2170 					int best_last_base_in_start_exon = find_donor_receptor(global_context, thread_context, this_read_name, this_read_text, this_read_len, start_coverage, end_coverage, start_pos, end_pos, indels_in_start,  start_vote -> votes[  ii_array[start_element_i]] [ jj_array[start_element_i] ],   start_vote -> votes[  ii_array[target_element_i]] [ jj_array[target_element_i] ], &mismatched_bases , &matched_in_the_uncovered_gap, &donor_receptor_neg_strand);
2171 
2172 					if(best_last_base_in_start_exon > 0 && mismatched_bases<1){
2173 						ret = matched_in_the_uncovered_gap  +  end_vote->coverage_end[ ii_array[target_element_i] ][ jj_array[target_element_i] ] - end_coverage;
2174 						(*this_mask) = donor_receptor_neg_strand? CLUSTER_ALIGNMENT_DONOR_NEGATIVE_STRAND : 0 ;
2175 
2176 						if(0)SUBREADprintf("FROM %d-%d to %d-%d : INC=%d,  UNCOV=%d/%d\n",
2177 									start_vote->coverage_start[ ii_array[start_element_i] ][ jj_array[start_element_i] ],
2178 									start_vote->coverage_end[ ii_array[start_element_i] ][ jj_array[start_element_i] ],
2179 									end_vote -> coverage_start[ ii_array[target_element_i] ][ jj_array[target_element_i] ],
2180 									end_vote -> coverage_end[ ii_array[target_element_i] ][ jj_array[target_element_i] ], ret,
2181 									matched_in_the_uncovered_gap , end_coverage - start_coverage);
2182 
2183 						// # of matched bases, from the end of the "start" section to the end of the end section.
2184 						*exon_last_base = best_last_base_in_start_exon;
2185 					}
2186 				}
2187 			}
2188 		}else{
2189 			ret = end_vote->coverage_end[ ii_array[target_element_i] ][ jj_array[target_element_i] ] - end_vote->coverage_start[ ii_array[target_element_i] ][ jj_array[target_element_i] ] ;
2190 			// if the two sections are on two reads, check the first base of the second read is after the first base of the first read.
2191 		}
2192 	}
2193 	return ret;
2194 }
2195 
2196 
process_voting_junction_PE_topK(global_context_t * global_context,thread_context_t * thread_context,subread_read_number_t pair_number,gene_vote_t * vote_1,gene_vote_t * vote_2,char * read_name_1,char * read_name_2,char * read_text_1,char * read_text_2,int read_len_1,int read_len_2,int is_negative_strand,gene_vote_number_t v1_all_subreads,gene_vote_number_t v2_all_subreads)2197 int process_voting_junction_PE_topK(global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t pair_number, gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, gene_vote_number_t v1_all_subreads, gene_vote_number_t v2_all_subreads)
2198 {
2199 	topK_buffer_t * topbuf = thread_context?&thread_context->topKbuff:&global_context ->topKbuff ;
2200 
2201 	vote_combination_t * comb_buffer = (vote_combination_t *) topbuf -> comb_buffer;
2202 	simple_mapping_t * vote_simple_1_buffer, * vote_simple_2_buffer;
2203 	vote_simple_1_buffer =(simple_mapping_t *) topbuf -> vote_simple_1_buffer;
2204 	vote_simple_2_buffer =(simple_mapping_t *) topbuf -> vote_simple_2_buffer;
2205 	//memset(comb_buffer, 0 , sizeof(vote_combination_t) * global_context -> config.max_vote_combinations);
2206 
2207 	int is_second_read,i,j;
2208 	int third_highest_votes[2][9];
2209 	int is_fully_covered_1 = 0;
2210 	int is_fully_covered_2 = 0;
2211 
2212 	for(is_second_read = 0 ; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read ++)
2213 	{
2214 		gene_vote_t * current_vote = is_second_read?vote_2:vote_1;
2215 		int *top_three_buff = third_highest_votes[is_second_read], i , j;
2216 		int * is_fully_covered = is_second_read?&is_fully_covered_2:&is_fully_covered_1;
2217 		int current_read_len = is_second_read?read_len_2:read_len_1;
2218 
2219 		memset(top_three_buff, 0 , global_context -> config.top_scores * sizeof(int));
2220 
2221 		if((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection)){
2222 			*is_fully_covered = test_fully_covered(global_context , current_vote, current_read_len);
2223 		}
2224 
2225 
2226 
2227 		for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
2228 		{
2229 			for (j=0; j< current_vote->items[i]; j++){
2230 				int vv = current_vote -> votes[i][j];
2231 				if(global_context->config.scRNA_input_mode && !global_context -> input_reads.is_paired_end_reads)vv += SE_READ_IN_KNOWN_EXON_REWARD*is_pos_in_annotated_exon_regions(global_context, current_vote -> pos[i][j]);
2232 				update_top_three(global_context, top_three_buff, vv);
2233 			}
2234 		}
2235 
2236 		if(0 && FIXLENstrcmp("R00000003493",read_name_1)==0)SUBREADprintf("3N [R %d] =%d,%d,%d\n", 1+is_second_read, top_three_buff[0], top_three_buff[1], top_three_buff[2]);
2237 
2238 		for(i = 0; i < global_context -> config.multi_best_reads; i++)
2239 		{
2240 			mapping_result_t * old_result = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, i);
2241 			if(old_result -> selected_votes>0)
2242 			{
2243 				update_top_three(global_context, top_three_buff, old_result -> selected_votes);
2244 			}
2245 		}
2246 		if(0 && FIXLENstrcmp("R00000003493",read_name_1)==0)SUBREADprintf("3Q [R %d] =%d,%d,%d\n", 1+is_second_read, top_three_buff[0], top_three_buff[1], top_three_buff[2]);
2247 	}
2248 
2249 
2250 	int simple_record_numbers[2], third_k;
2251 
2252 	for(is_second_read = 0 ; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read ++)
2253 	{
2254 		int current_simple_number = 0;
2255 		int current_read_len = is_second_read?read_len_2:read_len_1;
2256 		// populate the two simple read lists
2257 		for(third_k = 0 ; third_k < global_context -> config.top_scores; third_k ++)
2258 		{
2259 			if(current_simple_number >= global_context -> config.max_vote_simples)break;
2260 			int this_vote_N = third_highest_votes [is_second_read][third_k];
2261 			// only consider max_votes and max_votes - 1
2262 			if(this_vote_N<1 || (third_highest_votes[is_second_read][0] - this_vote_N > global_context -> config.max_vote_number_cutoff )) break;
2263 
2264 			simple_mapping_t * current_simple = is_second_read ? vote_simple_2_buffer: vote_simple_1_buffer;
2265 			gene_vote_t * current_vote = is_second_read?vote_2:vote_1;
2266 			for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
2267 			{
2268 				if(current_simple_number >= global_context -> config.max_vote_simples)break;
2269 				for (j=0; j< current_vote->items[i]; j++)
2270 				{
2271 					if(current_simple_number >= global_context -> config.max_vote_simples)break;
2272 					if(global_context->config.do_big_margin_filtering_for_junctions && third_k == 0 && current_vote->votes[i][j] >= third_highest_votes [is_second_read][global_context -> config.top_scores - 1])
2273 						insert_big_margin_record(global_context , _global_retrieve_big_margin_ptr(global_context,pair_number, is_second_read), current_vote -> votes[i][j], current_vote -> coverage_start[i][j], current_vote -> coverage_end[i][j] , current_read_len, (current_vote -> masks[i][j] & IS_NEGATIVE_STRAND)?1:0);
2274 
2275 					int vv = current_vote->votes[i][j];
2276 					if(global_context->config.scRNA_input_mode && !global_context -> input_reads.is_paired_end_reads)vv += SE_READ_IN_KNOWN_EXON_REWARD*is_pos_in_annotated_exon_regions(global_context,  current_vote -> pos[i][j]);
2277 					if(vv == this_vote_N && current_vote->votes[i][j] >= global_context->config.minimum_subread_for_second_read)
2278 					{
2279 						current_simple[current_simple_number].is_vote_t_item = 1;
2280 						current_simple[current_simple_number].item_index_i = i;
2281 						current_simple[current_simple_number].item_index_j = j;
2282 						current_simple[current_simple_number].read_start_base = current_vote -> coverage_start[i][j];
2283 						current_simple[current_simple_number].mapping_position = current_vote -> pos[i][j];
2284 						current_simple[current_simple_number].major_half_votes = vv;
2285 
2286 						current_simple_number ++;
2287 
2288 					}
2289 				}
2290 			}
2291 
2292 			for(i = 0; i < global_context -> config.multi_best_reads; i++)
2293 			{
2294 				mapping_result_t * old_result = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, i);
2295 				if(current_simple_number >= global_context -> config.max_vote_simples)break;
2296 				if(old_result -> selected_votes == this_vote_N)
2297 				{
2298 					current_simple[current_simple_number].is_vote_t_item = 0;
2299 					current_simple[current_simple_number].item_index_i = i;
2300 					current_simple[current_simple_number].mapping_position = old_result -> selected_position;
2301 					current_simple[current_simple_number].major_half_votes = old_result -> selected_votes;
2302 					current_simple[current_simple_number].read_start_base = old_result -> confident_coverage_start;
2303 
2304 					current_simple_number ++;
2305 				}
2306 			}
2307 
2308 		}
2309 		simple_record_numbers[is_second_read] = current_simple_number;
2310 	}
2311 
2312 	int used_comb_buffer = 0;
2313 	//calculate all combinations
2314 
2315 	if(global_context -> input_reads.is_paired_end_reads){
2316 		for(i = 0; i < simple_record_numbers[0]; i++){
2317 			for(j = 0; j < simple_record_numbers[1]; j++){
2318 				int target_index;
2319 				int is_PE_distance = 0, is_same_chromosome = 0, is_both_exonic_regions = 0;
2320 
2321 				if(max(vote_simple_1_buffer[i].major_half_votes, vote_simple_2_buffer[j].major_half_votes) < global_context->config.minimum_subread_for_first_read)continue;
2322 
2323 				simple_PE_and_same_chro(global_context , vote_simple_1_buffer+i, vote_simple_2_buffer+j , &is_PE_distance, &is_same_chromosome , read_len_1, read_len_2);
2324 
2325 				if((!is_PE_distance) && min(vote_simple_1_buffer[i].major_half_votes, vote_simple_2_buffer[j].major_half_votes) < global_context->config.minimum_subread_for_first_read)continue;
2326 				if( global_context ->  exonic_region_bitmap && is_same_chromosome)is_both_exonic_regions =  is_pos_in_annotated_exon_regions(global_context, vote_simple_1_buffer[i].mapping_position + vote_simple_1_buffer[i].read_start_base ) && is_pos_in_annotated_exon_regions(global_context, vote_simple_2_buffer[j].mapping_position + vote_simple_2_buffer[j].read_start_base  ) ;
2327 
2328 				int adjusted_weight;
2329 
2330 				if(1){
2331 					if (is_both_exonic_regions && is_PE_distance) adjusted_weight = 1800;
2332 					else if(is_both_exonic_regions) adjusted_weight = 1300;
2333 					else if(is_PE_distance) adjusted_weight = 1300;
2334 					else if(is_same_chromosome) adjusted_weight = 1000;
2335 					else adjusted_weight = 800;
2336 				}else{
2337 					if (is_both_exonic_regions) adjusted_weight = 1300;
2338 					else if(is_PE_distance) adjusted_weight = 1300;
2339 					else if(is_same_chromosome) adjusted_weight = 1000;
2340 					else adjusted_weight = 800;
2341 				}
2342 					//int adjusted_weight = is_PE_distance?1600:(is_same_chromosome?1000:500);
2343 				int adjusted_votes = (vote_simple_1_buffer[i].major_half_votes + vote_simple_2_buffer[j].major_half_votes) * adjusted_weight;
2344 
2345 				for(target_index=0; target_index<used_comb_buffer; target_index++){
2346 					if(comb_buffer[target_index].score_adj < adjusted_votes) break;
2347 				}
2348 
2349 
2350 				if(target_index < global_context -> config.max_vote_combinations){
2351 					int move_i;
2352 
2353 					for(move_i = min(used_comb_buffer, global_context -> config.max_vote_combinations - 1) ; move_i > target_index ; move_i --)
2354 						//checked: memory boundary
2355 						memcpy(comb_buffer + move_i, comb_buffer + move_i - 1 , sizeof(vote_combination_t) );
2356 
2357 					comb_buffer[target_index].r1_loc = vote_simple_1_buffer+i;
2358 					comb_buffer[target_index].r2_loc = vote_simple_2_buffer+j;
2359 					comb_buffer[target_index].score_adj = adjusted_votes;
2360 
2361 					if(used_comb_buffer < global_context -> config.max_vote_combinations)
2362 						used_comb_buffer ++;
2363 				}
2364 
2365 			}
2366 		}
2367 	}
2368 
2369 	mapping_result_t * alignment_tmp_r1, * alignment_tmp_r2;
2370 	alignment_tmp_r1 = (mapping_result_t *) topbuf -> alignment_tmp_r1;
2371 	alignment_tmp_r2 = (mapping_result_t *) topbuf -> alignment_tmp_r2;
2372 
2373 	subjunc_result_t * junction_tmp_r2 , * junction_tmp_r1;
2374 	junction_tmp_r1 = (subjunc_result_t *) topbuf -> junction_tmp_r1;
2375 	junction_tmp_r2 = (subjunc_result_t *) topbuf -> junction_tmp_r2;
2376 
2377 	memset(junction_tmp_r1, 0, sizeof(subjunc_result_t) * global_context->config.multi_best_reads);
2378 	memset(junction_tmp_r2, 0, sizeof(subjunc_result_t) * global_context->config.multi_best_reads);
2379 
2380 	memset(alignment_tmp_r1, 0, sizeof(mapping_result_t) * global_context->config.multi_best_reads);
2381 	memset(alignment_tmp_r2, 0, sizeof(mapping_result_t) * global_context->config.multi_best_reads);
2382 
2383 	int alignment_res_r1_cursor = 0, alignment_res_r2_cursor = 0;
2384 
2385 	if(used_comb_buffer > 0){
2386 		merge_sort(comb_buffer, used_comb_buffer, comb_sort_compare, comb_sort_exchange, comb_sort_merge);
2387 		for(is_second_read = 0; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read++){
2388 			int current_read_len = is_second_read ? read_len_2:read_len_1;
2389 			char * current_read_text = is_second_read ? read_text_2:read_text_1;
2390 			int current_all_subreads = is_second_read ? v2_all_subreads:v1_all_subreads;
2391 			mapping_result_t * current_alignment_tmp = is_second_read?alignment_tmp_r2:alignment_tmp_r1;
2392 			int * current_r_cursor = is_second_read ? &alignment_res_r2_cursor:&alignment_res_r1_cursor;
2393 			int * is_fully_covered = is_second_read?&is_fully_covered_2:&is_fully_covered_1;
2394 			gene_vote_t * current_vote = is_second_read?vote_2:vote_1;
2395 
2396 			subjunc_result_t * current_junction_tmp = NULL;
2397 			if(global_context -> config.do_breakpoint_detection) current_junction_tmp = is_second_read?junction_tmp_r2:junction_tmp_r1;
2398 
2399 			for(i = used_comb_buffer - 1; i >=0; i--){
2400 				if((* current_r_cursor) >= global_context->config.multi_best_reads)break;
2401 
2402 				// add the combination of comb_buffer[i] into the two mapping_result_t arrays
2403 				simple_mapping_t * current_loc = is_second_read?comb_buffer[i].r2_loc:comb_buffer[i].r1_loc;
2404 				assert(current_loc);
2405 				unsigned int current_pos = current_loc->mapping_position;
2406 
2407 				int is_exist = 0;
2408 				for(j = 0; j < *current_r_cursor; j++)
2409 				{
2410 					if(current_alignment_tmp[j].selected_position == current_pos){
2411 						is_exist = 1;
2412 						break;
2413 					}
2414 				}
2415 				//SUBREADprintf("CLLL BUF %d R_%d : %u ; EXIST %d. Written into the %d-th best location\n", i, 1+is_second_read,  current_loc->mapping_position, is_exist, *current_r_cursor);
2416 
2417 				if(!is_exist){
2418 					if(current_loc -> is_vote_t_item)
2419 						copy_vote_to_alignment_res(global_context, thread_context, current_alignment_tmp + (*current_r_cursor), current_junction_tmp ? current_junction_tmp + (*current_r_cursor) : NULL, current_vote, current_loc -> item_index_i, current_loc -> item_index_j, current_read_len, read_name_1, current_read_text, current_all_subreads , current_vote -> noninformative_subreads, pair_number, is_second_read, is_fully_covered);
2420 					else{
2421 						//checked: memory boundary
2422 						memcpy(current_alignment_tmp + (*current_r_cursor), _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, current_loc -> item_index_i), sizeof(mapping_result_t));
2423 						if(current_junction_tmp)
2424 							//checked: memory boundary
2425 							memcpy(current_junction_tmp + (*current_r_cursor), _global_retrieve_subjunc_ptr(global_context, pair_number, is_second_read, current_loc -> item_index_i), sizeof(subjunc_result_t));
2426 					}
2427 					(*current_r_cursor)++;
2428 				}
2429 			}
2430 		}
2431 	}else{// if the one end is not mapped at all
2432 
2433 		if(0 ==  simple_record_numbers[0])
2434 			_global_retrieve_alignment_ptr(global_context, pair_number, 0, 0) -> noninformative_subreads_in_vote = vote_1 -> noninformative_subreads;
2435 		if(global_context -> input_reads.is_paired_end_reads && 0 ==  simple_record_numbers[1])
2436 			_global_retrieve_alignment_ptr(global_context, pair_number, 1, 0) -> noninformative_subreads_in_vote = vote_2 -> noninformative_subreads;
2437 
2438 		if(simple_record_numbers[0]>0 || simple_record_numbers[1]>0)
2439 		{
2440 			// copy all the simple into the mapping_result_t
2441 
2442 			for(is_second_read = 0; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read++)
2443 			{
2444 				int * current_r_cursor = is_second_read ? &alignment_res_r2_cursor:&alignment_res_r1_cursor;
2445 
2446 				int current_read_len = is_second_read ? read_len_2:read_len_1;
2447 				char * current_read_text = is_second_read ? read_text_2:read_text_1;
2448 				int current_all_subreads = is_second_read ? v2_all_subreads:v1_all_subreads;
2449 				mapping_result_t * current_alignment_tmp = is_second_read?alignment_tmp_r2:alignment_tmp_r1;
2450 				gene_vote_t * current_vote = is_second_read?vote_2:vote_1;
2451 				int * is_fully_covered = is_second_read?&is_fully_covered_2:&is_fully_covered_1;
2452 
2453 				subjunc_result_t * current_junction_tmp = NULL;
2454 				if(global_context -> config.do_breakpoint_detection) current_junction_tmp = is_second_read?junction_tmp_r2:junction_tmp_r1;
2455 
2456 				for(i = 0; i < simple_record_numbers[is_second_read]; i++){
2457 
2458 					if((*current_r_cursor) >= global_context->config.multi_best_reads)break;
2459 
2460 					simple_mapping_t * current_loc = is_second_read?vote_simple_2_buffer+i:vote_simple_1_buffer+i;
2461 
2462 					if(current_loc -> major_half_votes < global_context->config.minimum_subread_for_first_read) continue;
2463 					unsigned int current_pos = current_loc->mapping_position;
2464 
2465 					int is_exist = 0;
2466 					for(j = 0; j < *current_r_cursor; j++)
2467 					{
2468 						if(current_alignment_tmp[j].selected_position == current_pos){
2469 							is_exist = 1;
2470 							break;
2471 						}
2472 					}
2473 					if(!is_exist){
2474 						if(current_loc -> is_vote_t_item)
2475 							copy_vote_to_alignment_res(global_context, thread_context, current_alignment_tmp + (*current_r_cursor), current_junction_tmp ? current_junction_tmp + (*current_r_cursor): NULL, current_vote, current_loc -> item_index_i, current_loc -> item_index_j, current_read_len, read_name_1, current_read_text, current_all_subreads , current_vote -> noninformative_subreads, pair_number, is_second_read, is_fully_covered);
2476 						else{
2477 							//checked:boundary
2478 							memcpy(current_alignment_tmp + (*current_r_cursor), _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, current_loc -> item_index_i), sizeof(mapping_result_t));
2479 							if(current_junction_tmp)
2480 								//checked:boundary
2481 								memcpy(current_junction_tmp + (*current_r_cursor), _global_retrieve_subjunc_ptr(global_context, pair_number, is_second_read, current_loc -> item_index_i), sizeof(subjunc_result_t));
2482 						}
2483 
2484 						(*current_r_cursor)++;
2485 					}
2486 				}
2487 			}
2488 		}
2489 	}
2490 
2491 	for(is_second_read = 0; is_second_read < 1 +  global_context -> input_reads.is_paired_end_reads; is_second_read++){
2492 		int * current_r_cursor = is_second_read ? &alignment_res_r2_cursor:&alignment_res_r1_cursor;
2493 		if((*current_r_cursor) > global_context->config.multi_best_reads){
2494 			SUBREADprintf("ERROR: multi_best_locations excessed the boundary: %d > %d\n", (*current_r_cursor), global_context->config.multi_best_reads);
2495 			return -1;
2496 		}
2497 	}
2498 
2499 	for(is_second_read = 0; is_second_read < 1 +  global_context -> input_reads.is_paired_end_reads; is_second_read++)
2500 	{
2501 		int * current_r_cursor = is_second_read ? &alignment_res_r2_cursor:&alignment_res_r1_cursor;
2502 		mapping_result_t * current_alignment_tmp = is_second_read?alignment_tmp_r2:alignment_tmp_r1;
2503 		subjunc_result_t * current_junction_tmp = NULL;
2504 
2505 		if(global_context -> config.do_breakpoint_detection) current_junction_tmp = is_second_read?junction_tmp_r2:junction_tmp_r1;
2506 
2507 		for(i = 0; i < global_context->config.multi_best_reads ; i++){
2508 			mapping_result_t * cur_res = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, i);
2509 			if( i < (*current_r_cursor))
2510 				memcpy(cur_res, current_alignment_tmp + i, sizeof(mapping_result_t));
2511 			else	cur_res -> selected_votes = 0;
2512 
2513 			if(global_context -> config.do_breakpoint_detection) {
2514 				subjunc_result_t * cur_junc =  _global_retrieve_subjunc_ptr(global_context, pair_number, is_second_read, i);
2515 				if(i  < (*current_r_cursor))
2516 					memcpy(cur_junc, current_junction_tmp + i , sizeof(subjunc_result_t));
2517 				else	cur_junc -> minor_votes = 0;
2518 
2519 			}
2520 		}
2521 	}
2522 
2523 	return 0;
2524 }
2525 
2526 
2527 // seq1 and seq2 must be on the same strand!
2528 // (seq2 is reversed)
2529 // The second half of seq1 MUST BE the same as the first half of seq2 if the two reads have an overlapping part.
is_gapped_as_funky(global_context_t * global_context,char * rname1,char * chr1,unsigned int pos1,int rlen1,int is_1_negative,char * cigar1,char * seq1,char * rname2,char * chr2,unsigned int pos2,int rlen2,int is_2_negative,char * cigar2,char * seq2,int tlen_removed_intron)2530 int is_gapped_as_funky(global_context_t * global_context, char * rname1, char * chr1, unsigned int pos1, int rlen1, int is_1_negative, char * cigar1, char * seq1, char * rname2, char * chr2, unsigned int pos2, int rlen2, int is_2_negative, char * cigar2, char * seq2, int tlen_removed_intron)
2531 {
2532 /*
2533 	if(tlen_removed_intron >= rlen1 + rlen2) return 1;	// may be gapped.
2534 	int try_overlapping;
2535 
2536 	int best_matched_bases = 0;
2537 	int best_overlapping_len = -1;
2538 
2539 	int assumed_overlapping = rlen1+rlen2-tlen_removed_intron;
2540 	for(try_overlapping = 0; try_overlapping < min(rlen1, rlen2); try_overlapping++)
2541 	{
2542 		int r1_start = rlen1 - try_overlapping;
2543 		int r2_end = try_overlapping;
2544 		int xk1;
2545 		int all_matched = 0, all_mismatched = 0;
2546 		for(xk1 = 0; xk1 < r2_end; xk1++){
2547 			char r1ch = seq1[r1_start + xk1];
2548 			char r2ch = seq2[xk1];
2549 			if(r1ch==r2ch) all_matched++;
2550 			else all_mismatched++;
2551 		}
2552 
2553 		if(all_mismatched <= 1 && try_overlapping == assumed_overlapping){
2554 			// the assumed overlapping length is good enough.
2555 			return 0;
2556 		}
2557 		if(all_mismatched <= 1 && all_matched > best_matched_bases){
2558 			best_overlapping_len = try_overlapping;
2559 			best_matched_bases = all_matched;
2560 		}
2561 	}
2562 
2563 	if(best_overlapping_len <= 0)return 0;
2564 	return assumed_overlapping
2565 */
2566 	return tlen_removed_intron > 600;
2567 }
2568 
2569 // the positions are not offset by adding the first soft clipping length. I.e., pos1 and pos2 may be smaller than those in the SAM files.
2570 // seq1 and seq2 must be on the same strand!
2571 // (seq2 is reversed)
is_funky_fragment(global_context_t * global_context,char * rname1,char * chr1,unsigned int pos1,int rlen1,int is_1_negative,char * cigar1,char * seq1,char * rname2,char * chr2,unsigned int pos2,int rlen2,int is_2_negative,char * cigar2,char * seq2,int tlen_removed_intron)2572 int is_funky_fragment(global_context_t * global_context, char * rname1, char * chr1, unsigned int pos1, int rlen1, int is_1_negative, char * cigar1, char * seq1, char * rname2, char * chr2, unsigned int pos2, int rlen2, int is_2_negative, char * cigar2, char * seq2, int tlen_removed_intron)
2573 {
2574 	long long llraw_tlen = pos1;
2575 	llraw_tlen -= pos2;
2576 	if(llraw_tlen <0)
2577 		llraw_tlen = -llraw_tlen;
2578 	unsigned int raw_tlen = llraw_tlen;
2579 	raw_tlen += max(rlen2, rlen1);
2580 
2581 	//SUBREADprintf("CHRS=%p,%p,  POS=%u,%u,  RTLEN=%u\n", chr1, chr2, pos1, pos2, raw_tlen);
2582 
2583 	if(chr1 != chr2) raw_tlen = 0;
2584 
2585 	// note: the two pointers can be compared because they should be derived from the offset table.
2586 	// Each chromosome name should have one and only one distinct char * pointer.
2587 	if(chr1 == chr2 && raw_tlen <= global_context -> config.maximum_translocation_length && is_2_negative == is_1_negative)
2588 	{
2589 		if(is_gapped_as_funky(global_context, rname1, chr1, pos1, rlen1, is_1_negative, cigar1, seq1, rname2, chr2, pos2, rlen2, is_2_negative, cigar2, seq2, tlen_removed_intron))
2590 			return FUNKY_FRAGMENT_A;
2591 		else	return NOT_FUNKY;
2592 	}
2593 	else if( chr1 == chr2 && raw_tlen <= global_context -> config.maximum_translocation_length && is_2_negative != is_1_negative )
2594 		return FUNKY_FRAGMENT_DE;
2595 	else if( chr1 != chr2 || raw_tlen > global_context -> config.maximum_translocation_length)
2596 		return FUNKY_FRAGMENT_BC;
2597 
2598 	return NOT_FUNKY;
2599 }
2600 
process_voting_junction(global_context_t * global_context,thread_context_t * thread_context,subread_read_number_t pair_number,gene_vote_t * vote_1,gene_vote_t * vote_2,char * read_name_1,char * read_name_2,char * read_text_1,char * read_text_2,int read_len_1,int read_len_2,int is_negative_strand,gene_vote_number_t v1_all_subreads,gene_vote_number_t v2_all_subreads)2601 int process_voting_junction(global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t pair_number, gene_vote_t * vote_1, gene_vote_t * vote_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, int read_len_1, int read_len_2, int is_negative_strand, gene_vote_number_t v1_all_subreads, gene_vote_number_t v2_all_subreads){
2602 
2603 
2604 	//#warning "FOR TESTING CLUSTER_BASED JUNCTION DETECTION ONLY!!."
2605 	//return process_voting_junction_PE_juncs(global_context, thread_context, pair_number, vote_1, vote_2, read_name_1, read_name_2, read_text_1, read_text_2, read_len_1, read_len_2, is_negative_strand, v1_all_subreads, v2_all_subreads);
2606 		return process_voting_junction_PE_topK(global_context, thread_context, pair_number, vote_1, vote_2, read_name_1, read_name_2, read_text_1, read_text_2, read_len_1, read_len_2, is_negative_strand, v1_all_subreads, v2_all_subreads);
2607 
2608 }
2609 
2610 
explain_read(global_context_t * global_context,thread_context_t * thread_context,realignment_result_t * final_realignments,subread_read_number_t pair_number,int read_len,char * read_name,char * read_text,char * qual_text,int is_second_read,int best_read_id,int is_negative_strand)2611 unsigned int explain_read(global_context_t * global_context, thread_context_t * thread_context, realignment_result_t * final_realignments, subread_read_number_t pair_number, int read_len, char * read_name , char *read_text, char *qual_text, int is_second_read, int best_read_id, int is_negative_strand)
2612 {
2613 	explain_context_t explain_context;
2614 	mapping_result_t *current_result = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, best_read_id);
2615 
2616 	if(global_context -> config.do_big_margin_filtering_for_reads)
2617 	{
2618 		int current_repeated_times = is_ambiguous_voting(global_context, pair_number, is_second_read, current_result->selected_votes, current_result->confident_coverage_start, current_result->confident_coverage_end, read_len, (current_result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0);
2619 		if( global_context -> config.do_big_margin_filtering_for_reads && current_repeated_times>1) return 0;
2620 	}
2621 
2622 	memset(&explain_context,0, sizeof(explain_context_t));
2623 	explain_context.full_read_len = read_len;
2624 	explain_context.is_fully_covered = current_result -> is_fully_covered ;
2625 	explain_context.full_read_text = read_text;
2626 	explain_context.full_qual_text = qual_text;
2627 	explain_context.read_name = read_name;
2628 	explain_context.is_confirmed_section_negative_strand = is_negative_strand ;
2629 	explain_context.pair_number = pair_number;
2630 	explain_context.is_second_read = is_second_read ;
2631 	explain_context.best_read_id = best_read_id;
2632 	explain_context.total_tries = 0;
2633 
2634 	if(0 && FIXLENstrcmp("simulated.24700032", explain_context.read_name)==0)SUBREADprintf("BBFINAL %s SEL_POS=%u COV=%d - %d\n",  explain_context.read_name, current_result -> selected_position, current_result -> confident_coverage_start, current_result -> confident_coverage_end);
2635 
2636 	unsigned int back_search_tail_position,front_search_start_position;
2637 	unsigned short back_search_read_tail, front_search_read_start;
2638 
2639 
2640 	back_search_read_tail = min(explain_context.full_read_len , current_result -> confident_coverage_end );//- 5;
2641 	back_search_tail_position = current_result -> selected_position + back_search_read_tail +  current_result -> indels_in_confident_coverage;
2642 
2643 	//if( back_search_read_tail > 102)
2644 	//SUBREADprintf("MAX back_search_read_tail : MIN %d , %d\n", explain_context.full_read_len , current_result -> confident_coverage_end);
2645 
2646 	explain_context.tmp_search_junctions[0].read_pos_end = back_search_read_tail;
2647 	explain_context.tmp_search_junctions[0].abs_offset_for_start = back_search_tail_position;
2648 
2649 	explain_context.all_back_alignments = 0;
2650 	explain_context.tmp_search_sections = 0;
2651 	explain_context.best_indel_penalty =0;
2652 	explain_context.best_matching_bases = -9999;
2653 	explain_context.second_best_matching_bases = -9999;
2654 	explain_context.tmp_indel_penalty = 0;
2655 	explain_context.tmp_total_matched_bases = 0;
2656 	explain_context.is_currently_tie = 0;
2657 	explain_context.best_is_complex = 0;
2658 	explain_context.best_support_as_simple = 0;
2659 	explain_context.best_min_unsupport_as_simple = 0;
2660 	explain_context.tmp_support_as_simple = 0;
2661 	explain_context.tmp_min_support_as_complex = 999999;
2662 	explain_context.tmp_min_unsupport = 999999;
2663 	explain_context.tmp_is_pure_donor_found_explain = 1;
2664 	explain_context.best_is_pure_donor_found_explain = 0;
2665 
2666 	if(1) {
2667 		front_search_read_start = back_search_read_tail > 8? back_search_read_tail - 8:0;
2668 		front_search_start_position = back_search_tail_position>8?back_search_tail_position - 8:0;
2669 	} else {
2670 		//front_search_read_start = current_result -> confident_coverage_start + 5;
2671 		front_search_read_start = min(explain_context.full_read_len , current_result -> confident_coverage_end);
2672 		if(front_search_read_start > 2*global_context -> config.realignment_minimum_variant_distance) front_search_read_start -= 2*global_context -> config.realignment_minimum_variant_distance;
2673 		else front_search_read_start = 0;
2674 		front_search_start_position = current_result -> selected_position + front_search_read_start;
2675 	}
2676 
2677 	search_events_to_back(global_context, thread_context, &explain_context, read_text , qual_text, back_search_tail_position , back_search_read_tail, 0, 0, 1);
2678 	int back_penalty = explain_context.best_indel_penalty;
2679 
2680 	//int is_backsearch_tie = explain_context.is_currently_tie;
2681 	int back_search_matches_diff = -9999;
2682 
2683 	/*
2684 
2685 
2686 	if(explain_context.back_search_confirmed_sections>0)
2687 	{
2688 
2689 		short last_section_length = explain_context.back_search_junctions[0].read_pos_end - explain_context.back_search_junctions[0].read_pos_start;
2690 
2691 		front_search_read_start = explain_context.back_search_junctions[0].read_pos_start;
2692 		front_search_start_position = explain_context.back_search_junctions[0].abs_offset_for_start - last_section_length;
2693 
2694 		int last_sec = explain_context.back_search_confirmed_sections-1;
2695 
2696 		current_result -> selected_position = explain_context.back_search_junctions[last_sec].abs_offset_for_start - explain_context.back_search_junctions[last_sec].read_pos_end + explain_context.back_search_junctions[last_sec].read_pos_start;
2697 		back_search_matches_diff = explain_context.best_matching_bases - explain_context.second_best_matching_bases;
2698 
2699 		if(0 && memcmp(explain_context.read_name,  TTTSNAME, 26)==0)
2700 		{
2701 			int xk1;
2702 			for(xk1 = 0; xk1 < explain_context.back_search_confirmed_sections; xk1++)
2703 			{
2704 				short pr_section_length = explain_context.back_search_junctions[xk1].read_pos_end - explain_context.back_search_junctions[xk1].read_pos_start;
2705 				if(explain_context.back_search_junctions[xk1].event_after_section)
2706 					SUBREADprintf("BACK_SECTIONS [%d], START IS %u; RPSS=%d ; RPED=%d ; LEN=%d ; EVENT is %u %u INDEL=%d\n", xk1, explain_context.back_search_junctions[xk1].abs_offset_for_start, explain_context.back_search_junctions[xk1].read_pos_start, explain_context.back_search_junctions[last_sec].read_pos_end, pr_section_length, explain_context.back_search_junctions[xk1].event_after_section->event_small_side, explain_context.back_search_junctions[xk1].event_after_section->event_large_side, explain_context.back_search_junctions[xk1].event_after_section->indel_length);
2707 				else	SUBREADprintf("BACK_SECTIONS [%d], START IS %u; RPSS=%d ; RPED=%d ; LEN=%d\n", xk1, explain_context.back_search_junctions[xk1].abs_offset_for_start, explain_context.back_search_junctions[xk1].read_pos_start, explain_context.back_search_junctions[last_sec].read_pos_end, pr_section_length);
2708 			}
2709 		}
2710 
2711 		//SUBREADprintf("DBI:%d - %d;\n", explain_context.best_matching_bases , explain_context.second_best_matching_bases);
2712 	}
2713 	else
2714 	*/
2715 	explain_context.all_front_alignments = 0;
2716 	explain_context.tmp_search_sections = 0;
2717 	explain_context.best_indel_penalty = 0;
2718 	explain_context.best_matching_bases = -9999;
2719 	explain_context.second_best_matching_bases = -9999;
2720 	explain_context.tmp_total_matched_bases = 0;
2721 	explain_context.tmp_indel_penalty = 0;
2722 
2723 	explain_context.is_currently_tie = 0;
2724 	explain_context.best_is_complex = 0;
2725 	explain_context.best_support_as_simple = 0;
2726 	explain_context.best_min_unsupport_as_simple = 0;
2727 	explain_context.tmp_support_as_simple = 0;
2728 	explain_context.tmp_min_support_as_complex = 999999;
2729 	explain_context.tmp_min_unsupport = 999999;
2730 	explain_context.tmp_is_pure_donor_found_explain = 1;
2731 	explain_context.best_is_pure_donor_found_explain = 0;
2732 
2733 	memset(explain_context.tmp_search_junctions, 0, sizeof(perfect_section_in_read_t ) * MAX_EVENTS_IN_READ);
2734 
2735 	explain_context.tmp_search_junctions[0].read_pos_start = front_search_read_start;
2736 	explain_context.tmp_search_junctions[0].abs_offset_for_start = front_search_start_position;
2737 
2738 	if(0 && FIXLENstrcmp("R000002689",explain_context.read_name ) == 0)
2739 		SUBREADprintf("Enter F_SEARCH: start=%u  read_pos=%d  REMAIN=%d\n", front_search_start_position, front_search_read_start,  read_len - front_search_read_start );
2740 
2741 
2742 	short search_remain =  read_len - front_search_read_start;
2743 	//#warning "SUBREAD_151 REMOVE THE ASSERT! "
2744 	//if(search_remain >= 102)SUBREADprintf("FATAL: RLEN=%d, SEARCH=%d\n", read_len, front_search_read_start);
2745 	//assert( search_remain < 102 );
2746 
2747 	search_events_to_front(global_context, thread_context, &explain_context, read_text + front_search_read_start, qual_text + front_search_read_start, front_search_start_position, search_remain , 0, 0, 1);
2748 	if(0 && FIXLENstrcmp("R_chr901_932716_91M1D9M",explain_context.read_name ) == 0)
2749 		 SUBREADprintf("F_SEARCH has found %d result sets\n", explain_context.all_front_alignments);
2750 
2751 	explain_context.best_indel_penalty += back_penalty;
2752 	//int is_frontsearch_tie = explain_context.is_currently_tie;
2753 
2754 	//SUBREADprintf("DFI:%d - %d;\n", explain_context.best_matching_bases , explain_context.second_best_matching_bases);
2755 	int front_search_matches_diff = explain_context.best_matching_bases - explain_context.second_best_matching_bases;
2756 	explain_context.best_second_match_diff = front_search_matches_diff + back_search_matches_diff;
2757 
2758 	int realignment_number = finalise_explain_CIGAR(global_context, thread_context, &explain_context, final_realignments);
2759 
2760 	if(0 && FIXLENstrcmp("SRR3439488.572382", explain_context.read_name)==0)
2761 		SUBREADprintf("TRYING_REALIGN:%s:%u\n", explain_context.read_name, explain_context.total_tries);
2762 
2763 	return realignment_number;
2764 }
2765 
2766 
debug_clipping(global_context_t * global_context,thread_context_t * thread_context,gene_value_index_t * current_value_index,char * read_text,unsigned int mapped_pos,int test_len,int search_to_tail,int search_center,int number_of_clipped,char * read_name)2767 void debug_clipping(global_context_t * global_context,  thread_context_t * thread_context, gene_value_index_t * current_value_index, char * read_text, unsigned int mapped_pos, int test_len,  int search_to_tail, int search_center, int number_of_clipped, char * read_name){
2768 
2769 	//if(test_len>100)return;
2770 
2771 	int xk1;
2772 
2773 	SUBREADprintf("\n %s CENTER=%d, CLIPPED=%d, TLEN=%d    %s\n", read_name, search_center, number_of_clipped, test_len, search_to_tail?">>>>":"<<<<");
2774 
2775 	for(xk1 = 0 ; xk1 < test_len ; xk1++)
2776 	{
2777 		char reference_base = gvindex_get(current_value_index, xk1 + mapped_pos);
2778 		SUBREADprintf("%c", reference_base == read_text[xk1] ? '-':'#');
2779 	}
2780 
2781 	SUBREADprintf("\n");
2782 	for(xk1 = 0 ; xk1 < test_len ; xk1++)
2783 	{
2784 		if(xk1 == search_center)
2785 			SUBREADprintf("%c", search_to_tail?'>':'<');
2786 		else SUBREADprintf(" ");
2787 	}
2788 
2789 	SUBREADprintf("\n");
2790 	for(xk1 = 0 ; xk1 < test_len ; xk1++)
2791 	{
2792 		if( search_to_tail && xk1 >= test_len - number_of_clipped)
2793 			SUBREADprintf("R");
2794 		else if( (!search_to_tail) && xk1 <= number_of_clipped - 1)
2795 			SUBREADprintf("L");
2796 		else SUBREADprintf(" ");
2797 	}
2798 
2799 	SUBREADprintf("\n");
2800 
2801 }
2802 
2803 #define SOFT_CLIPPING_WINDOW_SIZE 5
2804 #define SOFT_CLIPPING_MAX_ERROR   1
2805 
2806 // it returns the number of bases to be clipped off.
find_soft_clipping(global_context_t * global_context,thread_context_t * thread_context,gene_value_index_t * current_value_index,char * read_text,unsigned int mapped_pos,int test_len,int search_to_tail,int search_center)2807 int find_soft_clipping(global_context_t * global_context,  thread_context_t * thread_context, gene_value_index_t * current_value_index, char * read_text, unsigned int mapped_pos, int test_len,  int search_to_tail, int search_center)
2808 {
2809 	int base_in_window = 0;
2810 	int added_base_index = 0, removed_base_index = 0;
2811 	int search_start = 0;
2812 	int matched_in_window = SOFT_CLIPPING_WINDOW_SIZE;
2813 	int last_matched_base_index = -1, delta;
2814 
2815 	if(search_to_tail)
2816 	{
2817 		if(search_center < 0)
2818 			search_start = 0;
2819 		else if(search_center >= test_len)
2820 			// SHOULD NOT HAPPEN!!!
2821 			search_start = test_len - 1;
2822 		else	search_start = search_center - 1;
2823 
2824 		delta = 1;
2825 	}else{
2826 		if(search_center < 0)
2827 			// SHOULD NOT HAPPEN!!!
2828 			search_start = 0;
2829 		else if(search_center >= test_len)
2830 			search_start = test_len - 1;
2831 		else	search_start = search_center + 1;
2832 
2833 		delta = -1;
2834 	}
2835 
2836 	for(added_base_index = search_start; added_base_index >= 0 && added_base_index < test_len; added_base_index += delta)
2837 	{
2838 		// add the new base
2839 		char reference_base = gvindex_get(current_value_index, added_base_index + mapped_pos);
2840 
2841 		if(0){
2842 			char outpos1[100];
2843 			absoffset_to_posstr(global_context, added_base_index + mapped_pos, outpos1);
2844 			SUBREADprintf("CHMAT [%s] %s (%u) ref:read = %c:%c\n", search_to_tail?"T":"H" ,outpos1,  added_base_index + mapped_pos, reference_base,  read_text[added_base_index]);
2845 		}
2846 		int added_is_matched = (reference_base == read_text[added_base_index]);
2847 		matched_in_window += added_is_matched;
2848 		if(added_is_matched)
2849 			last_matched_base_index = added_base_index;
2850 
2851 		base_in_window ++;
2852 
2853 		if(base_in_window > SOFT_CLIPPING_WINDOW_SIZE){
2854 			removed_base_index = added_base_index - delta * SOFT_CLIPPING_WINDOW_SIZE;
2855 			char removing_ref_base = gvindex_get(current_value_index, removed_base_index + mapped_pos);
2856 			matched_in_window -= (removing_ref_base == read_text[removed_base_index]);
2857 		}else{
2858 			matched_in_window --;
2859 		}
2860 
2861 		if(matched_in_window < SOFT_CLIPPING_WINDOW_SIZE - SOFT_CLIPPING_MAX_ERROR){
2862 			// clip, bondary is the last matched base.
2863 			if(search_to_tail){
2864 				if(last_matched_base_index < 0) return test_len - search_start;
2865 				else return test_len - last_matched_base_index - 1;
2866 			}else{
2867 				if(last_matched_base_index >= 0) return last_matched_base_index;
2868 				else return search_start - 1;
2869 			}
2870 		}
2871 	}
2872 
2873 	if(last_matched_base_index < 0) return test_len;
2874 
2875 	if(search_to_tail){
2876 		if(last_matched_base_index < 0) return test_len - search_start;
2877 		else return test_len - last_matched_base_index - 1;
2878 	}else{
2879 		if(last_matched_base_index >= 0) return last_matched_base_index;
2880 		else return search_start - 1;
2881 	}
2882 }
2883 
2884 // read_head_abs_offset is the first WANTED base in read.
2885 // If the first section in read is reversed, read_head_abs_offset is the LAST WANTED bases in this section. (the abs offset of the first base in the section is actually larger than read_head_abs_offset)
final_CIGAR_quality(global_context_t * global_context,thread_context_t * thread_context,char * read_text,char * qual_text,int read_len,char * cigar_string,unsigned long read_head_abs_offset,int is_read_head_reversed,int * mismatched_bases,int covered_start,int covered_end,char * read_name,int * non_clipped_length,int * total_indel_length,int * matched_bases,int * chromosomal_length,int * full_section_clipped)2886 int final_CIGAR_quality(global_context_t * global_context, thread_context_t * thread_context, char * read_text, char * qual_text, int read_len, char * cigar_string, unsigned long read_head_abs_offset, int is_read_head_reversed, int * mismatched_bases, int covered_start, int covered_end, char * read_name, int * non_clipped_length, int *total_indel_length, int * matched_bases, int * chromosomal_length, int * full_section_clipped)
2887 {
2888 	int cigar_cursor = 0;
2889 	int read_cursor = 0;
2890 	unsigned int current_perfect_section_abs = read_head_abs_offset;
2891 	int rebuilt_read_len = 0, total_insertion_length = 0;
2892 	float all_matched_bases = 0;
2893 	gene_value_index_t * current_value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
2894 	int current_reversed = is_read_head_reversed;
2895 	int all_mismatched = 0;
2896 	int is_First_M = 1, is_wrong_cigar = 0;
2897 	int head_soft_clipped = -1, tail_soft_clipped = -1;
2898 	unsigned int tmp_int = 0;
2899 
2900 	//SUBREADprintf("Coverage : %d ~ %d\n", covered_start, covered_end);
2901 
2902 	if(0){
2903 		char posout1[100];
2904 		int chro_max = get_offset_maximum_chro_pos(global_context,thread_context,read_head_abs_offset);
2905 		absoffset_to_posstr(global_context, read_head_abs_offset, posout1);
2906 		SUBREADprintf("READ %s : mapped to %s ; max_pos=%d\n", read_name,  posout1, chro_max);
2907 	}
2908 
2909 	while(1)
2910 	{
2911 		char nch = cigar_string[cigar_cursor++];
2912 		if(!nch)break;
2913 		if(isdigit(nch))
2914 			tmp_int = tmp_int*10+(nch-'0');
2915 		else{
2916 			if(tmp_int == 0)is_wrong_cigar = 1;
2917 			if(is_wrong_cigar) break;
2918 			if(nch == 'M' || nch == 'S')
2919 			{
2920 				char *qual_text_cur;
2921 				if(qual_text[0])qual_text_cur = qual_text+read_cursor;
2922 				else	qual_text_cur = NULL;
2923 
2924 				float section_qual;
2925 
2926 				int is_Last_M = (cigar_string[cigar_cursor]==0);
2927 				int has_clipping_this_section_head = 0, has_clipping_this_section_tail = 0;
2928 				char * reversed_first_section_text = NULL;
2929 
2930 				if(0){
2931 					int is_head_in_chro = get_offset_maximum_chro_pos(global_context,thread_context, current_perfect_section_abs );
2932 					int is_end_in_chro = get_offset_maximum_chro_pos(global_context,thread_context, current_perfect_section_abs + tmp_int );
2933 					char posout1[100];
2934 					char posout2[100];
2935 					int chro_max = get_offset_maximum_chro_pos(global_context,thread_context, current_perfect_section_abs );
2936 					absoffset_to_posstr(global_context, current_perfect_section_abs, posout1);
2937 					absoffset_to_posstr(global_context, current_perfect_section_abs + tmp_int, posout2);
2938 					SUBREADprintf("  %dM SECTION : mapped to %s ~ %s ; max_pos=%d ; Hin=%d, Ein=%d\n", tmp_int,  posout1, posout2, chro_max, is_head_in_chro, is_end_in_chro);
2939 					SUBREADprintf("  %dM SECTION : Hin=%d, Ein=%d\n", tmp_int, is_head_in_chro, is_end_in_chro);
2940 				}
2941 
2942 				// find "J" sections if it is the first M
2943 				if(is_First_M && global_context -> config.show_soft_cliping)
2944 				{
2945 					int adj_coverage_start = covered_start - read_cursor;
2946 
2947 					if(current_reversed)
2948 					{
2949 						reversed_first_section_text = malloc(MAX_READ_LENGTH);
2950 						memcpy(reversed_first_section_text, read_text, tmp_int);
2951 						reverse_read(reversed_first_section_text, tmp_int,  global_context->config.space_type);
2952 
2953 						head_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, reversed_first_section_text, current_perfect_section_abs, tmp_int, 1, 0);
2954 					}
2955 					else
2956 						head_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, read_text, current_perfect_section_abs, tmp_int, 0, adj_coverage_start);
2957 					//SUBREADprintf("SSHEAD:%d\n", head_soft_clipped);
2958 
2959 					if(head_soft_clipped == tmp_int){
2960 						(*full_section_clipped) = 1;
2961 						head_soft_clipped = 0;
2962 					}
2963 					else has_clipping_this_section_head = 1;
2964 
2965 					if(has_clipping_this_section_head){
2966 						if( tmp_int - head_soft_clipped < 3 && head_soft_clipped > 1 ) (*full_section_clipped) = 1;
2967 					}
2968 
2969 					if(reversed_first_section_text)
2970 						free(reversed_first_section_text);
2971 					reversed_first_section_text = NULL;
2972 				}
2973 				if(is_Last_M && global_context -> config.show_soft_cliping)
2974 				{
2975 					int adj_coverage_end = covered_end - read_cursor;
2976 
2977 					if(current_reversed)
2978 					{
2979 						reversed_first_section_text = malloc(MAX_READ_LENGTH);
2980 						// checked: boundary
2981 						memcpy(reversed_first_section_text, read_text + read_cursor, tmp_int);
2982 						reverse_read(reversed_first_section_text, tmp_int,  global_context->config.space_type);
2983 						tail_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, reversed_first_section_text, current_perfect_section_abs, tmp_int, 0, tmp_int);
2984 					}
2985 					else
2986 						tail_soft_clipped = find_soft_clipping(global_context, thread_context, current_value_index, read_text + read_cursor, current_perfect_section_abs, tmp_int, 1, adj_coverage_end);
2987 
2988 					if(1 && FIXLENstrcmp("NS500643:556:HGTMTBGXB:4:13403:18179:8012", read_name)==0)
2989 						SUBREADprintf("SSTAIL:%d\n", tail_soft_clipped);
2990 
2991 					if(1 && tail_soft_clipped == tmp_int){
2992 						tail_soft_clipped = 0;
2993 						if(full_section_clipped)(*full_section_clipped) = 1;
2994 					} else has_clipping_this_section_tail = 1;
2995 
2996 					if( has_clipping_this_section_tail ){
2997 						if(tmp_int - tail_soft_clipped < 3 && tail_soft_clipped > 1) (*full_section_clipped) = 1;
2998 					}
2999 
3000 					if(reversed_first_section_text)
3001 						free(reversed_first_section_text);
3002 				}
3003 
3004 				if(is_Last_M && is_First_M && tail_soft_clipped+head_soft_clipped >= tmp_int-1)
3005 				{
3006 					head_soft_clipped=0;
3007 					tail_soft_clipped=0;
3008 				}
3009 
3010 				int mismatch_calculation_start = has_clipping_this_section_head?head_soft_clipped:0;
3011 				int mismatch_calculation_end = has_clipping_this_section_tail?tail_soft_clipped:0;
3012 
3013 				if(global_context -> config.space_type == GENE_SPACE_COLOR)
3014 					section_qual =  match_base_quality_cs(current_value_index, read_text+read_cursor, current_perfect_section_abs, qual_text_cur, tmp_int, global_context->config.phred_score_format , mismatched_bases, &all_mismatched, global_context -> config.high_quality_base_threshold, mismatch_calculation_start, mismatch_calculation_end);
3015 				else
3016 					section_qual =  match_base_quality(current_value_index, read_text+read_cursor, current_perfect_section_abs, qual_text_cur, tmp_int, current_reversed, global_context->config.phred_score_format , mismatched_bases, &all_mismatched, global_context -> config.high_quality_base_threshold, mismatch_calculation_start, mismatch_calculation_end);
3017 				all_matched_bases += section_qual;
3018 				rebuilt_read_len += tmp_int;
3019 				is_First_M=0;
3020 
3021 				read_cursor += tmp_int;
3022 
3023 				//move to the NEXT UNWANTED ABS OFFSET.
3024 				if(current_reversed)
3025 					current_perfect_section_abs --;
3026 				else
3027 					current_perfect_section_abs += tmp_int;
3028 
3029 
3030 			}
3031 			else if(nch == 'I')
3032 			{
3033 				rebuilt_read_len += tmp_int;
3034 				read_cursor += tmp_int;
3035 
3036 				all_matched_bases += tmp_int;
3037 				total_indel_length += tmp_int;
3038 				total_insertion_length += tmp_int;
3039 			}
3040 			else if(nch == 'D')
3041 			{
3042 				total_indel_length ++;
3043 				if(!current_reversed)
3044 					current_perfect_section_abs += tmp_int;
3045 			}
3046 			else if(tolower(nch) == 'n')
3047 			{
3048 				total_indel_length ++;
3049 				current_perfect_section_abs += tmp_int;
3050 				if(nch == 'n') current_reversed = !current_reversed;
3051 			}
3052 			else if(tolower(nch) == 'b')
3053 			{
3054 				total_indel_length ++;
3055 				current_perfect_section_abs -= tmp_int;
3056 				if(nch == 'b') current_reversed = !current_reversed;
3057 			}
3058 
3059 			if(read_cursor>MAX_READ_LENGTH){
3060 				SUBREADprintf("ERROR: Cigar section longer than read length: %d >= %d, '%s'\n", tmp_int , MAX_READ_LENGTH, cigar_string);
3061 				is_wrong_cigar = 1;
3062 			}
3063 
3064 			tmp_int = 0;
3065 		}
3066 	}
3067 
3068 	int my_non_clipped_length = read_len;
3069 	my_non_clipped_length -= max(0,tail_soft_clipped);
3070 	my_non_clipped_length -= max(0,head_soft_clipped);
3071 
3072 	//#warning " ========== COMMENT THIS LINE !! ========="
3073 	//printf("QCR ALL MM=%d, RBLEN=%d, MAPPED_LEN=%d ; CIGAR=%s\n", all_mismatched, rebuilt_read_len , my_non_clipped_length, cigar_string);
3074 
3075 	if(is_wrong_cigar || rebuilt_read_len != read_len || my_non_clipped_length < global_context->config.min_mapped_fraction){
3076 		(*mismatched_bases)=99999;
3077 		all_matched_bases = 0;
3078 		sprintf(cigar_string, "%dM", read_len);
3079 	}
3080 	else if((head_soft_clipped>0 || tail_soft_clipped>0))
3081 	{
3082 		char new_cigar_tmp[120];
3083 		is_First_M=1;
3084 		new_cigar_tmp[0]=0;
3085 		cigar_cursor = 0;
3086 		while(1)
3087 		{
3088 			char nch = cigar_string[cigar_cursor++];
3089 
3090 			if(!nch)break;
3091 			if(isdigit(nch))
3092 				tmp_int = tmp_int*10+(nch-'0');
3093 			else{
3094 				char cigar_piece [30];
3095 				cigar_piece[0]=0;
3096 
3097 				if(nch == 'M')
3098 				{
3099 					char cigar_tiny [12];
3100 					int is_Last_M = (cigar_string[cigar_cursor]==0);
3101 					if(is_First_M && head_soft_clipped>0)
3102 					{
3103 						tmp_int -= head_soft_clipped;
3104 						sprintf(cigar_tiny,"%dS",head_soft_clipped);
3105 						strcat(cigar_piece, cigar_tiny);
3106 					}
3107 					if(is_Last_M && tail_soft_clipped>0)
3108 					{
3109 						tmp_int -= tail_soft_clipped;
3110 					}
3111 					sprintf(cigar_tiny,"%dM",tmp_int);
3112 					strcat(cigar_piece, cigar_tiny);
3113 					if(is_Last_M && tail_soft_clipped>0)
3114 					{
3115 						sprintf(cigar_tiny,"%dS",tail_soft_clipped);
3116 						strcat(cigar_piece, cigar_tiny);
3117 					}
3118 					is_First_M = 0;
3119 				}
3120 				else
3121 				{
3122 					sprintf(cigar_piece, "%u%c", tmp_int, nch);
3123 				}
3124 
3125 				strcat(new_cigar_tmp, cigar_piece);
3126 				tmp_int = 0;
3127 			}
3128 		}
3129 
3130 		if(1 && FIXLENstrcmp("NS500643:556:HGTMTBGXB:4:13403:18179:8012", read_name)==0)
3131 			SUBREADprintf("NEW_CIGAR_2 : %s\n", new_cigar_tmp);
3132 		strcpy(cigar_string, new_cigar_tmp);
3133 	}
3134 
3135 	if((*mismatched_bases) != 99999)
3136 		(*mismatched_bases) = all_mismatched;
3137 
3138 	(*non_clipped_length) = my_non_clipped_length;
3139 	(*matched_bases) = my_non_clipped_length - all_mismatched - total_insertion_length;
3140 	(*chromosomal_length) = current_perfect_section_abs - read_head_abs_offset + total_insertion_length;
3141 
3142 	return max(0, (int)(all_matched_bases*60/my_non_clipped_length));
3143 }
3144 
3145 // this function also adds final_counting_reads in chromosome_events.
finalise_explain_CIGAR(global_context_t * global_context,thread_context_t * thread_context,explain_context_t * explain_context,realignment_result_t * final_realignments)3146 unsigned int finalise_explain_CIGAR(global_context_t * global_context, thread_context_t * thread_context, explain_context_t * explain_context, realignment_result_t * final_realignments)
3147 {
3148 	int xk1, front_i, back_i;
3149 	char tmp_cigar[120];
3150 	chromosome_event_t * to_be_supported [20];
3151 	short flanking_size_left[20], flanking_size_right[20];
3152 	int to_be_supported_count = 0;
3153 	int is_junction_read = 0;
3154 	int total_perfect_matched_sections = 0;
3155 
3156 	mapping_result_t * result = _global_retrieve_alignment_ptr(global_context, explain_context->pair_number, explain_context->is_second_read, explain_context-> best_read_id);
3157 	result -> result_flags &= ~CORE_IS_FULLY_EXPLAINED;
3158 	result -> result_flags &= ~CORE_IS_PAIRED_END;
3159 
3160 	//SUBREADprintf("FINAL_CIGAR R1 %d[%d] = %p, FLAGS=%d\n", explain_context -> pair_number , explain_context-> best_read_id , result , result -> result_flags);
3161 	// reverse the back_search result for every equally best alignment
3162 	//
3163 
3164 	for(back_i = 0; back_i < explain_context -> all_back_alignments; back_i++){
3165 		if( explain_context -> result_back_junction_numbers[back_i] > MAX_EVENTS_IN_READ ){
3166 			SUBREADprintf("ERROR: Too many cigar sections: %d > %d\n", explain_context -> result_back_junction_numbers[back_i] , MAX_EVENTS_IN_READ);
3167 			return 0;
3168 		}
3169 		for(xk1=0; xk1<explain_context -> result_back_junction_numbers[back_i]/2; xk1++)
3170 		{
3171 			perfect_section_in_read_t tmp_exp;
3172 			// checked: boundary
3173 			memcpy(&tmp_exp, &explain_context -> result_back_junctions[back_i][xk1], sizeof(perfect_section_in_read_t));
3174 			memcpy(&explain_context -> result_back_junctions[back_i][xk1],  &explain_context -> result_back_junctions[back_i][explain_context -> result_back_junction_numbers[back_i] - xk1 - 1] , sizeof(perfect_section_in_read_t));
3175 			memcpy(&explain_context -> result_back_junctions[back_i][explain_context -> result_back_junction_numbers[back_i] - xk1 - 1] , &tmp_exp , sizeof(perfect_section_in_read_t));
3176 		}
3177 	}
3178 
3179 	// adding indel lengths in read lengths and relocate sections
3180 	// note that the last section in back results has the same strand of the main piece.
3181 
3182 	int is_cigar_overflow = 0, fusions_in_read = 0, final_alignment_number = 0;
3183 	for(back_i = 0; back_i < explain_context -> all_back_alignments; back_i++){
3184 		if(final_alignment_number >= MAX_ALIGNMENT_PER_ANCHOR)break;
3185 
3186 		int is_first_section_negative = (result ->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
3187 		for(xk1=0; xk1<explain_context -> result_back_junction_numbers[back_i]; xk1++)
3188 		{
3189 			int section_length = explain_context -> result_back_junctions[back_i][xk1].read_pos_end - explain_context -> result_back_junctions[back_i][xk1].read_pos_start;
3190 			if(0 && FIXLENstrcmp("simulated.11420793", explain_context->read_name)==0)SUBREADprintf("FINAL_EXPLAIN %s BACK_%d SEC_%d OLD_START=%d SEC_LENG=%d\n", explain_context->read_name, back_i, xk1,  explain_context -> result_back_junctions[back_i][xk1].abs_offset_for_start, section_length);
3191 			unsigned int new_start_pos;
3192 
3193 			if(explain_context -> result_back_junctions[back_i][xk1].is_strand_jumped)
3194 				// the "strand_jumped" section do not need to move
3195 				// however, the "abs_offset_for_start" is actually for the last base in this section.
3196 				// this does not metter if we compare the reversed read to the chromosome.
3197 				// "abs_offset_for_start" is the first UNWANTED base (smaller than the first WANTED base)
3198 				new_start_pos = explain_context -> result_back_junctions[back_i][xk1].abs_offset_for_start +1;
3199 			else
3200 				// "abs_offset_for_start" is the first UNWANTED base. By subtracting the length, it becomes the first WANTED base.
3201 				new_start_pos = explain_context -> result_back_junctions[back_i][xk1].abs_offset_for_start - section_length;
3202 
3203 			explain_context -> result_back_junctions[back_i][xk1].abs_offset_for_start = new_start_pos;
3204 			if(explain_context -> result_back_junctions[back_i][xk1].event_after_section
3205 				&& explain_context -> result_back_junctions[back_i][xk1].event_after_section->is_strand_jumped) is_first_section_negative=!is_first_section_negative;
3206 		}
3207 
3208 		// build CIGAR
3209 		for(front_i = 0; front_i < explain_context -> all_front_alignments; front_i++){
3210 			if(final_alignment_number >= MAX_ALIGNMENT_PER_ANCHOR)break;
3211 
3212 			to_be_supported_count = 0;
3213 			tmp_cigar[0]=0;
3214 			int known_junction_supp = 0;
3215 
3216 			for(xk1 = 0; xk1 < explain_context -> result_back_junction_numbers[back_i] + explain_context -> result_front_junction_numbers[front_i] -1; xk1++)
3217 			{
3218 				char piece_cigar[25];
3219 				int read_pos_start, read_pos_end;
3220 				perfect_section_in_read_t * current_section, *next_section = NULL;
3221 
3222 				int is_front_search = 0;
3223 				if(xk1 >= explain_context -> result_back_junction_numbers[back_i] - 1) {
3224 					current_section = &explain_context -> result_front_junctions[front_i][xk1 - explain_context -> result_back_junction_numbers[back_i] +1];
3225 					if(xk1 - explain_context -> result_back_junction_numbers[back_i] +2 < explain_context -> result_front_junction_numbers[front_i])
3226 						next_section = &explain_context -> result_front_junctions[front_i][xk1 - explain_context -> result_back_junction_numbers[back_i] +2];
3227 					is_front_search = 1;
3228 				} else {
3229 					current_section = &explain_context -> result_back_junctions[back_i][xk1];
3230 					if(xk1+1 <  explain_context ->  result_back_junction_numbers[back_i])
3231 						next_section = &explain_context -> result_back_junctions[back_i][xk1+1];
3232 				}
3233 
3234 
3235 				if(xk1 == explain_context -> result_back_junction_numbers[back_i] - 1)
3236 				     read_pos_start = explain_context -> result_back_junctions[back_i][xk1].read_pos_start;
3237 				else read_pos_start = current_section -> read_pos_start;
3238 
3239 				read_pos_end = current_section -> read_pos_end;
3240 				chromosome_event_t *event_after = current_section -> event_after_section;
3241 
3242 				sprintf(piece_cigar, "%dM", (read_pos_end - read_pos_start));
3243 				total_perfect_matched_sections += (read_pos_end - read_pos_start);
3244 				flanking_size_left[xk1] = (read_pos_end - read_pos_start);
3245 
3246 				if(xk1<explain_context ->  result_back_junction_numbers[back_i] + explain_context ->  result_front_junction_numbers[front_i]  -2)
3247 					assert(event_after);
3248 
3249 				if(xk1>0)
3250 					flanking_size_right[xk1-1] = (read_pos_end - read_pos_start);
3251 
3252 				if(event_after)
3253 				{
3254 					if(event_after -> event_type == CHRO_EVENT_TYPE_INDEL)
3255 						sprintf(piece_cigar+strlen(piece_cigar), "%d%c", abs(event_after->indel_length), event_after->indel_length>0?'D':'I');
3256 					else if(event_after -> event_type == CHRO_EVENT_TYPE_JUNCTION||event_after -> event_type == CHRO_EVENT_TYPE_FUSION) {
3257 						// the distance in CIGAR is the NEXT UNWANTED BASE of piece#1 to the FIRST WANTED BASE in piece#2
3258 						int delta_one ;
3259 						if(current_section -> is_strand_jumped + current_section -> is_connected_to_large_side == 1) delta_one = 1;
3260 						else delta_one = -1;
3261 
3262 						// if it is from front_search, the event side points to the first WANTED base of the next section; it should be moved to the last WANTED base the next section if the next section is jumped.
3263 						if(next_section && (event_after -> is_strand_jumped + current_section -> is_strand_jumped==1))
3264 						{
3265 							if(is_front_search)
3266 							{
3267 								if(current_section -> is_connected_to_large_side)
3268 									delta_one += (next_section->read_pos_end - next_section-> read_pos_start - 1);
3269 								else
3270 									delta_one -= (next_section->read_pos_end - next_section-> read_pos_start - 1);
3271 							}
3272 							else
3273 							{
3274 								if(current_section -> is_connected_to_large_side)
3275 									delta_one += (next_section->read_pos_end - next_section-> read_pos_start - 1);
3276 								else
3277 									delta_one -= (next_section->read_pos_end - next_section-> read_pos_start - 1);
3278 							}
3279 						}
3280 
3281 						char jump_mode = current_section -> is_connected_to_large_side?'B':'N';
3282 						long long int movement = event_after -> event_large_side;
3283 						movement -= event_after -> event_small_side - delta_one;
3284 						if(1){
3285 							if(jump_mode == 'B' && movement < 0){
3286 								movement = - movement;
3287 								jump_mode = 'N';
3288 							}else if(jump_mode == 'N' && movement < 0){
3289 								movement = - movement;
3290 								jump_mode = 'B';
3291 							}
3292 						}
3293 
3294 						if(event_after -> is_strand_jumped) jump_mode = tolower(jump_mode);
3295 						fusions_in_read += (event_after -> event_type == CHRO_EVENT_TYPE_FUSION);
3296 						sprintf(piece_cigar+strlen(piece_cigar), "%u%c", (int)movement, jump_mode);
3297 
3298 						if(event_after -> indel_at_junction) sprintf(piece_cigar+strlen(piece_cigar), "%dI", event_after -> indel_at_junction);
3299 						is_junction_read ++;
3300 						if(event_after -> is_donor_found_or_annotation & 64 ) known_junction_supp ++;
3301 					}
3302 					to_be_supported[to_be_supported_count++] = event_after;
3303 				}
3304 				strcat(tmp_cigar, piece_cigar);
3305 				if(strlen(tmp_cigar) > CORE_MAX_CIGAR_STR_LEN - 14){
3306 					is_cigar_overflow=1;
3307 					break;
3308 				}
3309 			}
3310 
3311 			int mismatch_bases = 0;
3312 
3313 			//#warning ">>>>>>>>>>>>>>>> COMMENT NEXT LINE <<<<<<<<<<<<<<<<<<<<<<<"
3314 			//SUBREADprintf("ReadDebug:%s\t%s\n", explain_context -> read_name , tmp_cigar);
3315 			if(is_cigar_overflow) sprintf(tmp_cigar, "%dM",  explain_context -> full_read_len);
3316 
3317 			unsigned int final_position;
3318 
3319 //			#warning "'0 &&' is because there could be indels in the high-confident region but this indel is finally disused."
3320 			if( 0 && explain_context -> result_back_junction_numbers[back_i] + explain_context -> result_front_junction_numbers[front_i] <= 2) final_position = result -> selected_position;
3321 			else final_position = explain_context -> result_back_junctions[back_i][0].abs_offset_for_start;
3322 
3323 			if(0 && FIXLENstrcmp("simulated.11420793", explain_context->read_name)==0)SUBREADprintf("FFFINAL %s : POS=%u, ABS=%u\n", explain_context->read_name, final_position, explain_context -> result_back_junctions[back_i][0].abs_offset_for_start);
3324 
3325 			int is_exonic_read_fraction_OK = 1;
3326 
3327 			if( global_context -> config.minimum_exonic_subread_fraction > 0.0000001 && (!is_junction_read) && result -> used_subreads_in_vote>0)
3328 			{
3329 				int min_subreads = global_context -> config.minimum_exonic_subread_fraction * result-> used_subreads_in_vote;
3330 				if( result -> selected_votes < min_subreads )
3331 					is_exonic_read_fraction_OK = 0 ;
3332 			}
3333 
3334 
3335 
3336 			int final_qual = 0, applied_mismatch = 0, non_clipped_length = 0, total_indel_length = 0, final_MATCH = 0, chromosomal_length = 0, full_section_clipped = 0;
3337 
3338 			if(is_exonic_read_fraction_OK)
3339 			{
3340 				final_qual  = final_CIGAR_quality(global_context, thread_context, explain_context -> full_read_text, explain_context -> full_qual_text, explain_context -> full_read_len , tmp_cigar, final_position, is_first_section_negative != ((result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0), &mismatch_bases, result -> confident_coverage_start, result -> confident_coverage_end,  explain_context -> read_name, &non_clipped_length, &total_indel_length, & final_MATCH, & chromosomal_length, & full_section_clipped);
3341 				//#warning ">>>>>>> COMMENT THIS <<<<<<<"
3342 				//printf("OCT27-STEP2-%s:%d-POS%u-VOT%d-CIG-%s [ %d ]-INDELs=%llu; M/MM=%d,%d\n", explain_context -> read_name, explain_context  -> is_second_read + 1,  result -> selected_position, result -> selected_votes, tmp_cigar, is_cigar_overflow, ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table -> numOfElements, final_MATCH, mismatch_bases);
3343 
3344 
3345 				applied_mismatch = is_junction_read? global_context->config.max_mismatch_junction_reads:global_context->config.max_mismatch_exonic_reads ;
3346 				if(explain_context->full_read_len > EXON_LONG_READ_LENGTH)
3347 					applied_mismatch = ((((explain_context->full_read_len+1)<<16) / 100) * applied_mismatch)>>16;
3348 
3349 				if(global_context -> config.space_type == GENE_SPACE_COLOR) applied_mismatch += to_be_supported_count*2;
3350 			}
3351 
3352 
3353 			//#warning " ========== COMMENT THIS LINE !! ========="
3354 			if(0 && FIXLENstrcmp("HWI-ST945:119:D0J2JACXX:1:1303:17374:199067", explain_context -> read_name) ==0){
3355 				char outpos1[100];
3356 				absoffset_to_posstr(global_context, final_position, outpos1);
3357 				SUBREADprintf("FINALQUAL %s : FINAL_POS=%s ( %u )\tCIGAR=%s\tMM=%d / MAPLEN=%d > %d?\tVOTE=%d > %0.2f x %d ?  MASK=%d\tQUAL=%d\tBRNO=%d\nKNOWN_JUNCS=%d PENALTY=%d\n\n", explain_context -> read_name, outpos1 , final_position , tmp_cigar, mismatch_bases, non_clipped_length, applied_mismatch,  result -> selected_votes, global_context -> config.minimum_exonic_subread_fraction,result-> used_subreads_in_vote, result->result_flags, final_qual, explain_context -> best_read_id, known_junction_supp, explain_context -> best_indel_penalty);
3358 				//exit(0);
3359 			}
3360 
3361 
3362 			if(mismatch_bases <= applied_mismatch && is_exonic_read_fraction_OK && fusions_in_read < 2 ){// && (0 == full_section_clipped || 0 == global_context -> config.do_breakpoint_detection)) {
3363 				realignment_result_t * realign_res = final_realignments+final_alignment_number;
3364 				final_alignment_number ++;
3365 
3366 				realign_res -> realign_flags = result->result_flags;
3367 				realign_res -> first_base_is_jumpped = 0;
3368 				realign_res -> mapping_result = result;
3369 				realign_res -> chromosomal_length = chromosomal_length;
3370 				realign_res -> known_junction_supp = known_junction_supp;
3371 				realign_res -> final_penalty = explain_context -> best_indel_penalty;
3372 
3373 				if(mismatch_bases >  applied_mismatch ) realign_res -> realign_flags |= CORE_TOO_MANY_MISMATCHES;
3374 				else realign_res -> realign_flags &= ~CORE_TOO_MANY_MISMATCHES;
3375 
3376 				if(((result -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0) != is_first_section_negative)
3377 				{
3378 					assert((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection));
3379 					realign_res -> first_base_is_jumpped = 1;
3380 				}
3381 				strcpy(realign_res -> cigar_string, tmp_cigar);
3382 
3383 				if(1)
3384 				{
3385 					int is_RNA_from_positive = -1;
3386 					unsigned long long read_id = 2llu * explain_context ->  pair_number + explain_context->is_second_read;
3387 
3388 					for(xk1= 0; xk1 < to_be_supported_count; xk1++)
3389 					{
3390 						if(xk1 >= MAX_EVENTS_IN_READ) break;
3391 
3392 						if(to_be_supported [xk1] -> event_type !=CHRO_EVENT_TYPE_INDEL && is_junction_read){
3393 							if(to_be_supported [xk1] -> event_type == CHRO_EVENT_TYPE_JUNCTION && to_be_supported [xk1] -> is_donor_found_or_annotation && is_RNA_from_positive == -1)
3394 								is_RNA_from_positive = !(to_be_supported [xk1] -> is_negative_strand);
3395 						}
3396 
3397 						//final counts are added in function "add_realignment_event_support" in core.c
3398 
3399 						realign_res -> supporting_chromosome_events[xk1] = to_be_supported[xk1];
3400 						realign_res -> flanking_size_left[xk1] = flanking_size_left[xk1];
3401 						realign_res -> flanking_size_right[xk1] = flanking_size_right[xk1];
3402 						realign_res -> crirical_support[xk1] += (read_id == to_be_supported [xk1] -> critical_read_id);
3403 					}
3404 					if(to_be_supported_count < MAX_EVENTS_IN_READ )
3405 						realign_res -> supporting_chromosome_events[to_be_supported_count] = NULL;
3406 
3407 					result -> result_flags |= CORE_IS_FULLY_EXPLAINED;
3408 					result -> read_length = explain_context->full_read_len;
3409 
3410 					if(is_RNA_from_positive == -1)
3411 					{
3412 						realign_res -> realign_flags |= CORE_NOTFOUND_DONORS ;
3413 						realign_res -> realign_flags &= ~(CORE_IS_GT_AG_DONORS);
3414 					}
3415 					else
3416 					{
3417 						realign_res -> realign_flags &= ~ (CORE_NOTFOUND_DONORS | CORE_IS_GT_AG_DONORS);
3418 
3419 						if(is_RNA_from_positive)
3420 							realign_res -> realign_flags |= CORE_IS_GT_AG_DONORS;
3421 					}
3422 				}
3423 
3424 				realign_res -> first_base_position = final_position;
3425 				realign_res -> final_quality = final_qual;
3426 				realign_res -> final_mismatched_bases = mismatch_bases;
3427 				realign_res -> final_matched_bases = (unsigned short)final_MATCH;
3428 				realign_res -> best_second_diff_bases = (9<explain_context -> best_second_match_diff)?-1:explain_context -> best_second_match_diff;
3429 
3430 			}
3431 		}
3432 	}
3433 
3434 	return final_alignment_number;
3435 }
3436 
3437 
3438 
3439 
3440 #define ceq(c,t) ((c)[0]==(t)[0] && (c)[1]==(t)[1])
3441 #define c2eq(ch1, ch2, tg1, tg2) ((ceq(ch1, tg1) && ceq(ch2, tg2)) || (ceq(ch1, tg2) && ceq(ch2, tg1)) )
3442 
paired_chars_full_core(char * ch1,char * ch2,int is_reverse)3443 int paired_chars_full_core(char * ch1, char * ch2, int is_reverse)
3444 {
3445 	if (c2eq(ch1, ch2, "GT", "AG") || c2eq(ch1, ch2, "CT", "AC"))
3446 	{
3447 		if (is_reverse) if (ceq(ch1, "AG") || ceq(ch1, "AC")) return 2;
3448 		if (!is_reverse) if (ceq(ch1, "CT") || ceq(ch1, "GT")) return 2;
3449 	}
3450 	else if ( c2eq(ch1, ch2,"GC","AG") || c2eq(ch1, ch2,"GC","CT") || c2eq(ch1, ch2,"AT","AC") || c2eq(ch1, ch2,"GT","AT"))
3451 	{
3452 		if (is_reverse) if (ceq(ch1, "GC") || ceq(ch1, "AT")  || ceq(ch1, "AG") || ceq(ch1, "AC")) return 1;
3453 		if (!is_reverse) if (ceq(ch1, "GC") || ceq(ch1, "AT") ||ceq(ch1, "GT") || ceq(ch1, "CT")) return 1;
3454 	}
3455 	return 0;
3456 }
3457 
paired_chars_part_core(char * ch1,char * ch2,int is_reverse)3458 int paired_chars_part_core(char * ch1, char * ch2, int is_reverse)
3459 {
3460 	if (c2eq(ch1, ch2, "GT", "AG") || c2eq(ch1, ch2, "CT", "AC")) {
3461 		if (is_reverse){
3462 			if (ceq(ch1, "AG") || ceq(ch1, "AC")) return 1;
3463 		} else {
3464 			if (ceq(ch1, "CT") || ceq(ch1, "GT")) return 1;
3465 		}
3466 	}
3467 	return 0;
3468 }
3469 
3470 #define is_donor_chars_full(cc) (((cc)[0]=='G' && (cc)[1]=='T') || \
3471 			    ((cc)[0]=='A' && (cc)[1]=='G') || \
3472 			    ((cc)[0]=='A' && (cc)[1]=='C') || \
3473 			    ((cc)[0]=='C' && (cc)[1]=='T') || \
3474 			    ((cc)[0]=='G' && (cc)[1]=='C') || \
3475 			    ((cc)[0]=='A' && (cc)[1]=='T') || \
3476 			    ((cc)[0]=='A' && (cc)[1]=='C') )
3477 
3478 
3479 #define is_donor_chars_part(cc) (((cc)[0]=='G' && (cc)[1]=='T') || \
3480 			    ((cc)[0]=='A' && (cc)[1]=='G') || \
3481 			    ((cc)[0]=='A' && (cc)[1]=='C') || \
3482 			    ((cc)[0]=='C' && (cc)[1]=='T'))
3483 
3484 //#warning "=============== NO DONOR-RECEPTOR NEEDED =============="
3485 //#define is_donor_chars(x) 1
3486 //#define paired_chars(x,y,z) 1
3487 
3488 #define is_donor_chars is_donor_chars_part
3489 #define  paired_chars paired_chars_part_core
3490 
3491 
3492 
3493 
print_big_margin(global_context_t * global_context,subread_read_number_t pair_number,int is_second_read)3494 void print_big_margin(global_context_t * global_context, subread_read_number_t pair_number, int is_second_read){
3495 	unsigned short * big_margin_record = _global_retrieve_big_margin_ptr(global_context,pair_number, is_second_read);
3496 	int x1;
3497 
3498 	SUBREADprintf("\n  >>> READ_NO=%u,  SECOND=%d, MEM=%p <<< \n", (unsigned int)pair_number, is_second_read, big_margin_record);
3499 	for(x1 = 0; x1 < global_context->config.big_margin_record_size/3 ; x1++)
3500 	{
3501 		SUBREADprintf("%d %d~%d   ", big_margin_record[x1*3] , big_margin_record[x1*3+1] , big_margin_record[x1*3+2]);
3502 	}
3503 	SUBREADputs("");
3504 }
3505 
3506 #define ABGIGUOUS_TOLERANCE 3
3507 
is_ambiguous_voting(global_context_t * global_context,subread_read_number_t pair_number,int is_second_read,int selected_vote,int max_start,int max_end,int read_len,int is_negative)3508 int is_ambiguous_voting(global_context_t * global_context, subread_read_number_t pair_number, int is_second_read, int selected_vote, int max_start,int max_end, int read_len, int is_negative)
3509 {
3510 //	#warning "=========== THE NEXT LINE IS ONLY FOR COMPARING WITH STAR!! ============== "
3511 //	return 0;
3512 	if( global_context->config.big_margin_record_size<3) return 0;
3513 	int xk1;
3514 	int encounter = 0;
3515 
3516 	if(is_negative)
3517 	{
3518 		int tmp = max_start;
3519 		max_start = read_len - max_end;
3520 		max_end = read_len - tmp;
3521 	}
3522 
3523 	unsigned short * big_margin_record = _global_retrieve_big_margin_ptr(global_context,pair_number, is_second_read);
3524 
3525 	for(xk1 = 0; xk1 < global_context->config.big_margin_record_size/3 ; xk1++)
3526 	{
3527 		if(!big_margin_record[xk1*3])break;
3528 
3529 		if(big_margin_record[xk1*3] >= selected_vote - 1)	// actually, max-1
3530 		{
3531 			if(0) {
3532 				if ( max_start >= big_margin_record[xk1*3+1] - ABGIGUOUS_TOLERANCE && max_end <= big_margin_record[xk1*3+2] + ABGIGUOUS_TOLERANCE )
3533 					encounter++;
3534 				else if ( big_margin_record[xk1*3+1] >= max_start - ABGIGUOUS_TOLERANCE && big_margin_record[xk1*3+2] <= max_end + ABGIGUOUS_TOLERANCE )
3535 					encounter++;
3536 
3537 			} else {
3538 			// 4 and 4 are the best setting for indel and fusion simulation.
3539 				if(selected_vote >= big_margin_record[xk1*3]) {
3540 					if(big_margin_record[xk1*3+1] >= max_start - 4 && big_margin_record[xk1*3+2] <= max_end + 4)
3541 						encounter++;
3542 				} else {
3543 					if(big_margin_record[xk1*3+1] <= max_start + 4 && big_margin_record[xk1*3+2] >= max_end - 4)
3544 						encounter++;
3545 				}
3546 			}
3547 		}
3548 
3549 	}
3550 
3551 	if(encounter>1) return encounter;
3552 	return 0;
3553 }
3554 
3555 #define JUNCTION_CONFIRM_WINDOW 17
3556 // This function implements the same function of donor_score, except that the two halves are from different strands.
3557 // Both halves are forced to positive strand and the split point is found.
3558 // Note that the donor/receptor sides are still expected for distinguishing between Fusion Breaks and Fusion Junctions.
3559 
3560 // Note that the read_text is on reversed mode. The guess points are on reversed mode too.
3561 // "Left" and "Right" means the left/right half in the "reversed" read.
donor_jumped_score(global_context_t * global_context,thread_context_t * thread_context,unsigned int small_virtualHead_abs_offset,unsigned int large_virtualHead_abs_offset,int guess_start,int guess_end,char * read_text,int read_len,int is_small_half_negative,int is_large_half_negative,int small_half_on_left_reversed,int * final_split_point,int * is_GT_AG_strand,int * is_donor_found_or_annotation,int * small_side_increasing_coordinate,int * large_side_increasing_coordinate)3562 int donor_jumped_score(global_context_t * global_context, thread_context_t * thread_context, unsigned int small_virtualHead_abs_offset, unsigned int large_virtualHead_abs_offset, int guess_start, int guess_end,  char * read_text, int read_len, int is_small_half_negative, int is_large_half_negative, int small_half_on_left_reversed, int * final_split_point, int * is_GT_AG_strand, int * is_donor_found_or_annotation, int * small_side_increasing_coordinate, int * large_side_increasing_coordinate)
3563 {
3564 	gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index ;
3565 	// guess_end is the index of the first UNWANTED BASE.
3566 	int most_likely_point_as_reversed = (guess_start+guess_end)/2;
3567 
3568 	int selected_real_split_point = -1, selected_junction_strand = -1;
3569 	//char donor_left[2], donor_right[2];
3570 
3571 	int best_score = -111111;
3572 
3573 	int real_split_point_i;
3574 	int real_split_point_numbers = guess_end - guess_start;
3575 
3576 	char positive_read[MAX_READ_LENGTH+1];
3577 	strcpy(positive_read, read_text) ;
3578 	reverse_read(positive_read, read_len, global_context->config.space_type);
3579 
3580 	//printf("TEST_JUMPED: %u - %u\n", small_virtualHead_abs_offset, large_virtualHead_abs_offset);
3581 
3582 
3583 	(*small_side_increasing_coordinate) = (small_half_on_left_reversed != is_small_half_negative);
3584 	(*large_side_increasing_coordinate) = (small_half_on_left_reversed == is_large_half_negative);
3585 
3586 
3587 	for(real_split_point_i = 0 ; real_split_point_i < real_split_point_numbers; real_split_point_i++)
3588 	{
3589 		int left_should_match, right_should_match;
3590 		int left_should_not_match, right_should_not_match;
3591 		int real_split_point_as_reversed = (real_split_point_i % 2)?-((real_split_point_i+1)/2):((1+real_split_point_i)/2);
3592 		real_split_point_as_reversed += most_likely_point_as_reversed;
3593 
3594 		if(real_split_point_as_reversed > read_len-JUNCTION_CONFIRM_WINDOW)continue;
3595 		if(real_split_point_as_reversed < JUNCTION_CONFIRM_WINDOW)continue;
3596 
3597 		int is_donor_test_ok=0;
3598 
3599 		if(small_half_on_left_reversed)
3600 		{
3601 			unsigned int small_pos_test_begin = small_virtualHead_abs_offset + (is_small_half_negative?real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW:(read_len - real_split_point_as_reversed));
3602 			char * small_pos_read_begin = (is_small_half_negative?read_text:positive_read) + (is_small_half_negative?
3603 						(real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW)           :
3604 						(read_len - real_split_point_as_reversed)
3605   						);
3606 
3607 			unsigned int large_pos_test_begin = large_virtualHead_abs_offset + (is_large_half_negative?real_split_point_as_reversed:(read_len - real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW));
3608 			char * large_pos_read_begin = (is_large_half_negative?read_text:positive_read) + (is_large_half_negative?
3609 						(real_split_point_as_reversed)     :
3610 						(read_len - real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW));
3611 
3612 			left_should_match = match_chro(small_pos_read_begin , value_index , small_pos_test_begin , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3613 			right_should_match = match_chro(large_pos_read_begin , value_index , large_pos_test_begin , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3614 			left_should_not_match = right_should_not_match = 0;
3615 		//match_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, small_virtualHead_abs_offset + real_split_point - JUNCTION_CONFIRM_WINDOW , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3616 
3617 		}
3618 		else
3619 		{
3620 			unsigned int small_pos_test_begin = small_virtualHead_abs_offset + (is_small_half_negative?real_split_point_as_reversed:(read_len - real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW));
3621 			char * small_pos_read_begin = (is_small_half_negative?read_text:positive_read) + (is_small_half_negative?
3622 							(real_split_point_as_reversed):(read_len - real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW));
3623 
3624 			unsigned int large_pos_test_begin = large_virtualHead_abs_offset + (is_large_half_negative?(real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW):(read_len - real_split_point_as_reversed));
3625 			char * large_pos_read_begin = (is_large_half_negative?read_text:positive_read) + (is_large_half_negative?
3626 							  (real_split_point_as_reversed - JUNCTION_CONFIRM_WINDOW):(read_len - real_split_point_as_reversed));
3627 
3628 			left_should_match = match_chro(small_pos_read_begin , value_index , small_pos_test_begin , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3629 			right_should_match = match_chro(large_pos_read_begin , value_index , large_pos_test_begin , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3630 			left_should_not_match = right_should_not_match = 0;
3631 
3632 		}
3633 
3634 		//#warning "============ REMOVE THE TWO '+ 1' FROM THE NEXT LINE ================="
3635 		//#warning "============ ADD THE TWO '+ 1's IN THE BLANKETS FOR SVs GRANT APP ================="
3636 		int mismatch_in_between_allowd = (global_context -> config.more_accurate_fusions)?(0):(1);
3637 		if(left_should_match + right_should_match  >= JUNCTION_CONFIRM_WINDOW*2 - mismatch_in_between_allowd  &&
3638 			left_should_not_match <= JUNCTION_CONFIRM_WINDOW -3 && right_should_not_match <= JUNCTION_CONFIRM_WINDOW -3)
3639 		{
3640 			int test_score = is_donor_test_ok*500+left_should_match + right_should_match - left_should_not_match - right_should_not_match;
3641 			if(test_score > best_score)
3642 			{
3643 				selected_real_split_point = real_split_point_as_reversed;
3644 				best_score = test_score;
3645 			}
3646 		}
3647 	}
3648 
3649 	if(best_score>0)
3650 	{
3651 		//printf("TEST_JUMPED: BSCORE=%d  SPLT=%d\n", best_score , selected_real_split_point);
3652 		*final_split_point = selected_real_split_point;
3653 		*is_donor_found_or_annotation = best_score>=500;
3654 		*is_GT_AG_strand = selected_junction_strand;
3655 		return best_score;
3656 	}
3657 	return 0;
3658 }
3659 
3660 
donor_score(global_context_t * global_context,thread_context_t * thread_context,unsigned int left_virtualHead_abs_offset,unsigned int right_virtualHead_abs_offset,int left_indel_offset,int right_indel_offset,int normally_arranged,int guess_start,int guess_end,char * read_text,int read_len,int * final_split_point,int * is_GT_AG_strand,int * is_donor_found_or_annotation,int * final_inserted_bases,int * small_side_increasing_coordinate,int * large_side_increasing_coordinate,char * read_name)3661 int donor_score(global_context_t * global_context, thread_context_t * thread_context, unsigned int left_virtualHead_abs_offset, unsigned int right_virtualHead_abs_offset, int left_indel_offset, int right_indel_offset, int normally_arranged, int guess_start, int guess_end,  char * read_text, int read_len, int * final_split_point, int * is_GT_AG_strand, int * is_donor_found_or_annotation, int * final_inserted_bases, int * small_side_increasing_coordinate, int * large_side_increasing_coordinate, char * read_name)
3662 {
3663 
3664 
3665 	gene_value_index_t * value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
3666 	int need_donor_test = global_context->config.do_breakpoint_detection && global_context -> config.check_donor_at_junctions && (!(  global_context-> config.do_fusion_detection ||   global_context-> config.do_long_del_detection));
3667 
3668 	(*small_side_increasing_coordinate)=!normally_arranged;
3669 	(*large_side_increasing_coordinate)= normally_arranged;
3670 
3671 	// guess_end is the index of the first UNWANTED BASE.
3672 	int most_likely_point = (guess_start+guess_end)/2;
3673 
3674 	// "split_point" is the first base NOT IN piece 1; it is also the first base IN piece 2.
3675 	int selected_real_split_point = -1, selected_junction_strand = -1 , selected_inserted_bases = 0;
3676 	char donor_left[3], donor_right[3];
3677 
3678 
3679 	int best_score = -111111;
3680 	int non_insertion_preferred = 0;
3681 
3682 	int real_split_point_i;
3683 	int real_split_point_numbers = guess_end - guess_start;
3684 
3685 	if(0 && FIXLENstrcmp("R006856515", read_name) == 0)
3686 		SUBREADprintf("TESTDON: LR=%d; RR=%d\n", left_indel_offset, right_indel_offset);
3687 
3688 	for(real_split_point_i = 0 ; real_split_point_i < real_split_point_numbers; real_split_point_i++)
3689 	{
3690 		int left_should_match, right_should_match = 0;
3691 		int left_should_not_match = 0, right_should_not_match = 0;
3692 		int real_split_point = (real_split_point_i % 2)?-((real_split_point_i+1)/2):((1+real_split_point_i)/2);
3693 		real_split_point += most_likely_point;
3694 		int is_donor_test_ok = 0;
3695 
3696 		if(real_split_point > read_len-JUNCTION_CONFIRM_WINDOW)continue;
3697 		if(real_split_point < JUNCTION_CONFIRM_WINDOW)continue;
3698 
3699 		if(global_context->config.prefer_donor_receptor_junctions)
3700 		{
3701 			if(normally_arranged)
3702 			{
3703 				gvindex_get_string (donor_left, value_index, left_virtualHead_abs_offset + real_split_point + left_indel_offset, 2, 0);
3704 				if(is_donor_chars(donor_left))
3705 				{
3706 					gvindex_get_string (donor_right, value_index, right_virtualHead_abs_offset + real_split_point + right_indel_offset - 2, 2, 0);
3707 					if(is_donor_chars(donor_right))
3708 					{
3709 						is_donor_test_ok = paired_chars(donor_left, donor_right,0);
3710 					}
3711 				}
3712 			}
3713 			else
3714 			{
3715 				gvindex_get_string (donor_left, value_index, right_virtualHead_abs_offset + real_split_point + left_indel_offset, 2, 0);
3716 				gvindex_get_string (donor_right, value_index, left_virtualHead_abs_offset + real_split_point + right_indel_offset - 2, 2, 0);
3717 				is_donor_test_ok = is_donor_chars(donor_left) && is_donor_chars(donor_right) && paired_chars(donor_left, donor_right,0);
3718 			}
3719 		}
3720 
3721 	//	donor_left[2]=0; donor_right[2]=0;
3722 
3723 		if(0 && FIXLENstrcmp("R006856515", read_name) == 0)
3724 		{
3725 			donor_left[2]=0;
3726 			donor_right[2]=0;
3727 			SUBREADprintf("TESTDON: %s %s; OFFSET=%d; DON_OK=%d; NORMAL=%d; LEFT_OFF=%d; RIGHT_OFF=%d\n", donor_left, donor_right, real_split_point_i, is_donor_test_ok, normally_arranged, left_indel_offset, right_indel_offset);
3728 		}
3729 
3730 		//#warning "============ REMOVE THE TWO '+ 1' FROM THE NEXT LINE ================="
3731 		//#warning "============ ADD TWO '+ 1' IN THE BLANKETS FOR SVs GRANT APP ================="
3732 		int mismatch_in_between_allowd = (global_context -> config.more_accurate_fusions)?(0) : (1);
3733 		if(is_donor_test_ok || !need_donor_test)
3734 		{
3735 			if(normally_arranged)
3736 			{
3737 				int inserted_bases=0;
3738 
3739 				left_should_match = match_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, left_virtualHead_abs_offset + real_split_point - JUNCTION_CONFIRM_WINDOW + left_indel_offset , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3740 				//printf("INS=%d; LM=%d\t\tLOL=%u, LOR=%u, SP=%d\n", inserted_bases, left_should_match, left_virtualHead_abs_offset, right_virtualHead_abs_offset, real_split_point);
3741 				if(left_should_match > JUNCTION_CONFIRM_WINDOW- (global_context->config.max_insertion_at_junctions?5:2))
3742 				{
3743 					for(inserted_bases = 0; inserted_bases <= global_context->config.max_insertion_at_junctions; inserted_bases++)
3744 					{
3745 
3746 						right_should_match = match_chro(read_text + real_split_point + inserted_bases, value_index, right_virtualHead_abs_offset + real_split_point + right_indel_offset + inserted_bases, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3747 					//	printf("INS=%d; LM=%d; RM=%d\t\tLOL=%u, LOR=%u, SP=%d\n", inserted_bases, left_should_match, right_should_match, left_virtualHead_abs_offset, right_virtualHead_abs_offset, real_split_point);
3748 						if(right_should_match >= 2*JUNCTION_CONFIRM_WINDOW - left_should_match - mismatch_in_between_allowd)
3749 						{
3750 							left_should_not_match = match_chro(read_text + real_split_point + inserted_bases, value_index, left_virtualHead_abs_offset + real_split_point + left_indel_offset, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3751 							right_should_not_match = match_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, right_virtualHead_abs_offset  + real_split_point + right_indel_offset - JUNCTION_CONFIRM_WINDOW + inserted_bases, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3752 
3753 
3754 							if(left_should_not_match <= JUNCTION_CONFIRM_WINDOW -5 && right_should_not_match <= JUNCTION_CONFIRM_WINDOW -5)
3755 							{
3756 								int test_score ;
3757 								if(global_context->config.max_insertion_at_junctions)
3758 									test_score = 100*(is_donor_test_ok*3000+left_should_match + right_should_match) - (left_should_not_match + right_should_not_match) - 20*inserted_bases;
3759 								else
3760 									test_score = 100*(is_donor_test_ok*3000+left_should_match + right_should_match - left_should_not_match - right_should_not_match);
3761 
3762 								if(test_score > best_score)
3763 								{
3764 									//if(left_virtualHead_abs_offset > 2729745284 - 200 && left_virtualHead_abs_offset< 2729745284 + 200)
3765 									//	SUBREADprintf("INS=%d; BSS=%d; TSC=%d\n%s\n\n", inserted_bases , best_score, test_score, read_text);
3766 									selected_junction_strand = (donor_left[0]=='G' || donor_right[1]=='G');
3767 									selected_inserted_bases = inserted_bases;
3768 									selected_real_split_point = real_split_point;
3769 									best_score = test_score;
3770 								}
3771 							}
3772 
3773 						}
3774 						if(global_context->config.max_insertion_at_junctions && 0 == inserted_bases && right_should_match >= 2*JUNCTION_CONFIRM_WINDOW - left_should_match - 5)
3775 							non_insertion_preferred = 1;
3776 
3777 					}
3778 				}
3779 			}
3780 			else
3781 			{
3782 				right_should_match = match_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, right_virtualHead_abs_offset + right_indel_offset + real_split_point - JUNCTION_CONFIRM_WINDOW , JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3783 				left_should_match = match_chro(read_text + real_split_point, value_index, left_virtualHead_abs_offset + real_split_point + left_indel_offset, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3784 
3785 				right_should_not_match = match_chro(read_text + real_split_point, value_index, right_virtualHead_abs_offset + real_split_point + right_indel_offset, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3786 				left_should_not_match = match_chro(read_text + real_split_point - JUNCTION_CONFIRM_WINDOW, value_index, left_virtualHead_abs_offset + left_indel_offset + real_split_point - JUNCTION_CONFIRM_WINDOW, JUNCTION_CONFIRM_WINDOW , 0, global_context -> config.space_type);
3787 
3788 				//printf("LEFT:MA=%d UMA=%d     RIGHT:MA=%d UMA=%d\n", left_should_match, left_should_not_match, right_should_match, right_should_not_match);
3789 
3790 				if(left_should_match +right_should_match >= 2*JUNCTION_CONFIRM_WINDOW - mismatch_in_between_allowd &&
3791 					left_should_not_match <= JUNCTION_CONFIRM_WINDOW -5 && right_should_not_match <= JUNCTION_CONFIRM_WINDOW -5)
3792 				{
3793 
3794 					int test_score;
3795 
3796 					test_score = 100*(is_donor_test_ok*3000+left_should_match + right_should_match - left_should_not_match - right_should_not_match);
3797 					if(test_score > best_score)
3798 					{
3799 						selected_junction_strand = (donor_left[0]=='G' || donor_right[1]=='G');
3800 						selected_real_split_point = real_split_point;
3801 						best_score = test_score;
3802 					}
3803 				}
3804 			}
3805 		}
3806 	}
3807 	if(best_score>0 && (0==non_insertion_preferred || 0==selected_inserted_bases))
3808 	{
3809 		*final_split_point = selected_real_split_point;
3810 		*is_donor_found_or_annotation = best_score>=290000;
3811 		*is_GT_AG_strand = selected_junction_strand;
3812 		*final_inserted_bases = selected_inserted_bases;
3813 
3814 		if(0 && FIXLENstrcmp("R006856515", read_name)==0)
3815 			SUBREADprintf("FINAL_INS_LEN=%d; BEST_SCORE=%d  %s\n", selected_inserted_bases, best_score, read_name);
3816 		return (1+best_score)/100;
3817 	}
3818 	return 0;
3819 
3820 }
3821 
find_new_junctions(global_context_t * global_context,thread_context_t * thread_context,subread_read_number_t pair_number,char * read_name,char * read_text,char * qual_text,int read_len,int is_second_read,int best_read_id)3822 void find_new_junctions(global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t pair_number, char * read_name, char * read_text, char * qual_text, int read_len, int is_second_read, int best_read_id)
3823 {
3824 	mapping_result_t * result =_global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, best_read_id);
3825 	subjunc_result_t * subjunc_result =_global_retrieve_subjunc_ptr(global_context, pair_number, is_second_read, best_read_id);
3826 
3827 	if(read_len > EXON_LONG_READ_LENGTH)
3828 	{
3829 		assert(result -> selected_position <= 0xffff0000);
3830 		core_search_short_exons(global_context, thread_context,  read_text, qual_text, read_len, result -> selected_position, (subjunc_result -> minor_votes < 1)? result -> selected_position:subjunc_result -> minor_position, result -> confident_coverage_start, result -> confident_coverage_end);
3831 	}
3832 
3833 	int selected_real_split_point = subjunc_result->split_point;
3834 
3835 	//#warning " =============== remove "+ 2" FROM THE NEXT LINE (FOR A HIGHER ACCURACY FROM SubFusion on 19 JAN 2015)  =================="
3836 	if((global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection) && subjunc_result -> minor_votes < 1)return;
3837 	if((!(global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection)) && subjunc_result -> minor_votes < 1)return;
3838 
3839 	//if(result -> selected_votes < global_context->config.minimum_subread_for_first_read)return;
3840 
3841 	if(global_context->config.do_big_margin_filtering_for_junctions)
3842 		if(is_ambiguous_voting(global_context, pair_number, is_second_read, result->selected_votes, result -> confident_coverage_start, result -> confident_coverage_end, read_len, (result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0))return;
3843 
3844 	unsigned int left_virtualHead_abs_offset = min(result -> selected_position, subjunc_result -> minor_position);
3845 	unsigned int right_virtualHead_abs_offset = max(result -> selected_position, subjunc_result -> minor_position);
3846 
3847 	int is_GT_AG_donors = result->result_flags & 0x3;
3848 	int is_donor_found_or_annotation = is_GT_AG_donors<3;
3849 	int is_strand_jumped = (result->result_flags & CORE_IS_STRAND_JUMPED)?1:0;
3850 
3851 	if(selected_real_split_point>0)
3852 	{
3853 		unsigned int left_edge_wanted, right_edge_wanted;
3854 
3855 		if(is_strand_jumped)
3856 		{
3857 			if(0){
3858 
3859 				// note that splicing point and the coverage coordinates are "major negative" view.
3860 				// recover the "negative view" splicing point location
3861 				int S = (result->result_flags & CORE_IS_NEGATIVE_STRAND) ? selected_real_split_point : (read_len - selected_real_split_point);
3862 				int Sbar = read_len - S;
3863 
3864 				int is_abnormal_as_reversed = (subjunc_result->minor_coverage_start > result->confident_coverage_start) + (subjunc_result -> minor_position >  result -> selected_position) == 1;
3865 				if(!(result->result_flags & CORE_IS_NEGATIVE_STRAND)) is_abnormal_as_reversed = !is_abnormal_as_reversed;
3866 				int is_small_half_negative = ((result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0) + (subjunc_result->minor_position < result->selected_position) ==1;
3867 
3868 				if(is_abnormal_as_reversed && is_small_half_negative)
3869 				{
3870 					left_edge_wanted = left_virtualHead_abs_offset + S;
3871 					right_edge_wanted = right_virtualHead_abs_offset + Sbar;
3872 				}
3873 				else if(is_abnormal_as_reversed && !is_small_half_negative)
3874 				{
3875 					left_edge_wanted = left_virtualHead_abs_offset + Sbar - 1;
3876 					right_edge_wanted = right_virtualHead_abs_offset + S - 1;
3877 				}
3878 				else if(!is_abnormal_as_reversed && is_small_half_negative)
3879 				{
3880 					left_edge_wanted = left_virtualHead_abs_offset + S - 1;
3881 					right_edge_wanted = right_virtualHead_abs_offset + Sbar - 1;
3882 				}
3883 				else // if(!is_abnormal_as_reversed && !is_small_half_negative)
3884 				{
3885 					left_edge_wanted = left_virtualHead_abs_offset + Sbar;
3886 					right_edge_wanted = right_virtualHead_abs_offset + S;
3887 				}
3888 
3889 				if(left_edge_wanted >= right_edge_wanted){
3890 					SUBREADprintf("REVERSED NEW JUNC: %u ~ %u : ABN_REV=%d , SMALL_NEG=%d, LEFT_VH=%u, RIGHT_VH=%u, S/~S=%d/%d\n", left_edge_wanted, right_edge_wanted, is_abnormal_as_reversed, is_small_half_negative, left_virtualHead_abs_offset, right_virtualHead_abs_offset, S, Sbar);
3891 				}
3892 
3893 			}else{
3894 				unsigned int major_half_smallest_coordinate, minor_half_smallest_coordinate;
3895 				major_half_smallest_coordinate = result -> selected_position + selected_real_split_point;
3896 				minor_half_smallest_coordinate = subjunc_result->minor_position + read_len - selected_real_split_point;
3897 				left_edge_wanted = min(major_half_smallest_coordinate, minor_half_smallest_coordinate);
3898 				right_edge_wanted = max(major_half_smallest_coordinate, minor_half_smallest_coordinate);
3899 				int is_abnormal_as_reversed = (subjunc_result->minor_coverage_start > result->confident_coverage_start) + (minor_half_smallest_coordinate > major_half_smallest_coordinate) == 1;
3900 				int is_small_half_negative = ((result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0) + (minor_half_smallest_coordinate < major_half_smallest_coordinate) ==1;
3901 				if(!(result->result_flags & CORE_IS_NEGATIVE_STRAND)) is_abnormal_as_reversed = !is_abnormal_as_reversed;
3902 				if(is_small_half_negative != is_abnormal_as_reversed)
3903 				{
3904 					left_edge_wanted -=1;
3905 					right_edge_wanted -=1;
3906 				}
3907 			}
3908 		}
3909 		else
3910 		{
3911 			int selected_real_split_point_for_left = selected_real_split_point;
3912 			int selected_real_split_point_for_right = selected_real_split_point;
3913 			if((subjunc_result->minor_coverage_start > result->confident_coverage_start) + (subjunc_result -> minor_position >  result -> selected_position) == 1) //abnormally arranged halves
3914 				selected_real_split_point_for_right --;
3915 			else	// normally arranged halves
3916 				selected_real_split_point_for_left --;
3917 
3918 
3919 
3920 			int minor_indel_offset = (subjunc_result->double_indel_offset & 0xf);
3921 			int major_indel_offset = (subjunc_result->double_indel_offset >> 4) & 0xf;
3922 			if(major_indel_offset>=8)major_indel_offset=-(16-major_indel_offset);
3923 			//assert(minor_indel_offset==0);
3924 			//assert(major_indel_offset==0);
3925 
3926 			left_edge_wanted = left_virtualHead_abs_offset + selected_real_split_point_for_left + ((result -> selected_position > subjunc_result -> minor_position)?minor_indel_offset: major_indel_offset);
3927 			right_edge_wanted = right_virtualHead_abs_offset + selected_real_split_point_for_right;
3928 		}
3929 
3930 		char * chro_name_left, *chro_name_right;
3931 		int chro_pos_left,chro_pos_right;
3932 
3933 		locate_gene_position( left_edge_wanted , &global_context -> chromosome_table, &chro_name_left, &chro_pos_left);
3934 		locate_gene_position( right_edge_wanted , &global_context -> chromosome_table, &chro_name_right, &chro_pos_right);
3935 		if((!( global_context-> config.do_fusion_detection ||  global_context-> config.do_long_del_detection) ) && chro_name_right!=chro_name_left) return;
3936 
3937 		//insert event
3938 		HashTable * event_table = NULL;
3939 		chromosome_event_t * event_space = NULL;
3940 		if(thread_context)
3941 		{
3942 			event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
3943 			event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
3944 		}
3945 		else
3946 		{
3947 			event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
3948 			event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
3949 		}
3950 
3951 		// note that selected_real_split_point is the first UNWANTED base after left half.
3952 
3953 		//if(abs(left_edge_wanted-27286396) < 250 || abs(right_edge_wanted - 27286396)<250)
3954 		if(0 && FIXLENstrcmp("R003738400", read_name) == 0)
3955 		{
3956 			char leftpos[100], rightpos[100];
3957 			absoffset_to_posstr(global_context, left_edge_wanted, leftpos);
3958 			absoffset_to_posstr(global_context, right_edge_wanted, rightpos);
3959 			SUBREADprintf("READ=%s, LEFT=%s, RIGHT=%s\n", read_name, leftpos, rightpos);
3960 		}
3961 
3962 		chromosome_event_t * found = NULL;
3963 		chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
3964 		int found_events = search_event(global_context, event_table, event_space, left_edge_wanted , EVENT_SEARCH_BY_SMALL_SIDE,  CHRO_EVENT_TYPE_INDEL | CHRO_EVENT_TYPE_JUNCTION | CHRO_EVENT_TYPE_FUSION, search_return);
3965 
3966 		mark_gapped_read(result);
3967 		if(found_events)
3968 		{
3969 			int kx1;
3970 			for(kx1 = 0; kx1 < found_events ; kx1++)
3971 			{
3972 				if(search_return[kx1] -> event_large_side == right_edge_wanted)
3973 				{
3974 					found = search_return[kx1];
3975 					break;
3976 				}
3977 			}
3978 		}
3979 
3980 		//if( 1018082 == pair_number)
3981 		//		SUBREADprintf("NEW_CHIMERISM_HERE [%u:%d: R_%d] : %s , %s , %u , %u, %c ; INC=%d %d\n", pair_number, best_read_id, is_second_read+1, chro_name_left, chro_name_right, chro_pos_left, chro_pos_right, is_strand_jumped?'X':'=', subjunc_result -> small_side_increasing_coordinate, subjunc_result -> large_side_increasing_coordinate);
3982 
3983 
3984 		int is_key_fusion = 0;
3985 		if(0)if(
3986 			( 9566365 + 1210 - 200 <= left_edge_wanted && 9566365 + 1210 + 200 >= left_edge_wanted) &&
3987 		    ( 36859887 + 1210 - 200  <= right_edge_wanted &&  36859887 + 1210 + 200 >= right_edge_wanted)
3988 		){
3989 			SUBREADprintf("Read = %s, FOUND = %p in %d , %s:%u , %s:%u, INCs= %d, %d, JUMP=%d\n", read_name, found, found_events, chro_name_left, chro_pos_left, chro_name_right, chro_pos_right, subjunc_result -> small_side_increasing_coordinate, subjunc_result -> large_side_increasing_coordinate, is_strand_jumped);
3990 			is_key_fusion = 1;
3991 		}
3992 
3993 		if(found) found -> supporting_reads ++;
3994 		else
3995 		{
3996 			int event_no;
3997 
3998 
3999 			if(thread_context)
4000 				event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
4001 			else
4002 				event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) ->  total_events ++;
4003 
4004 
4005 			event_space = reallocate_event_space(global_context, thread_context, event_no);
4006 
4007 			chromosome_event_t * new_event = event_space+event_no;
4008 			memset(new_event,0,sizeof(chromosome_event_t));
4009 			new_event -> event_small_side = left_edge_wanted;
4010 			new_event -> event_large_side = right_edge_wanted + subjunc_result->indel_at_junction;
4011 			new_event -> critical_read_id = 2llu * pair_number + is_second_read;
4012 
4013 			int new_event_type =(((global_context -> config.entry_program_name == CORE_PROGRAM_SUBJUNC && global_context ->  config.do_fusion_detection)||(global_context -> config.entry_program_name == CORE_PROGRAM_SUBJUNC && global_context ->  config.do_long_del_detection))&& !global_context -> config.prefer_donor_receptor_junctions)?CHRO_EVENT_TYPE_FUSION:CHRO_EVENT_TYPE_JUNCTION;
4014 
4015 			//#warning "=========================== DELETE NEXT LINE !!! =================================="
4016 			//new_event_type = CHRO_EVENT_TYPE_REMOVED;
4017 
4018 			if(is_strand_jumped) new_event_type = CHRO_EVENT_TYPE_FUSION;
4019 			if((subjunc_result->minor_coverage_start > result->confident_coverage_start) + (subjunc_result -> minor_position >  result -> selected_position) ==1)
4020 				new_event_type = CHRO_EVENT_TYPE_FUSION;
4021 			if(chro_name_right!=chro_name_left)
4022 				new_event_type = CHRO_EVENT_TYPE_FUSION;
4023 			if(right_edge_wanted - left_edge_wanted > global_context -> config.maximum_intron_length)
4024 				if(!(global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection))
4025 					new_event_type = CHRO_EVENT_TYPE_REMOVED;
4026 
4027 
4028 			if(1)
4029 			{
4030 				unsigned int dist = new_event -> event_large_side -  new_event -> event_small_side;
4031 				int origin_type = new_event_type;
4032 				int fusion_cover_len = -1;
4033 
4034 				if(dist > MAX_INSERTION_LENGTH && new_event_type == CHRO_EVENT_TYPE_FUSION)
4035 				{
4036 					int cov_end, cover_start, major_cov;
4037 					cov_end = max(subjunc_result->minor_coverage_end, result->confident_coverage_end );
4038 					cover_start = min(subjunc_result->minor_coverage_start, result->confident_coverage_start);
4039 
4040 					major_cov =  result->confident_coverage_end  -  result->confident_coverage_start;
4041 
4042 					fusion_cover_len = cov_end - cover_start ;
4043 
4044 					if(fusion_cover_len < read_len - 15 || major_cov > read_len - 15)
4045 						new_event_type = CHRO_EVENT_TYPE_REMOVED;
4046 				}
4047 
4048 				if(dist > MAX_INSERTION_LENGTH && new_event_type == CHRO_EVENT_TYPE_FUSION && subjunc_result -> minor_votes < 2)
4049 					new_event_type = CHRO_EVENT_TYPE_REMOVED;
4050 				else if(new_event_type == CHRO_EVENT_TYPE_FUSION && subjunc_result -> minor_votes < 1)
4051 					new_event_type = CHRO_EVENT_TYPE_REMOVED;
4052 
4053 
4054 				if(dist > MAX_INSERTION_LENGTH && new_event_type == CHRO_EVENT_TYPE_FUSION && result -> selected_votes < 2)
4055 					new_event_type = CHRO_EVENT_TYPE_REMOVED;
4056 				else if(new_event_type == CHRO_EVENT_TYPE_FUSION && result -> selected_votes < 1)
4057 					new_event_type = CHRO_EVENT_TYPE_REMOVED;
4058 
4059 				if(0 && origin_type == CHRO_EVENT_TYPE_FUSION)
4060 				{
4061 					char leftpos[100], rightpos[100];
4062 					absoffset_to_posstr(global_context, new_event -> event_small_side, leftpos);
4063 					absoffset_to_posstr(global_context, new_event -> event_large_side, rightpos);
4064 
4065 					if(new_event_type == CHRO_EVENT_TYPE_REMOVED)
4066 						SUBREADprintf("NEW_FUSION REMOVED %s SUGGEST %s ~ %s MAJOR COV=%d ~ %d, MINOR COV=%d ~ %d, RLEN=%d, COVED=%d, VOTES=%d, %d, %s, SPLIT=%d\n", read_name, leftpos, rightpos, result->confident_coverage_start, result->confident_coverage_end, subjunc_result->minor_coverage_start, subjunc_result->minor_coverage_end, read_len, fusion_cover_len, result -> selected_votes, subjunc_result -> minor_votes, is_strand_jumped?"JUMPED":"======", selected_real_split_point);
4067 					else
4068 						SUBREADprintf("NEW_FUSION WANTED %s SUGGEST %s ~ %s  MAJOR COV=%d ~ %d, MINOR COV=%d ~ %d, RLEN=%d, COVED=%d, VOTES=%d, %d, %s, SPLIT=%d\n", read_name, leftpos, rightpos, result->confident_coverage_start, result->confident_coverage_end, subjunc_result->minor_coverage_start, subjunc_result->minor_coverage_end, read_len, fusion_cover_len, result -> selected_votes, subjunc_result -> minor_votes, is_strand_jumped?"JUMPED":"======", selected_real_split_point);
4069 				}
4070 
4071 				if(dist > MAX_INSERTION_LENGTH && new_event_type == CHRO_EVENT_TYPE_FUSION && (selected_real_split_point < read_len * 0.2 || selected_real_split_point >= read_len *0.8000) )
4072 					new_event_type = CHRO_EVENT_TYPE_REMOVED;
4073 			}
4074 		//if(pair_number == 13)
4075 		//printf("MMMMX %d %u -- %u : TYPE %d\n" , event_no, left_edge_wanted, right_edge_wanted, new_event_type);
4076 
4077 
4078 //			if((is_donor_found_or_annotation || !global_context -> config.check_donor_at_junctions) &&(!is_strand_jumped) && right_edge_wanted - left_edge_wanted <= global_context -> config.maximum_intron_length
4079 //				&& (subjunc_result->minor_coverage_start > result->confident_coverage_start) + (subjunc_result -> minor_position >  result -> selected_position) !=1)
4080 
4081 
4082 			if(is_key_fusion) SUBREADprintf("   INSERTED AS %d ( in %d or %d )\n", new_event_type , CHRO_EVENT_TYPE_JUNCTION, CHRO_EVENT_TYPE_FUSION);
4083 
4084 			if(new_event_type == CHRO_EVENT_TYPE_JUNCTION)
4085 			{
4086 				new_event -> is_negative_strand= !is_GT_AG_donors;
4087 				new_event -> event_type = CHRO_EVENT_TYPE_JUNCTION;
4088 
4089 				new_event -> supporting_reads = 1;
4090 				new_event -> indel_length = 0;
4091 				new_event -> indel_at_junction = subjunc_result->indel_at_junction;
4092 				new_event -> is_donor_found_or_annotation = is_donor_found_or_annotation;
4093 
4094 				new_event -> small_side_increasing_coordinate = subjunc_result -> small_side_increasing_coordinate;
4095 				new_event -> large_side_increasing_coordinate = subjunc_result -> large_side_increasing_coordinate;
4096 
4097 				put_new_event(event_table, new_event , event_no);
4098 
4099 				if(0 && FIXLENstrcmp("R000000052", read_name) == 0)
4100 					SUBREADprintf("NEW_JUNCTION_HERE : %s , %u , %u  (%u, %u)\n", chro_name_right, chro_pos_left, chro_pos_right,  new_event -> event_small_side, new_event -> event_large_side);
4101 			}
4102 			else if(new_event_type == CHRO_EVENT_TYPE_FUSION)
4103 			{
4104 				if((global_context ->  config.do_fusion_detection || global_context ->  config.do_long_del_detection))
4105 				{
4106 					new_event -> event_type = CHRO_EVENT_TYPE_FUSION;
4107 					new_event -> is_strand_jumped = is_strand_jumped;
4108 
4109 
4110 					new_event -> supporting_reads = 1;
4111 					new_event -> indel_length = 0;
4112 
4113 					new_event -> small_side_increasing_coordinate = subjunc_result -> small_side_increasing_coordinate;
4114 					new_event -> large_side_increasing_coordinate = subjunc_result -> large_side_increasing_coordinate;
4115 
4116 					put_new_event(event_table, new_event , event_no);
4117 					//if( 1018082 == pair_number)
4118 					//	SUBREADprintf("NEW_CHIMERISM_HERE_FULL [%u:%d: R_%d] : %s , %s , %u , %u, %c ; INC=%d %d\n", pair_number, best_read_id, is_second_read+1, chro_name_left, chro_name_right, chro_pos_left, chro_pos_right, is_strand_jumped?'X':'=', new_event -> small_side_increasing_coordinate, new_event -> large_side_increasing_coordinate);
4119 				}
4120 			}
4121 		}
4122 	}
4123 }
4124 
4125 void write_translocation_results_final(void * key, void * buckv, HashTable * tab);
4126 void write_inversion_results_final(void * key, void * buckv, HashTable * tab);
4127 
write_fusion_final_results(global_context_t * global_context)4128 int write_fusion_final_results(global_context_t * global_context)
4129 {
4130 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
4131 	char fn2 [MAX_FILE_NAME_LENGTH+30];
4132 
4133 	sprintf(fn2,"%s.breakpoints.vcf", global_context->config.output_prefix);
4134 	FILE * ofp = f_subr_open(fn2, "wb");
4135 	fprintf(ofp,"##fileformat=VCFv4.1\n");
4136 	fprintf(ofp,"##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n");
4137 	fprintf(ofp,"##INFO=<ID=MATEID,Number=1,Type=String,Description=\"Paired breakend id\">\n");
4138 	fprintf(ofp,"##INFO=<ID=SR,Number=1,Type=Integer,Description=\"Supporting read number\">\n");
4139 	fprintf(ofp,"#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO\n");
4140 
4141 	int xk1, disk_is_full = 0;
4142 	unsigned int all_junctions = 0;
4143 	int no_sup_juncs = 0;
4144 	int all_juncs = 0;
4145 
4146 	for(xk1 = 0; xk1 < indel_context -> total_events ; xk1++)
4147 	{
4148 		char * chro_name_left,* chro_name_right;
4149 		int chro_pos_left, chro_pos_right;
4150 		chromosome_event_t * event_body = indel_context -> event_space_dynamic +xk1;
4151 		if(event_body -> event_type != CHRO_EVENT_TYPE_FUSION && (global_context->config.entry_program_name != CORE_PROGRAM_SUBREAD || event_body -> event_type != CHRO_EVENT_TYPE_JUNCTION))
4152 			continue;
4153 
4154 		all_juncs++;
4155 
4156 		if(event_body->final_counted_reads<1|| event_body->critical_supporting_reads < 1 - 1)
4157 		{
4158 			no_sup_juncs++;
4159 			continue;
4160 		}
4161 		locate_gene_position( event_body -> event_small_side , &global_context -> chromosome_table, &chro_name_left, &chro_pos_left);
4162 		locate_gene_position( event_body -> event_large_side , &global_context -> chromosome_table, &chro_name_right, &chro_pos_right);
4163 
4164 		chro_pos_left+=1;
4165 		chro_pos_right+=1;
4166 		all_junctions ++;
4167 
4168 		int wlen;
4169 		char alt_base[500];
4170 		char ref_base;
4171 		char bkt = event_body -> large_side_increasing_coordinate?'[':']';
4172 
4173 		gene_value_index_t * current_index = find_current_value_index(global_context , event_body -> event_small_side , 1);
4174 		ref_base = gvindex_get( current_index, event_body -> event_small_side);
4175 		if(event_body -> small_side_increasing_coordinate)
4176 			sprintf(alt_base,"%c%s:%u%c%c", bkt, chro_name_right, chro_pos_right, bkt, ref_base);
4177 		else
4178 			sprintf(alt_base,"%c%c%s:%u%c", ref_base, bkt, chro_name_right, chro_pos_right, bkt);
4179 
4180 		wlen = fprintf(ofp,"%s\t%u\tbnd_%d\t%c\t%s\t.\tPASS\tSVTYPE=BND;MATEID=bnd_%d;SR=%d\n", chro_name_left, chro_pos_left, all_junctions *2 -1, ref_base, alt_base, all_junctions*2, event_body -> final_counted_reads);
4181 
4182 		current_index = find_current_value_index(global_context , event_body -> event_large_side , 1);
4183 		ref_base = gvindex_get( current_index, event_body -> event_large_side );
4184 		bkt = event_body -> small_side_increasing_coordinate?'[':']';
4185 		if(event_body -> large_side_increasing_coordinate)
4186 			sprintf(alt_base,"%c%s:%u%c%c", bkt, chro_name_left, chro_pos_left, bkt, ref_base);
4187 		else
4188 			sprintf(alt_base,"%c%c%s:%u%c", ref_base, bkt, chro_name_left, chro_pos_left, bkt);
4189 
4190 		wlen += fprintf(ofp,"%s\t%u\tbnd_%d\t%c\t%s\t.\tPASS\tSVTYPE=BND;MATEID=bnd_%d;SR=%d\n", chro_name_right, chro_pos_right, all_junctions *2, ref_base, alt_base, all_junctions*2 -1, event_body -> final_counted_reads);
4191 		if(wlen <18) disk_is_full = 1;
4192 // fprintf(ofp, "%s\t%u\t%s\t%u\t%s\t%d\t%s\t%s\n", chro_name_left, chro_pos_left, chro_name_right, chro_pos_right+1, event_body -> is_strand_jumped?"No":"Yes", event_body -> final_counted_reads, event_body -> small_side_increasing_coordinate?"Yes":"No", event_body -> large_side_increasing_coordinate?"Yes":"No");
4193 	}
4194 
4195 	global_context -> all_fusions = all_junctions;
4196 
4197 	if(global_context->config.do_structural_variance_detection){
4198 		global_context -> translocation_result_table.entry_table -> appendix1 = ofp;
4199 		global_context -> translocation_result_table.entry_table -> appendix2 = global_context;
4200 		HashTableIteration(global_context -> translocation_result_table.entry_table, write_translocation_results_final);
4201 		global_context -> inversion_result_table.entry_table -> appendix1 = ofp;
4202 		global_context -> inversion_result_table.entry_table -> appendix2 = global_context;
4203 		HashTableIteration(global_context -> inversion_result_table.entry_table, write_inversion_results_final);
4204 	}
4205 
4206 	fclose(ofp);
4207 
4208 	if(disk_is_full){
4209 		unlink(fn2);
4210 		SUBREADprintf("ERROR: disk is full. No fusion table is generated.\n");
4211 	}
4212 	return 0;
4213 }
4214 
write_inversion_results_final(void * bukey,void * buckv,HashTable * tab)4215 void write_inversion_results_final(void * bukey, void * buckv, HashTable * tab){
4216 	int x1;
4217 	bucketed_table_bucket_t * buck = buckv;
4218 
4219 	FILE * ofp = (FILE *)tab -> appendix1;
4220 	global_context_t * global_context = (global_context_t * )tab -> appendix2;
4221 	for(x1 = 0; x1 < buck -> items; x1++)
4222 	{
4223 		if(buck->positions[x1] - buck->positions[x1] % buck -> maximum_interval_length == buck -> keyed_bucket)
4224 		{
4225 			inversion_result_t * inv_res = buck -> details[x1];
4226 
4227 			char * src_chr;
4228 			int src_pos;
4229 
4230 			locate_gene_position(inv_res -> small_side,  &global_context -> chromosome_table, &src_chr , &src_pos);
4231 			fprintf(ofp, "INV\t%s\t%d\t%s\t%u\t%s\n",  src_chr, src_pos + 1, src_chr, src_pos + 1 + inv_res -> length,  inv_res -> is_precisely_called ? "PRECISE":"IMPRECISE");
4232 			fprintf(ofp, "INV\t%s\t%d\t%s\t%u\t%s\n",  src_chr, src_pos + 2, src_chr, src_pos + inv_res -> length,  inv_res -> is_precisely_called ? "PRECISE":"IMPRECISE");
4233 
4234 			//fprintf(ofp, "INVERSION\t%s\t%u\t%u\t%u\t%u\n", src_chr, src_pos, inv_res -> length, inv_res -> all_sup_D , inv_res -> max_sup_E);
4235 		}
4236 	}
4237 
4238 }
4239 
write_translocation_results_final(void * bukey,void * buckv,HashTable * tab)4240 void write_translocation_results_final(void * bukey, void * buckv, HashTable * tab){
4241 	int x1;
4242 	bucketed_table_bucket_t * buck = buckv;
4243 
4244 	FILE * ofp = (FILE *)tab -> appendix1;
4245 	global_context_t * global_context = (global_context_t * )tab -> appendix2;
4246 	for(x1 = 0; x1 < buck -> items; x1++)
4247 	{
4248 		if(buck->positions[x1] - buck->positions[x1] % buck -> maximum_interval_length == buck -> keyed_bucket)
4249 		{
4250 			char * src_chr, *targ_chr;
4251 			int src_pos, targ_pos;
4252 
4253 			translocation_result_t * trans_res = buck -> details[x1];
4254 
4255 			locate_gene_position(trans_res -> source_left_side,  &global_context -> chromosome_table, &src_chr , &src_pos);
4256 			locate_gene_position(trans_res -> target_left_side,  &global_context -> chromosome_table, &targ_chr , &targ_pos);
4257 
4258 			//fprintf(ofp, "TRANSLOCATION\t%s\t%u\t%u\t%s\t%u\t%s\t%u\t%u\n", src_chr, src_pos, trans_res -> length, targ_chr, targ_pos, trans_res -> is_inv?"INV":"STR", trans_res -> all_sup_P , trans_res -> max_sup_QR);
4259 			/*
4260 			SUBREADprintf("ABS=%u, %u, PRECISE=%d\n", trans_res -> source_left_side, trans_res -> target_left_side, trans_res -> is_precisely_called);
4261 			SUBREADprintf("%u, %u\n", src_pos, targ_pos);
4262 			SUBREADprintf("%s, %s\n", src_chr, targ_chr);
4263 			*/
4264 			fprintf(ofp, "%s\t%s\t%u\t%s\t%d\t%s\t%s\n", src_chr == targ_chr?"ITX":"CTX", src_chr, src_pos + 1, targ_chr, targ_pos + 1, trans_res -> is_inv?"X":"=",  trans_res -> is_precisely_called ? "PRECISE":"IMPRECISE");
4265 			fprintf(ofp, "%s\t%s\t%u\t%s\t%d\t%s\t%s\n", src_chr == targ_chr?"ITX":"CTX", src_chr, src_pos + trans_res -> length + 1, targ_chr, targ_pos + 1, trans_res -> is_inv?"X":"=", trans_res -> is_precisely_called ? "PRECISE":"IMPRECISE");
4266 			fprintf(ofp, "DEL\t%s\t%d\t%u\t%s\n", src_chr, src_pos + 1, trans_res -> length ,  trans_res -> is_precisely_called ? "PRECISE":"IMPRECISE");
4267 		}
4268 	}
4269 
4270 }
4271 
write_junction_final_results(global_context_t * global_context)4272 int write_junction_final_results(global_context_t * global_context)
4273 {
4274 
4275 	int no_sup_juncs = 0, disk_is_full = 0;
4276 
4277 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
4278 	char fn2 [MAX_FILE_NAME_LENGTH+30];
4279 
4280 	sprintf(fn2,"%s.junction.bed", global_context->config.output_prefix);
4281 	FILE * ofp = f_subr_open(fn2, "wb");
4282 
4283 	fprintf(ofp, "#Chr, StartLeftBlock, EndRightBlock, Junction_Name, nSupport, Strand, StartLeftBlock, EndRightBlock, Color, nBlocks, BlockSizes, BlockStarts\n");
4284 
4285 	int xk1;
4286 	unsigned int all_junctions = 0;
4287 
4288 	for(xk1 = 0; xk1 < indel_context -> total_events ; xk1++)
4289 	{
4290 		char * chro_name_left,* chro_name_right, indel_sect[10];
4291 		int chro_pos_left, chro_pos_right;
4292 		chromosome_event_t * event_body = indel_context -> event_space_dynamic +xk1;
4293 		if(event_body -> event_type != CHRO_EVENT_TYPE_JUNCTION)
4294 			continue;
4295 
4296 		//#warning "  ================================== remove '- 1' from the next line!!! ================================="
4297 		if(event_body->final_counted_reads <  1 || ( event_body->critical_supporting_reads < 1 - 1&& event_body->indel_at_junction))
4298 		{
4299 			no_sup_juncs++;
4300 			continue;
4301 		}
4302 
4303 		locate_gene_position( event_body -> event_small_side , &global_context -> chromosome_table, &chro_name_left, &chro_pos_left);
4304 		locate_gene_position( event_body -> event_large_side , &global_context -> chromosome_table, &chro_name_right, &chro_pos_right);
4305 
4306 		chro_pos_left++;
4307 
4308 
4309 		unsigned int feature_start = chro_pos_left - event_body -> junction_flanking_left;
4310 		if(chro_pos_left <= event_body -> junction_flanking_left){
4311 			feature_start = 1;
4312 			event_body -> junction_flanking_left = chro_pos_left - 1;
4313 		}
4314 
4315 		unsigned int feature_end = chro_pos_right + event_body -> junction_flanking_right;
4316 
4317 		all_junctions ++;
4318 
4319 		indel_sect[0]=0;
4320 		if(event_body->indel_at_junction)
4321 			sprintf(indel_sect,"INS%d", event_body->indel_at_junction);
4322 		if(event_body-> is_donor_found_or_annotation &64)strcat(indel_sect,"ANNO");
4323 		//else if(event_body->critical_supporting_reads < 1)
4324 		//	strcpy(indel_sect, "NOCRT");
4325 
4326 
4327 		int wlen = fprintf(ofp,"%s\t%u\t%u\tJUNC%08u%s\t%d\t%c\t%u\t%u\t%d,%d,%d\t2\t%d,%d\t0,%u\n", chro_name_left, feature_start,  feature_end,
4328 												all_junctions, indel_sect,  event_body -> final_counted_reads, event_body->is_negative_strand?'-':'+',
4329 												feature_start,  feature_end, event_body->is_negative_strand?0:255, /*event_body -> anti_supporting_reads*/ event_body->is_negative_strand?255:0, event_body->is_negative_strand?255:0,
4330 												 event_body -> junction_flanking_left, event_body -> junction_flanking_right, feature_end-feature_start-event_body -> junction_flanking_right);
4331 		if(wlen < 10) disk_is_full = 1;
4332 	}
4333 
4334 	fclose(ofp);
4335 	if(disk_is_full){
4336 		unlink(fn2);
4337 		SUBREADprintf("ERROR: disk is full; no junction table is created.\n");
4338 	}
4339 	global_context -> all_junctions = all_junctions;
4340 	//printf("Non-support juncs=%d;  Final juncs = %d\n", no_sup_juncs, all_junctions);
4341 	return 0;
4342 }
4343 
4344 
4345 
get_chro_2base(char * buf,gene_value_index_t * index,unsigned int pos,int is_negative_strand)4346 void get_chro_2base(char *buf, gene_value_index_t * index, unsigned int pos, int is_negative_strand)
4347 {
4348 	gvindex_get_string (buf, index, pos, 2, is_negative_strand);
4349 }
4350 
4351 
paired_chars_part(char * ch1,char * ch2,int is_reverse)4352 int paired_chars_part(char * ch1, char * ch2, int is_reverse)
4353 {
4354 	if (c2eq(ch1, ch2, "GT", "AG") || c2eq(ch1, ch2, "CT", "AC"))
4355 	{
4356 		if (is_reverse) if (ceq(ch1, "AG") || ceq(ch1, "AC")) return 1;
4357 		if (!is_reverse) if (ceq(ch1, "CT") || ceq(ch1, "GT")) return 1;
4358 	}
4359 	return 0;
4360 }
4361 #define is_donar_chars_part(cc) (((cc)[0]=='G' && (cc)[1]=='T') || \
4362 			    ((cc)[0]=='A' && (cc)[1]=='G') || \
4363 			    ((cc)[0]=='A' && (cc)[1]=='C') || \
4364 			    ((cc)[0]=='C' && (cc)[1]=='T'))
4365 
4366 
4367 #define SHORT_EXON_MIN_LENGTH 18
4368 #define EXON_EXTENDING_SCAN 0
4369 #define SHORT_EXON_WINDOW 6
4370 #define SHORT_EXON_EXTEND 5000
4371 
core_search_short_exons(global_context_t * global_context,thread_context_t * thread_context,char * read_text,char * qualityb0,int rl,unsigned int P1_Pos,unsigned int P2_Pos,short read_coverage_start,short read_coverage_end)4372 void core_search_short_exons(global_context_t * global_context, thread_context_t * thread_context, char * read_text, char * qualityb0, int rl, unsigned int P1_Pos, unsigned int P2_Pos, short read_coverage_start, short read_coverage_end)
4373 {
4374 	char inb[MAX_READ_LENGTH], qualityb[MAX_READ_LENGTH];
4375 	if ( (rl <= EXON_LONG_READ_LENGTH ) && (!EXON_EXTENDING_SCAN)) return;
4376 	//return;
4377 	gene_value_index_t * base_index = thread_context?thread_context->current_value_index:global_context->current_value_index ;
4378 	//insert event
4379 	HashTable * event_table = NULL;
4380 	chromosome_event_t * event_space = NULL;
4381 	if(thread_context)
4382 	{
4383 		event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
4384 		event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
4385 	}
4386 	else
4387 	{
4388 		event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
4389 		event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
4390 	}
4391 
4392 	strcpy(inb, read_text);
4393 	strcpy(qualityb, qualityb0);
4394 
4395 	unsigned int pos_small=min(P1_Pos, P2_Pos), pos_big = max(P1_Pos, P2_Pos);
4396 
4397 	int max_score , test_score;
4398 	unsigned int best_j1_edge=0 , best_j2_edge=0;
4399 	int need_to_test = 0;
4400 
4401 //////////////////////////////////////////////////////////////////////////////////////////////
4402 //////////////////////////////////////////////////////////////////////////////////////////////
4403 // SCAN TO THE HEAD  /////////////////////////////////////////////////////////////////////////
4404 //////////////////////////////////////////////////////////////////////////////////////////////
4405 //////////////////////////////////////////////////////////////////////////////////////////////
4406 
4407 	if (read_coverage_start  > SHORT_EXON_MIN_LENGTH)
4408 	{
4409 		max_score = -1;
4410 
4411 		int need_check2 = 1;
4412 		if(qualityb[0])
4413 		{
4414 			float head_quality = read_quality_score(qualityb , SHORT_EXON_MIN_LENGTH , global_context->config.phred_score_format);
4415 			if(head_quality < 6 )
4416 				need_check2 = 0;
4417 		}
4418 
4419 
4420 		if(need_check2)
4421 			if(SHORT_EXON_MIN_LENGTH *0.6 < match_chro(inb, base_index, pos_small, SHORT_EXON_MIN_LENGTH , 0, global_context->config.space_type))
4422 				need_check2 = 0;
4423 
4424 
4425 		if(need_check2)
4426 		{
4427 
4428 			int delta_pos, is_indel = 0;
4429 			for(delta_pos=-3; delta_pos <=3; delta_pos ++)
4430 			{
4431 				if(match_chro(inb, base_index, pos_small + delta_pos, SHORT_EXON_MIN_LENGTH , 0, global_context->config.space_type) >= SHORT_EXON_MIN_LENGTH*.7)
4432 				{
4433 					is_indel = 1;
4434 					break;
4435 				}
4436 			}
4437 			// The head of the read is incorrect. Do we need to search a long way?
4438 			// See if there is a donor in the head area.
4439 			int test_donor_pos;
4440 			char cc[3];
4441 			cc[2]=0;
4442 
4443 			if(!is_indel)
4444 				for(test_donor_pos = SHORT_EXON_MIN_LENGTH ; test_donor_pos < read_coverage_start ; test_donor_pos ++)
4445 				{
4446 					get_chro_2base(cc, base_index, pos_small + test_donor_pos, 0);
4447 					if(is_donar_chars_part(cc))
4448 					{
4449 						need_to_test = 1;
4450 						break;
4451 					}
4452 				}
4453 		}
4454 	}
4455 
4456 	max_score = -999;
4457 	int max_is_GTAG = 0;
4458 
4459 	if(need_to_test && pos_small >= SHORT_EXON_MIN_LENGTH)
4460 	{
4461 		unsigned int test_end = pos_small - SHORT_EXON_EXTEND;
4462 		if(SHORT_EXON_EXTEND > pos_small) test_end = 0;
4463 
4464 		unsigned int new_pos = pos_small-SHORT_EXON_MIN_LENGTH;
4465 		while(1)
4466 		{
4467 			new_pos = match_chro_range(inb,  base_index, new_pos, 7 , new_pos - test_end , SEARCH_BACK);
4468 			if(new_pos==0xffffffff) break;
4469 			// There is an exact match. See if the donor/receptors are matched.
4470 			// new_pos is the new head position of the read.
4471 			int splice_point;
4472 			for(splice_point = SHORT_EXON_MIN_LENGTH; splice_point < read_coverage_start ; splice_point ++)
4473 			{
4474 				char cc[3];
4475 				cc[2]=0;
4476 				char cc2[3];
4477 				cc2[2]=0;
4478 
4479 				get_chro_2base(cc, base_index, pos_small + splice_point -2, 0);
4480 				if(is_donar_chars_part(cc))
4481 				{
4482 					// <<< EXON---|CC2---INTRON---CC|---EXON
4483 					get_chro_2base(cc2, base_index, new_pos + splice_point, 0);
4484 					if(is_donar_chars_part(cc2) && paired_chars_part(cc2 , cc, 0))
4485 					{
4486 						int matched_in_exon_old = match_chro(inb + splice_point, base_index, pos_small + splice_point , SHORT_EXON_WINDOW , 0, global_context->config.space_type);
4487 						int matched_in_exon_new = match_chro(inb, base_index, new_pos , splice_point, 0, global_context->config.space_type);
4488 
4489 
4490 						test_score = 1000000+ (matched_in_exon_new )*10000  + matched_in_exon_old * 1000 + new_pos - test_end;
4491 						if(test_score <= max_score) continue;
4492 						max_score = test_score;
4493 
4494 						if(matched_in_exon_new < splice_point || matched_in_exon_old < SHORT_EXON_WINDOW )
4495 							continue;
4496 
4497 						max_is_GTAG = (cc2[0]=='G' || cc2[1]=='G');
4498 						//printf("EX CC=%s\tCC2=%s\tis_GTAG=%d\n",cc,cc2,max_is_GTAG);
4499 						best_j1_edge = new_pos + splice_point - 1;
4500 						best_j2_edge = pos_small + splice_point;
4501 					}
4502 				}
4503 			}
4504 		}
4505 	}
4506 
4507 
4508 	if(best_j1_edge>0)
4509 	{
4510 		int event_no;
4511 		chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
4512 		chromosome_event_t * found = NULL;
4513 
4514 		int found_events = search_event(global_context, event_table, event_space, best_j1_edge , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_JUNCTION|CHRO_EVENT_TYPE_FUSION, search_return);
4515 
4516 		if(found_events)
4517 		{
4518 			int kx1;
4519 			for(kx1 = 0; kx1 < found_events ; kx1++)
4520 			{
4521 				if(search_return[kx1] -> event_large_side == best_j2_edge)
4522 				{
4523 					found = search_return[kx1];
4524 					break;
4525 				}
4526 			}
4527 		}
4528 
4529 		if(found) found -> supporting_reads ++;
4530 		else
4531 		{
4532 			if(thread_context)
4533 				event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
4534 			else
4535 				event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) ->  total_events ++;
4536 
4537 			event_space = reallocate_event_space(global_context, thread_context, event_no);
4538 
4539 			chromosome_event_t * new_event = event_space+event_no;
4540 			memset(new_event,0,sizeof(chromosome_event_t));
4541 			new_event -> event_small_side = best_j1_edge;
4542 			new_event -> event_large_side = best_j2_edge;
4543 
4544 			new_event -> is_negative_strand= !max_is_GTAG;
4545 			new_event -> event_type = CHRO_EVENT_TYPE_JUNCTION;
4546 
4547 			new_event -> supporting_reads = 1;
4548 			new_event -> indel_length = 0;
4549 
4550 			put_new_event(event_table, new_event , event_no);
4551 		}
4552 		//printf("FOUND NEW JUNCTION HEAD: %u - %u\n", best_j1_edge, best_j2_edge);
4553 	}
4554 
4555 
4556 //////////////////////////////////////////////////////////////////////////////////////////////
4557 //////////////////////////////////////////////////////////////////////////////////////////////
4558 // SCAN TO THE TAIL  /////////////////////////////////////////////////////////////////////////
4559 //////////////////////////////////////////////////////////////////////////////////////////////
4560 //////////////////////////////////////////////////////////////////////////////////////////////
4561 
4562 	need_to_test = 0;
4563 	max_score = -999;
4564 
4565 
4566 	if (read_coverage_end< rl - SHORT_EXON_MIN_LENGTH)
4567 	{
4568 		int need_check2 = 1;
4569 		if(qualityb[0])
4570 		{
4571 			float head_quality = read_quality_score(qualityb + rl - SHORT_EXON_MIN_LENGTH , SHORT_EXON_MIN_LENGTH , global_context->config.phred_score_format);
4572 			if(head_quality < 6 )
4573 				need_check2 = 0;
4574 		}
4575 
4576 
4577 		if(SHORT_EXON_MIN_LENGTH *0.6 < match_chro(inb + rl - SHORT_EXON_MIN_LENGTH, base_index, pos_big + rl - SHORT_EXON_MIN_LENGTH , SHORT_EXON_MIN_LENGTH , 0, global_context->config.space_type))
4578 			need_check2 = 0;
4579 		if(need_check2)
4580 		{
4581 			int delta_pos, is_indel = 0;
4582 			for(delta_pos=-3; delta_pos <=3; delta_pos ++)
4583 			{
4584 				if(match_chro(inb + rl - SHORT_EXON_MIN_LENGTH, base_index, pos_big + rl - SHORT_EXON_MIN_LENGTH + delta_pos, SHORT_EXON_MIN_LENGTH , 0, global_context->config.space_type) >= SHORT_EXON_MIN_LENGTH*.7)
4585 				{
4586 					is_indel = 1;
4587 					break;
4588 				}
4589 			}
4590 			// The head of the read is incorrect. Do we need to search a long way?
4591 			// See if there is a donor in the head area.
4592 			int test_donor_pos;
4593 			char cc[3];
4594 			cc[2]=0;
4595 
4596 			if(!is_indel)
4597 				for(test_donor_pos = read_coverage_end  ; test_donor_pos < rl ; test_donor_pos ++)
4598 				{
4599 					get_chro_2base(cc, base_index, pos_big + test_donor_pos, 0);
4600 					if(is_donar_chars_part(cc))
4601 					{
4602 						need_to_test = 1;
4603 						break;
4604 					}
4605 				}
4606 		}
4607 	}
4608 
4609 	best_j1_edge = 0;
4610 	max_is_GTAG = 0;
4611 
4612 	if(need_to_test)
4613 	{
4614 		unsigned int test_end = pos_big + SHORT_EXON_EXTEND;
4615 		if(test_end > base_index -> length + base_index -> start_point) test_end = base_index -> length + base_index -> start_point;
4616 
4617 		unsigned int new_pos = pos_big +rl - SHORT_EXON_MIN_LENGTH +16;
4618 
4619 		while(1)
4620 		{
4621 			if(new_pos +  test_end - new_pos < base_index-> start_base_offset + base_index->length)
4622 			{
4623 				assert(new_pos<0xffff0000);
4624 				new_pos = match_chro_range(inb + rl - SHORT_EXON_MIN_LENGTH,  base_index, new_pos, 7 , test_end - new_pos , SEARCH_FRONT);
4625 			}
4626 			else break;
4627 
4628 			if(new_pos==0xffffffff) break;
4629 			// There is an exact match. See if the donor/receptors are matched.
4630 			// (new_pos + SHORT_EXON_MIN_LENGTH -rl + splice_point) is the new exon start.
4631 
4632 			int splice_point;
4633 			for(splice_point = read_coverage_end ; splice_point < rl -  SHORT_EXON_MIN_LENGTH; splice_point ++)
4634 			{
4635 				char cc[3];
4636 				cc[2]=0;
4637 				char cc2[3];
4638 				cc2[2]=0;
4639 
4640 				unsigned int new_pos_tail = (new_pos + SHORT_EXON_MIN_LENGTH -rl + splice_point);
4641 
4642 				get_chro_2base(cc, base_index, pos_big + splice_point, 0);
4643 				if(is_donar_chars_part(cc))
4644 				{
4645 					get_chro_2base(cc2, base_index, new_pos_tail -2, 0);
4646 					if(is_donar_chars_part(cc2) && paired_chars_part(cc , cc2, 0))
4647 					{
4648 						int matched_in_exon_new = match_chro(inb + splice_point, base_index, new_pos_tail , rl - splice_point , 0, global_context->config.space_type);
4649 						int matched_in_exon_old = match_chro(inb + splice_point - SHORT_EXON_WINDOW , base_index, pos_big + splice_point - SHORT_EXON_WINDOW , SHORT_EXON_WINDOW, 0, global_context->config.space_type);
4650 
4651 						test_score = 1000000+ (matched_in_exon_new)*10000 + matched_in_exon_old * 1000  + test_end - new_pos;
4652 						if(test_score <= max_score) continue;
4653 						max_score = test_score;
4654 
4655 						if(matched_in_exon_new < (rl - splice_point) || matched_in_exon_old < SHORT_EXON_WINDOW)
4656 							continue;
4657 
4658 						// EXON ---|CC---INTRON---CC2|--- EXON >>>
4659 						max_is_GTAG = (cc[0]=='G'|| cc[1]=='G');
4660 						best_j1_edge = pos_big + splice_point - 1;
4661 						best_j2_edge = new_pos_tail;
4662 					}
4663 				}
4664 			}
4665 
4666 		}
4667 	}
4668 
4669 
4670 	if(best_j1_edge>0)
4671 	{
4672 		int event_no;
4673 		chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
4674 		chromosome_event_t * found = NULL;
4675 
4676 		int found_events = search_event(global_context, event_table, event_space, best_j1_edge , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_JUNCTION|CHRO_EVENT_TYPE_FUSION, search_return);
4677 
4678 		if(found_events)
4679 		{
4680 			int kx1;
4681 			for(kx1 = 0; kx1 < found_events ; kx1++)
4682 			{
4683 				if(search_return[kx1] -> event_large_side == best_j2_edge)
4684 				{
4685 					found = search_return[kx1];
4686 					break;
4687 				}
4688 			}
4689 		}
4690 
4691 		if(found) found -> supporting_reads ++;
4692 		else
4693 		{
4694 			if(thread_context)
4695 				event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
4696 			else
4697 				event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) ->  total_events ++;
4698 
4699 
4700 			event_space = reallocate_event_space(global_context, thread_context, event_no);
4701 
4702 			chromosome_event_t * new_event = event_space+event_no;
4703 			memset(new_event,0,sizeof(chromosome_event_t));
4704 			new_event -> event_small_side = best_j1_edge;
4705 			new_event -> event_large_side = best_j2_edge;
4706 
4707 			new_event -> is_negative_strand= !max_is_GTAG;
4708 			new_event -> event_type = CHRO_EVENT_TYPE_JUNCTION;
4709 
4710 			new_event -> supporting_reads = 1;
4711 			new_event -> indel_length = 0;
4712 
4713 			put_new_event(event_table, new_event , event_no);
4714 			//printf("FOUND NEW JUNCTION TAIL: %u - %u\n", best_j1_edge, best_j2_edge);
4715 		}
4716 	}
4717 }
4718 
4719 
4720 
4721 
4722 
4723 
4724 
4725 
4726 
core_select_best_matching_halves_maxone(global_context_t * global_context,gene_vote_t * vote,unsigned int * best_pos1,unsigned int * best_pos2,int * best_vote1,int * best_vote2,char * is_abnormal,short * half_marks,int * is_reversed_halves,float accept_rate,int read_len,long long int hint_pos,int tolerable_bases,short * read_coverage_start,short * read_coverage_end,gene_vote_number_t * indel_in_p1,gene_vote_number_t * indel_in_p2,gehash_data_t max_pos,gene_vote_number_t max_votes,short max_start,short max_end,short max_mask,gene_vote_number_t * max_indel_recorder,int * best_select_max_votes,int rl)4727 int core_select_best_matching_halves_maxone(global_context_t * global_context, gene_vote_t * vote, unsigned int * best_pos1, unsigned int * best_pos2, int * best_vote1, int * best_vote2, char * is_abnormal, short * half_marks, int * is_reversed_halves, float accept_rate, int read_len, long long int hint_pos, int tolerable_bases, short * read_coverage_start, short * read_coverage_end, gene_vote_number_t * indel_in_p1, gene_vote_number_t * indel_in_p2, gehash_data_t max_pos, gene_vote_number_t max_votes, short max_start, short max_end, short max_mask, gene_vote_number_t * max_indel_recorder, int* best_select_max_votes, int rl)
4728 {
4729 	int best_splicing_point = -1, i,j;
4730 	char * best_chro_name, is_reversed;
4731 	int best_chro_pos;
4732 	int selected_max_votes = -1;
4733 
4734 
4735 	is_reversed = (max_mask & IS_NEGATIVE_STRAND)?1:0;
4736 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
4737 		for(j=0; j< vote->items[i]; j++)
4738 		{
4739 			char * chro_name;
4740 			char is_partner_reversed;
4741 			int chro_pos;
4742 
4743 			int overlapped_len, overlap_start, overlap_end;
4744 			// All logical conditions
4745 
4746 			//if( (vote->votes[i][j] < vote-> coverage_start[i][j]) < 12 && (vote-> coverage_end[i][j] > rl - 12 )) continue;
4747 
4748 			is_partner_reversed = (vote->masks [i][j] & IS_NEGATIVE_STRAND) ? 1:0;
4749 			overlap_start = max(max_start , vote->coverage_start[i][j]);
4750 			overlap_end   = min(max_end , vote->coverage_end[i][j]);
4751 			overlapped_len =overlap_end - overlap_start;
4752 
4753 			int coverage_len = max_end - max_start + vote->coverage_end[i][j] - vote->coverage_start[i][j];
4754 			if (overlapped_len >0)coverage_len -= overlapped_len;
4755 			//SUBREADprintf("MAX: %d-%d   OTHER %d-%d    COV=%d   OVLP=%d\n", max_start, max_end, vote->coverage_start[i][j], vote->coverage_end[i][j], coverage_len, overlapped_len);
4756 
4757 
4758 
4759 			if(overlapped_len >=14)
4760 				continue;
4761 
4762 			long long int dist = vote->pos[i][j];
4763 			dist -= max_pos;
4764 
4765 			//SUBREADprintf ("D=%lld\n", abs(dist));
4766 			if (abs(dist)<6)
4767 				continue;
4768 
4769 			int support_r1 = 1;
4770 			int support_r2 = 1;
4771 
4772 			if (max_votes < support_r1 || vote->votes[i][j]<support_r2)
4773 				continue;
4774 
4775 			// Same chromosome
4776 			if ((vote->coverage_start[i][j] < max_start) + is_reversed == 1)
4777 			{
4778 				locate_gene_position(max_pos + read_len, &(global_context -> chromosome_table) , &best_chro_name, &best_chro_pos);
4779 				locate_gene_position(vote->pos[i][j] , &(global_context -> chromosome_table), &chro_name, &chro_pos);
4780 			}else
4781 			{
4782 				locate_gene_position(max_pos , &(global_context -> chromosome_table), &best_chro_name, &best_chro_pos);
4783 				locate_gene_position(vote->pos[i][j] +read_len, &(global_context -> chromosome_table), &chro_name, &chro_pos);
4784 			}
4785 
4786 			if (chro_name != best_chro_name)	// The pointers can be compared because they can be the same.
4787 				continue;
4788 
4789 			int is_fusion = 0;
4790 
4791 			if(is_reversed != is_partner_reversed) is_fusion = 1;
4792 
4793 			if( is_reversed && ((max_pos > vote->pos[i][j]) + (vote->coverage_start[i][j] < max_start) != 1))is_fusion = 1;
4794 			if((! is_reversed) && ((max_pos > vote->pos[i][j]) + (vote->coverage_start[i][j] > max_start) != 1)) is_fusion = 1;
4795 
4796 			if(abs(dist) > 500000 || chro_name != best_chro_name) continue;
4797 
4798 			int test_vote_value ;
4799 			test_vote_value = 8888888 +  vote->votes[i][j]* 1000000 - abs(dist);
4800 			if (hint_pos>=0)
4801 			{
4802 				long long int hint_dist = hint_pos;
4803 				hint_dist -= vote->pos[i][j];
4804 				if (abs (hint_dist) < 100000)
4805 					test_vote_value += 100;
4806 				if (abs (hint_dist) < 5000)
4807 					test_vote_value += 100;
4808 			}
4809 
4810 			if (test_vote_value<selected_max_votes)continue;
4811 			// Conditions of order of R3 and R5
4812 			*half_marks &= ~IS_REVERSED_HALVES;
4813 			if (vote->coverage_start[i][j] < max_start && (((max_pos < vote->pos[i][j]) && !is_reversed) || ((max_pos > vote->pos[i][j]) && is_reversed) ) )
4814 				*half_marks |= IS_REVERSED_HALVES;
4815 			if (vote->coverage_start[i][j] >= max_end  &&  (((max_pos > vote->pos[i][j]) && !is_reversed) || ((max_pos < vote->pos[i][j]) && is_reversed) ) )
4816 				*half_marks |= IS_REVERSED_HALVES;
4817 
4818 			if (vote->coverage_start[i][j] < max_start)
4819 			{
4820 				(*half_marks) = (*half_marks) & ~IS_R1_CLOSE_TO_5;
4821 			}
4822 			else
4823 			{
4824 				(*half_marks) |= IS_R1_CLOSE_TO_5;
4825 			}
4826 
4827 			if(max_mask & IS_NEGATIVE_STRAND)
4828 				*half_marks = (*half_marks) |   IS_NEGATIVE_STRAND_R1;
4829 			else
4830 				*half_marks = (*half_marks) &  ~IS_NEGATIVE_STRAND_R1;
4831 
4832 			if(vote->masks[i][j] & IS_NEGATIVE_STRAND)
4833 				*half_marks = (*half_marks) |   IS_NEGATIVE_STRAND_R2;
4834 			else
4835 				*half_marks = (*half_marks) &  ~IS_NEGATIVE_STRAND_R2;
4836 
4837 
4838 
4839 			best_splicing_point = ((vote->coverage_start[i][j] < max_start)? (vote->coverage_end[i][j]):(max_end)) + ((vote->coverage_start[i][j] < max_start)? (max_start):(vote->coverage_start[i][j]));
4840 
4841 
4842 			best_splicing_point /=2;
4843 
4844 			* best_pos1 = max_pos ;
4845 			* best_pos2 = vote->pos[i][j] ;
4846 			* best_vote1 = max_votes ;
4847 			* best_vote2 = vote->votes[i][j] ;
4848 			* read_coverage_start = min(vote->coverage_start[i][j] , max_start);
4849 			* read_coverage_end = max(vote->coverage_end[i][j] , max_end);
4850 
4851 			* read_coverage_start = max_start;
4852 			* read_coverage_end = max_end;
4853 
4854 			int k;
4855 			for(k=0; k<MAX_INDEL_TOLERANCE ; k+=3)
4856 				if(!max_indel_recorder[k+3])break;
4857 			* indel_in_p1 = max_indel_recorder[k+2];
4858 
4859 			for(k=0; k<MAX_INDEL_TOLERANCE ; k+=3)
4860 				if(!vote->indel_recorder[i][j][k+3])break;
4861 			* indel_in_p2 = vote->indel_recorder[i][j][k+2];
4862 
4863 
4864 			* is_reversed_halves = is_reversed;
4865 
4866 			if (test_vote_value >=100)
4867 				*half_marks = (*half_marks) | IS_PAIRED_HINTED;
4868 			else
4869 				*half_marks = (*half_marks) & ~(IS_PAIRED_HINTED);
4870 
4871 			if (is_fusion)
4872 				*half_marks = (*half_marks)    | IS_FUSION;
4873 			else
4874 				*half_marks = (*half_marks) & ~( IS_FUSION);
4875 
4876 
4877 			selected_max_votes = test_vote_value;
4878 
4879 		}
4880 	*best_select_max_votes = selected_max_votes ;
4881 	return best_splicing_point;
4882 }
4883 
4884 
4885 
core_select_best_matching_halves(global_context_t * global_context,gene_vote_t * vote,unsigned int * best_pos1,unsigned int * best_pos2,int * best_vote1,int * best_vote2,char * is_abnormal,short * half_marks,int * is_reversed_halves,float accept_rate,int read_len,long long int hint_pos,int tolerable_bases,short * read_coverage_start,short * read_coverage_end,char * indel_in_p1,char * indel_in_p2,int * max_cover_start,int * max_cover_end,int rl,int repeated_pos_base,int is_negative,char * repeat_record,unsigned int index_valid_range)4886 int core_select_best_matching_halves(global_context_t * global_context , gene_vote_t * vote, unsigned int * best_pos1, unsigned int * best_pos2, int * best_vote1, int * best_vote2, char * is_abnormal, short * half_marks, int * is_reversed_halves, float accept_rate, int read_len, long long int hint_pos, int tolerable_bases, short * read_coverage_start, short * read_coverage_end, char * indel_in_p1, char * indel_in_p2 , int * max_cover_start, int * max_cover_end, int rl, int repeated_pos_base, int is_negative, char * repeat_record, unsigned int index_valid_range)
4887 {
4888 	unsigned int tmp_best_pos1=0, tmp_best_pos2=0;
4889 	int tmp_best_vote1=0, tmp_best_vote2=0, tmp_is_reversed_halves=0;
4890 	char tmp_is_abnormal=0;
4891 	gene_vote_number_t tmp_indel_in_p1=0, tmp_indel_in_p2=0;
4892 	short tmp_half_marks=0, tmp_read_coverage_start=0, tmp_read_coverage_end=0;
4893 	int ret = 0, best_ret = 0;
4894 
4895 	int i,j;
4896 	int test_select_votes=-1, best_select_votes = 1000000;
4897 	//int max_minor = 0;
4898 
4899 	/*
4900 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
4901 		for(j=0; j< vote->items[i]; j++)
4902 		{
4903 			if(vote->votes[i][j] < vote->max_vote)continue;
4904 			int ii,jj;
4905 			for (ii=0; ii<GENE_VOTE_TABLE_SIZE;ii++)
4906 				for(jj=0; jj< vote->items[ii]; jj++)
4907 				{
4908 					if(max_minor >= vote->votes[ii][jj]) continue;
4909 					if(ii==i && jj==j)continue;
4910 					long long int dist =  vote->pos[ii][jj];
4911 					dist =abs(dist - vote->pos[i][j]);
4912 					if(dist > 500000)
4913 						continue;
4914 					max_minor = vote->votes[ii][jj];
4915 				}
4916 
4917 		}
4918 
4919 	int encountered = 0;
4920 
4921 
4922 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
4923 		for(j=0; j< vote->items[i]; j++)
4924 		{
4925 			if(vote->votes[i][j] < vote->max_vote)continue;
4926 			int ii,jj;
4927 			for (ii=0; ii<GENE_VOTE_TABLE_SIZE;ii++)
4928 				for(jj=0; jj< vote->items[ii]; jj++)
4929 				{
4930 					if(max_minor != vote->votes[ii][jj]) continue;
4931 					if(ii==i && jj==j)continue;
4932 					long long int dist =  vote->pos[ii][jj];
4933 					dist =abs(dist - vote->pos[i][j]);
4934 					if(dist > 500000)
4935 						continue;
4936 					encountered++;
4937 				}
4938 
4939 		}
4940 	*/
4941 
4942 	int repeated_pos = repeated_pos_base;
4943 	int offset_shifting = (rl > 220)?4:0;
4944 	//int encounter = 0;
4945 
4946 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
4947 		for(j=0; j< vote->items[i]; j++)
4948 		{
4949 			/*if((vote->votes[i][j] >=  vote->max_vote -1) && (vote->max_coverage_start >= vote-> coverage_start[i][j] - EXON_MAX_BIGMARGIN_OVERLAPPING ) &&  (vote->max_coverage_end <= vote-> coverage_end[i][j] + EXON_MAX_BIGMARGIN_OVERLAPPING))
4950 				encounter++;*/
4951 			if(repeated_pos_base>=0 && vote->pos[i][j]<=index_valid_range)
4952 				if(vote->votes[i][j] >=  vote->max_vote && repeated_pos < repeated_pos_base+12)
4953 				{
4954 					repeat_record[repeated_pos] = (vote-> coverage_start[i][j] >> offset_shifting);
4955 					repeat_record[repeated_pos+1] = (vote-> coverage_end[i][j] >> offset_shifting);
4956 					repeat_record[repeated_pos+2] = (is_negative?0x80:0) | (vote->votes[i][j]&0x7f);
4957 					repeated_pos+=3;
4958 				}
4959 		}
4960 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
4961 		for(j=0; j< vote->items[i]; j++)
4962 		{
4963 			if(repeated_pos_base>=0 && vote->pos[i][j]<=index_valid_range)
4964 				if(vote->votes[i][j] ==  vote->max_vote -1 && repeated_pos < repeated_pos_base+12)
4965 				{
4966 					repeat_record[repeated_pos] = (vote-> coverage_start[i][j] >> offset_shifting);
4967 					repeat_record[repeated_pos+1] = (vote-> coverage_end[i][j] >> offset_shifting);
4968 					repeat_record[repeated_pos+2] = (is_negative?0x80:0) | (vote->votes[i][j]&0x7f);
4969 					repeated_pos+=3;
4970 				}
4971 		}
4972 
4973 
4974 	/*
4975 	if(encounter>=2)
4976 		return 0;
4977 	*/
4978 
4979 	ret = core_select_best_matching_halves_maxone(global_context, vote, &tmp_best_pos1, &tmp_best_pos2, &tmp_best_vote1, &tmp_best_vote2,  &tmp_is_abnormal,&tmp_half_marks, &tmp_is_reversed_halves, accept_rate, read_len, hint_pos,  tolerable_bases, &tmp_read_coverage_start, &tmp_read_coverage_end, &tmp_indel_in_p1, &tmp_indel_in_p2, vote -> max_position,  vote->max_vote, vote-> max_coverage_start, vote-> max_coverage_end,  vote-> max_mask, vote->max_indel_recorder, &test_select_votes, rl);
4980 	test_select_votes += vote->max_vote*1000000;
4981 			//SUBREADprintf("TSV=%d\n",test_select_votes);
4982 
4983 	if(test_select_votes > best_select_votes)
4984 	{
4985 		best_select_votes = test_select_votes;
4986 		*best_pos1 = tmp_best_pos1;
4987 		*best_pos2 = tmp_best_pos2;
4988 		*is_reversed_halves= tmp_is_reversed_halves;
4989 
4990 		*best_vote1 = tmp_best_vote1;
4991 		*best_vote2 = tmp_best_vote2;
4992 		*is_abnormal = tmp_is_abnormal;
4993 		*indel_in_p1 = tmp_indel_in_p1;
4994 		*indel_in_p2 = tmp_indel_in_p2;
4995 
4996 		*half_marks = tmp_half_marks;
4997 		*read_coverage_start = tmp_read_coverage_start;
4998 		*read_coverage_end = tmp_read_coverage_end;
4999 
5000 		* max_cover_start = vote-> max_coverage_start;
5001 		* max_cover_end = vote-> max_coverage_end;
5002 		best_ret = ret;
5003 	}
5004 	return best_ret;
5005 }
5006 
5007 
5008 
5009 #define EXON_DONOR_TEST_WINDOW 17
5010 
5011 
5012 // pos1 must be small than pos2.
core13_test_donor(char * read,int read_len,unsigned int pos1,unsigned int pos2,int guess_break_point,char negative_strand,int test_range,char is_soft_condition,int EXON_INDEL_TOLERANCE,int * real_break_point,gene_value_index_t * my_value_array_index,int indel_offset1,int indel_offset2,int is_reversed,int space_type,int * best_donor_score,int * is_GTAG)5013 int core13_test_donor(char *read, int read_len, unsigned int pos1, unsigned int pos2, int guess_break_point, char negative_strand, int test_range, char is_soft_condition, int EXON_INDEL_TOLERANCE, int* real_break_point, gene_value_index_t * my_value_array_index, int indel_offset1, int indel_offset2, int is_reversed, int space_type, int * best_donor_score, int * is_GTAG)
5014 {
5015 	int bps_pos_x;
5016 	int search_start = guess_break_point - test_range ;
5017 	int search_end   = guess_break_point + test_range ;
5018 	char h1_2ch[3], h2_2ch[3];
5019 
5020 	h1_2ch[2] = h2_2ch[2]=0;
5021 	search_start=max(10, search_start);
5022 	search_end = min(read_len-10, search_end);
5023 	int best_break = -1;
5024 	int min_x = -9099;
5025 
5026 	for (bps_pos_x = search_start; bps_pos_x < search_end ; bps_pos_x ++)
5027 	{
5028 		int paired_score = 0;
5029 		get_chro_2base(h1_2ch, my_value_array_index, pos1 - indel_offset1+ bps_pos_x , is_reversed);
5030 		get_chro_2base(h2_2ch, my_value_array_index, pos2 - 2 - indel_offset2 + bps_pos_x, is_reversed);
5031 
5032 
5033 		//if(!is_reversed)
5034 		//SUBREADprintf("C1=%s @%u, C2=%s @%u\n",h1_2ch, pos1 + bps_pos_x, h2_2ch,pos2 - 2 + indel_offset + bps_pos_x);
5035 		if(h1_2ch[0]==h2_2ch[0] && h1_2ch[1]==h2_2ch[1]) continue;
5036 
5037 		if(is_donar_chars_part(h1_2ch) && is_donar_chars_part(h2_2ch))
5038 		{
5039 
5040 			paired_score = paired_chars_part(h1_2ch, h2_2ch, is_reversed);
5041 
5042 			if(paired_score)
5043 			{
5044 				int m1, m2, x1, x2;
5045 				int break_point_half = is_reversed?(read_len - bps_pos_x):bps_pos_x;
5046 				int first_exon_end,second_half_start;
5047 				int donar_conf_len = 0;
5048 
5049 				donar_conf_len = min(break_point_half , EXON_DONOR_TEST_WINDOW);
5050 				donar_conf_len = min(read_len - break_point_half, donar_conf_len);
5051 				//SUBREADprintf("DONOR_CONF_LEN=%d\n", donar_conf_len);
5052 
5053 				if (is_reversed)
5054 				{
5055 					first_exon_end = pos2 + bps_pos_x - indel_offset2;
5056 					second_half_start = pos1 + bps_pos_x- indel_offset1;
5057 
5058 					m1 = match_chro(read + break_point_half - donar_conf_len , my_value_array_index, first_exon_end, donar_conf_len, is_reversed, space_type);
5059 					m2 = match_chro(read + break_point_half , my_value_array_index, second_half_start-donar_conf_len , donar_conf_len, is_reversed, space_type);
5060 
5061 					x1 = match_chro(read + break_point_half ,  my_value_array_index, first_exon_end - donar_conf_len, donar_conf_len , is_reversed, space_type);
5062 					x2 = match_chro(read + break_point_half - donar_conf_len ,  my_value_array_index, second_half_start , donar_conf_len, is_reversed, space_type);
5063 				}
5064 				else
5065 				{
5066 					first_exon_end = pos1 + bps_pos_x - indel_offset1;
5067 					second_half_start = pos2 + bps_pos_x - indel_offset2;
5068 
5069 					m1 = match_chro(read + break_point_half - donar_conf_len, my_value_array_index, first_exon_end-donar_conf_len , donar_conf_len, is_reversed, space_type);
5070 					m2 = match_chro(read + break_point_half , my_value_array_index, second_half_start, donar_conf_len, is_reversed, space_type);
5071 
5072 					x1 = match_chro(read + break_point_half ,  my_value_array_index, first_exon_end, donar_conf_len , is_reversed,space_type);
5073 					x2 = match_chro(read + break_point_half - donar_conf_len,  my_value_array_index, second_half_start - donar_conf_len, donar_conf_len , is_reversed,space_type);
5074 				}
5075 
5076 				#ifdef TEST_TARGET
5077 				if(memcmp(read, TEST_TARGET, 15)==0)
5078 				{
5079 					SUBREADprintf("DONOR TEST STR=%s, %s ; pos=%d    %d %d ; M=%d %d ; X=%d %d\n", h1_2ch, h2_2ch, bps_pos_x, indel_offset1, indel_offset2, m1, m2, x1, x2);
5080 				}
5081 				#endif
5082 
5083 				int threshold = 3;
5084 				if (paired_score == 1)
5085 					threshold = 3;
5086 
5087 				#ifdef QUALITY_KILL
5088 				if (m1 >= donar_conf_len-1    && m2>=donar_conf_len-1 )
5089 					if(x1<donar_conf_len - threshold  && x2<donar_conf_len- threshold )
5090 				#else
5091 				if (m1 >= donar_conf_len-1    && m2>=donar_conf_len -1)
5092 					if(x1<donar_conf_len - threshold  && x2<donar_conf_len - threshold)
5093 				#endif
5094 					{
5095 						int score =  3000-(x1 + x2) + (m1+ m2) ;
5096 						if (min_x < score)
5097 						{
5098 							min_x = score;
5099 							best_break = bps_pos_x;
5100 							*is_GTAG = 1==((is_reversed) + (h1_2ch[0]=='G' || h1_2ch[1]=='G'));	//"GT" or "AG"
5101 							//printf("FL CC=%s\tCC2=%s\tis_GTAG=%d\tREV=%d\n",h1_2ch,h2_2ch,*is_GTAG, is_reversed);
5102 							*best_donor_score = score;
5103 						}
5104 					}
5105 			}
5106 		}
5107 	}
5108 
5109 	if (best_break>0)
5110 	{
5111 				#ifdef TEST_TARGET
5112 				if(memcmp(read, TEST_TARGET, 15)==0)
5113 					SUBREADprintf("SELECRED!!!_BREAKPOINT=%d, RAW POS=%u,%u, R=%s\n",  best_break, pos1 , pos2, read);
5114 				#endif
5115 		//SUBREADprintf ("FINAL BREAK: %d   ; REV = %d\n ", best_break, is_reversed);
5116 		*real_break_point = best_break;
5117 		return 1;
5118 	}
5119 	else
5120 	{
5121 				#ifdef TEST_TARGET
5122 				if(memcmp(read, TEST_TARGET, 15)==0)
5123 					SUBREADprintf("KILLED!!!_BREAKPOINT=%d, R=%s\n",  best_break+ pos1, read);
5124 				#endif
5125 	}
5126 	return 0;
5127 }
5128 
5129 
5130 
5131 
5132 
5133 
5134 #define EXON_LARGE_WINDOW 60
5135 #define ACCEPTED_SUPPORT_RATE 0.3
5136 
core_fragile_junction_voting(global_context_t * global_context,thread_context_t * thread_context,char * rname,char * read,char * qual,unsigned int full_rl,int negative_strand,int color_space,unsigned int low_border,unsigned int high_border,gene_vote_t * vote_p1)5137 void core_fragile_junction_voting(global_context_t * global_context, thread_context_t * thread_context, char * rname, char * read, char * qual, unsigned int full_rl, int negative_strand, int color_space, unsigned int low_border, unsigned int high_border, gene_vote_t *vote_p1)
5138 {
5139 	int windows = full_rl / EXON_LARGE_WINDOW +1;
5140 	float overlap = (1.0*windows * EXON_LARGE_WINDOW - full_rl) / (windows-1);
5141 
5142 	int ww;
5143 	int window_cursor = 0;
5144 
5145 	HashTable * event_table = NULL;
5146 	chromosome_event_t * event_space = NULL;
5147 	if(thread_context)
5148 	{
5149 		event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
5150 		event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
5151 	}
5152 	else
5153 	{
5154 		event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
5155 		event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
5156 	}
5157 
5158 	int GENE_SLIDING_STEP = global_context->current_index -> index_gap;
5159 
5160 
5161 	for(ww=0; ww<windows;ww++)
5162 	{
5163 		window_cursor = (int)(ww * EXON_LARGE_WINDOW - ww * overlap);
5164 		int read_len = EXON_LARGE_WINDOW;
5165 		if(ww == windows-1)
5166 			read_len = full_rl -window_cursor;
5167 
5168 		float subread_step = 3.00001;
5169 		int i;
5170 		int subread_no;
5171 		char * InBuff;
5172 		InBuff = read + window_cursor;
5173 		char tmp_char = InBuff[read_len];
5174 		InBuff[read_len] = 0;
5175 
5176 		init_gene_vote(vote_p1);
5177 		for(subread_no=0; ; subread_no++)
5178 		{
5179 			int subread_offset1 = (int)(subread_step * (subread_no+1));
5180 			subread_offset1 -= subread_offset1%GENE_SLIDING_STEP;
5181 			subread_offset1 += GENE_SLIDING_STEP-1;
5182 
5183 			for(i=0; i<GENE_SLIDING_STEP ; i++)
5184 			{
5185 				int subread_offset = (int)(subread_step * subread_no);
5186 				subread_offset -= subread_offset%GENE_SLIDING_STEP -i;
5187 
5188 				char * subread_string = InBuff + subread_offset;
5189 				gehash_key_t subread_integer = genekey2int(subread_string, color_space);
5190 
5191 				gehash_go_q(global_context->current_index, subread_integer , subread_offset, read_len,negative_strand, vote_p1, 5, subread_no,  low_border, high_border - read_len);
5192 			}
5193 			if(subread_offset1 >= read_len -16)
5194 				break;
5195 		}
5196 
5197 		int ii, jj, kk;
5198 		for(ii = 0; ii < GENE_VOTE_TABLE_SIZE; ii++) {
5199 			for(jj = 0; jj < vote_p1 -> items[ii] ; jj++) {
5200 				if(vote_p1 -> votes[ii][jj] < vote_p1 -> max_vote) continue;
5201 
5202 				gene_vote_number_t * indel_recorder = vote_p1 -> indel_recorder[ii][jj];
5203 				unsigned int voting_position =  vote_p1 -> pos[ii][jj];
5204 				int last_indel = 0, last_correct_subread=0;
5205 
5206 				for(kk =0; indel_recorder[kk]  && (kk < MAX_INDEL_SECTIONS); kk+=3){
5207 					char movement_buffer[MAX_READ_LENGTH * 10 / 7];
5208 					//chromosome_event_t * last_event = NULL;
5209 					int last_event_id = -1;
5210 
5211 					int indels = indel_recorder[kk+2] - last_indel;
5212 					if(indels==0) continue;
5213 
5214 					int next_correct_subread = indel_recorder[kk] -1;
5215 
5216 					int last_correct_base = find_subread_end(read_len, global_context->config.total_subreads , last_correct_subread) - 9;
5217 					int first_correct_base = find_subread_end(read_len, global_context->config.total_subreads , next_correct_subread) - 16 + 9;
5218 					first_correct_base = min(first_correct_base+10, read_len);
5219 					last_correct_base = max(0, last_correct_base);
5220 					last_correct_base = min(read_len-1, last_correct_base);
5221 
5222 					int x1, dyna_steps;
5223 
5224 					dyna_steps = core_dynamic_align(global_context, thread_context, InBuff + last_correct_base, first_correct_base - last_correct_base, voting_position + last_correct_base + last_indel, movement_buffer, indels, rname);
5225 
5226 					movement_buffer[dyna_steps]=0;
5227 
5228 					if(0 && strcmp("MISEQ:13:000000000-A1H1M:1:1112:12194:5511", rname) == 0)
5229 					{
5230 						SUBREADprintf("IR= %d  %d~%d\n", dyna_steps, last_correct_base, first_correct_base);
5231 
5232 						for(x1=0; x1<dyna_steps;x1++)
5233 						{
5234 							int mc, mv=movement_buffer[x1];
5235 							if(mv==0)mc='=';
5236 							else if(mv==1)mc='D';
5237 							else if(mv==2)mc='I';
5238 							else mc='X';
5239 							SUBREADprintf("%c",mc);
5240 						}
5241 						SUBREADputs("");
5242 					}
5243 					unsigned int cursor_on_chromosome = voting_position + last_correct_base + last_indel, cursor_on_read = last_correct_base;
5244 					int last_mv = 0;
5245 					unsigned int indel_left_boundary = 0;
5246 					int is_in_indel = 0, current_indel_len = 0, total_mismatch = 0;
5247 
5248 					for(x1=0; x1<dyna_steps;x1++)
5249 					{
5250 						int mv=movement_buffer[x1];
5251 						if(mv==3) total_mismatch++;
5252 					}
5253 
5254 					if(total_mismatch<2 || (global_context->config.maximise_sensitivity_indel && total_mismatch <= 2 ))
5255 						for(x1=0; x1<dyna_steps;x1++)
5256 						{
5257 							int mv=movement_buffer[x1];
5258 
5259 							if(last_mv != mv)
5260 							{
5261 								if( ( mv==1 || mv==2 ) && ! is_in_indel)
5262 								{
5263 									indel_left_boundary = cursor_on_chromosome;
5264 									is_in_indel = 1;
5265 									current_indel_len = 0;
5266 								}
5267 								else if ( is_in_indel && (mv == 0 || mv == 3)  )
5268 								{
5269 									gene_value_index_t * current_value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
5270 									int ambiguous_i, ambiguous_count=0;
5271 									int best_matched_bases = match_chro(InBuff + cursor_on_read - 6, current_value_index, indel_left_boundary - 6, 6, 0, global_context->config.space_type)  +
5272 												 match_chro(InBuff + cursor_on_read - min(current_indel_len,0), current_value_index, indel_left_boundary + max(0, current_indel_len), 6, 0, global_context->config.space_type);
5273 									for(ambiguous_i=-5; ambiguous_i<=5; ambiguous_i++)
5274 									{
5275 										int left_match = match_chro(InBuff + cursor_on_read - 6, current_value_index, indel_left_boundary - 6, 6+ambiguous_i, 0, global_context->config.space_type);
5276 										int right_match = match_chro(InBuff + cursor_on_read + ambiguous_i - min(current_indel_len,0), current_value_index, indel_left_boundary + ambiguous_i + max(0, current_indel_len), 6-ambiguous_i, 0,global_context->config.space_type);
5277 										if(left_match+right_match == best_matched_bases) ambiguous_count ++;
5278 									}
5279 
5280 									if(0 && strcmp("MISEQ:13:000000000-A1H1M:1:1112:12194:5511", rname) == 0)
5281 										SUBREADprintf("INDEL_DDADD: abs(I=%d); INDELS=%d; LOC=%u\n",i, current_indel_len, indel_left_boundary-1);
5282 									if(abs(current_indel_len)<=global_context -> config.max_indel_length)
5283 									{
5284 										chromosome_event_t * new_event = local_add_indel_event(global_context, thread_context, event_table, InBuff + cursor_on_read + min(0,current_indel_len), indel_left_boundary - 1, current_indel_len, 1, ambiguous_count, 0, NULL);
5285 										if(last_event_id >=0 && new_event){
5286 											// the event space can be changed when the new event is added. the location is updated everytime.
5287 											chromosome_event_t * event_space = NULL;
5288 											if(thread_context)
5289 												event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
5290 											else
5291 												event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
5292 											chromosome_event_t * last_event = event_space + last_event_id;
5293 
5294 											int dist = new_event -> event_small_side - last_event -> event_large_side +1;
5295 
5296 											new_event -> connected_previous_event_distance = dist;
5297 											last_event -> connected_next_event_distance = dist;
5298 										}
5299 
5300 										if (new_event)
5301 											last_event_id = new_event -> global_event_id;
5302 										else	last_event_id = -1;
5303 									}
5304 								}
5305 
5306 
5307 								if(mv == 0 || mv == 3)
5308 									is_in_indel = 0;
5309 							}
5310 
5311 							if(is_in_indel && mv == 1)
5312 								current_indel_len += 1;
5313 							if(is_in_indel && mv == 2)
5314 								current_indel_len -= 1;
5315 
5316 							if(mv == 1 || mv == 3 || mv == 0) cursor_on_chromosome++;
5317 							if(mv == 2 || mv == 3 || mv == 0) cursor_on_read++;
5318 
5319 							last_mv = mv;
5320 						}
5321 					 last_correct_subread = indel_recorder[i+1]-1;
5322 				}
5323 
5324 			}
5325 		}
5326 
5327 
5328 
5329 		if(1)
5330 		{
5331 			finalise_vote(vote_p1);
5332 			select_best_vote(vote_p1);
5333 			//print_votes(vote_p1, global_context -> config.index_prefix);
5334 			unsigned int best_pos1=0;
5335 			unsigned int best_pos2=0;
5336 			int best_vote1=0;
5337 			int best_vote2=0;
5338 			char is_abnormal=0;
5339 			short half_marks=0;
5340 			int is_reversed_halves=0, max_cover_start=0, max_cover_end=0;
5341 			char indel_in_p1=0, indel_in_p2=0;
5342 			short read_coverage_start =0, read_coverage_end=0;
5343 			gene_value_index_t * base_index = thread_context?thread_context->current_value_index:global_context->current_value_index ;
5344 
5345 			int splice_point = core_select_best_matching_halves(global_context, vote_p1, &best_pos1, &best_pos2, &best_vote1, &best_vote2, &is_abnormal ,&half_marks, &is_reversed_halves, ACCEPTED_SUPPORT_RATE, read_len, -1,  0, &read_coverage_start, &read_coverage_end, &indel_in_p1, &indel_in_p2, &max_cover_start, &max_cover_end, read_len, -1 , 0, NULL , 0xffffffff);
5346 
5347 			//SUBREADprintf("RN=%s , WINDOW = %d ~ %d , SP=%d;  BV=%d;  BV2=%d\n", rname , window_cursor , window_cursor + read_len , splice_point, best_vote1, best_vote2);
5348 			if (splice_point>0 && best_vote1 >= 1 && best_vote2>=1)
5349 			{
5350 				int test_real_break_point = -1, test_donor_score=-1;
5351 				int is_GTAG = 0;
5352 				int is_accepted = core13_test_donor(InBuff, read_len, min(best_pos1, best_pos2), max(best_pos1,best_pos2), splice_point, negative_strand, read_len/4, 0, 5, &test_real_break_point, base_index, 0, 0, negative_strand, color_space, &test_donor_score, &is_GTAG);
5353 
5354 				if (is_accepted ){
5355 					unsigned int pos_small = min(test_real_break_point+ best_pos1,  test_real_break_point+ best_pos2) - 1;
5356 					unsigned int pos_big = max(test_real_break_point+ best_pos1,  test_real_break_point+ best_pos2);
5357 
5358 					int event_no;
5359 					chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
5360 					chromosome_event_t * found = NULL;
5361 
5362 					int found_events = search_event(global_context, event_table, event_space, pos_small , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_JUNCTION|CHRO_EVENT_TYPE_FUSION, search_return);
5363 
5364 					if(found_events)
5365 					{
5366 						int kx1;
5367 						for(kx1 = 0; kx1 < found_events ; kx1++)
5368 						{
5369 							if(search_return[kx1] -> event_large_side == pos_big)
5370 							{
5371 								found = search_return[kx1];
5372 								break;
5373 							}
5374 						}
5375 					}
5376 
5377 					if(found) found -> supporting_reads ++;
5378 					else
5379 					{
5380 						if(thread_context)
5381 							event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
5382 						else
5383 							event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) ->  total_events ++;
5384 
5385 						event_space = reallocate_event_space(global_context, thread_context, event_no);
5386 
5387 						chromosome_event_t * new_event = event_space+event_no;
5388 						memset(new_event,0,sizeof(chromosome_event_t));
5389 						new_event -> event_small_side = pos_small;
5390 						new_event -> event_large_side = pos_big;
5391 
5392 						new_event -> is_negative_strand= !is_GTAG;
5393 						new_event -> event_type = CHRO_EVENT_TYPE_JUNCTION;
5394 
5395 						new_event -> supporting_reads = 1;
5396 						new_event -> indel_length = 0;
5397 
5398 						put_new_event(event_table, new_event , event_no);
5399 			//			SUBREADprintf("ADD JUNCTION BY FRAGILE, %d-%d\n", pos_small, pos_big);
5400 					}
5401 
5402 				}
5403 
5404 			}
5405 		}
5406 		InBuff[read_len] = tmp_char;
5407 	}
5408 }
5409 
5410 
print_frags(global_context_t * global_context,fragment_list_t * fls)5411 void print_frags(global_context_t * global_context, fragment_list_t * fls){
5412 	int x1;
5413 
5414 	for(x1 =0; x1 < fls -> fragments; x1++){
5415 		subread_read_number_t fno = fls -> fragment_numbers[x1] / 2;
5416 		int f_is_B = fls -> fragment_numbers[x1] % 2;
5417 
5418 		mapping_result_t * f_res = _global_retrieve_alignment_ptr(global_context, fno, f_is_B, 0);
5419 		mapping_result_t * mate_res = _global_retrieve_alignment_ptr(global_context, fno, !f_is_B, 0);
5420 		char outpos[100];
5421 		char outposm[100];
5422 		absoffset_to_posstr(global_context, f_res -> selected_position, outpos);
5423 		absoffset_to_posstr(global_context, mate_res -> selected_position, outposm);
5424 
5425 		int f_negative = (f_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5426 		int mate_negative = (mate_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5427 
5428 		if(f_is_B) f_negative=!f_negative;
5429 		else mate_negative=!mate_negative;
5430 
5431 		//SUBREADprintf("TRALOG: READ %09u %c AT %s (%c)  ;  MATE: %s (%c)\n", fno, f_is_B?'B':'A' , outpos, f_negative?'N':'P' , outposm, mate_negative?'N':'P');
5432 
5433 	}
5434 }
5435 
5436 // fragnos_paired_B = B_fragment_no * 2 + is_mate_b   (is_mate_b points the mate that has the location in locations_mate_B)
5437 // fragnos_paired_C = C_fragment_no * 2 + is_mate_c   (is_mate_c points the mate that has the location in locations_mate_C)
5438 //
5439 // locations_mate_B and locations_mate_C are the locations where the sequence is moved to. I.e., locations_mate_B and locations_mate_C are far far away from fragment A.
5440 //
find_translocation_BC_mates(global_context_t * global_context,mapping_result_t * res_A1,mapping_result_t * res_A2,fragment_list_t * listB,fragment_list_t * listC,int is_INV,unsigned long long * fragnos_paired_B,unsigned long long * fragnos_paired_C,unsigned int * locations_mate_B,unsigned int * locations_mate_C,unsigned int * guessed_brkP_small_sum,unsigned int * guessed_moved_length_sum,unsigned int * guessed_brkQ_small_sum)5441 int find_translocation_BC_mates(global_context_t * global_context, mapping_result_t * res_A1, mapping_result_t * res_A2, fragment_list_t * listB, fragment_list_t * listC, int is_INV, unsigned long long * fragnos_paired_B, unsigned long long * fragnos_paired_C, unsigned int * locations_mate_B, unsigned int * locations_mate_C,unsigned int  * guessed_brkP_small_sum, unsigned int * guessed_moved_length_sum , unsigned int * guessed_brkQ_small_sum){
5442 
5443 	int ret = 0, xk1, xk2;
5444 	char * is_C_used = malloc(sizeof(char) * listC->fragments);
5445 	memset(is_C_used, 0, sizeof(char) * listC->fragments);
5446 	long long tmp_guessed_brkP_small_sum = 0, tmp_guessed_moved_length_sum = 0, tmp_guessed_brkQ_small_sum = 0;
5447 
5448 	for(xk1 = 0; xk1 < listB->fragments; xk1++)
5449 	{
5450 		long long minimum_mate_distance = 0x7fffffff;
5451 		int minimum_xk2 = -1;
5452 		unsigned int mate_C_pos = 0;
5453 		mapping_result_t * res_Ca = NULL, * res_Cc = NULL, * res_Ba = NULL, *res_Bb = NULL;
5454 		mapping_result_t meta_C_res_body, res_Ca_body;
5455 		res_Ca = &res_Ca_body;
5456 
5457 		mapping_result_t * meta_C_res = &meta_C_res_body;
5458 
5459 		subread_read_number_t B_read_no = listB->fragment_numbers[xk1]/2;
5460 		int B_read_is_b = listB->fragment_numbers[xk1]%2;
5461 
5462 		mapping_result_t meta_B_res_body, res_Ba_body;
5463 		mapping_result_t * meta_B_res = &meta_B_res_body;
5464 		res_Ba = &res_Ba_body;
5465 
5466 		bigtable_readonly_result(global_context, NULL, B_read_no, 0, !B_read_is_b, meta_B_res, NULL);
5467 		res_Bb = meta_B_res;
5468 
5469 		bigtable_readonly_result(global_context, NULL, B_read_no, 0, B_read_is_b, res_Ba, NULL);
5470 
5471 		for(xk2 = 0; xk2 < listC->fragments; xk2++)
5472 		{
5473 			if(is_C_used[xk2]) continue;
5474 
5475 			subread_read_number_t C_read_no = listC->fragment_numbers[xk2]/2;
5476 			int C_read_is_b = listC->fragment_numbers[xk2]%2;
5477 
5478 			bigtable_readonly_result(global_context, NULL, C_read_no, 0, !C_read_is_b, meta_C_res, NULL);
5479 			res_Cc = meta_C_res;
5480 
5481 			bigtable_readonly_result(global_context, NULL, C_read_no, 0, C_read_is_b, res_Ca, NULL);
5482 
5483 			int is_meta_B_negative = (meta_B_res  -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5484 			if(!B_read_is_b) is_meta_B_negative = !is_meta_B_negative;
5485 
5486 			int is_meta_C_negative = (meta_C_res  -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5487 			if(!C_read_is_b) is_meta_C_negative = !is_meta_C_negative;
5488 
5489 			//SUBREADprintf("TRALOG: MATES : B[%d] = %u (%c); C[%d] = %u (%c)\n", xk1, meta_B_res -> selected_position, is_meta_B_negative?'N':'P' , xk2, meta_C_res  -> selected_position, is_meta_C_negative?'N':'P');
5490 
5491 			if(is_meta_B_negative != is_meta_C_negative &&
5492 			   meta_B_res -> selected_position < meta_C_res -> selected_position &&
5493 			   meta_C_res -> selected_position - meta_C_res -> selected_position < global_context -> config.maximum_translocation_length &&
5494 			   meta_C_res -> selected_position - meta_B_res -> selected_position <  minimum_mate_distance)
5495 			{
5496 				minimum_mate_distance = meta_C_res -> selected_position - meta_B_res -> selected_position;
5497 				minimum_xk2 = xk2;
5498 				mate_C_pos =  meta_C_res -> selected_position;
5499 			}
5500 		}
5501 		// read B has a mate of C[minimum xk2] if there is one.
5502 		if(minimum_xk2>=0)
5503 		{
5504 			subread_read_number_t C_mate_fno = listC -> fragment_numbers[minimum_xk2] / 2;
5505 			int C_mate_is_b = listC -> fragment_numbers[minimum_xk2] % 2;
5506 
5507 			fragnos_paired_B[ret] = (B_read_no*2)+(!B_read_is_b);
5508 			locations_mate_B[ret] = meta_B_res -> selected_position;
5509 
5510 			fragnos_paired_C[ret] = (C_mate_fno*2)+(C_mate_is_b);
5511 			locations_mate_C[ret] = mate_C_pos;
5512 
5513 			is_C_used[minimum_xk2] = 1;
5514 
5515 
5516 			int gapA, gapB, gapC;
5517 
5518 			if(is_INV){
5519 				gapA = res_Ca -> selected_position - res_A1 -> selected_position - res_A1 -> read_length;
5520 				gapB = res_A2 -> selected_position - res_Ba -> selected_position - res_Ba -> read_length;
5521 				gapC = res_Cc -> selected_position - res_Bb -> selected_position - res_Bb -> read_length;
5522 			}else{
5523 				gapA = res_Ba -> selected_position - res_A1 -> selected_position - res_A1 -> read_length;
5524 				gapB = res_A2 -> selected_position - res_Ca -> selected_position - res_Ca -> read_length;
5525 				gapC = res_Cc -> selected_position - res_Bb -> selected_position - res_Bb -> read_length;
5526 			}
5527 
5528 			tmp_guessed_brkP_small_sum += res_A1 -> selected_position + res_A1 -> read_length + gapA/2;
5529 			tmp_guessed_moved_length_sum += res_A2 -> selected_position - res_A1 -> selected_position - res_A1 -> read_length - gapB/2 + gapA/2;
5530 			tmp_guessed_brkQ_small_sum += res_Bb -> selected_position + res_Bb -> read_length + gapC/2;
5531 
5532 			ret ++;
5533 		}
5534 	}
5535 
5536 	free(is_C_used);
5537 
5538 	if(ret>0){
5539 		*guessed_brkP_small_sum= tmp_guessed_brkP_small_sum / ret;
5540 		*guessed_moved_length_sum = tmp_guessed_moved_length_sum/ ret;
5541 		*guessed_brkQ_small_sum = tmp_guessed_brkQ_small_sum / ret;
5542 	}
5543 
5544 	return ret;
5545 }
5546 
5547 
5548 // This function sees if all the mates of read B_x and C_y are at the same location.
5549 // If mates of B_x and C_y spread on a large region, it is usually unreliable.
5550 // posesB and posesB are linear absolute positions of the mate reads.
find_translocation_BC_conformation(global_context_t * global_context,int PEmates,unsigned int * posesB,unsigned int * posesC)5551 int find_translocation_BC_conformation(global_context_t * global_context, int PEmates, unsigned int  * posesB, unsigned int * posesC){
5552 
5553 	unsigned int min_pos = 0xffffffff, max_pos = 0, xk1;
5554 	if(PEmates<1) return 0;
5555 
5556 	for(xk1 = 0; xk1 < PEmates; xk1++)
5557 	{
5558 		min_pos = min(min_pos, posesB[xk1]);
5559 		min_pos = min(min_pos, posesC[xk1]);
5560 
5561 		max_pos = max(max_pos, posesB[xk1]);
5562 		max_pos = max(max_pos, posesC[xk1]);
5563 	}
5564 
5565 	if(max_pos - min_pos< 2*global_context -> config.maximum_pair_distance)return 1;
5566 	return 0;
5567 }
5568 
5569 
5570 // fliB and fliB are : frag_[BC]_no * 2 + is_Read_b_close_to_BreakPoint_P
breakpoint_PQR_supported(global_context_t * global_context,unsigned int brkPno,unsigned int brkQno,unsigned int brkRno,fragment_list_t * fliB,fragment_list_t * fliC,int isInv)5571 int breakpoint_PQR_supported(global_context_t * global_context , unsigned int brkPno , unsigned int brkQno, unsigned int brkRno, fragment_list_t * fliB, fragment_list_t * fliC, int isInv){
5572 	int fli_i;
5573 	int isFliB, nSupB=0, nSupC=0;
5574 
5575 	for(isFliB = 0; isFliB < 2; isFliB++){
5576 		fragment_list_t * fli = isFliB?fliB:fliC;
5577 		int * nSup = isFliB?&nSupB:&nSupC;
5578 		// fliB => support source_small ~ target_large if inv, or source_small ~ target_small if !inv
5579 		// fliC => support source_large ~ target_small if inv, or source_large ~ target_large if !inv
5580 
5581 		// the read that is close to BreakPoint_P should support source, the other read should support target
5582 		for(fli_i = 0; fli_i < fli -> fragments; fli_i ++){
5583 			subread_read_number_t frag_BC_no = fli -> fragment_numbers[fli_i]/2;
5584 			int is_Read_b_close_to_BreakPoint_P = fli -> fragment_numbers[fli_i]%2;
5585 			unsigned int source_small, source_large, target_smallQ, target_largeQ, target_smallR, target_largeR, target_large, target_small;
5586 
5587 			get_event_two_coordinates(global_context, brkPno, NULL, NULL, &source_small, NULL, NULL, &source_large);
5588 			get_event_two_coordinates(global_context, brkQno, NULL, NULL, &target_smallQ, NULL, NULL, &target_largeQ);
5589 			get_event_two_coordinates(global_context, brkRno, NULL, NULL, &target_smallR, NULL, NULL, &target_largeR);
5590 
5591 
5592 			if(target_smallQ <= target_smallR + BREAK_POINT_MAXIMUM_TOLERANCE && target_smallQ >= target_smallR - BREAK_POINT_MAXIMUM_TOLERANCE)
5593 			{
5594 				//target_smallQ is target, target_smallR is target
5595 				target_large = target_smallR;
5596 				target_small = target_smallQ;
5597 			}else{
5598 
5599 				//target_largeQ is target, target_largeR is target
5600 				target_large = target_largeQ;
5601 				target_small = target_largeR;
5602 			}
5603 
5604 
5605 			mapping_result_t res_BC_close_P_body, res_BC_close_Q_body;
5606 
5607 			mapping_result_t * res_BC_close_P = &res_BC_close_P_body, * res_BC_close_Q = & res_BC_close_Q_body;
5608 
5609 			bigtable_readonly_result(global_context, NULL, frag_BC_no, 0, is_Read_b_close_to_BreakPoint_P, res_BC_close_P, NULL);
5610 			bigtable_readonly_result(global_context, NULL, frag_BC_no, 0, !is_Read_b_close_to_BreakPoint_P, res_BC_close_Q, NULL);
5611 
5612 			unsigned int P_pos = isInv?( isFliB?source_large:source_small ):( isFliB?source_small:source_large );
5613 			unsigned int Q_pos = isInv?( isFliB?target_large:target_small ):( isFliB?target_small:target_large );
5614 
5615 			SUBREADprintf("TRALOG: PQR_TARGET P=%u~%u; Q=%u~%u, R=%u~%u ; Ppos=%u, Qpos=%u, Pread=%u, Qread=%u on %s\n", source_small, source_large, target_smallQ, target_largeQ, target_smallR, target_largeR,  P_pos, Q_pos, res_BC_close_P -> selected_position, res_BC_close_Q -> selected_position, isInv?"INV":"STR");
5616 
5617 			long long dist;
5618 			dist = res_BC_close_P -> selected_position;
5619 			dist -= P_pos;
5620 			if(abs(dist) < global_context -> config.maximum_pair_distance){
5621 				dist = res_BC_close_Q -> selected_position;
5622 				dist -= Q_pos;
5623 				if(abs(dist) < global_context -> config.maximum_pair_distance)
5624 					(*nSup)++;
5625 			}
5626 		}
5627 	}
5628 	//return nSupB + 1 >= fliB -> fragments/2 && nSupC + 1 >= fliC-> fragments/2 ;
5629 	SUBREADprintf("TRALOG: PQR_NSUP: B=%d, C=%d on %s\n", nSupB, nSupC, isInv?"INV":"STR");
5630 	return nSupB > 0 && nSupC > 0 && nSupB + 2 >= fliB->fragments / 2 && nSupC + 2 >= fliC->fragments / 2;
5631 }
5632 
5633 // fragnoD1_mates and fragnoD2_mates are poteltial E reads 1/2.
5634 // D1: D's small read; D2: D's large read
5635 // E2 ~ D2
5636 // E1 ~ D1
5637 // E2.start > Y.large
5638 // E1.start > Y.small
5639 
breakpoint_YZ_supported(global_context_t * global_context,unsigned int brkYno,unsigned int brkZno,unsigned long long * fragnoD1_mates,int fragnoD1len,unsigned long long * fragnoD2_mates,int fragnoD2len)5640 int breakpoint_YZ_supported(global_context_t * global_context, unsigned int brkYno, unsigned int brkZno, unsigned long long * fragnoD1_mates, int fragnoD1len, unsigned long long * fragnoD2_mates, int fragnoD2len){
5641 	int x1;
5642 	int is_D2_mates;
5643 
5644 	unsigned int inversion_small_edge, inversion_large_edge;
5645 	get_event_two_coordinates(global_context, brkYno, NULL, NULL, &inversion_small_edge, NULL, NULL, &inversion_large_edge);
5646 
5647 
5648 	int nSupD1mates = 0, nSupD2mates = 0;
5649 	for(is_D2_mates = 0; is_D2_mates < 2; is_D2_mates ++){
5650 		unsigned long long * fragno_Dmates = is_D2_mates?fragnoD2_mates:fragnoD1_mates;
5651 		int fragno_Dno = is_D2_mates?fragnoD2len:fragnoD1len;
5652 		int * nSupMates = is_D2_mates?&nSupD2mates:&nSupD1mates;
5653 		for(x1 = 0; x1 < fragno_Dno; x1++){
5654 			subread_read_number_t fragno_Dmate = fragno_Dmates[x1] / 2;
5655 			int is_large_read_far_from_D  = fragno_Dmates[x1] % 2;
5656 
5657 			mapping_result_t frag_D_mate_a_body, frag_D_mate_b_body;
5658 			mapping_result_t * frag_D_mate_a = &frag_D_mate_a_body, * frag_D_mate_b = & frag_D_mate_b_body;
5659 
5660 			bigtable_readonly_result(global_context, NULL, fragno_Dmate, 0, 0, frag_D_mate_a, NULL);
5661 			bigtable_readonly_result(global_context, NULL, fragno_Dmate, 0, 1, frag_D_mate_b, NULL);
5662 
5663 			mapping_result_t * frag_D_mate_1 = (frag_D_mate_a -> selected_position > frag_D_mate_b -> selected_position)?frag_D_mate_b:frag_D_mate_a;
5664 			mapping_result_t * frag_D_mate_2 = (frag_D_mate_a -> selected_position <=frag_D_mate_b -> selected_position)?frag_D_mate_b:frag_D_mate_a;
5665 
5666 			mapping_result_t * res_to_support_small_edge = (is_D2_mates ^ is_large_read_far_from_D)?frag_D_mate_2:frag_D_mate_1;
5667 			mapping_result_t * res_to_support_large_edge = (is_D2_mates ^ is_large_read_far_from_D)?frag_D_mate_1:frag_D_mate_2;
5668 
5669 			long long distsm;
5670 			distsm = res_to_support_small_edge -> selected_position;
5671 			distsm -= inversion_small_edge;
5672 
5673 			long long distla;
5674 			distla = res_to_support_large_edge -> selected_position;
5675 			distla -=  inversion_large_edge;
5676 
5677 			//SUBREADprintf("INVLOG: Dist_SM=%lld, Dist_LA=%lld\n", distsm, distla);
5678 
5679 			if(distsm > -8 && distsm <  global_context -> config.maximum_pair_distance){
5680 
5681 				if(distla > -8 && distla <  global_context -> config.maximum_pair_distance)
5682 					(*nSupMates) ++;
5683 			}
5684 
5685 
5686 		}
5687 	}
5688 
5689 	//SUBREADprintf("INVLOG: breakpoint_YZ_supported nSupD1=%d >= %d,  nSupD2=%d >= %d\n", nSupD1mates, fragnoD1len, nSupD2mates, fragnoD2len);
5690 	return nSupD1mates > 0 && nSupD2mates > 0 && nSupD1mates + 2 >= fragnoD1len / 2 && nSupD2mates + 2 >= fragnoD2len / 2;
5691 }
5692 
5693 #define _PQR_LIST_SIZE 48
5694 
find_translocation_brk_PQR(global_context_t * global_context,mapping_result_t * resA1,mapping_result_t * resA2,fragment_list_t * fliB,fragment_list_t * fliC,unsigned int * brkPno,unsigned int * brkQno,unsigned int * brkRno,int isInv,unsigned int * is_cand_P_found)5695 int find_translocation_brk_PQR(global_context_t * global_context, mapping_result_t * resA1, mapping_result_t * resA2, fragment_list_t * fliB, fragment_list_t * fliC, unsigned int * brkPno,   unsigned int *  brkQno,  unsigned int *  brkRno, int isInv, unsigned int * is_cand_P_found)
5696 {
5697 	unsigned int event_pos_list_A1[_PQR_LIST_SIZE];
5698 	void * event_ptr_list_A1[_PQR_LIST_SIZE];
5699 
5700 	char * chroA=NULL;
5701 	int posA1=0;
5702 
5703 	locate_gene_position(resA1 -> selected_position,  &global_context -> chromosome_table, &chroA, &posA1);
5704 
5705 
5706 	int candA1i, found_PQR = 0;
5707 	int candA1Number = bktable_lookup(&global_context -> breakpoint_table_P, chroA, posA1, global_context -> config.maximum_pair_distance , event_pos_list_A1, event_ptr_list_A1, _PQR_LIST_SIZE);
5708 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
5709 	int candBrkPi , candBrkPNumber=0;
5710 
5711 	//SUBREADprintf("A FOUND %d P ", candA1Number);
5712 
5713 	for(candA1i = 0; candA1i < candA1Number ; candA1i++){
5714 		int event_no = event_ptr_list_A1[candA1i] - NULL;
5715 		chromosome_event_t * event_body = indel_context -> event_space_dynamic + event_no;
5716 
5717 		long long small_dist = event_body -> event_small_side, large_dist = event_body -> event_large_side;
5718 		small_dist -= resA1 -> selected_position;
5719 		large_dist -= resA2 -> selected_position;
5720 	}
5721 
5722 	//SUBREADprintf(", (%d may be used)\n", candBrkPNumber);
5723 
5724 	*is_cand_P_found = candBrkPNumber;
5725 
5726 	for(candBrkPi = 0; candBrkPi < candBrkPNumber; candBrkPi++){
5727 		unsigned int event_no_P = event_ptr_list_A1[candBrkPi] - NULL;
5728 		chromosome_event_t * event_body_P = indel_context -> event_space_dynamic + event_no_P;
5729 
5730 		unsigned int anchor_for_brkQ = isInv?event_body_P -> event_large_side:event_body_P -> event_small_side;
5731 		unsigned int anchor_for_brkR = isInv?event_body_P -> event_small_side:event_body_P -> event_large_side;
5732 
5733 		unsigned int event_pos_list_Q[_PQR_LIST_SIZE];
5734 		void * event_ptr_list_Q[_PQR_LIST_SIZE];
5735 
5736 		unsigned int event_pos_list_R[_PQR_LIST_SIZE];
5737 		void * event_ptr_list_R[_PQR_LIST_SIZE];
5738 
5739 		char * charAncQ = NULL, * charAncR = NULL;
5740 		int posAncQ=0, posAncR = 0;
5741 		locate_gene_position(anchor_for_brkQ, &global_context -> chromosome_table, &charAncQ, &posAncQ);
5742 		locate_gene_position(anchor_for_brkR, &global_context -> chromosome_table, &charAncR, &posAncR);
5743 
5744 		int candQi, candQnumber = bktable_lookup(&global_context -> breakpoint_table_QR, charAncQ, posAncQ - BREAK_POINT_MAXIMUM_TOLERANCE , 2* BREAK_POINT_MAXIMUM_TOLERANCE , event_pos_list_Q, event_ptr_list_Q, _PQR_LIST_SIZE);
5745 		int candRi, candRnumber = bktable_lookup(&global_context -> breakpoint_table_QR, charAncR, posAncR - BREAK_POINT_MAXIMUM_TOLERANCE , 2* BREAK_POINT_MAXIMUM_TOLERANCE , event_pos_list_R, event_ptr_list_R, _PQR_LIST_SIZE);
5746 
5747 		SUBREADprintf("P [%s] FOUND %d Q AT %s:%u and %d R AT %s:%u\n", isInv?"INV":"STR", candQnumber, charAncQ, posAncQ, candRnumber, charAncR, posAncR);
5748 
5749 		for(candQi = 0 ; candQi < candQnumber ; candQi++){
5750 			unsigned int event_no_Q = event_ptr_list_Q[candQi] - NULL;
5751 			chromosome_event_t * event_body_Q = indel_context -> event_space_dynamic + event_no_Q;
5752 
5753 			long long cand_Q_small_dist = event_body_Q -> event_small_side;
5754 			cand_Q_small_dist -= isInv?event_body_P -> event_large_side:event_body_P -> event_small_side;
5755 
5756 			int is_Q_small_side_close_to_P = abs(cand_Q_small_dist) <= BREAK_POINT_MAXIMUM_TOLERANCE;
5757 
5758 			SUBREADprintf("Q: SMALL_CLOSE_P = %d, DIR = %c %c\n", is_Q_small_side_close_to_P,  event_body_Q -> small_side_increasing_coordinate?'>':'<', event_body_Q -> large_side_increasing_coordinate?'>':'<');
5759 
5760 			if(  is_Q_small_side_close_to_P  && event_body_Q -> large_side_increasing_coordinate == 1) continue;  // the large side is the target location.
5761 			if((!is_Q_small_side_close_to_P) && event_body_Q -> small_side_increasing_coordinate == 1) continue;  // the small side is the target location.
5762 
5763 
5764 			if(  isInv  && event_body_Q -> large_side_increasing_coordinate != event_body_Q -> small_side_increasing_coordinate) continue;
5765 			if((!isInv) && event_body_Q -> large_side_increasing_coordinate == event_body_Q -> small_side_increasing_coordinate) continue;
5766 
5767 			for(candRi = 0 ; candRi < candRnumber ; candRi++){
5768 				unsigned int event_no_R = event_ptr_list_R[candRi] - NULL;
5769 				chromosome_event_t * event_body_R = indel_context -> event_space_dynamic + event_no_R;
5770 
5771 				srInt_64 cand_R_dist_to_Q = is_Q_small_side_close_to_P?event_body_Q -> event_large_side:event_body_Q -> event_small_side;
5772 				cand_R_dist_to_Q -= is_Q_small_side_close_to_P?event_body_R -> event_large_side:event_body_R-> event_small_side;
5773 
5774 				SUBREADprintf("R: candDist=%lld, DIR = %c %c\n", cand_R_dist_to_Q,  event_body_Q -> small_side_increasing_coordinate?'>':'<', event_body_Q -> large_side_increasing_coordinate?'>':'<');
5775 
5776 				if(abs(cand_R_dist_to_Q) > BREAK_POINT_MAXIMUM_TOLERANCE) continue;
5777 				int is_R_small_side_close_to_P = is_Q_small_side_close_to_P;
5778 
5779 				if(  is_R_small_side_close_to_P  && !event_body_R -> large_side_increasing_coordinate) continue;
5780 				if(!(is_R_small_side_close_to_P) && !event_body_R -> small_side_increasing_coordinate) continue;
5781 
5782 				if(  isInv  && event_body_R -> large_side_increasing_coordinate != event_body_R -> small_side_increasing_coordinate) continue;
5783 				if(!(isInv) && event_body_R -> large_side_increasing_coordinate == event_body_R -> small_side_increasing_coordinate) continue;
5784 				(*brkPno) = event_no_P;
5785 				(*brkQno) = event_no_Q;
5786 				(*brkRno) = event_no_R;
5787 				found_PQR++;
5788 				return 1;
5789 			}
5790 		}
5791 	}
5792 
5793 	return found_PQR;
5794 }
5795 
5796 
get_event_two_coordinates(global_context_t * global_context,unsigned int event_no,char ** small_chro,int * small_pos,unsigned int * small_abs,char ** large_chro,int * large_pos,unsigned int * large_abs)5797 void get_event_two_coordinates(global_context_t * global_context, unsigned int event_no, char ** small_chro, int * small_pos, unsigned int * small_abs, char ** large_chro, int * large_pos, unsigned int * large_abs){
5798 
5799 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
5800 	chromosome_event_t * event_body = indel_context -> event_space_dynamic + event_no;
5801 
5802 	if(small_abs)(*small_abs) = event_body -> event_small_side;
5803 	if(large_abs)(*large_abs) = event_body -> event_large_side;
5804 
5805 	if(small_chro && small_pos)
5806 		locate_gene_position(event_body -> event_small_side,  &global_context -> chromosome_table, small_chro, small_pos);
5807 	if(large_chro && large_pos)
5808 		locate_gene_position(event_body -> event_large_side,  &global_context -> chromosome_table, large_chro, large_pos);
5809 }
5810 
5811 
create_or_update_translocation_imprecise_result(global_context_t * global_context,unsigned int guessed_P_small,unsigned int guessed_tra_len,unsigned int guessed_Q_small,int paired_BC_reads,int isInv)5812 void create_or_update_translocation_imprecise_result(global_context_t * global_context , unsigned int guessed_P_small, unsigned int guessed_tra_len, unsigned int guessed_Q_small , int paired_BC_reads, int isInv){
5813 
5814 	char * brkPchr;
5815 	int brkPsmall;
5816 	void * trans_old_ptrs [_PQR_LIST_SIZE];
5817 	unsigned int trans_old_poses [_PQR_LIST_SIZE];
5818 
5819 	locate_gene_position(guessed_P_small,  &global_context -> chromosome_table, &brkPchr, &brkPsmall);
5820 
5821 	int is_trans_found = 0, old_res_i, old_res_number = bktable_lookup(&global_context -> translocation_result_table, brkPchr, brkPsmall - BREAK_POINT_MAXIMUM_TOLERANCE, 2*BREAK_POINT_MAXIMUM_TOLERANCE, trans_old_poses, trans_old_ptrs, _PQR_LIST_SIZE);
5822 	for(old_res_i = 0; old_res_i < old_res_number; old_res_i++){
5823 		translocation_result_t * old_res = (translocation_result_t * )trans_old_ptrs[old_res_i];
5824 
5825 		long long target_dist = old_res -> target_left_side;
5826 		target_dist -= guessed_Q_small;
5827 
5828 		if(abs(target_dist) < BREAK_POINT_MAXIMUM_TOLERANCE && isInv == old_res -> is_inv){
5829 			target_dist = old_res -> length;
5830 			target_dist -= guessed_tra_len;
5831 			if(abs(target_dist) < BREAK_POINT_MAXIMUM_TOLERANCE){
5832 				old_res -> all_sup_P ++;
5833 				old_res -> max_sup_QR = max(old_res -> max_sup_QR , paired_BC_reads);
5834 				is_trans_found = 1;
5835 				break;
5836 			}
5837 		}
5838 	}
5839 
5840 	if(0 == is_trans_found){
5841 		translocation_result_t * new_res = malloc(sizeof(translocation_result_t));
5842 		memset(new_res, 0, sizeof(translocation_result_t));
5843 		new_res -> target_left_side = guessed_Q_small;
5844 		new_res -> length = guessed_tra_len;
5845 		new_res -> source_left_side = guessed_P_small;
5846 		new_res -> is_precisely_called = 0;
5847 		new_res -> all_sup_P = 1;
5848 		new_res -> max_sup_QR = paired_BC_reads;
5849 		new_res -> is_inv = isInv;
5850 
5851 		bktable_append(&global_context -> translocation_result_table,brkPchr, brkPsmall, new_res);
5852 	}
5853 
5854 }
5855 
create_or_update_translocation_result(global_context_t * global_context,unsigned int brkPno,unsigned int brkQno,unsigned int brkRno,int paired_BC_reads,int isInv)5856 void create_or_update_translocation_result(global_context_t * global_context , unsigned int brkPno, unsigned int brkQno, unsigned int brkRno , int paired_BC_reads, int isInv){
5857 
5858 	char *brkPchr, *brkQchr, *tmpchr;
5859 	int brkPsmall, brkPlarge, brkQsmall, tmpint;
5860 	unsigned int brkPabs_small, brkQabs_small, brkRabs_small, brkRabs_large, brkQabs_large;
5861 
5862 	SUBREADprintf("\nTRALOG: FINALLY_CONFIRMED: %s ; %d PE_MATES\n", isInv?"INV":"STR", paired_BC_reads);
5863 
5864 	get_event_two_coordinates(global_context, brkPno, &brkPchr, &brkPsmall, &brkPabs_small,  &tmpchr, &brkPlarge, NULL);
5865 	get_event_two_coordinates(global_context, brkQno, &brkQchr, &brkQsmall, &brkQabs_small,  &tmpchr, &tmpint, &brkQabs_large);
5866 	get_event_two_coordinates(global_context, brkRno, NULL, NULL, &brkRabs_small,  NULL, NULL, &brkRabs_large);
5867 
5868 	SUBREADprintf("TRARES: %s:%u (len=%d) => %s:%u   (Coor: last_base_before)\n", brkPchr, brkPsmall, brkPlarge - brkPsmall - 1, brkQchr, brkQsmall);
5869 
5870 	void * trans_old_ptrs [_PQR_LIST_SIZE];
5871 	unsigned int trans_old_poses [_PQR_LIST_SIZE];
5872 
5873 	unsigned int new_target_left_side, new_length;
5874 
5875 
5876 	if(brkQabs_small >= brkRabs_small - BREAK_POINT_MAXIMUM_TOLERANCE && brkQabs_small <= brkRabs_small + BREAK_POINT_MAXIMUM_TOLERANCE)
5877 	{
5878 		// Q small and R large are target
5879 		new_target_left_side = brkQabs_small;
5880 	} else{
5881 		// Q large and R small are target
5882 		new_target_left_side = brkQabs_large;
5883 	}
5884 
5885 	new_length = brkPlarge - brkPsmall - 1;
5886 
5887 	int is_trans_found = 0, old_res_i, old_res_number = bktable_lookup(&global_context -> translocation_result_table, brkPchr, brkPsmall - BREAK_POINT_MAXIMUM_TOLERANCE, 2*BREAK_POINT_MAXIMUM_TOLERANCE, trans_old_poses, trans_old_ptrs, _PQR_LIST_SIZE);
5888 	for(old_res_i = 0; old_res_i < old_res_number; old_res_i++){
5889 		translocation_result_t * old_res = (translocation_result_t * )trans_old_ptrs[old_res_i];
5890 
5891 		long long target_dist = old_res -> target_left_side;
5892 		target_dist -= new_target_left_side;
5893 
5894 		if(abs(target_dist) < BREAK_POINT_MAXIMUM_TOLERANCE && isInv == old_res -> is_inv){
5895 			target_dist = old_res -> length;
5896 			target_dist -= new_length;
5897 			if(abs(target_dist) < BREAK_POINT_MAXIMUM_TOLERANCE){
5898 				old_res -> all_sup_P ++;
5899 				old_res -> max_sup_QR = max(old_res -> max_sup_QR , paired_BC_reads);
5900 				is_trans_found = 1;
5901 				break;
5902 			}
5903 		}
5904 	}
5905 
5906 	if(0 == is_trans_found){
5907 
5908 		translocation_result_t * new_res = malloc(sizeof(translocation_result_t));
5909 		memset(new_res, 0, sizeof(translocation_result_t));
5910 		new_res -> target_left_side = new_target_left_side;
5911 		new_res -> length = new_length;
5912 		new_res -> source_left_side = brkPabs_small;
5913 		new_res -> is_precisely_called = 1;
5914 		new_res -> event_P_number = brkPno;
5915 		new_res -> event_Q_number = brkQno;
5916 		new_res -> event_R_number = brkRno;
5917 		new_res -> all_sup_P = 1;
5918 		new_res -> max_sup_QR = paired_BC_reads;
5919 		new_res -> is_inv = isInv;
5920 
5921 		bktable_append(&global_context -> translocation_result_table,brkPchr, brkPsmall, new_res);
5922 	}
5923 }
5924 
5925 
finalise_translocations(global_context_t * global_context)5926 void finalise_translocations(global_context_t * global_context){
5927 
5928 	void ** s1_ptrs, **s2_ptrs;
5929 	unsigned int * s1_poses, * s2_poses;
5930 
5931 	s1_ptrs = malloc(sizeof(void *) * S12_LIST_CAPACITY);
5932 	s2_ptrs = malloc(sizeof(void *) * S12_LIST_CAPACITY);
5933 
5934 	s1_poses = malloc(sizeof(int) * S12_LIST_CAPACITY);
5935 	s2_poses = malloc(sizeof(int) * S12_LIST_CAPACITY);
5936 
5937 	unsigned long long * s1_selected_list = malloc(sizeof(long long) * S12_LIST_CAPACITY);	// fragment_no * 2 + is_second_read
5938 	unsigned long long * s2_selected_list = malloc(sizeof(long long) * S12_LIST_CAPACITY);
5939 
5940 	mapping_result_t ** s1_result_ptr_list =  malloc(sizeof(mapping_result_t *) * S12_LIST_CAPACITY);
5941 	mapping_result_t ** s2_result_ptr_list =  malloc(sizeof(mapping_result_t *) * S12_LIST_CAPACITY);
5942 
5943 	int frag_Q_larger_read;
5944 	subread_read_number_t frag_A_i;
5945 
5946 	for(frag_A_i = 0; frag_A_i < global_context -> funky_list_A.fragments; frag_A_i ++){
5947 		fragment_list_t fli_STR_B, fli_STR_C, fli_INV_B, fli_INV_C;
5948 
5949 		fraglist_init(&fli_STR_B);
5950 		fraglist_init(&fli_STR_C);
5951 		fraglist_init(&fli_INV_B);
5952 		fraglist_init(&fli_INV_C);
5953 
5954 		subread_read_number_t frag_A_no = global_context -> funky_list_A.fragment_numbers[frag_A_i];
5955 
5956 		mapping_result_t q_res_A_body, q_res_B_body;
5957 
5958 		mapping_result_t * q_res_A = &q_res_A_body;
5959 		mapping_result_t * q_res_B = &q_res_B_body;
5960 
5961 		bigtable_readonly_result(global_context, NULL, frag_A_no, 0, 0, q_res_A, NULL);
5962 		bigtable_readonly_result(global_context, NULL, frag_A_no, 0, 1, q_res_B, NULL);
5963 
5964 		mapping_result_t * q_res_1 = q_res_A -> selected_position >  q_res_B -> selected_position?q_res_B:q_res_A;
5965 		mapping_result_t * q_res_2 = q_res_A -> selected_position <= q_res_B -> selected_position?q_res_B:q_res_A;
5966 
5967 		/***************************************************************************************************
5968  		 *
5969  		 *  is_q1_negative and is_q2_negative describes the strandness of the original FASTQ read sequence.
5970  		 *
5971  		 *  For the very normal mappings, is_q1_negative must be 0 and is_q2_negative must be 1.
5972  		 *
5973  		 *  If is_q1_negative != is_q2_negative, then there is a strand-jumpping fusion between the two reads.
5974  		 */
5975 
5976 		int is_q1_negative = (q_res_1 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5977 		int is_q2_negative = (q_res_2 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
5978 
5979 		if(q_res_B == q_res_1)is_q1_negative=!is_q1_negative;
5980 		if(q_res_B == q_res_2)is_q2_negative=!is_q2_negative;
5981 
5982 		long long dist = q_res_A ->selected_position;
5983 		dist -= q_res_B->selected_position;
5984 
5985 		if( abs(dist) < 1000 && !(is_q1_negative == 0 && is_q2_negative == 1))
5986 		{
5987 			SUBREADprintf("TRALOG: STRANDNESS_BUG %08llu\n", frag_A_no);
5988 		}
5989 
5990 
5991 		for(frag_Q_larger_read = 0; frag_Q_larger_read < 2; frag_Q_larger_read++){
5992 			void ** s_ptrs = frag_Q_larger_read?s2_ptrs:s1_ptrs;
5993 			unsigned int * s_poses = frag_Q_larger_read?s2_poses:s1_poses;
5994 			int q_res_offset = 0;
5995 			mapping_result_t * q_res = frag_Q_larger_read?q_res_2:q_res_1;
5996 
5997 			char * q_res_chro = NULL;
5998 			locate_gene_position(q_res -> selected_position,  &global_context -> chromosome_table, &q_res_chro, &q_res_offset);
5999 			q_res_offset +=1 ; // all tables are one-based.
6000 
6001 			unsigned int q_search_start = q_res_offset;
6002 			if(q_search_start > FUNKY_COLOCATION_TOLERANCE) q_search_start -= FUNKY_COLOCATION_TOLERANCE;
6003 			else q_search_start = 0;
6004 
6005 			int cand_i, canidate_s_items = bktable_lookup(&global_context -> funky_table_BC, q_res_chro, q_search_start, 2*FUNKY_COLOCATION_TOLERANCE, s_poses, s_ptrs, S12_LIST_CAPACITY);
6006 
6007 			if(0 && frag_A_no == 143736){
6008 				SUBREADprintf("TRALOG: SEARCH CLOSE TO %s READ: %s:%u ; HAD %d HITS\n", frag_Q_larger_read?"LARGE":"SMALL", q_res_chro, q_search_start, canidate_s_items);
6009 			}
6010 
6011 			// scan if candidate is reversed.
6012 			// s_ptrs - NULL is the fragment no.
6013 			for(cand_i = 0; cand_i < canidate_s_items; cand_i ++){
6014 				subread_read_number_t frag_S_no = (s_ptrs[cand_i] - NULL)/ 2;
6015 				int frag_S_is_read_B = (s_ptrs[cand_i] - NULL) % 2;
6016 
6017 				mapping_result_t read_S_res_body, mate_S_res_body;
6018 				mapping_result_t * read_S_res = &read_S_res_body;
6019 				mapping_result_t * mate_S_res = &mate_S_res_body;
6020 
6021 				bigtable_readonly_result(global_context, NULL, frag_S_no, 0, frag_S_is_read_B, read_S_res, NULL);
6022 				bigtable_readonly_result(global_context, NULL, frag_S_no, 0, !frag_S_is_read_B, mate_S_res, NULL);
6023 
6024 				int is_read_S_negative = (read_S_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6025 				int is_mate_S_negative = (mate_S_res -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6026 				if(frag_S_is_read_B) is_read_S_negative = !is_read_S_negative;
6027 				else is_mate_S_negative = !is_mate_S_negative;
6028 
6029 				int is_INV_TRA = is_mate_S_negative == is_read_S_negative;
6030 
6031 				if(is_INV_TRA && is_read_S_negative == !frag_Q_larger_read){
6032 					if(frag_Q_larger_read)
6033 						fraglist_append(&fli_INV_B, frag_S_no * 2 + frag_S_is_read_B);
6034 					else
6035 						fraglist_append(&fli_INV_C, frag_S_no * 2 + frag_S_is_read_B);
6036 				}
6037 
6038 				if((!is_INV_TRA) && is_read_S_negative == !frag_Q_larger_read){
6039 					if(frag_Q_larger_read)
6040 						fraglist_append(&fli_STR_C, frag_S_no * 2 + frag_S_is_read_B);
6041 					else
6042 						fraglist_append(&fli_STR_B, frag_S_no * 2 + frag_S_is_read_B);
6043 				}
6044 			}
6045 		}
6046 
6047 		unsigned int guesed_p_small, guessed_tra_length, guessed_q_small, is_brkP_cand_found = 0;
6048 
6049 		if(fli_INV_B.fragments >= 1 && fli_INV_C.fragments >= 1){
6050 			int PEmates = find_translocation_BC_mates(global_context, q_res_1, q_res_2, &fli_INV_B, &fli_INV_C, 1, s1_selected_list, s2_selected_list, s1_poses, s2_poses, &guesed_p_small, &guessed_tra_length, &guessed_q_small);
6051 			int ConformPE = find_translocation_BC_conformation(global_context, PEmates, s1_poses, s2_poses);
6052 			int brkPQR_are_found = 0;
6053 			unsigned int brkPno, brkQno, brkRno;
6054 
6055 			char out1pos[100], out2pos[100];
6056 			absoffset_to_posstr(global_context, q_res_1 -> selected_position, out1pos);
6057 			absoffset_to_posstr(global_context, q_res_2 -> selected_position, out2pos);
6058 			SUBREADprintf("TRALOG: A_READ: %09llu: INV : %s ~ %s ; %d PE_MATES (%s)\n", frag_A_no, out1pos, out2pos, PEmates, ConformPE?"CONFORMABLE":"INCONSISTENT");
6059 
6060 			//SUBREADputs("TRALOG: INV_C:");
6061 			//print_frags(global_context,&fli_INV_C);
6062 			//SUBREADputs("TRALOG: INV_B:");
6063 			//print_frags(global_context,&fli_INV_B);
6064 			if(PEmates)
6065 				brkPQR_are_found = find_translocation_brk_PQR(global_context, q_res_1, q_res_2, &fli_INV_B, &fli_INV_C, &brkPno, &brkQno, &brkRno, 1, &is_brkP_cand_found);
6066 
6067 			if(brkPQR_are_found){
6068 				brkPQR_are_found = breakpoint_PQR_supported(global_context , brkPno , brkQno, brkRno, &fli_INV_B, &fli_INV_C, 1);
6069 				SUBREADprintf("TRALOG: A_READ: INV BRK_PQR_SUPPED=%d\n", brkPQR_are_found);
6070 			}
6071 			if(brkPQR_are_found)
6072 				create_or_update_translocation_result( global_context , brkPno, brkQno, brkRno , PEmates, 1);
6073 			else if(ConformPE && fli_INV_B.fragments > 2 && fli_INV_C.fragments > 2 && is_brkP_cand_found)
6074 				create_or_update_translocation_imprecise_result(global_context, guesed_p_small, guessed_tra_length, guessed_q_small, PEmates, 1);
6075 		}
6076 
6077 		if(fli_STR_B.fragments >= 1 && fli_STR_C.fragments >= 1){
6078 			int PEmates = find_translocation_BC_mates(global_context, q_res_1, q_res_2, &fli_STR_B, &fli_STR_C, 0, s1_selected_list, s2_selected_list, s1_poses, s2_poses, &guesed_p_small, &guessed_tra_length, &guessed_q_small);
6079 			int ConformPE = find_translocation_BC_conformation(global_context, PEmates, s1_poses, s2_poses);
6080 
6081 			char out1pos[100], out2pos[100];
6082 			absoffset_to_posstr(global_context, q_res_1 -> selected_position, out1pos);
6083 			absoffset_to_posstr(global_context, q_res_2 -> selected_position, out2pos);
6084 
6085 			SUBREADprintf("TRALOG: A_READ: %09llu: TRA : %s ~ %s ; %d PE_MATES (%s)\n", frag_A_no, out1pos, out2pos, PEmates, ConformPE?"CONFORMABLE":"INCONSISTENT");
6086 
6087 			//SUBREADputs("TRALOG: STR_B:");
6088 			//print_frags(global_context,&fli_STR_B);
6089 			//SUBREADputs("TRALOG: STR_C:");
6090 			//print_frags(global_context,&fli_STR_C);
6091 
6092 			int brkPQR_are_found = 0;
6093 			unsigned int brkPno, brkQno, brkRno;
6094 
6095 			if(PEmates)
6096 				brkPQR_are_found = find_translocation_brk_PQR(global_context, q_res_1, q_res_2, &fli_STR_B, &fli_STR_C, &brkPno, &brkQno, &brkRno, 0, &is_brkP_cand_found);
6097 
6098 			if(brkPQR_are_found){
6099 				brkPQR_are_found = breakpoint_PQR_supported(global_context , brkPno , brkQno, brkRno, &fli_STR_B, &fli_STR_C, 0);
6100 			}
6101 
6102 			if(brkPQR_are_found)
6103 				create_or_update_translocation_result( global_context , brkPno, brkQno, brkRno , PEmates, 0);
6104 			else if(ConformPE && fli_INV_B.fragments > 2 && fli_INV_C.fragments > 2 && is_brkP_cand_found)
6105 				create_or_update_translocation_imprecise_result(global_context, guesed_p_small, guessed_tra_length, guessed_q_small, PEmates, 0);
6106 		}
6107 
6108 		fraglist_destroy(&fli_STR_B);
6109 		fraglist_destroy(&fli_STR_C);
6110 		fraglist_destroy(&fli_INV_B);
6111 		fraglist_destroy(&fli_INV_C);
6112 	}
6113 
6114 	free(s1_result_ptr_list);
6115 	free(s2_result_ptr_list);
6116 	free(s1_ptrs);
6117 	free(s2_ptrs);
6118 	free(s1_poses);
6119 	free(s2_poses);
6120 	free(s1_selected_list);
6121 	free(s2_selected_list);
6122 
6123 }
6124 
finalise_inversions(global_context_t * global_context)6125 void finalise_inversions(global_context_t * global_context){
6126 	subread_read_number_t frag_A_i;
6127 	void ** s1_ptrs, **s2_ptrs;
6128 	unsigned int * s1_poses, * s2_poses;
6129 
6130 	s1_ptrs = malloc(sizeof(void *) * S12_LIST_CAPACITY);
6131 	s2_ptrs = malloc(sizeof(void *) * S12_LIST_CAPACITY);
6132 
6133 	s1_poses = malloc(sizeof(int) * S12_LIST_CAPACITY);
6134 	s2_poses = malloc(sizeof(int) * S12_LIST_CAPACITY);
6135 
6136 	unsigned long long * s1_selected_list = malloc(sizeof(long long) * S12_LIST_CAPACITY);	// fragment_no * 2 + is_second_read
6137 	unsigned long long * s2_selected_list = malloc(sizeof(long long) * S12_LIST_CAPACITY);
6138 
6139 	mapping_result_t ** s1_result_ptr_list =  malloc(sizeof(mapping_result_t *) * S12_LIST_CAPACITY);
6140 	mapping_result_t ** s2_result_ptr_list =  malloc(sizeof(mapping_result_t *) * S12_LIST_CAPACITY);
6141 
6142 	int frag_Q_larger_read, xk1, xk2;
6143 
6144 	for(frag_A_i = 0; frag_A_i < global_context -> funky_list_DE.fragments; frag_A_i ++){
6145 		int s1_list_items = 0, s2_list_items = 0;
6146 
6147 		subread_read_number_t frag_A_no = global_context -> funky_list_DE.fragment_numbers[frag_A_i];
6148 
6149 		mapping_result_t q_res_A_body, q_res_B_body;
6150 
6151 		mapping_result_t * q_res_A = &q_res_A_body, * q_res_B = &q_res_B_body;
6152 
6153 		bigtable_readonly_result(global_context, NULL, frag_A_no, 0, 0, q_res_A, NULL);
6154 		bigtable_readonly_result(global_context, NULL, frag_A_no, 0, 1, q_res_B, NULL);
6155 
6156 		mapping_result_t * q_res_1 = q_res_A -> selected_position >  q_res_B -> selected_position?q_res_B:q_res_A;
6157 		mapping_result_t * q_res_2 = q_res_A -> selected_position <= q_res_B -> selected_position?q_res_B:q_res_A;
6158 
6159 
6160 		/***************************************************************************************************
6161  		 *
6162  		 *  is_q1_negative and is_q2_negative describes the strandness of the original FASTQ read sequence.
6163  		 *
6164  		 *  For the very normal mappings, is_q1_negative must be 0 and is_q2_negative must be 1.
6165  		 *
6166  		 *  If is_q1_negative != is_q2_negative, then there is a strand-jumpping fusion between the two reads.
6167  		 */
6168 
6169 		int is_q1_negative = (q_res_1 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6170 		int is_q2_negative = (q_res_2 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6171 
6172 		if(q_res_B == q_res_1)is_q1_negative=!is_q1_negative;
6173 		if(q_res_B == q_res_2)is_q2_negative=!is_q2_negative;
6174 
6175 		if(is_q1_negative == 0 && is_q2_negative == 0)	// D READ
6176 		{
6177 			for(frag_Q_larger_read = 0; frag_Q_larger_read < 2; frag_Q_larger_read++){
6178 				int * s_list_items = frag_Q_larger_read?&s2_list_items:&s1_list_items;
6179 				void ** s_ptrs = frag_Q_larger_read?s2_ptrs:s1_ptrs;
6180 				unsigned int * s_poses = frag_Q_larger_read?s2_poses:s1_poses;
6181 				int q_res_offset = 0;
6182 				mapping_result_t * q_res = frag_Q_larger_read?q_res_2:q_res_1;
6183 				unsigned long long * s_selected_list = frag_Q_larger_read?s2_selected_list:s1_selected_list;
6184 				mapping_result_t ** s_result_ptr_list = frag_Q_larger_read?s2_result_ptr_list:s1_result_ptr_list;
6185 
6186 
6187 				char * q_res_chro = NULL;
6188 				locate_gene_position(q_res -> selected_position,  &global_context -> chromosome_table, &q_res_chro, &q_res_offset);
6189 				q_res_offset +=1 ; // all tables are one-based.
6190 
6191 				unsigned int q_search_start = q_res_offset;
6192 				if(q_search_start > FUNKY_COLOCATION_TOLERANCE) q_search_start -= FUNKY_COLOCATION_TOLERANCE;
6193 				else q_search_start = 0;
6194 
6195 				int cand_i, canidate_s_items = bktable_lookup(&global_context -> funky_table_DE, q_res_chro, q_search_start, 2*FUNKY_COLOCATION_TOLERANCE, s_poses, s_ptrs, S12_LIST_CAPACITY);
6196 				// scan if candidate is reversed.
6197 				// s_ptrs - NULL is the fragment no.
6198 				for(cand_i = 0; cand_i < canidate_s_items; cand_i ++){
6199 					subread_read_number_t frag_S_no = (s_ptrs[cand_i] - NULL)/2;
6200 					int frag_S_larger_read = (s_ptrs[cand_i] - NULL)%2;
6201 
6202 					if(frag_S_no == frag_A_no) continue;
6203 
6204 					if(frag_S_larger_read == frag_Q_larger_read){
6205 
6206 						mapping_result_t res_S_A_body, res_S_B_body;
6207 						mapping_result_t * res_S_A = &res_S_A_body , * res_S_B = &res_S_B_body;
6208 
6209 						bigtable_readonly_result(global_context, NULL, frag_S_no, 0, 0, res_S_A, NULL);
6210 						bigtable_readonly_result(global_context, NULL, frag_S_no, 0, 1, res_S_B, NULL);
6211 
6212 						mapping_result_t * res_S_1 = res_S_A -> selected_position >  res_S_B -> selected_position?res_S_B:res_S_A;
6213 						mapping_result_t * res_S_2 = res_S_A -> selected_position <= res_S_B -> selected_position?res_S_B:res_S_A;
6214 
6215 						mapping_result_t * co_locatted_S_res = frag_S_larger_read?res_S_2:res_S_1;
6216 
6217 						int is_s1_negative = (res_S_1 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6218 						int is_s2_negative = (res_S_2 -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
6219 
6220 						if(res_S_B == res_S_1) is_s1_negative = !is_s1_negative;
6221 						if(res_S_B == res_S_2) is_s2_negative = !is_s2_negative;
6222 
6223 
6224 						if( is_s1_negative != 0 && is_s2_negative != 0 ){	// E READ
6225 							s_selected_list[*s_list_items] = frag_S_no * 2 + frag_S_larger_read;
6226 							s_result_ptr_list[*s_list_items] = co_locatted_S_res;
6227 							(*s_list_items)++;
6228 						}
6229 					}
6230 				}
6231 			}
6232 		}
6233 
6234 		int found_INV_frags = 0;
6235 		srInt_64 guessed_Z_large_abs_sum = 0, guessed_Y_small_abs_sum = 0;
6236 
6237 		for(xk1 = 0; xk1 < s1_list_items; xk1++){
6238 			for(xk2 = 0; xk2 < s2_list_items ; xk2 ++){
6239 				if(s1_selected_list[xk1]/2 == s2_selected_list[xk2]/2)
6240 				{
6241 					found_INV_frags ++;
6242 					// now there is only one D fragment. here we found the E fragment for it (E fragment is in s1[xk1] and s2[xk2])
6243 					// s1 is the E read that is close to D_1;  s2 is the E read that is close to D_2;   D_1 is the D read with smaller coordinate.
6244 					// res_E1 is the read that is close to D_2; mapping location of E_1 should be larger than D_2
6245 
6246 					mapping_result_t * res_D1 = q_res_1;
6247 					mapping_result_t * res_D2 = q_res_2;
6248 
6249 					mapping_result_t * res_E1 = s2_result_ptr_list[xk2];
6250 					mapping_result_t * res_E2 = s1_result_ptr_list[xk1];
6251 
6252 					int Gap_a_length = res_E2 -> selected_position - res_D1 -> selected_position - res_D1 -> read_length;
6253 					int Gap_b_length = res_E1 -> selected_position - res_D2 -> selected_position - res_D2 -> read_length;
6254 					int average_gap_len = (Gap_b_length + Gap_a_length)/2;
6255 					guessed_Y_small_abs_sum += res_D1 -> selected_position + res_D1 -> read_length - average_gap_len / 2;
6256 					guessed_Z_large_abs_sum += res_E1 -> selected_position - average_gap_len / 2;
6257 					SUBREADprintf("INVLOG: GUESSED_LEN = %d + %d / 2 = %d\n", Gap_a_length, Gap_b_length, average_gap_len);
6258 				}
6259 			}
6260 		}
6261 
6262 		unsigned int brkYno=0xffffffff, brkZno=0xffffffff;
6263 		int cand_YZ_breakpoints = 0;
6264 		if(found_INV_frags > 0)
6265 		{
6266 			char * q_small_chro = NULL;
6267 			int q_small_pos = 0;
6268 
6269 			guessed_Y_small_abs_sum /= found_INV_frags;
6270 			guessed_Z_large_abs_sum /= found_INV_frags;
6271 			SUBREADprintf("INVLOG: GUESSED_YZ=%lld, %lld\n", guessed_Y_small_abs_sum, guessed_Z_large_abs_sum);
6272 
6273 			locate_gene_position(q_res_1 -> selected_position,  &global_context -> chromosome_table, &q_small_chro, &q_small_pos);
6274 			int cand_Y, cand_Z;
6275 			cand_YZ_breakpoints = bktable_lookup(&global_context -> breakpoint_table_YZ, q_small_chro, q_small_pos, global_context -> config.maximum_pair_distance , s1_poses, s1_ptrs, S12_LIST_CAPACITY);
6276 
6277 			//SUBREADprintf("INVLOG: %09u FOUND %d CANDIDATE BKs AT %s:%u\n", frag_A_no, cand_YZ_breakpoints, q_small_chro, q_small_pos);
6278 
6279 			indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
6280 
6281 			for(cand_Y = 0; cand_Y < cand_YZ_breakpoints ; cand_Y ++){
6282 				if(brkYno < 0xffffffff) break;
6283 
6284 				int event_no_Y = s1_ptrs[cand_Y] - NULL;
6285 				chromosome_event_t * event_body_Y = indel_context -> event_space_dynamic + event_no_Y;
6286 
6287 				if(event_body_Y -> small_side_increasing_coordinate) continue;
6288 				if(event_body_Y -> small_side_increasing_coordinate != event_body_Y -> large_side_increasing_coordinate)
6289 					assert(0);
6290 
6291 				if(abs(event_body_Y -> event_large_side - q_res_2 -> selected_position) < global_context -> config.maximum_pair_distance){
6292 
6293 					for(cand_Z = 0; cand_Z < cand_YZ_breakpoints ; cand_Z ++){
6294 						int event_no_Z = s1_ptrs[cand_Z] - NULL;
6295 						chromosome_event_t * event_body_Z = indel_context -> event_space_dynamic + event_no_Z;
6296 
6297 						if(!event_body_Z -> small_side_increasing_coordinate) continue;
6298 						if(event_body_Z -> small_side_increasing_coordinate != event_body_Z -> large_side_increasing_coordinate)
6299 							assert(0);
6300 
6301 						long long dist_small = event_body_Z -> event_small_side , dist_large = event_body_Z -> event_large_side;
6302 						dist_small -= event_body_Y -> event_small_side;
6303 						dist_large -= event_body_Y -> event_large_side;
6304 
6305 						long long dist_small_large_diff = dist_small;
6306 						dist_small_large_diff -= dist_large;
6307 
6308 						if(abs(dist_small_large_diff)  <= BREAK_POINT_MAXIMUM_TOLERANCE && abs(dist_large) <= BREAK_POINT_MAXIMUM_TOLERANCE && event_body_Z -> small_side_increasing_coordinate != event_body_Y -> small_side_increasing_coordinate){
6309 
6310 							brkYno = event_no_Y;
6311 							brkZno = event_no_Z;
6312 
6313 							break;
6314 						}
6315 					}
6316 
6317 
6318 					if(1)
6319 					{
6320 						char outpos1[100], outpos2[100];
6321 						absoffset_to_posstr(global_context, event_body_Y -> event_small_side, outpos1);
6322 						absoffset_to_posstr(global_context, event_body_Y -> event_large_side, outpos2);
6323 
6324 						SUBREADprintf("INVLOG: %09llu FOUND BREAKPOINT YZ: %s ~ %s, INC_COR: %c %c , nSUP=%d\n", frag_A_no, outpos1, outpos2, event_body_Y -> small_side_increasing_coordinate?'>':'<', event_body_Y -> large_side_increasing_coordinate?'>':'<' , event_body_Y -> final_counted_reads);
6325 
6326 					}
6327 
6328 				}
6329 			}
6330 		}
6331 
6332 
6333 		char *brkYchr = "NULL";
6334 		unsigned int brkYabs_small = 0, brkYabs_large = 0;
6335 		int brkYsmall = 0, brkYlarge = 0;
6336 		int is_precisely_called = 0, is_roughly_called = 0;
6337 		if(brkYno < 0xffffffff){
6338 			// s1_selected_list : 2 * fragment_S_no + frag_S_larger_read
6339 			int is_passed_YZ = breakpoint_YZ_supported(global_context, brkYno, brkZno, s1_selected_list, s1_list_items, s2_selected_list, s2_list_items);
6340 			if(is_passed_YZ)
6341 			{
6342 				is_precisely_called = 1;
6343 
6344 				get_event_two_coordinates(global_context, brkYno, &brkYchr, &brkYsmall, &brkYabs_small, &brkYchr, &brkYlarge, &brkYabs_large);
6345 
6346 			}
6347 			else is_roughly_called = 1;
6348 			//SUBREADprintf("\nINVLOG: FINALLY_%sCONFIRMED: %09u  %s:%u (len=%d) INVERSED.\n", is_passed_YZ?"":"NOT ", frag_A_no, brkYchr, brkYsmall, brkYlarge - brkYsmall);
6349 		}
6350 
6351 		//SUBREADprintf("\nINVLOG: FINALLY_GUESSED: %09u  found_INV_frags=%d, s1_list_items=%d, s2_list_items=%d, cand_YZ_breakpoints=%d\n", frag_A_no, found_INV_frags, s1_list_items, s2_list_items, cand_YZ_breakpoints);
6352 
6353 		//for(xk1 = 0; xk1 < s1_list_items; xk1++) SUBREADprintf("INVLOG: %09d S_1 MATES: %09llu\n" , frag_A_no , s1_selected_list[xk1]/2);
6354 		//for(xk1 = 0; xk1 < s2_list_items; xk1++) SUBREADprintf("INVLOG: %09d S_2 MATES: %09llu\n" , frag_A_no , s2_selected_list[xk1]/2);
6355 
6356 
6357 
6358 		/*
6359 		if(found_INV_frags >= min(s1_list_items , s2_list_items) - 2 && found_INV_frags > 1 && !is_precisely_called && cand_YZ_breakpoints>0){
6360 			// guess brkYlarge, brkYsmall, brkZlarge, brkZsmall, brkYabsLarge, brkZabsLarge...
6361 			locate_gene_position(guessed_Y_small_abs_sum,  &global_context -> chromosome_table, &brkYchr, &brkYsmall);
6362 			locate_gene_position(guessed_Z_large_abs_sum,  &global_context -> chromosome_table, &brkYchr, &brkYlarge);
6363 			//SUBREADprintf("\nINVLOG: FINALLY_GUESSED: %09u  %s:%u (len=%llu) INVERSED.\n", frag_A_no, brkYchr, brkYsmall, guessed_Z_large_abs_sum - guessed_Y_small_abs_sum);
6364 			is_roughly_called = 1;
6365 		}*/
6366 
6367 		if( is_precisely_called || is_roughly_called )
6368 		{
6369 			void * old_ptrs[_PQR_LIST_SIZE];
6370 			unsigned int old_poses[_PQR_LIST_SIZE];
6371 			int old_found = 0, old_i, old_inversions = bktable_lookup(&global_context -> inversion_result_table, brkYchr, brkYsmall - BREAK_POINT_MAXIMUM_TOLERANCE, 2*BREAK_POINT_MAXIMUM_TOLERANCE, old_poses, old_ptrs, _PQR_LIST_SIZE);
6372 			for(old_i = 0; old_i < old_inversions; old_i ++){
6373 				inversion_result_t * inv_res_old = (inversion_result_t *) old_ptrs[old_i];
6374 				long long old_dist = inv_res_old -> length;
6375 				old_dist -= brkYlarge - brkYsmall;	// the difference on inversion length.
6376 				if(abs(old_dist) < BREAK_POINT_MAXIMUM_TOLERANCE){
6377 					inv_res_old -> all_sup_D ++;
6378 					inv_res_old -> max_sup_E = max(inv_res_old -> max_sup_E , found_INV_frags);
6379 					old_found = 1;
6380 					break;
6381 				}
6382 			}
6383 
6384 			if(0 == old_found){
6385 				inversion_result_t * inv_res_new = malloc(sizeof(chromosome_event_t));
6386 				memset(inv_res_new, 0 , sizeof(chromosome_event_t));
6387 
6388 				inv_res_new -> length = brkYlarge - brkYsmall;
6389 				inv_res_new -> is_precisely_called = is_precisely_called;
6390 				if(is_precisely_called){
6391 					inv_res_new -> event_Y_number = brkYno;
6392 					inv_res_new -> event_Z_number = brkZno;
6393 					inv_res_new -> small_side = brkYabs_small;
6394 				}else{
6395 					inv_res_new -> event_Y_rough_small_abs = guessed_Y_small_abs_sum;
6396 					inv_res_new -> event_Z_rough_large_abs = guessed_Z_large_abs_sum;
6397 					inv_res_new -> small_side = guessed_Y_small_abs_sum;
6398 				}
6399 				inv_res_new -> all_sup_D = 1;
6400 				inv_res_new -> max_sup_E = found_INV_frags;
6401 
6402 				bktable_append(&global_context -> inversion_result_table, brkYchr, brkYsmall, inv_res_new);
6403 			}
6404 		}
6405 	}
6406 
6407 	free(s1_result_ptr_list);
6408 	free(s2_result_ptr_list);
6409 	free(s1_ptrs);
6410 	free(s2_ptrs);
6411 	free(s1_poses);
6412 	free(s2_poses);
6413 	free(s1_selected_list);
6414 	free(s2_selected_list);
6415 }
6416 
build_breakpoint_tables(global_context_t * global_context)6417 void build_breakpoint_tables(global_context_t  * global_context){
6418 
6419 	int xk1;
6420 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
6421 
6422 	for(xk1 = 0; xk1 < indel_context -> total_events ; xk1++)
6423 	{
6424 		char * chro_name_left= NULL,* chro_name_right = NULL;
6425 		int chro_pos_left= 0, chro_pos_right = 0;
6426 
6427 		chromosome_event_t * event_body = indel_context -> event_space_dynamic + xk1;
6428 
6429 		if(event_body -> event_type != CHRO_EVENT_TYPE_FUSION && event_body -> event_type != CHRO_EVENT_TYPE_JUNCTION)
6430 			continue;
6431 
6432 		locate_gene_position(event_body -> event_small_side,  &global_context -> chromosome_table, &chro_name_left, &chro_pos_left);
6433 		locate_gene_position(event_body -> event_large_side,  &global_context -> chromosome_table, &chro_name_right, &chro_pos_right);
6434 
6435 		long long dist = chro_pos_left;
6436 		dist -= chro_pos_right;
6437 		if(dist<0)dist=-dist;
6438 
6439 		int breakpoint_group = -1;
6440 
6441 		if(event_body -> is_strand_jumped){
6442 			// breakpoint QR or YZ
6443 			if(chro_name_left != chro_name_right || dist > global_context -> config.maximum_translocation_length)
6444 				breakpoint_group = 2;	// QR
6445 			else
6446 				breakpoint_group = 3;	// YZ
6447 		}else{
6448 			// breakpoint QR or P
6449 			if(chro_name_left != chro_name_right || dist > global_context -> config.maximum_translocation_length)
6450 				breakpoint_group = 2;	// QR
6451 			else
6452 				breakpoint_group = 1;	// P
6453 		}
6454 
6455 
6456 		bucketed_table_t * index_table = breakpoint_group == 1?
6457 							&global_context -> breakpoint_table_P :
6458 							(breakpoint_group == 2?
6459 								&global_context -> breakpoint_table_QR:
6460 								(breakpoint_group == 3?
6461 									&global_context -> breakpoint_table_YZ:
6462 									NULL
6463 								)
6464 							);
6465 
6466 		//SUBREADprintf("BPLOG: %s:%u ~ %s:%u (%c) GRP=%d (%p)\n", chro_name_left, chro_pos_left, chro_name_right, chro_pos_right, event_body -> is_strand_jumped?'X':'=', breakpoint_group, index_table);
6467 
6468 		if(index_table)	bktable_append(index_table, chro_name_left, chro_pos_left, NULL + xk1);
6469 		if(index_table)	bktable_append(index_table, chro_name_right, chro_pos_right, NULL + xk1);
6470 	}
6471 }
6472 
finalise_structural_variances(global_context_t * global_context)6473 void finalise_structural_variances(global_context_t * global_context){
6474 	SUBREADprintf("Funky Tables: A:%u, BC:%u, DE:%u\n", (unsigned int) global_context -> funky_list_A.fragments, (unsigned int)global_context -> funky_table_BC.fragments / 2, (unsigned int)global_context -> funky_list_DE.fragments);
6475 
6476 	build_breakpoint_tables(global_context);
6477 	SUBREADprintf("Breakpoint Tables: P:%u, QR:%u, YZ:%u\n", (unsigned int)global_context -> breakpoint_table_P.fragments, (unsigned int)global_context -> breakpoint_table_QR.fragments, (unsigned int)global_context -> breakpoint_table_YZ.fragments);
6478 	finalise_translocations(global_context);
6479 	finalise_inversions(global_context);
6480 }
6481