1 /***************************************************************
2 
3    The Subread software package is free software package:
4    you can redistribute it and/or modify it under the terms
5    of the GNU General Public License as published by the
6    Free Software Foundation, either version 3 of the License,
7    or (at your option) any later version.
8 
9    Subread is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty
11    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 
13    See the GNU General Public License for more details.
14 
15    Authors: Drs Yang Liao and Wei Shi
16 
17   ***************************************************************/
18 
19 
20 #include <stdlib.h>
21 #include <math.h>
22 #include <signal.h>
23 #include <dirent.h>
24 #include <string.h>
25 #include <assert.h>
26 #include <unistd.h>
27 #include "hashtable.h"
28 #include "HelperFunctions.h"
29 #include "gene-value-index.h"
30 #include "gene-algorithms.h"
31 #include "input-files.h"
32 #include "core.h"
33 #include "core-indel.h"
34 #include "core-junction.h"
35 #include "core-bigtable.h"
36 #include "sublog.h"
37 
38 
39 //#define indel_debug
40 //#define is_target_window_X(x) 0
41 
42 int DIGIN_COUNTER=0;
43 int FRESH_COUNTER=0;
44 extern unsigned int BASE_BLOCK_LENGTH;
45 
is_bad_read(char * r,int len,char * w)46 void is_bad_read(char * r, int len, char * w)
47 {
48 }
49 
merge_event_sides(void * arr,int start,int items,int items2)50 void merge_event_sides(void * arr, int start, int items, int items2)
51 {
52 	void ** arrl  =  (void **)arr;
53 	chromosome_event_t * event_space = (chromosome_event_t*) arrl[1];
54 	int * event_id_list = (int *)arrl[0];
55 	int is_small_side = (arrl[2]!=NULL);
56 
57 	int * merge_tmp = malloc(sizeof(int)* (items+items2));
58 	int x1, first_cursor = start, second_cursor = start+items;
59 
60 	for(x1=0;x1<items+items2;x1++)
61 	{
62 		int choose_2 = 0;
63 		if(first_cursor >= start+items)
64 			choose_2 = 1;
65 		else if(second_cursor < start+items+items2)
66 		{
67 
68 			chromosome_event_t * levent = event_space + event_id_list[first_cursor];
69 			chromosome_event_t * revent = event_space + event_id_list[second_cursor];
70 
71 			if(is_small_side){
72 				if(levent->event_small_side > revent ->event_small_side)
73 					choose_2 = 1;
74 			}else
75 			{
76 				if(levent->event_large_side > revent ->event_large_side)
77 					choose_2 = 1;
78 			}
79 
80 		}
81 
82 		if(choose_2)
83 			merge_tmp[x1] = event_id_list[second_cursor++];
84 		else
85 			merge_tmp[x1] = event_id_list[first_cursor++];
86 
87 	}
88 
89 	memcpy(event_id_list + start, merge_tmp, sizeof(int)* (items+items2));
90 
91 	free(merge_tmp);
92 }
93 
exchange_event_sides(void * arr,int l,int r)94 void exchange_event_sides(void * arr, int l, int r)
95 {
96 	void ** arrl  =  (void **)arr;
97 	int * event_id_list = (int *)arrl[0];
98 
99 	int tmp ;
100 	tmp = event_id_list[l];
101 	event_id_list[l] = event_id_list[r];
102 	event_id_list[r] = tmp;
103 
104 }
105 
compare_event_inner(chromosome_event_t * event_space,int l,int r,int is_small_side)106 int compare_event_inner(chromosome_event_t * event_space, int l, int r, int is_small_side)
107 {
108 
109 	chromosome_event_t * revent = event_space +r;
110 	chromosome_event_t * levent = event_space +l;
111 	if(is_small_side){
112 		if(levent->event_small_side <revent ->event_small_side) return -1;
113 		else if(levent->event_small_side ==revent ->event_small_side) return 0;
114 		else return 1;
115 	}else
116 	{
117 		if(levent->event_large_side <revent ->event_large_side) return -1;
118 		else if(levent->event_large_side ==revent ->event_large_side) return 0;
119 		else return 1;
120 	}
121 
122 
123 }
124 
compare_event_sides(void * arr,int l,int r)125 int compare_event_sides(void * arr, int l, int r)
126 {
127 	void ** arrl  =  (void **)arr;
128 	chromosome_event_t * event_space = (chromosome_event_t*) arrl[1];
129 	int * event_id_list = (int *)arrl[0];
130 	int is_small_side = (arrl[2]!=NULL);
131 
132 	return compare_event_inner(event_space, event_id_list[l], event_id_list[r], is_small_side);
133 }
134 
BINsearch_event(chromosome_event_t * event_space,int * event_ids,int is_small_side,unsigned int pos,int events)135 int BINsearch_event(chromosome_event_t * event_space, int * event_ids, int is_small_side, unsigned int pos, int events)
136 {
137 	int imax, imin;
138 	assert(events > 0);
139 	imax = events - 1; imin = 0;
140 
141 	while (1)
142 	{
143 		int imid = (imin+ imax)/2;
144 		unsigned tested_mid_pos;
145 		//assert(imax<events);
146 		//assert(imin<events);
147 		//assert(imid<events);
148 		assert(event_ids[imid]<events);
149 		chromosome_event_t * mid_event = event_space+event_ids[imid];
150 		tested_mid_pos = is_small_side?mid_event->event_small_side:mid_event->event_large_side;
151 
152 		if(tested_mid_pos == pos)
153 			return imid;
154 		else if(tested_mid_pos < pos)
155 			imin = imid +1;
156 		else	imax = imid -1;
157 
158 		if(imin > imax)
159 			return imax;
160 	}
161 }
162 
163 
164 typedef struct{
165 	int thread_id;
166 	int block_start;
167 	int block_end;
168 	HashTable * result_tab;
169 	int * small_side_ordered_event_ids, * large_side_ordered_event_ids;
170 	chromosome_event_t * event_space;
171 	global_context_t * global_context;
172 } AT_context_t;
173 
174 
175 #define ANTI_SUPPORTING_READ_LIMIT 100
176 
anti_support_thread_run(void * v_cont)177 void * anti_support_thread_run(void * v_cont){
178 	AT_context_t * atcont = v_cont;
179 
180 	int * cancelled_event_list = malloc(sizeof(int)*ANTI_SUPPORTING_READ_LIMIT), x2;
181 	int * small_side_ordered_event_ids = atcont -> small_side_ordered_event_ids, * large_side_ordered_event_ids = atcont -> large_side_ordered_event_ids;
182 	global_context_t * global_context = atcont -> global_context;
183 	chromosome_event_t * event_space = atcont -> event_space;
184 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
185 	if(indel_context->total_events<1)return NULL;
186 
187 	int current_read_number, cancelled_events, x1;
188 
189 	for(current_read_number = atcont -> block_start; current_read_number < atcont -> block_end; current_read_number++)
190 	{
191 		int is_second_read;
192 		for (is_second_read = 0; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read ++)
193 		{
194 			int best_read_id;
195 			for(best_read_id = 0; best_read_id < global_context -> config.multi_best_reads; best_read_id++)
196 			{
197 				mapping_result_t *current_result = _global_retrieve_alignment_ptr(global_context, current_read_number, is_second_read, best_read_id);
198 				if(current_result -> selected_votes<1) break;
199 				if(!global_context->config.report_multi_mapping_reads)if(current_result -> result_flags & CORE_IS_BREAKEVEN) continue;
200 				//if(current_result -> result_flags & CORE_IS_GAPPED_READ) continue;
201 				if(current_result->selected_votes < global_context->config.minimum_subread_for_first_read)
202 					continue;
203 
204 				cancelled_events=0;
205 				unsigned int mapping_head = current_result -> selected_position;
206 				unsigned int coverage_start = mapping_head + current_result -> confident_coverage_start;
207 				unsigned int coverage_end = mapping_head + current_result -> confident_coverage_end;
208 
209 				int small_side_left_event = BINsearch_event(event_space, small_side_ordered_event_ids,1, coverage_start - 1, indel_context->total_events) + 1;
210 				int large_side_left_event = BINsearch_event(event_space, large_side_ordered_event_ids,0, coverage_start - 1, indel_context->total_events) + 1;
211 
212 				int small_side_right_event = BINsearch_event(event_space, small_side_ordered_event_ids,1, coverage_end, indel_context->total_events)+20;
213 				int large_side_right_event = BINsearch_event(event_space, large_side_ordered_event_ids,0, coverage_end, indel_context->total_events)+20;
214 
215 				for(x1 = small_side_left_event ; x1 <= small_side_right_event ; x1++)
216 				{
217 					if(x1 >= indel_context->total_events) break;
218 					if(ANTI_SUPPORTING_READ_LIMIT <= cancelled_events) break;
219 					chromosome_event_t * event_body = event_space+ small_side_ordered_event_ids[x1];
220 					if(event_body -> event_small_side <= coverage_start + 5) continue;
221 					if(event_body -> event_small_side >= coverage_end - 5) continue;
222 
223 					long long cur_count = HashTableGet(atcont -> result_tab , NULL+small_side_ordered_event_ids[x1]+1 ) - NULL;
224 					cur_count++;
225 					HashTablePut( atcont -> result_tab , NULL+small_side_ordered_event_ids[x1]+1  , NULL+cur_count );
226 
227 					cancelled_event_list[cancelled_events++] = small_side_ordered_event_ids[x1];
228 				}
229 
230 				for(x1 = large_side_left_event ; x1 <= large_side_right_event ; x1++)
231 				{
232 					if(x1 >= indel_context->total_events) break;
233 					chromosome_event_t * event_body = event_space+ large_side_ordered_event_ids[x1];
234 					if(event_body -> event_large_side <= coverage_start + 5) continue;
235 					if(event_body -> event_large_side >= coverage_end- 5) continue;
236 
237 					int to_be_add = 1;
238 					for(x2=0; x2<cancelled_events; x2++)
239 						if(cancelled_event_list[x2] == large_side_ordered_event_ids[x1])
240 						{
241 							to_be_add = 0;
242 							break;
243 						}
244 
245 					if(to_be_add){
246 					//	printf("OCT27-ANTISUP-READ @ %u has LARGE @ %u~%u, INDELS = %d, ASUP = %d\n", coverage_end, event_body -> event_small_side,  event_body -> event_large_side, event_body -> indel_length, event_body -> anti_supporting_reads);
247 						long long cur_count = HashTableGet(atcont -> result_tab , NULL+large_side_ordered_event_ids[x1]+1 ) - NULL;
248 						cur_count++;
249 						HashTablePut( atcont -> result_tab , NULL+large_side_ordered_event_ids[x1]+1  , NULL+cur_count );
250 					}
251 				}
252 			}
253 		}
254 	}
255 
256 	free(cancelled_event_list);
257 	return NULL;
258 }
259 
260 
anti_support_add_count(void * ky,void * va,HashTable * tab)261 void anti_support_add_count(void * ky, void * va, HashTable * tab){
262 	chromosome_event_t * event_space = tab -> appendix1 ;
263 	int eno = ky-NULL-1;
264 	chromosome_event_t * event_body = event_space + eno;
265 	event_body -> anti_supporting_reads+= (va - NULL);
266 }
267 
anti_supporting_read_scan(global_context_t * global_context)268 int anti_supporting_read_scan(global_context_t * global_context)
269 {
270 
271 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
272 	if(indel_context->total_events<1)return 0;
273 
274 	chromosome_event_t * event_space = indel_context -> event_space_dynamic;
275 
276 	int x1, * small_side_ordered_event_ids, * large_side_ordered_event_ids;
277 
278 	small_side_ordered_event_ids = malloc(sizeof(int)*indel_context -> total_events);
279 	large_side_ordered_event_ids = malloc(sizeof(int)*indel_context -> total_events);
280 
281 	for(x1=0; x1<indel_context->total_events; x1++) {
282 		small_side_ordered_event_ids[x1]=x1;
283 		large_side_ordered_event_ids[x1]=x1;
284 	}
285 
286 	void * sort_data[3];
287 	sort_data[0] = small_side_ordered_event_ids;
288 	sort_data[1] = event_space;
289 	sort_data[2] = (void *)0xffff;	// small_side_ordering
290 
291 	merge_sort(sort_data, indel_context->total_events, compare_event_sides, exchange_event_sides, merge_event_sides);
292 
293 	sort_data[0] = large_side_ordered_event_ids;
294 	sort_data[2] = NULL;	// large_side_ordering
295 
296 	merge_sort(sort_data, indel_context->total_events, compare_event_sides, exchange_event_sides, merge_event_sides);
297 
298 
299 	AT_context_t ATconts[64];
300 	pthread_t AThreads[64];
301 
302 	int thread_no, last_end = 0;
303 	for(thread_no=0; thread_no< global_context -> config.all_threads; thread_no++){
304 		AT_context_t * atc = ATconts + thread_no;
305 		atc -> thread_id = thread_no;
306 		atc -> block_start = last_end;
307 		atc -> block_end = last_end = global_context -> processed_reads_in_chunk / global_context -> config.all_threads * thread_no;
308 		if(thread_no == global_context -> config.all_threads-1)  atc -> block_end = global_context -> processed_reads_in_chunk;
309 
310 		atc -> global_context = global_context;
311 		atc -> result_tab = HashTableCreate(200000);
312 		atc -> small_side_ordered_event_ids = small_side_ordered_event_ids;
313 		atc -> large_side_ordered_event_ids = large_side_ordered_event_ids;
314 		atc -> event_space = event_space;
315 
316 		pthread_create(AThreads + thread_no , NULL, anti_support_thread_run, atc);
317 	}
318 
319 	for(thread_no=0; thread_no< global_context -> config.all_threads; thread_no++){
320 		pthread_join(AThreads[thread_no], NULL);
321 		ATconts[thread_no].result_tab -> appendix1 = event_space;
322 		HashTableIteration( ATconts[thread_no].result_tab, anti_support_add_count );
323 		HashTableDestroy(ATconts[thread_no].result_tab);
324 	}
325 
326 	free(small_side_ordered_event_ids);
327 	free(large_side_ordered_event_ids);
328 
329 	return 0;
330 }
331 
reallocate_event_space(global_context_t * global_context,thread_context_t * thread_context,int event_no)332 chromosome_event_t * reallocate_event_space( global_context_t* global_context,thread_context_t * thread_context,int event_no)
333 {
334 	int max_event_no;
335 
336 	if(thread_context) {
337 		max_event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> current_max_event_number;
338 		if(max_event_no<=event_no)
339 		{
340 			((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> current_max_event_number *= 1.6;
341 			 ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic =
342 				realloc(((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic, sizeof(chromosome_event_t) *
343 					((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> current_max_event_number);
344 		}
345 		return ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
346 	} else {
347 		max_event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> current_max_event_number;
348 		if(max_event_no<=event_no)
349 		{
350 			((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> current_max_event_number *= 1.6;
351 			((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic =
352 				realloc(((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic, sizeof(chromosome_event_t) *
353 					((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> current_max_event_number);
354 		}
355 		return ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
356 	}
357 
358 }
359 
is_ambiguous_indel_score(chromosome_event_t * e)360 int is_ambiguous_indel_score(chromosome_event_t * e)
361 {
362 	return 0;
363 	//if(e -> indel_length == 1 && e->is_ambiguous>4)return 1;
364 	//if(e -> indel_length <= 3 && e->is_ambiguous>3 && (e-> event_small_side % 7 < 3))return 1;
365 	//if(e -> indel_length == 4) return e->is_ambiguous>4;
366 	//return 0;
367 }
368 
event_neighbour_sort_compare(void * arr,int l,int r)369 int event_neighbour_sort_compare(void * arr, int l, int r){
370 	unsigned int ** sort_data = (unsigned int **) arr;
371 	if(sort_data[1][l] > sort_data[1][r]) return 1;
372 	if(sort_data[1][l] < sort_data[1][r]) return -1;
373 	return 0;
374 }
375 
event_neighbour_sort_exchange(void * arr,int l,int r)376 void event_neighbour_sort_exchange(void * arr, int l, int r){
377 	unsigned int ** sort_data = (unsigned int **) arr;
378 	unsigned int tmpi;
379 
380 	tmpi = sort_data[0][l];
381 	sort_data[0][l] = sort_data[0][r];
382 	sort_data[0][r] = tmpi;
383 
384 	tmpi = sort_data[1][l];
385 	sort_data[1][l] = sort_data[1][r];
386 	sort_data[1][r] = tmpi;
387 }
388 
389 // smallFirst
event_neighbour_sort_merge(void * arr,int start,int items,int items2)390 void event_neighbour_sort_merge(void * arr, int start, int items, int items2){
391 	unsigned int ** sort_data = (unsigned int **) arr;
392 
393 	unsigned int * tmp_global_id_list = malloc(sizeof(int) * (items+items2));
394 	unsigned int * tmp_offset_list = malloc(sizeof(int) * (items+items2));
395 
396 	int i1_cursor = start, i2_cursor = items + start;
397 	int tmp_cursor = 0;
398 
399 	while(1){
400 		if(i1_cursor == items + start && i2_cursor == items + items2 + start )break;
401 		int select_items_1 = (i1_cursor < items + start && event_neighbour_sort_compare(arr, i1_cursor, i2_cursor) <= 0) || (i2_cursor == start + items + items2);
402 		if(select_items_1){
403 			tmp_global_id_list[tmp_cursor] = sort_data[0][i1_cursor];
404 			tmp_offset_list[tmp_cursor ++ ] = sort_data[1][i1_cursor ++];
405 		}else{
406 			tmp_global_id_list[tmp_cursor] = sort_data[0][i2_cursor];
407 			tmp_offset_list[tmp_cursor ++ ] = sort_data[1][i2_cursor ++];
408 		}
409 	}
410 
411 	memcpy( sort_data[0] + start, tmp_global_id_list, sizeof(int) * (items+items2) );
412 	memcpy( sort_data[1] + start, tmp_offset_list, sizeof(int) * (items+items2) );
413 	free(tmp_global_id_list);
414 	free(tmp_offset_list);
415 }
416 
test_redundant_event(global_context_t * global_context,chromosome_event_t * testee,chromosome_event_t * neighbour)417 int test_redundant_event( global_context_t * global_context, chromosome_event_t* testee, chromosome_event_t* neighbour ){
418 	int length_delta = 3, is_same = 0;
419 	if(testee -> event_type == CHRO_EVENT_TYPE_INDEL) length_delta = 0;
420 
421 	long long dist = testee -> event_large_side;
422 	dist -= neighbour -> event_large_side;
423 
424 	if(abs(dist) <= length_delta){
425 		if(testee -> event_type == CHRO_EVENT_TYPE_INDEL){
426 			int indel_diff = testee -> indel_length;
427 			indel_diff -= neighbour -> indel_length;
428 
429 			if(abs(indel_diff) <= length_delta)
430 				is_same = 1;
431 		}else is_same = 1;
432 	}
433 
434 	int is_redund = 0;
435 	if(is_same){
436 		if(testee -> indel_at_junction > neighbour -> indel_at_junction) is_redund = 1;
437 		else if(testee -> indel_at_junction == neighbour -> indel_at_junction){
438 			if(testee -> supporting_reads < neighbour -> supporting_reads) is_redund = 1;
439 			else if( testee -> supporting_reads == neighbour -> supporting_reads ){
440 				if(testee -> event_small_side < neighbour -> event_small_side) is_redund = 1;
441 			}
442 		}
443 	}
444 
445 	return is_redund;
446 }
447 
448 
449 #define reallocate_to_be_removed_ids		if(to_be_removed_number >= remove_id_size-1){\
450 							remove_id_size = remove_id_size*3/2;\
451 							to_be_removed_ids = realloc(to_be_removed_ids, remove_id_size);\
452 						}
453 
454 #define MIN_EVENT_SUPPORT_NO 2
455 
remove_neighbour(global_context_t * global_context)456 void remove_neighbour(global_context_t * global_context)
457 {
458 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
459 
460 	HashTable * event_table = indel_context -> event_entry_table;
461 	chromosome_event_t * event_space = indel_context -> event_space_dynamic;
462 	int xk1,xk2,xk3;
463 	int * to_be_removed_ids;
464 	int to_be_removed_number = 0, all_junctions = 0;
465 	int remove_id_size = 999999;
466 
467 	to_be_removed_ids = malloc(sizeof(int) * (1+remove_id_size));
468 	for(xk1=0; xk1<indel_context->total_events; xk1++)
469 	{
470 		chromosome_event_t * event_body = &event_space[xk1];
471 		if(CHRO_EVENT_TYPE_REMOVED == event_body->event_type) continue;
472 		if(global_context -> config.do_remove_neighbour_for_scRNA && global_context -> config.scRNA_input_mode && event_body -> supporting_reads < MIN_EVENT_SUPPORT_NO){
473 			reallocate_to_be_removed_ids;
474 			to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
475 			continue;
476 		}
477 		all_junctions++;
478 		int event_type;
479 		for(event_type = 0; event_type < 2; event_type++)
480 		{
481 			if(event_type) // for INDELs
482 			{
483 				int neighbour_range = 3;
484 				if(event_body->event_type != CHRO_EVENT_TYPE_INDEL) continue;
485 
486 				reallocate_to_be_removed_ids;
487 				if(is_ambiguous_indel_score(event_body))to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
488 				else for(xk2=-neighbour_range; xk2<=neighbour_range; xk2++)
489 				{
490 					//if(!xk2)continue;
491 					unsigned int test_pos_small = event_body -> event_small_side + xk2;
492 					chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
493 
494 					int found_events = search_event(global_context, event_table, event_space, test_pos_small  , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_INDEL, search_return);
495 					//if(test_pos_small > 284136 && test_pos_small < 284146) printf("POS=%u\tTRES=%d\n", test_pos_small, found_events);
496 
497 					for(xk3 = 0; xk3<found_events; xk3++)
498 					{
499 						reallocate_to_be_removed_ids;
500 						chromosome_event_t * tested_neighbour = search_return[xk3];
501 						long long int length_diff = tested_neighbour -> indel_length;
502 						length_diff -=  event_body -> indel_length;
503 						if(length_diff == 0 && xk2==0) continue;
504 						if(is_ambiguous_indel_score(tested_neighbour))
505 							to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
506 						else
507 							if(length_diff==0)
508 							{
509 								if(event_body -> event_small_side - event_body->connected_previous_event_distance + 1 == tested_neighbour-> event_large_side) continue;
510 								if(tested_neighbour -> event_small_side - tested_neighbour->connected_previous_event_distance + 1 == event_body-> event_large_side) continue;
511 
512 								if(event_body -> event_large_side + event_body->connected_next_event_distance - 1 == tested_neighbour-> event_small_side) continue;
513 								if(tested_neighbour -> event_large_side + tested_neighbour->connected_next_event_distance - 1 == event_body-> event_small_side) continue;
514 
515 								if(event_body->event_quality < tested_neighbour -> event_quality)
516 									to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
517 								else if(event_body->event_quality ==  tested_neighbour -> event_quality &&
518 									(event_body -> supporting_reads < tested_neighbour -> supporting_reads||(event_body -> supporting_reads == tested_neighbour -> supporting_reads && xk2<0)))
519 								{
520 									to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
521 								}
522 							}
523 					}
524 				}
525 			}
526 			else // for junctions/fusions
527 			{
528 				int neighbour_range = 11;
529 				int indel_range = 4 , delta_small = 0;// max_range = 0;
530 				if(event_body -> event_type == CHRO_EVENT_TYPE_INDEL) continue;
531 				if(event_body -> is_donor_found_or_annotation & 64) continue; // do not remove known events
532 				//if(! global_context->config.check_donor_at_junctions )max_range = 10;
533 
534 				//for(delta_small = - max_range; delta_small <= max_range ; delta_small ++)
535 					for(xk2=-neighbour_range; xk2<=neighbour_range; xk2++)
536 					{
537 						if(!xk2)continue;
538 						unsigned int test_pos_small = event_body -> event_small_side + xk2;
539 						chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
540 
541 						int found_events = search_event(global_context,event_table, event_space, test_pos_small + delta_small , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_JUNCTION|CHRO_EVENT_TYPE_FUSION, search_return);
542 						for(xk3 = 0; xk3<found_events; xk3++)
543 						{
544 
545 							reallocate_to_be_removed_ids;
546 							chromosome_event_t * tested_neighbour = search_return[xk3];
547 
548 							if(tested_neighbour -> indel_at_junction > event_body -> indel_at_junction) continue;
549 							if(event_body -> indel_at_junction > tested_neighbour -> indel_at_junction && abs(tested_neighbour -> event_large_side - tested_neighbour -> event_small_side + tested_neighbour -> indel_at_junction - event_body -> event_large_side + event_body -> event_small_side - event_body -> indel_at_junction) <= 16)
550 								to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
551 							else if(tested_neighbour -> event_large_side >=  event_body -> event_large_side -indel_range + xk2 && tested_neighbour -> event_large_side <= event_body -> event_large_side + indel_range + xk2 && (event_body -> supporting_reads < tested_neighbour -> supporting_reads || (event_body -> supporting_reads ==  tested_neighbour -> supporting_reads  && xk2<0)))
552 								to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
553 						}
554 					}
555 
556 				if((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection) && event_body -> event_type == CHRO_EVENT_TYPE_FUSION)
557 				{
558 					for(xk2=-10 ; xk2 < 10 ; xk2++)
559 					{
560 						if(!xk2)continue;
561 
562 						reallocate_to_be_removed_ids;
563 						unsigned int test_pos_small = event_body -> event_small_side + xk2;
564 						chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
565 						int found_events = search_event(global_context,event_table, event_space, test_pos_small + delta_small , EVENT_SEARCH_BY_BOTH_SIDES, CHRO_EVENT_TYPE_JUNCTION|CHRO_EVENT_TYPE_FUSION, search_return);
566 
567 						for(xk3 = 0; xk3<found_events; xk3++)
568 						{
569 							chromosome_event_t * tested_neighbour = search_return[xk3];
570 							if(found_events && event_body -> supporting_reads < tested_neighbour -> supporting_reads)
571 							{
572 								to_be_removed_ids[to_be_removed_number++] = event_body -> global_event_id;
573 								break;
574 							}
575 						}
576 					}
577 				}
578 			}
579 		}
580 	}
581 
582 	for(xk1=0; xk1<to_be_removed_number; xk1++) {
583 		int event_no = to_be_removed_ids[xk1];
584 		chromosome_event_t * deleted_event =  &event_space[event_no];
585 
586 		for(xk2=0;xk2<2;xk2++){
587 			unsigned int pos = xk2?deleted_event->event_large_side:deleted_event->event_small_side;
588 			unsigned int * res = HashTableGet(event_table, NULL+pos);
589 			if(res){
590 				int current_size = res[0]&0x0fffffff;
591 				int wrt_ptr = 1;
592 				for(xk3 = 1 ; xk3< current_size+1 ; xk3++){
593 					if(!res[xk3])break;
594 					if(res[xk3] -1 == event_no) continue;
595 					if(wrt_ptr != xk3) res[wrt_ptr]=res[xk3];
596 					wrt_ptr++;
597 				}
598 				if(0 && wrt_ptr==1){
599 					HashTableRemove(event_table, NULL+pos);
600 					free(res);
601 				}else if(wrt_ptr < current_size +1)res[wrt_ptr]=0;
602 			}
603 		}
604 		if(deleted_event -> event_type == CHRO_EVENT_TYPE_INDEL && deleted_event -> inserted_bases)
605 			free(deleted_event -> inserted_bases);
606 		deleted_event -> event_type = CHRO_EVENT_TYPE_REMOVED;
607 	}
608 
609 	free(to_be_removed_ids);
610 }
611 
612 
613 
localPointerCmp_forEventEntry(const void * pointer1,const void * pointer2)614 int localPointerCmp_forEventEntry(const void *pointer1, const void *pointer2)
615 {
616 	unsigned int v1 = (unsigned int)(pointer1-NULL);
617 	unsigned int v2 = (unsigned int)(pointer2-NULL);
618 	return v1!=v2;
619 }
620 
localPointerHashFunction_forEventEntry(const void * pointer)621 srUInt_64 localPointerHashFunction_forEventEntry(const void *pointer)
622 {
623 	unsigned int v = (pointer-NULL);
624 	return (unsigned long) (v << 24) ^ (v>>8);
625 }
626 
627 
init_indel_tables(global_context_t * context)628 int init_indel_tables(global_context_t * context)
629 {
630 	indel_context_t * indel_context = (indel_context_t*)malloc(sizeof(indel_context_t));
631 
632 	// event_entry_table is a table from an absolute chromosome location to a list of integers.
633 	// each integer is the index of event in the event array + 1.
634 	// unused values are set to 0.
635 
636 	indel_context -> event_entry_table = NULL;
637 
638 	context -> module_contexts[MODULE_INDEL_ID] = indel_context;
639 
640 	indel_context -> total_events = 0;
641 	indel_context -> current_max_event_number = 0;
642 	//indel_context -> current_max_event_number = context->config.init_max_event_number;
643 	//indel_context -> event_space_dynamic = (chromosome_event_t *)malloc(sizeof(chromosome_event_t)*indel_context -> current_max_event_number);
644 
645 	indel_context -> event_space_dynamic = NULL;
646 
647 	if(context -> config.all_threads < 2) {
648 		indel_context -> event_entry_table = HashTableCreate(399997);
649 
650 		indel_context -> event_entry_table -> appendix1=malloc(1024 * 1024 * 64);
651 		indel_context -> event_entry_table -> appendix2=malloc(1024 * 1024 * 64);
652 		memset(indel_context -> event_entry_table -> appendix1, 0, 1024 * 1024 * 64);
653 		memset(indel_context -> event_entry_table -> appendix2, 0, 1024 * 1024 * 64);
654 
655 		HashTableSetKeyComparisonFunction(indel_context->event_entry_table, localPointerCmp_forEventEntry);
656 		HashTableSetHashFunction(indel_context->event_entry_table, localPointerHashFunction_forEventEntry);
657 
658 		indel_context -> total_events = 0;
659 		indel_context -> current_max_event_number = context->config.init_max_event_number;
660 		indel_context -> event_space_dynamic = malloc(sizeof(chromosome_event_t)*indel_context -> current_max_event_number);
661 		if(!indel_context -> event_space_dynamic)
662 		{
663 			sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_FATAL, "Cannot allocate memory for threads. Please try to reduce the thread number.");
664 			return 1;
665 		}
666 	}
667 	if(context->config.is_third_iteration_running) {
668 		char * fns = malloc(200);
669 		fns[0]=0;
670 		exec_cmd("ulimit -n", fns, 200);
671 		int max_open_file = atoi(fns);
672 		//SUBREADprintf("SYS FILE LIMIT=%d\n", max_open_file);
673 		free(fns);
674 		max_open_file = max(100, max_open_file);
675 		max_open_file = min(3000, max_open_file);
676 
677 
678 		indel_context -> local_reassembly_pileup_files = HashTableCreate(100);
679 		indel_context -> local_reassembly_pileup_files -> appendix1 = NULL + max_open_file  * 2/ 3;
680 		HashTableSetDeallocationFunctions(indel_context -> local_reassembly_pileup_files, NULL, NULL);
681 		HashTableSetKeyComparisonFunction(indel_context -> local_reassembly_pileup_files, my_strcmp);
682 		HashTableSetHashFunction(indel_context -> local_reassembly_pileup_files ,HashTableStringHashFunction);
683 	}
684 
685 	indel_context -> dynamic_align_table = malloc(sizeof(short*)*MAX_READ_LENGTH);
686 	indel_context -> dynamic_align_table_mask = malloc(sizeof(char *)*MAX_READ_LENGTH);
687 
688 	int xk1;
689 	for(xk1=0;xk1<MAX_READ_LENGTH; xk1++) {
690 		indel_context -> dynamic_align_table[xk1] = malloc(sizeof(short)*MAX_READ_LENGTH);
691 		indel_context -> dynamic_align_table_mask[xk1] = malloc(sizeof(char)*MAX_READ_LENGTH);
692 	}
693 
694 	for(xk1=0; xk1<EVENT_BODY_LOCK_BUCKETS; xk1++)
695 		subread_init_lock(indel_context -> event_body_locks+xk1);
696 
697 	return 0;
698 }
699 
init_indel_thread_contexts(global_context_t * global_context,thread_context_t * thread_context,int task)700 int init_indel_thread_contexts(global_context_t * global_context, thread_context_t * thread_context, int task)
701 {
702 	indel_thread_context_t * indel_thread_context = (indel_thread_context_t*)malloc(sizeof(indel_thread_context_t));
703 	indel_context_t * indel_context = (indel_context_t *) global_context -> module_contexts[MODULE_INDEL_ID];
704 
705 	if(task == STEP_VOTING) {
706 		indel_thread_context -> event_entry_table = HashTableCreate(399997 * ( global_context -> config.scRNA_input_mode?18:2 ));
707 		indel_thread_context -> event_entry_table -> appendix1=NULL;//indel_context -> event_entry_table-> appendix1;
708 		indel_thread_context -> event_entry_table -> appendix2=NULL;//indel_context -> event_entry_table-> appendix2;
709 		HashTableSetKeyComparisonFunction(indel_thread_context->event_entry_table, localPointerCmp_forEventEntry);
710 		HashTableSetHashFunction(indel_thread_context->event_entry_table, localPointerHashFunction_forEventEntry);
711 
712 		indel_thread_context -> total_events = 0;
713 		indel_thread_context -> current_max_event_number = global_context->config.init_max_event_number;
714 		indel_thread_context -> event_space_dynamic = malloc(sizeof(chromosome_event_t)*indel_thread_context -> current_max_event_number);
715 		if(!indel_thread_context -> event_space_dynamic)
716 		{
717 			sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_FATAL, "Cannot allocate memory for threads. Please try to reduce the thread number.");
718 			return 1;
719 		}
720 
721 		indel_thread_context -> dynamic_align_table = malloc(sizeof(short*)*MAX_READ_LENGTH);
722 		indel_thread_context -> dynamic_align_table_mask = malloc(sizeof(char *)*MAX_READ_LENGTH);
723 
724 		int xk1;
725 		for(xk1=0;xk1<MAX_READ_LENGTH; xk1++)
726 		{
727 			indel_thread_context -> dynamic_align_table[xk1] = malloc(sizeof(short)*MAX_READ_LENGTH);
728 			indel_thread_context -> dynamic_align_table_mask[xk1] = malloc(sizeof(char)*MAX_READ_LENGTH);
729 		}
730 	}
731 	else if(task == STEP_ITERATION_TWO)
732 	{
733 		indel_thread_context -> event_entry_table = indel_context -> event_entry_table;
734 		indel_thread_context -> total_events = indel_context -> total_events;
735 		indel_thread_context -> event_space_dynamic = indel_context -> event_space_dynamic;
736 		indel_thread_context -> final_reads_mismatches_array = (unsigned short*)malloc(sizeof(unsigned short)*indel_context -> total_events);
737 		indel_thread_context -> final_counted_reads_array = (unsigned short*)malloc(sizeof(unsigned short)*indel_context -> total_events);
738 		assert(indel_thread_context -> final_counted_reads_array);
739 
740 		memset(indel_thread_context -> final_reads_mismatches_array , 0, sizeof(unsigned short)*indel_context -> total_events);
741 		memset(indel_thread_context -> final_counted_reads_array , 0, sizeof(unsigned short)*indel_context -> total_events);
742 
743 		subread_init_lock(&thread_context -> output_lock);
744 
745 	}
746 
747 	thread_context -> module_thread_contexts[MODULE_INDEL_ID] = indel_thread_context;
748 	return 0;
749 }
750 
751 
destory_event_entry_table(HashTable * old)752 void destory_event_entry_table(HashTable * old)
753 {
754 	int bucket;
755 	KeyValuePair * cursor;
756 	for(bucket=0; bucket<old -> numOfBuckets; bucket++)
757 	{
758 		cursor = old -> bucketArray[bucket];
759 		while(1)
760 		{
761 			if (!cursor) break;
762 			int * id_list = (int *) cursor ->value;
763 //#warning "=============== UNCOMMENT THE NEXT LINE IN THE RELEASE!!! THIS IS FOR REMOVING A SEGFAULT ERROR =================="
764 			free(id_list);
765 
766 			cursor = cursor->next;
767 		}
768 	}
769 }
770 
771 typedef struct {
772 	unsigned int scanning_positons;
773 	unsigned int thread_bodytable_number;
774 } scanning_events_record_t;
775 
776 
777 
778 #define _get_global_body(iii) ( indel_context ->  event_space_dynamic + records[iii].thread_bodytable_number)
779 
scanning_events_compare(void * arr,int l,int r)780 int scanning_events_compare(void * arr, int l, int r){
781 	void ** arrr = (void **) arr;
782 	indel_context_t * indel_context = arrr[0];
783 	scanning_events_record_t * records = arrr[1];
784 	chromosome_event_t * body_l = _get_global_body(l);
785 	chromosome_event_t * body_r = _get_global_body(r);
786 
787 	if(records[l].scanning_positons > records[r].scanning_positons)return 1;
788 	if(records[l].scanning_positons < records[r].scanning_positons)return -1;
789 
790 	if((body_l -> is_donor_found_or_annotation & 64)!=0 && (body_r -> is_donor_found_or_annotation & 64)==0) return 1; // prefer known truth.
791 	if((body_l -> is_donor_found_or_annotation & 64)==0 && (body_r -> is_donor_found_or_annotation & 64)!=0) return -1;
792 	if(body_l -> supporting_reads > body_r -> supporting_reads) return -1;
793 	if(body_l -> supporting_reads < body_r -> supporting_reads) return 1;
794 	if(abs(body_l -> indel_length) < abs(body_r -> indel_length)) return 1;
795 	if(abs(body_l -> indel_length) > abs(body_r -> indel_length)) return -1;
796 	if(body_l -> indel_length > body_r -> indel_length) return -1; // same length, but L is del and R is ins -- prefer del than ins
797 	if(body_l -> indel_length < body_r -> indel_length) return 1;
798 
799 	if(body_l -> event_small_side > body_r -> event_small_side)return 1;
800 	if(body_l -> event_small_side < body_r -> event_small_side)return -1;
801 	if(body_l -> event_large_side > body_r -> event_large_side)return 1;
802 	return -1;
803 }
804 
null_scanning_events_merge(void * arr,int start,int items,int items2)805 void null_scanning_events_merge(void * arr,  int start, int items, int items2){}
scanning_events_merge(void * arr,int start,int items,int items2)806 void scanning_events_merge(void * arr,  int start, int items, int items2){
807 	void ** arrr = (void **) arr;
808 	scanning_events_record_t * records = arrr[1];
809 
810 	int read_1_ptr = start, read_2_ptr = start+items, write_ptr;
811 	scanning_events_record_t * merged_records = malloc(sizeof(scanning_events_record_t) * (items+items2));
812 
813 	for(write_ptr=0; write_ptr<items+items2; write_ptr++){
814 		if((read_1_ptr >= start+items)||(read_2_ptr < start+items+items2 && scanning_events_compare(arr, read_1_ptr, read_2_ptr) > 0))
815 			memcpy(merged_records+write_ptr, records+(read_2_ptr++), sizeof(scanning_events_record_t));
816 		else
817 			memcpy(merged_records+write_ptr, records+(read_1_ptr++), sizeof(scanning_events_record_t));
818 	}
819 	memcpy(records + start, merged_records, sizeof(scanning_events_record_t) * (items+items2));
820 	free(merged_records);
821 }
822 
scanning_events_exchange(void * arr,int l,int r)823 void scanning_events_exchange(void * arr, int l, int r){
824 	void ** arrr = (void **) arr;
825 	scanning_events_record_t * records = arrr[1];
826 
827 	unsigned long tmpi;
828 
829 	tmpi = records[l].scanning_positons;
830 	records[l].scanning_positons = records[r].scanning_positons;
831 	records[r].scanning_positons = tmpi;
832 
833 	tmpi = records[l].thread_bodytable_number;
834 	records[l].thread_bodytable_number = records[r].thread_bodytable_number;
835 	records[r].thread_bodytable_number = tmpi;
836 }
837 
838 
839 
840 #define _test_record_size	if(current_record_number >= current_record_size - 2){\
841 		current_record_size *= 1.5;\
842 		records=realloc(records, sizeof(scanning_events_record_t)*current_record_size);\
843 		if(NULL == records) return -1;\
844 	}\
845 
846 #define _add_record	records[current_record_number].scanning_positons = body -> event_small_side;\
847 	records[current_record_number].thread_bodytable_number = xx1;\
848 	current_record_number++;\
849 	records[current_record_number].scanning_positons = body -> event_large_side;\
850 	records[current_record_number].thread_bodytable_number = xx1;\
851 	current_record_number++;\
852 
853 
sort_junction_entry_table(global_context_t * global_context)854 int sort_junction_entry_table(global_context_t * global_context){
855 	indel_context_t * indel_context = (indel_context_t*)global_context -> module_contexts[MODULE_INDEL_ID];
856 	chromosome_event_t * event_space = indel_context -> event_space_dynamic;
857 
858 	if(indel_context -> event_entry_table){
859 		if(indel_context -> event_entry_table->appendix1)
860 		{
861 			free(indel_context -> event_entry_table -> appendix1);
862 			free(indel_context -> event_entry_table -> appendix2);
863 		}
864 
865 		// free the entry table: the pointers.
866 		destory_event_entry_table(indel_context -> event_entry_table);
867 		HashTableDestroy(indel_context -> event_entry_table);
868 	}
869 
870 	indel_context -> event_entry_table = HashTableCreate(399997);
871 	HashTableSetKeyComparisonFunction(indel_context->event_entry_table, localPointerCmp_forEventEntry);
872 	HashTableSetHashFunction(indel_context->event_entry_table, localPointerHashFunction_forEventEntry);
873 
874 	if(global_context -> config.use_bitmap_event_table) {
875 		indel_context -> event_entry_table -> appendix1=malloc(1024 * 1024 * 64);
876 		indel_context -> event_entry_table -> appendix2=malloc(1024 * 1024 * 64);
877 		memset(indel_context -> event_entry_table -> appendix1, 0, 1024 * 1024 * 64);
878 		memset(indel_context -> event_entry_table -> appendix2, 0, 1024 * 1024 * 64);
879 	} else {
880 		indel_context -> event_entry_table -> appendix1=NULL;
881 		indel_context -> event_entry_table -> appendix2=NULL;
882 	}
883 
884 	int xx1, current_record_number=0, current_record_size=10000;
885 
886 	scanning_events_record_t * records = malloc(sizeof(scanning_events_record_t)*current_record_size);
887 
888 	for(xx1 = 0; xx1 < indel_context -> total_events; xx1++){
889 		chromosome_event_t * body = event_space + xx1;
890 		_test_record_size;
891 		_add_record;
892 	}
893 
894 	void * sort_arr[2];
895 	sort_arr[0] = indel_context;
896 	sort_arr[1] = records;
897 
898 	// many repeated elements
899 	// do not use quick sort.
900 	merge_sort(sort_arr, current_record_number, scanning_events_compare, scanning_events_exchange, scanning_events_merge);
901 	unsigned int last_scannping_pos = records[0].scanning_positons;
902 	int merge_start = 0;
903 	HashTable * event_table = indel_context -> event_entry_table;
904 
905 	for(xx1 =0; xx1 <= current_record_number; xx1++){
906 		scanning_events_record_t * this_record = NULL;
907 		if(xx1<current_record_number) this_record = records+xx1;
908 
909 		if(xx1>0){
910 			if(xx1 == current_record_number || last_scannping_pos!= this_record -> scanning_positons){
911 				int merge_end = xx1, merge_i;
912 				if( merge_end-merge_start > MAX_EVENT_ENTRIES_PER_SITE )
913 					merge_end = merge_start + MAX_EVENT_ENTRIES_PER_SITE;
914 				unsigned int * id_list = malloc(sizeof(int) * (1+merge_end-merge_start));
915 				assert(id_list);
916 				id_list[0] = merge_end-merge_start;
917 				for(merge_i = merge_start; merge_i < merge_end; merge_i++){
918 					chromosome_event_t * body = _get_global_body(merge_i);
919 					id_list[merge_i - merge_start + 1] = records[merge_i].thread_bodytable_number + 1;
920 
921 					mark_event_bitmap(event_table->appendix1, body -> event_small_side);
922 					mark_event_bitmap(event_table->appendix2, body -> event_large_side);
923 				}
924 				merge_start = xx1;
925 
926 				//#warning "======= UNCOMMENT NEXT ======="
927 				HashTablePut(indel_context -> event_entry_table, NULL + last_scannping_pos, id_list);
928 			}
929 		}
930 		if(xx1 == current_record_number) break;
931 		last_scannping_pos = this_record -> scanning_positons;
932 	}
933 
934 	free(records);
935 	return 0;
936 }
937 
938 #define _get_event_body(iii) (records[iii].thread_no<0?( indel_context -> event_space_dynamic +  records[iii].thread_bodytable_number ):(((indel_thread_context_t*)(thread_contexts[records[iii].thread_no].module_thread_contexts[MODULE_INDEL_ID]))-> event_space_dynamic + records[iii].thread_bodytable_number ))
939 
940 typedef struct {
941 	unsigned int thread_bodytable_number;
942 	short thread_no;
943 } concatinating_events_record_t;
944 
945 #define _test_conc_size if(conc_rec_items >= conc_rec_size - 1){\
946 			conc_rec_size *= 1.5;\
947 			records = realloc(records, conc_rec_size * sizeof(concatinating_events_record_t));\
948 			if(NULL == records) return -1;\
949 		}
950 
conc_sort_compare(void * arr,int l,int r)951 int conc_sort_compare(void * arr, int l, int r){
952 	void ** arrr = (void **)arr;
953 	concatinating_events_record_t * records = arrr[0];
954 	indel_context_t * indel_context = arrr[1];
955 	thread_context_t * thread_contexts = arrr[2];
956 
957 	chromosome_event_t * body_l = _get_event_body(l);
958 	chromosome_event_t * body_r = _get_event_body(r);
959 
960 
961 	if(body_l -> event_small_side > body_r -> event_small_side)return 3;
962 	if(body_l -> event_small_side < body_r -> event_small_side)return -3;
963 	if(body_l -> event_large_side > body_r -> event_large_side)return 3;
964 	if(body_l -> event_large_side < body_r -> event_large_side)return -3;
965 
966 	if(abs(body_l -> indel_length) < abs(body_r -> indel_length)) return 2;
967 	if(abs(body_l -> indel_length) > abs(body_r -> indel_length)) return -2;
968 	if(body_l -> indel_length > body_r -> indel_length) return -2; // same length, but L is del and R is ins -- prefer del than ins
969 	if(body_l -> indel_length < body_r -> indel_length) return 2;
970 
971 	if((body_l -> is_donor_found_or_annotation & 64)!=0 && (body_r -> is_donor_found_or_annotation & 64)==0) return 1; // prefer known truth.
972 	if((body_l -> is_donor_found_or_annotation & 64)==0 && (body_r -> is_donor_found_or_annotation & 64)!=0) return -1;
973 	if(body_l -> supporting_reads > body_r -> supporting_reads) return -1;
974 	if(body_l -> supporting_reads < body_r -> supporting_reads) return 1;
975 	return 0;
976 
977 }
978 
979 
conc_sort_merge(void * arr,int start,int items,int items2)980 void conc_sort_merge(void * arr,  int start, int items, int items2){
981 	void ** arrr = (void **) arr;
982 	concatinating_events_record_t * records = arrr[0];
983 
984 	int read_1_ptr = start, read_2_ptr = start+items, write_ptr;
985 	concatinating_events_record_t * merged_records = malloc(sizeof(concatinating_events_record_t) * (items+items2));
986 
987 	for(write_ptr=0; write_ptr<items+items2; write_ptr++){
988 		if((read_1_ptr >= start+items)||(read_2_ptr < start+items+items2 && conc_sort_compare(arr, read_1_ptr, read_2_ptr) > 0))
989 			memcpy(merged_records+write_ptr, records+(read_2_ptr++), sizeof(concatinating_events_record_t));
990 		else
991 			memcpy(merged_records+write_ptr, records+(read_1_ptr++), sizeof(concatinating_events_record_t));
992 	}
993 	memcpy(records + start, merged_records, sizeof(concatinating_events_record_t) * (items+items2));
994 	free(merged_records);
995 }
996 
997 
998 
conc_sort_exchange(void * arr,int l,int r)999 void conc_sort_exchange(void * arr, int l, int r){
1000 
1001 	void ** arrr = (void **)arr;
1002 	concatinating_events_record_t * records = arrr[0];
1003 
1004 	unsigned int tmpi;
1005 	tmpi = records[l].thread_bodytable_number;
1006 	records[l].thread_bodytable_number =  records[r].thread_bodytable_number;
1007 	records[r].thread_bodytable_number = tmpi;
1008 
1009 	tmpi = records[l].thread_no;
1010 	records[l].thread_no =  records[r].thread_no;
1011 	records[r].thread_no = tmpi;
1012 }
1013 
1014 // sort and merge events from all threads and the global event space.
sort_global_event_table(global_context_t * global_context)1015 int sort_global_event_table(global_context_t * global_context){
1016 	return finalise_indel_and_junction_thread(global_context, NULL, STEP_VOTING);
1017 }
1018 
finalise_indel_and_junction_thread(global_context_t * global_context,thread_context_t * thread_contexts,int task)1019 int finalise_indel_and_junction_thread(global_context_t * global_context, thread_context_t * thread_contexts, int task)
1020 {
1021 	indel_context_t * indel_context = (indel_context_t*)global_context -> module_contexts[MODULE_INDEL_ID];
1022 	if(task == STEP_VOTING) {
1023 		concatinating_events_record_t * records;
1024 		int conc_rec_size = 10000, conc_rec_items = 0;
1025 		records = malloc(sizeof(concatinating_events_record_t) * conc_rec_size);
1026 
1027 		int xk1, thn;
1028 		if(thread_contexts)
1029 			for(thn =0; thn < global_context->config.all_threads; thn++){
1030 				thread_context_t * thread_context = thread_contexts+thn;
1031 				indel_thread_context_t * indel_thread_context = (indel_thread_context_t*)thread_context -> module_thread_contexts[MODULE_INDEL_ID];
1032 
1033 				for(xk1 = 0; xk1 < indel_thread_context -> total_events; xk1++){
1034 					chromosome_event_t * old_body = indel_thread_context  ->  event_space_dynamic + xk1;
1035 					if(old_body -> event_type == CHRO_EVENT_TYPE_REMOVED) continue;
1036 
1037 					_test_conc_size;
1038 					records[conc_rec_items].thread_no=thn;
1039 					records[conc_rec_items++].thread_bodytable_number=xk1;
1040 				}
1041 			}
1042 
1043 		for(xk1 = 0; xk1 < indel_context -> total_events; xk1++){
1044 			chromosome_event_t * old_body = indel_context ->  event_space_dynamic + xk1;
1045 			if(old_body -> event_type == CHRO_EVENT_TYPE_REMOVED) continue;
1046 			_test_conc_size;
1047 			records[conc_rec_items].thread_no=-1;
1048 			records[conc_rec_items++].thread_bodytable_number=xk1;
1049 		}
1050 
1051 		void * sort_arr[3];
1052 		sort_arr[0] = records;
1053 		sort_arr[1] = indel_context;
1054 		sort_arr[2] = thread_contexts;
1055 
1056 		// many repeated elements -- do not use quick sort.
1057 		merge_sort(sort_arr, conc_rec_items, conc_sort_compare, conc_sort_exchange, conc_sort_merge);
1058 
1059 		chromosome_event_t * prev_env = NULL;
1060 		int merge_target_size = 10000;
1061 		int merge_target_items = 0;
1062 		chromosome_event_t * merge_target = malloc(sizeof(chromosome_event_t) * merge_target_size);
1063 
1064 		int merge_start = 0;
1065 		for(xk1 = 0; xk1 <= conc_rec_items; xk1++){
1066 			chromosome_event_t * this_event = (xk1 == conc_rec_items)?NULL:_get_event_body(xk1);
1067 			if(xk1 > 0){
1068 				int compret = 0;
1069 				if(xk1 < conc_rec_items) compret = conc_sort_compare(sort_arr, xk1-1, xk1);
1070 				if(abs(compret)>1 || xk1 == conc_rec_items){// different events or last one -- merge [ prev_event_record_no , xk1 - 1 ]
1071 					int xk_merge;
1072 					// find a new slot in the target space
1073 					if(merge_target_items >= merge_target_size - 1){
1074 						merge_target_size *= 1.5;
1075 						merge_target = realloc(merge_target, sizeof(chromosome_event_t) * merge_target_size);
1076 					}
1077 
1078 					chromosome_event_t * merged_body = merge_target + merge_target_items;
1079 					memcpy( merged_body, prev_env, sizeof(chromosome_event_t) );
1080 					merged_body -> global_event_id = merge_target_items++;
1081 
1082 					for(xk_merge = merge_start; xk_merge < xk1 - 1; xk_merge++){
1083 						chromosome_event_t * old_body = _get_event_body(xk_merge);
1084 
1085 						assert(merged_body -> event_type == old_body -> event_type);
1086 						merged_body -> supporting_reads += old_body -> supporting_reads;
1087 						merged_body -> anti_supporting_reads += old_body -> anti_supporting_reads;
1088 						merged_body -> final_counted_reads += old_body -> final_counted_reads;
1089 						merged_body -> final_reads_mismatches += old_body -> final_reads_mismatches;
1090 						merged_body -> critical_supporting_reads += old_body -> critical_supporting_reads;
1091 						merged_body -> junction_flanking_left = max(merged_body -> junction_flanking_left, old_body -> junction_flanking_left);
1092 						merged_body -> junction_flanking_right = max(merged_body -> junction_flanking_right, old_body -> junction_flanking_right);
1093 						merged_body -> is_donor_found_or_annotation |= old_body -> is_donor_found_or_annotation;
1094 
1095 						if(merged_body -> connected_next_event_distance > 0 && old_body -> connected_next_event_distance > 0)
1096 							merged_body -> connected_next_event_distance = min(merged_body -> connected_next_event_distance, old_body -> connected_next_event_distance);
1097 						else	merged_body -> connected_next_event_distance = max(merged_body -> connected_next_event_distance, old_body -> connected_next_event_distance);
1098 
1099 						if(merged_body -> connected_previous_event_distance > 0 && old_body -> connected_previous_event_distance > 0)
1100 							merged_body -> connected_previous_event_distance = min(merged_body -> connected_previous_event_distance, old_body -> connected_previous_event_distance);
1101 						else    merged_body -> connected_previous_event_distance = max(merged_body -> connected_previous_event_distance, old_body -> connected_previous_event_distance);
1102 
1103 						if(old_body->inserted_bases && old_body -> event_type == CHRO_EVENT_TYPE_INDEL  && old_body -> indel_length<0){
1104 							//printf("OCT27-FREEMEM-INS [%d] : thread-%d + %d %u-%p\n", xk_merge, records[xk_merge].thread_no, records[xk_merge].thread_bodytable_number, old_body->event_small_side, old_body->inserted_bases );
1105 							if(merged_body -> inserted_bases && merged_body -> inserted_bases != old_body -> inserted_bases){
1106 					//			SUBREADprintf("FREE PTR=%p\n", old_body -> inserted_bases);
1107 								free(old_body -> inserted_bases);
1108 							}
1109 							else merged_body -> inserted_bases = old_body -> inserted_bases;
1110 							old_body -> inserted_bases = NULL;
1111 						}
1112 					}
1113 					merge_start = xk1;
1114 				}
1115 			}
1116 			if(xk1 == conc_rec_items) break;
1117 			prev_env = this_event;
1118 		}
1119 
1120 		free(records);
1121 
1122 		if(thread_contexts)
1123 			for(thn =0; thn < global_context->config.all_threads; thn++){
1124 				thread_context_t * thread_context = thread_contexts+thn;
1125 				indel_thread_context_t * indel_thread_context = (indel_thread_context_t*)thread_context -> module_thread_contexts[MODULE_INDEL_ID];
1126 
1127 				destory_event_entry_table(indel_thread_context -> event_entry_table);
1128 				HashTableDestroy(indel_thread_context -> event_entry_table);
1129 
1130 				free(indel_thread_context -> event_space_dynamic);
1131 
1132 				for(xk1=0;xk1<MAX_READ_LENGTH; xk1++)
1133 				{
1134 					free(indel_thread_context -> dynamic_align_table[xk1]);
1135 					free(indel_thread_context -> dynamic_align_table_mask[xk1]);
1136 				}
1137 
1138 				free(indel_thread_context->dynamic_align_table);
1139 				free(indel_thread_context->dynamic_align_table_mask);
1140 				free(indel_thread_context);
1141 			}
1142 
1143 		if(indel_context -> event_space_dynamic ) free(indel_context -> event_space_dynamic);
1144 		indel_context -> event_space_dynamic = NULL;
1145 
1146 		indel_context -> event_space_dynamic = merge_target;
1147 		indel_context -> current_max_event_number = merge_target_size;
1148 		indel_context -> total_events = merge_target_items;
1149 	} else if(task == STEP_ITERATION_TWO) {
1150 		int xk1, thn;
1151 		if(thread_contexts)
1152 			for(thn =0; thn < global_context->config.all_threads; thn++){
1153 				thread_context_t * thread_context = thread_contexts+thn;
1154 				indel_thread_context_t * indel_thread_context = (indel_thread_context_t*)thread_context -> module_thread_contexts[MODULE_INDEL_ID];
1155 
1156 				for(xk1 = 0; xk1 < indel_thread_context -> total_events; xk1++)
1157 				{
1158 					indel_context -> event_space_dynamic [xk1] . final_counted_reads +=indel_thread_context -> final_counted_reads_array[xk1];
1159 					indel_context -> event_space_dynamic [xk1] . final_reads_mismatches +=indel_thread_context -> final_reads_mismatches_array[xk1];
1160 				}
1161 				free(indel_thread_context -> final_counted_reads_array);
1162 				free(indel_thread_context -> final_reads_mismatches_array);
1163 				free(indel_thread_context);
1164 			}
1165 	}
1166 	return 0;
1167 }
1168 
1169 
add_annotation_to_junctions(void * key,void * val,HashTable * tab)1170 void add_annotation_to_junctions(void * key, void * val, HashTable * tab){
1171 	global_context_t * global_context = tab->appendix1;
1172 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
1173 	char * gene_chr = key;
1174 	int * pos_strands = val;
1175 	int current_lagre_end = -1;
1176 	char chroname[MAX_CHROMOSOME_NAME_LEN];
1177 
1178 	int x1, ww=0, wi=0;
1179 	for(x1 =0 ; ; x1++){
1180 		if(ww){
1181 			chroname[wi++]=gene_chr[x1];
1182 			chroname[wi]=0;
1183 		}
1184 		if(0==gene_chr[x1]) break;
1185 		if(gene_chr[x1]==':')
1186 			ww=1;
1187 	}
1188 
1189 	//#warning ">>>>>>> COMMENt NEXT <<<<<<<<<<<<<<"
1190 	//if(1) SUBREADprintf("SEE LOCATION: %s has %d\n",  gene_chr ,  pos_strands[0]+1 );
1191 
1192 	for(x1 = 1; x1 < pos_strands[0]+1; x1+=3){
1193 		if(pos_strands[x1]==0)break;
1194 		//#warning ">>>>>>> COMMENt NEXT <<<<<<<<<<<<<<"
1195 		//SUBREADprintf("SEE LOCATION ___ : %d >0 && %d < %d %d %d\n", current_lagre_end , current_lagre_end,  pos_strands[x1],  pos_strands[x1+1],  pos_strands[x1+2]);
1196 		if(current_lagre_end > 0 && current_lagre_end < pos_strands[x1]){
1197 			// add a junction between current_lagre_end and pos_strands[x1]
1198 			if(indel_context -> total_events >= indel_context -> current_max_event_number -1){
1199 				indel_context -> current_max_event_number*=1.2;
1200 				indel_context -> event_space_dynamic = realloc(indel_context -> event_space_dynamic , sizeof( chromosome_event_t )*indel_context -> current_max_event_number);
1201 			}
1202 			chromosome_event_t * newbody = indel_context -> event_space_dynamic  + indel_context -> total_events ;
1203 			memset(newbody, 0, sizeof(chromosome_event_t));
1204 			newbody -> global_event_id = indel_context -> total_events++;
1205 			newbody -> event_type = CHRO_EVENT_TYPE_JUNCTION;
1206 
1207 			newbody -> event_small_side = linear_gene_position(&global_context->chromosome_table , chroname, current_lagre_end);
1208 			newbody -> event_large_side = linear_gene_position(&global_context->chromosome_table , chroname, pos_strands[x1]);
1209 
1210 			//#warning ">>>>>>> COMMENt NEXT <<<<<<<<<<<<<<"
1211 			if(0){
1212 				if ( 1 || newbody -> event_small_side == 625683723){
1213 					SUBREADprintf("Loaded event :");
1214 					debug_show_event(global_context, newbody);
1215 				}
1216 			}
1217 			newbody -> is_negative_strand = pos_strands[x1+2];
1218 			newbody -> is_donor_found_or_annotation = 127;
1219 		//	printf("09NOV: add_annotation_to_junctions: %u -> %u '%s', %d,%d\n", newbody -> event_small_side ,  newbody -> event_large_side, chroname, current_lagre_end, pos_strands[x1]);
1220 		}
1221 
1222 		if(current_lagre_end < pos_strands[x1+1])
1223 			current_lagre_end = pos_strands[x1+1];
1224 	}
1225 }
1226 
1227 
1228 typedef struct {
1229 	global_context_t * global_context;
1230 	HashTable * feature_sorting_table;
1231 } do_load_juncs_context_t;
1232 
do_juncs_add_feature(char * gene_name,char * transcript_id,char * chro_name,unsigned int feature_start,unsigned int feature_end,int is_negative_strand,void * context)1233 int do_juncs_add_feature(char * gene_name, char * transcript_id, char * chro_name, unsigned int feature_start, unsigned int feature_end, int is_negative_strand, void * context){
1234 	//#warning ">>>>>>>>>>>>>>>>> COMMENT NEXT <<<<<<<<<<<<<<<<<<<<"
1235 	//return 0;
1236 	//#warning ">>>>>>>>>>>>>>>>> COMMENT NEXT <<<<<<<<<<<<<<<<<<<<"
1237 	//SUBREADprintf("INJ LOCS: %s : %u, %u\n", chro_name, feature_start, feature_end);
1238 	do_load_juncs_context_t * do_load_juncs_context = context;
1239 	HashTable * feature_sorting_table = do_load_juncs_context -> feature_sorting_table;
1240 
1241 	global_context_t * global_context = do_load_juncs_context -> global_context;
1242 	char tmp_chro_name[MAX_CHROMOSOME_NAME_LEN];
1243 	if(global_context -> sam_chro_to_anno_chr_alias){
1244 		char * sam_chro = get_sam_chro_name_from_alias(global_context -> sam_chro_to_anno_chr_alias, chro_name);
1245 		if(sam_chro!=NULL) chro_name = sam_chro;
1246 	}
1247 	int access_n = HashTableGet( global_context -> chromosome_table.read_name_to_index, chro_name ) - NULL;
1248 	if(access_n < 1){
1249 		if(chro_name[0]=='c' && chro_name[1]=='h' && chro_name[2]=='r'){
1250 			chro_name += 3;
1251 		}else{
1252 			strcpy(tmp_chro_name, "chr");
1253 			strcat(tmp_chro_name, chro_name);
1254 			chro_name = tmp_chro_name;
1255 		}
1256 	}
1257 
1258 	char sort_key[ MAX_CHROMOSOME_NAME_LEN * 3 + 2 ];
1259 	sprintf(sort_key, "%s:%s", gene_name, chro_name);
1260 	int * old_features = HashTableGet(feature_sorting_table, sort_key), x1;
1261 	int written_space = -1, written_last_item = 0;
1262 	if(old_features){
1263 		int has_space = 0;
1264 		for(x1= 1; x1 < old_features[0] + 1 ; x1+=3){
1265 			if(old_features[x1]==0){
1266 				has_space = 1;
1267 				break;
1268 			}
1269 		}
1270 		if(!has_space){
1271 			int old_size = old_features[0];
1272 			old_features[0] *= 1.5;
1273 			old_features[0] -= old_features[0]%3;
1274 
1275 			// I used malloc but not realloc here, because the old memory block will be freed by HashTablePutReplace.
1276 			int * old_features2 = malloc( sizeof(int) * (old_features[0] +1));
1277 			memcpy(old_features2, old_features, (old_size + 1) * sizeof(int));
1278 			old_features2[old_size + 1] = 0;
1279 
1280 			HashTablePutReplace(feature_sorting_table, sort_key, old_features2, 0);
1281 			old_features = old_features2;
1282 		}
1283 		for(x1= 1; x1 < old_features[0] + 1 ; x1+=3){
1284 			if(old_features[x1]> feature_start || old_features[x1]==0){
1285 				written_space = x1;
1286 				if(old_features[x1]> feature_start){
1287 					int x2;
1288 					for(x2 = x1+3; x2 <  old_features[0] + 1;x2+=3){
1289 						if(old_features[x2]==0)break;
1290 					}
1291 					x2 += 2;
1292 					assert(x2 < old_features[0] + 1);
1293 					if(x2 + 1 <  old_features[0] + 1) old_features[x2+1]=0;
1294 
1295 					for(; x2 >= x1 + 3; x2 --)
1296 						old_features[x2] = old_features[x2-3];
1297 				} else written_last_item = 1;
1298 				break;
1299 			}
1300 		}
1301 	}else{
1302 		int init_space_sort = 15;
1303 		old_features = malloc((init_space_sort + 1) * sizeof(int));
1304 		old_features[0]=init_space_sort;
1305 		char * mm_sort_key = malloc(strlen(sort_key)+1);
1306 		strcpy(mm_sort_key, sort_key);
1307 		HashTablePut(feature_sorting_table, mm_sort_key, old_features);
1308 		written_space = 1;
1309 		written_last_item = 1;
1310 	}
1311 	assert(written_space >0);
1312 	old_features[written_space]=feature_start - 1;
1313 	old_features[written_space+1]=feature_end - 1;
1314 	old_features[written_space+2]=is_negative_strand;
1315 	if(written_last_item && written_space + 3 < old_features[0]+1) old_features[written_space+3]=0;
1316 
1317 	return 0;
1318 }
1319 
load_known_junctions(global_context_t * global_context)1320 int load_known_junctions(global_context_t * global_context){
1321 
1322 	HashTable * feature_sorting_table = HashTableCreate(90239);
1323 	HashTableSetKeyComparisonFunction(feature_sorting_table, my_strcmp);
1324 	HashTableSetHashFunction(feature_sorting_table, HashTableStringHashFunction);
1325 	HashTableSetDeallocationFunctions(feature_sorting_table, free, free);
1326 
1327 	do_load_juncs_context_t do_load_juncs_context;
1328 	memset(&do_load_juncs_context, 0, sizeof(do_load_juncs_context));
1329 
1330 	do_load_juncs_context.global_context = global_context;
1331 	do_load_juncs_context.feature_sorting_table = feature_sorting_table;
1332 
1333 	int features = load_features_annotation(global_context->config.exon_annotation_file , global_context->config.exon_annotation_file_type, global_context->config.exon_annotation_gene_id_column, NULL, global_context->config.exon_annotation_feature_name_column, &do_load_juncs_context, do_juncs_add_feature);
1334 
1335 	feature_sorting_table -> appendix1 = global_context;
1336 	HashTableIteration(feature_sorting_table, add_annotation_to_junctions);
1337 	HashTableDestroy(feature_sorting_table);
1338 
1339 	if(features <0)return -1;
1340 	else{
1341 		return 0;
1342 	}
1343 }
1344 
1345 
there_are_events_in_range(char * bitmap,unsigned int pos,int sec_len)1346 int there_are_events_in_range(char * bitmap, unsigned int pos, int sec_len)
1347 {
1348 	int ret = 0;
1349 	if(!bitmap) return 1;
1350 	unsigned int offset = pos >> 3;
1351 	unsigned int offset_end = (pos + sec_len) >> 3;
1352 	unsigned int offset_byte = (offset >> 3)&0x3ffffff;
1353 	unsigned int offset_byte_end =1+((offset_end >> 3)&0x3ffffff);
1354 	for(; offset_byte < offset_byte_end ; offset_byte++)
1355 	{
1356 		if(bitmap[offset_byte]){
1357 			ret = 1;
1358 			break;
1359 		}
1360 	}
1361 //	printf("TEST_JUMP: THERE ARE %d at %u\n" , ret , pos);
1362 	return ret;
1363 
1364 }
1365 // bitmap has 64M bytes.
check_event_bitmap(unsigned char * bitmap,unsigned int pos)1366 int check_event_bitmap(unsigned char * bitmap, unsigned int pos)
1367 {
1368 	unsigned int offset = pos >> 3;
1369 	unsigned int offset_byte = (offset >> 3)&0x3ffffff;
1370 	char keybyte = bitmap[offset_byte];
1371 	if(!keybyte) return 0;
1372 
1373 	int offset_bit = offset & 7;
1374 	return (keybyte & (1<<offset_bit))!=0;
1375 }
1376 
mark_event_bitmap(unsigned char * bitmap,unsigned int pos)1377 void mark_event_bitmap(unsigned char * bitmap, unsigned int pos)
1378 {
1379 
1380 	// bitmap is as large as 64MBytes
1381 	// pos_all_bit = pos / 8
1382 	// pos_byte = pos_all_bit / 8
1383 	// pos_bit  = pos_all_bit % 8
1384 
1385 	unsigned int offset = pos >> 3;
1386 	unsigned int offset_byte = (offset >> 3)&0x3ffffff;
1387 	int offset_bit = offset & 7;
1388 	bitmap[offset_byte] |= (1<<offset_bit);
1389 }
1390 
1391 
put_new_event(HashTable * event_table,chromosome_event_t * new_event,int event_no)1392 void put_new_event(HashTable * event_table, chromosome_event_t * new_event , int event_no)
1393 {
1394 	unsigned int sides[2];
1395 	sides[0] = new_event->event_small_side;
1396 	sides[1] = new_event->event_large_side;
1397 
1398 	new_event -> global_event_id = event_no;
1399 	int xk1, xk2;
1400 
1401 	for(xk1=0; xk1<2; xk1++)
1402 	{
1403 		if(0 == sides[xk1])continue ;
1404 		unsigned int * id_list = HashTableGet(event_table, NULL+sides[xk1]);
1405 		if(!id_list)
1406 		{
1407 			id_list = malloc(sizeof(unsigned int)*(1+EVENT_ENTRIES_INIT_SIZE));
1408 			id_list[0]=EVENT_ENTRIES_INIT_SIZE;
1409 			id_list[1]=0;
1410 			HashTablePut(event_table , NULL+sides[xk1] , id_list);
1411 		}
1412 
1413 		for(xk2=1;xk2< id_list[0] ; xk2++)
1414 			if(id_list[xk2]==0) break;
1415 
1416 		if(xk2 <  id_list[0])
1417 			id_list[xk2] = event_no+1;
1418 		if(xk2 < id_list[0]) id_list[xk2+1] = 0;
1419 	}
1420 
1421 	if(event_table->appendix1)
1422 	{
1423 		mark_event_bitmap(event_table->appendix1, sides[0]);
1424 		mark_event_bitmap(event_table->appendix2, sides[1]);
1425 	}
1426 }
search_event(global_context_t * global_context,HashTable * event_table,chromosome_event_t * event_space,unsigned int pos,int search_type,unsigned char event_type,chromosome_event_t ** return_buffer)1427 int search_event(global_context_t * global_context, HashTable * event_table, chromosome_event_t * event_space, unsigned int pos, int search_type, unsigned char event_type, chromosome_event_t ** return_buffer)
1428 {
1429 	int ret = 0;
1430 
1431 	if(pos < 1 || pos > 0xffff0000)return 0;
1432 
1433 	if(event_table->appendix1)
1434 	{
1435 		if(search_type == EVENT_SEARCH_BY_SMALL_SIDE)
1436 			if(! check_event_bitmap(event_table -> appendix1, pos))return 0;
1437 
1438 		if(search_type == EVENT_SEARCH_BY_LARGE_SIDE)
1439 			if(! check_event_bitmap(event_table -> appendix2, pos))return 0;
1440 
1441 		if(search_type == EVENT_SEARCH_BY_BOTH_SIDES)
1442 			if(!(check_event_bitmap(event_table -> appendix1, pos) || check_event_bitmap(event_table -> appendix2, pos)))
1443 				return 0;
1444 	}
1445 
1446 	unsigned int * res = HashTableGet(event_table, NULL+pos);
1447 	if(res)
1448 	{
1449 		int xk2;
1450 		int current_size = res[0]&0x0fffffff;
1451 		for(xk2=1; xk2< current_size+1 ; xk2++)
1452 		{
1453 			if(!res[xk2])break;
1454 			//if(res[xk2] - 1>= ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> current_max_event_number ) { SUBREADprintf("FATAL ERROR: Event id out-of-boundary: %u > %u.\n", res[xk2],  ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> current_max_event_number ); continue;}
1455 			chromosome_event_t * event_body = &event_space[res[xk2]-1];
1456 
1457 			if((event_body -> event_type & event_type) == 0)continue;
1458 			//SUBREADprintf("VB1:%u\n", res[xk2]);
1459 			if(search_type == EVENT_SEARCH_BY_SMALL_SIDE && event_body -> event_small_side != pos)continue;
1460 			if(search_type == EVENT_SEARCH_BY_LARGE_SIDE && event_body -> event_large_side != pos)continue;
1461 			if(search_type == EVENT_SEARCH_BY_BOTH_SIDES && event_body -> event_small_side != pos && event_body -> event_large_side != pos)continue;
1462 
1463 			return_buffer[ret++] = event_body;
1464 		}
1465 	}
1466 
1467 	return ret;
1468 }
1469 
1470 
get_insertion_sequence(global_context_t * global_context,thread_context_t * thread_context,char * binary_bases,char * read_text,int insertions)1471 void get_insertion_sequence(global_context_t * global_context, thread_context_t * thread_context , char * binary_bases , char * read_text , int insertions)
1472 {
1473 	int xk1;
1474 	read_text[0]=0;
1475 
1476 	for(xk1=0; xk1<insertions; xk1++)
1477 	{
1478 		int byte_no = xk1/4;
1479 		int bit_no = 2*(xk1%4);
1480 
1481 		read_text[xk1] = int2base((3&(binary_bases[byte_no]>> bit_no)));
1482 	}
1483 	read_text[insertions]=0;
1484 }
1485 
set_insertion_sequence(global_context_t * global_context,thread_context_t * thread_context,char ** binary_bases,char * read_text,int insertions)1486 void set_insertion_sequence(global_context_t * global_context, thread_context_t * thread_context , char ** binary_bases , char * read_text , int insertions)
1487 {
1488 	int xk1;
1489 
1490 	(*binary_bases) = malloc((1+insertions)/4+2);
1491 	//SUBREADprintf("ALLOC PTR=%p\n", (*binary_bases) );
1492 
1493 	assert(insertions <= MAX_INSERTION_LENGTH);
1494 	memset((*binary_bases),0, (1+insertions)/4+2);
1495 
1496 	for(xk1=0; xk1<insertions;xk1++)
1497 	{
1498 		int byte_no = xk1/4;
1499 		int bit_no = 2*(xk1%4);
1500 
1501 		*((*binary_bases)+byte_no) |= (base2int(read_text[xk1]))<<bit_no;
1502 	}
1503 }
1504 
local_add_indel_event(global_context_t * global_context,thread_context_t * thread_context,HashTable * event_table,char * read_text,unsigned int left_edge,int indels,int score_supporting_read_added,int is_ambiguous,int mismatched_bases,int * old_event_id)1505 chromosome_event_t * local_add_indel_event(global_context_t * global_context, thread_context_t * thread_context, HashTable * event_table, char * read_text, unsigned int left_edge, int indels, int score_supporting_read_added, int is_ambiguous, int mismatched_bases,int * old_event_id)
1506 {
1507 		chromosome_event_t * found = NULL;
1508 		chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
1509 
1510 		chromosome_event_t * event_space = NULL;
1511 		if(thread_context)
1512 			event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1513 		else
1514 			event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1515 
1516 
1517 		int found_events = search_event(global_context, event_table, event_space, left_edge, EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_INDEL|CHRO_EVENT_TYPE_LONG_INDEL, search_return);
1518 
1519 		//#warning ">>>>>>>>>>>>>>>>>> COMMENt THIS <<<<<<<<<<<<<<<<<<<<<<<"
1520 		//printf("OCT27-STEPRR-TST LEFT=%u, INDEL=%d; FOUND=%d\n", left_edge , indels , found_events);
1521 
1522 
1523 		if(found_events)
1524 		{
1525 			int kx1;
1526 			for(kx1 = 0; kx1 < found_events ; kx1++)
1527 			{
1528 				if(search_return[kx1] -> indel_length == indels)
1529 				{
1530 					found = search_return[kx1];
1531 					break;
1532 				}
1533 			}
1534 		}
1535 
1536 		if(found){
1537 			if(old_event_id) (*old_event_id)=found->global_event_id;
1538 			found -> supporting_reads += score_supporting_read_added;
1539 			//found -> is_ambiguous = max(is_ambiguous , found -> is_ambiguous );
1540 			return NULL;
1541 		}
1542 		else
1543 		{
1544 			int event_no;
1545 			//event_no =(thread_context)?((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events: ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) ->  total_events;
1546 
1547 			if(thread_context)
1548 				event_no = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> total_events ++;
1549 			else
1550 				event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) ->  total_events ++;
1551 
1552 			event_space = reallocate_event_space(global_context, thread_context, event_no);
1553 
1554 
1555 			chromosome_event_t * new_event = event_space+event_no;
1556 			memset(new_event,0,sizeof(chromosome_event_t));
1557 			if(indels <0)	// if it is an insertion:
1558 				set_insertion_sequence(global_context, thread_context , &new_event -> inserted_bases , read_text, -indels);
1559 
1560 			new_event -> event_small_side = left_edge;
1561 			new_event -> event_large_side = left_edge + 1 + max(0, indels);
1562 			//assert(new_event -> event_small_side && new_event -> event_large_side);
1563 			new_event -> event_type = CHRO_EVENT_TYPE_INDEL;
1564 			new_event -> indel_length = indels;
1565 			new_event -> supporting_reads = 1;
1566 			new_event -> event_quality = 1;//pow(0.5 , 3*mismatched_bases);
1567 			//new_event -> is_ambiguous = is_ambiguous;
1568 
1569 			//#warning ">>>>>>>>>>>>>>>>>> COMMENt THIS <<<<<<<<<<<<<<<<<<<<<<<"
1570 			//printf("OCT27-STEPRR-NEW LEFT=%u RIGHT=%u, INDEL=%d\n", new_event -> event_small_side ,  new_event -> event_large_side, new_event -> indel_length );
1571 
1572 			put_new_event(event_table, new_event , event_no);
1573 			return new_event;
1574 		}
1575 
1576 }
1577 
1578 float EXON_RECOVER_MATCHING_RATE = 0.9;
1579 float EXON_INDEL_MATCHING_RATE_TAIL = 0.8;
1580 float EXON_INDEL_MATCHING_RATE_HEAD = 0.8;
1581 
1582 
1583 
1584 
core_extend_covered_region_15(global_context_t * global_context,gene_value_index_t * array_index,unsigned int read_start_pos,char * read,int read_len,int cover_start,int cover_end,int window_size,int req_match_5end,int req_match_3end,int indel_tolerance,int space_type,int tail_indel,short * head_indel_pos,int * head_indel_movement,short * tail_indel_pos,int * tail_indel_movement,int is_head_high_quality,char * qual_txt,int qual_format,float head_matching_rate,float tail_matching_rate)1585 int core_extend_covered_region_15(global_context_t * global_context, gene_value_index_t *array_index, unsigned int read_start_pos, char * read, int read_len, int cover_start, int cover_end, int window_size, int req_match_5end , int req_match_3end, int indel_tolerance, int space_type, int tail_indel, short * head_indel_pos, int * head_indel_movement, short * tail_indel_pos, int * tail_indel_movement, int is_head_high_quality, char * qual_txt, int qual_format, float head_matching_rate, float tail_matching_rate){
1586 	int is_head;
1587 
1588 	if(0){
1589 		char posout[100];
1590 		absoffset_to_posstr(global_context, read_start_pos, posout);
1591 		SUBREADprintf("RTXT=%s, MAPP=%s\n", read, posout);
1592 	}
1593 
1594 	for(is_head = 0; is_head < 2; is_head ++){
1595 		int move_i;
1596 		int best_match_n = -1, best_movement = 0;
1597 		for(move_i = 0; move_i < 2* indel_tolerance-1; move_i ++){
1598 			int indel_move = (move_i+1)/2 * (move_i %2?1:-1);
1599 			int indel_movement = indel_move + (is_head?0:tail_indel);
1600 			int this_match_n;
1601 
1602 			if(is_head)
1603 				this_match_n = match_chro(read, array_index, read_start_pos - indel_movement, window_size, 0, space_type );
1604 			else
1605 				this_match_n = match_chro(read + read_len - window_size, array_index, read_start_pos + read_len - window_size + indel_movement, window_size, 0, space_type );
1606 
1607 			// indel_movement : negativ = insertion, positive = deletion before or after the covered region
1608 
1609 			//SUBREADprintf("MOVE: HEAD=%d, MATCH=%d, MOV=%d\n", is_head, this_match_n, indel_movement);
1610 			if(this_match_n > best_match_n){
1611 				best_match_n = this_match_n;
1612 				best_movement = indel_movement;
1613 			}
1614 		}
1615 
1616 		int max_score=-1, best_splice_on_read=0;
1617 		if(best_match_n >0 && best_movement!=0){
1618 			int test_start = is_head?window_size:(cover_end);
1619 			int test_end = is_head?cover_start - max(0, -best_movement):(read_len - window_size - max(0, -best_movement));
1620 			int indel_first_piece_end_on_read, indel_second_piece_start_on_read;
1621 
1622 			for(indel_first_piece_end_on_read = test_start ; indel_first_piece_end_on_read < test_end; indel_first_piece_end_on_read++){
1623 				unsigned int indel_first_piece_end_on_chro = read_start_pos + indel_first_piece_end_on_read + (is_head? -best_movement :tail_indel);
1624 				indel_second_piece_start_on_read = indel_first_piece_end_on_read - min(0, best_movement);
1625 				unsigned int indel_second_piece_start_on_chro = indel_first_piece_end_on_chro + max(0,best_movement);
1626 				int first_piece_match = match_chro(read + indel_first_piece_end_on_read - window_size , array_index , indel_first_piece_end_on_chro - window_size, window_size, 0, space_type);
1627 				int second_piece_match = match_chro(read + indel_second_piece_start_on_read , array_index , indel_second_piece_start_on_chro, window_size, 0, space_type);
1628 				int score = first_piece_match + second_piece_match;
1629 				if(score > max_score){
1630 					max_score = score;
1631 					best_splice_on_read = indel_first_piece_end_on_read;
1632 				}
1633 				if(score == 2*window_size)break;
1634 				//SUBREADprintf("FIND_SPLICE_POINT: IS_HEAD=%d, FIRST_PIECE_END=%d-%d, CHRO=%u-%u MATCHED=%d,%d\n", is_head, indel_first_piece_end_on_read, indel_second_piece_start_on_read, indel_first_piece_end_on_chro, indel_second_piece_start_on_chro, first_piece_match, second_piece_match);
1635 			}
1636 		}
1637 
1638 		if(max_score >=  2*window_size - 1){
1639 			if(is_head){
1640 				(*head_indel_pos) = best_splice_on_read;
1641 				(*head_indel_movement) = best_movement;
1642 			} else {
1643 				(*tail_indel_pos) = best_splice_on_read;
1644 				(*tail_indel_movement) = best_movement;
1645 			}
1646 		}
1647 	}
1648 
1649 	return 0;
1650 
1651 }
1652 
1653 
1654 
core_extend_covered_region_13(gene_value_index_t * array_index,unsigned int read_start_pos,char * read,int read_len,int cover_start,int cover_end,int window_size,int req_match_5end,int req_match_3end,int indel_tolerance,int space_type,int tail_indel,short * head_indel_pos,int * head_indel_movement,short * tail_indel_pos,int * tail_indel_movement,int is_head_high_quality,char * qual_txt,int qual_format,float head_matching_rate,float tail_matching_rate)1655 int core_extend_covered_region_13(gene_value_index_t *array_index, unsigned int read_start_pos, char * read, int read_len, int cover_start, int cover_end, int window_size, int req_match_5end , int req_match_3end, int indel_tolerance, int space_type, int tail_indel, short * head_indel_pos, int * head_indel_movement, short * tail_indel_pos, int * tail_indel_movement, int is_head_high_quality, char * qual_txt, int qual_format, float head_matching_rate, float tail_matching_rate)
1656 {
1657 	int ret = 0;
1658 	*head_indel_pos = -1;
1659 	*tail_indel_pos = -1;
1660 	if (cover_start >= window_size && head_matching_rate < 1.0001)
1661 	{
1662 		int head_test_len =  cover_start;
1663 		int roughly_mapped = match_chro(read, array_index, read_start_pos, head_test_len , 0, space_type);
1664 		if (roughly_mapped >= head_test_len  * EXON_RECOVER_MATCHING_RATE - 0.0001)
1665 		{
1666 			ret |= 1;
1667 		}
1668 		else
1669 		{
1670 			int window_end_pos = cover_start + window_size-1;
1671 			int not_too_bad = 1;
1672 			int right_match_number=0;
1673 
1674 			while (1)
1675 			{
1676 				int matched_bases_in_window = match_chro_wronglen(read+ window_end_pos - window_size, array_index, read_start_pos + window_end_pos - window_size, window_size, space_type, NULL, &right_match_number);
1677 				int best_indel_movement = -1000;
1678 				int best_indel_pos = -1;
1679 
1680 				if (matched_bases_in_window >= req_match_5end) // this window is not bad enough so that an indel is considered
1681 					window_end_pos--;
1682 				else
1683 				{
1684 					roughly_mapped = match_chro(read, array_index, read_start_pos, window_end_pos - right_match_number , 0, space_type);
1685 					if (roughly_mapped < (int)(0.5+ (window_end_pos - right_match_number)  * EXON_RECOVER_MATCHING_RATE))
1686 					{
1687 						// the quality of this window is too low (at most 1 base is matched). I must consider if it is an indel.
1688 						int indel_movement_i ;
1689 
1690 						int best_matched_bases_after_indel = -1;
1691 
1692 						not_too_bad = 0;
1693 						for (indel_movement_i = 0; indel_movement_i < 2* indel_tolerance-1 ; indel_movement_i ++)
1694 						{
1695 							int indel_movement = (indel_movement_i+1)/2 * (indel_movement_i %2?1:-1);
1696 							int test_length = window_end_pos /*- 1*/ - max(0, indel_movement) -  right_match_number;
1697 
1698 							if (test_length < window_size) continue;
1699 							//if (test_length <= 1+ abs(indel_movement)/4) continue;
1700 							//test_length = min(10, test_length);
1701 							if (abs(indel_movement) > indel_tolerance) continue;
1702 
1703 							int test_start =0;// window_end_pos - max(0, indel_movement) -  right_match_number - test_length;
1704 
1705 							float matched_bases_after_indel = match_chro_support(read +test_start, array_index, read_start_pos + indel_movement +test_start, test_length,0, space_type, qual_txt, qual_format);
1706 							SUBREADprintf("HEAD : MATCHED_AFTER_INDEL = %f ; MVMT=%d ; WINDOW_END=%d\n", matched_bases_after_indel, indel_movement, window_end_pos);
1707 
1708 							float test_rate = head_matching_rate;
1709 
1710 							if(test_length < 3) test_rate = 1;
1711 
1712 							if(best_matched_bases_after_indel <matched_bases_after_indel  && matched_bases_after_indel >= (int)(0.5+test_length * test_rate))
1713 							{
1714 								not_too_bad = 1;
1715 
1716 								best_matched_bases_after_indel = matched_bases_after_indel;
1717 								best_indel_movement = indel_movement;
1718 								best_indel_pos = window_end_pos - right_match_number;
1719 
1720 
1721 
1722 								*head_indel_pos = best_indel_pos;
1723 								*head_indel_movement = best_indel_movement;
1724 							}
1725 						}
1726 						if(best_indel_pos<0) *head_indel_pos =  window_end_pos - right_match_number;
1727 					//	break;
1728 					}
1729 					window_end_pos--;
1730 				}
1731 				if (window_end_pos - window_size <= 0) break;
1732 			}
1733 			if(not_too_bad)
1734 				ret |=1;
1735 			//else
1736 				//*head_indel_pos = -1;
1737 		}
1738 	}
1739 	else ret |= 1;
1740 
1741 
1742 
1743 	if (cover_end <= read_len - window_size && tail_matching_rate < 1.0001)
1744 	{
1745 		int tail_test_len = read_len - cover_end;
1746 		int roughly_mapped = match_chro(read + cover_end, array_index, read_start_pos + cover_end + tail_indel, tail_test_len , 0, space_type);
1747 		if (roughly_mapped >= tail_test_len  * EXON_RECOVER_MATCHING_RATE - 0.0001)
1748 		{
1749 			ret |= 2;
1750 		}
1751 		else
1752 		{
1753 			int window_start_pos = cover_end - window_size +1;
1754 			int not_too_bad = 1;
1755 
1756 			while (1)
1757 			{
1758 				int left_match_number = 0;
1759 				int matched_bases_in_window = match_chro_wronglen(read+ window_start_pos, array_index, read_start_pos + window_start_pos + tail_indel, window_size, space_type, &left_match_number, NULL);
1760 				int best_indel_movement = -1000;
1761 				int best_indel_pos = -1;
1762 
1763 				//printf("MOVE 4-bp WINDOW : %d ; matched = %d >=? %d\n", window_start_pos, matched_bases_in_window, req_match_3end);
1764 
1765 				if (matched_bases_in_window >= req_match_3end) // this window is not bad enough so that an indel is considered
1766 					window_start_pos++;
1767 				else
1768 				{
1769 					roughly_mapped = match_chro(read+window_start_pos + left_match_number, array_index, read_start_pos + window_start_pos + tail_indel + left_match_number, read_len - window_start_pos - left_match_number , 0, space_type);
1770 
1771 					if (roughly_mapped < (int)(0.5+(read_len - window_start_pos -left_match_number )  * EXON_RECOVER_MATCHING_RATE))
1772 					{
1773 						// the quality of this window is too low (at most 1 base is matched). I must consider if it is an indel.
1774 						int indel_movement_i ;
1775 						int best_matched_bases_after_indel = -1;
1776 						not_too_bad = 0;
1777 						for (indel_movement_i = 0; indel_movement_i < 2* indel_tolerance; indel_movement_i ++)
1778 						{
1779 
1780 							int indel_adjustment = (indel_movement_i+1)/2 * (indel_movement_i %2?1:-1);
1781 							int indel_movement = tail_indel + indel_adjustment;
1782 							int test_length = read_len - window_start_pos  - left_match_number + min(0, indel_adjustment);
1783 							//test_length = min(10, test_length);
1784 
1785 
1786 							if (test_length < window_size) continue;
1787 							//if (test_length <= 1 + abs(indel_movement)/4) continue;
1788 							if (abs(indel_movement) > indel_tolerance) continue;
1789 
1790 							char * qual_txt_moved = qual_txt;
1791 							if(qual_txt[0]) qual_txt_moved+=window_start_pos - min(0, indel_movement) + left_match_number;
1792 
1793 							float matched_bases_after_indel = match_chro_support(read + window_start_pos - min(0, indel_movement) + left_match_number, array_index, read_start_pos + window_start_pos  + max(0,indel_movement) +left_match_number , test_length,0, space_type, qual_txt_moved , qual_format);
1794 							SUBREADprintf("TAIL : MATCHED_AFTER_INDEL = %f ; MVMT=%d ; WINDOW_END=%d\n", matched_bases_after_indel, indel_movement, window_start_pos - min(0, indel_movement) + left_match_number);
1795 
1796 							float test_rate = tail_matching_rate;
1797 
1798 							if(test_length < 3) test_rate = 1;
1799 
1800 							//printf("%d <? %d && %d >=? %d\n", best_matched_bases_after_indel, matched_bases_after_indel, matched_bases_after_indel, (int)(0.5+test_length * test_rate));
1801 							if(best_matched_bases_after_indel <matched_bases_after_indel  && matched_bases_after_indel >= (int)(0.5+test_length * test_rate))
1802 							{
1803 								not_too_bad = 1;
1804 								best_matched_bases_after_indel = matched_bases_after_indel;
1805 								best_indel_movement = indel_movement;
1806 								best_indel_pos = window_start_pos + left_match_number ;//-1;
1807 								*tail_indel_movement = best_indel_movement ;
1808 							}
1809 						}
1810 
1811 						if(best_indel_pos<0)
1812 							*tail_indel_pos =  window_start_pos + left_match_number ;
1813 						else
1814 							*tail_indel_pos = best_indel_pos;
1815 					}
1816 					window_start_pos++;
1817 				}
1818 				if (window_start_pos + window_size >= read_len) break;
1819 			}
1820 			if(not_too_bad)
1821 				ret |=2;
1822 			//else
1823 			//	*tail_indel_pos =-1;
1824 		}
1825 	}
1826 	else ret |= 2;
1827 
1828 	return ret;
1829 }
1830 
1831 
1832 
1833 
1834 
1835 
1836 int core_dynamic_align(global_context_t * global_context, thread_context_t * thread_context, char * read, int read_len, unsigned int begin_position, char * movement_buffer, int expected_offset, char * read_name);
1837 
find_new_indels(global_context_t * global_context,thread_context_t * thread_context,int pair_number,char * read_name,char * read_text,char * qual_text,int read_len,int is_second_read,int best_read_id)1838 int find_new_indels(global_context_t * global_context, thread_context_t * thread_context, int pair_number, char * read_name, char * read_text, char * qual_text, int read_len, int is_second_read, int best_read_id)
1839 {
1840 	int i,j;
1841 
1842 	int last_correct_subread = 0;
1843 	int last_indel = 0;
1844 
1845 	gene_vote_number_t * indel_recorder;
1846 	unsigned int voting_position;
1847 
1848 	HashTable * event_table = NULL;
1849 
1850 	if(thread_context)
1851 		event_table = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_entry_table;
1852 	else
1853 		event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
1854 
1855 
1856 	mapping_result_t *current_result = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, best_read_id);
1857 
1858 	if(global_context->config.do_big_margin_filtering_for_reads)
1859 		if(is_ambiguous_voting(global_context, pair_number, is_second_read , current_result->selected_votes, current_result->confident_coverage_start, current_result->confident_coverage_end, read_len, (current_result->result_flags & CORE_IS_NEGATIVE_STRAND)?1:0))return 0;
1860 
1861 
1862 
1863 	indel_recorder = current_result -> selected_indel_record;
1864 	voting_position = current_result -> selected_position;
1865 
1866 
1867 	if(!indel_recorder[0])
1868 		return 0;
1869 
1870 	gene_value_index_t * current_value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
1871 
1872 	if(0){ // test "shifting indels" -- not needed when "NEW" go_Q used
1873 		int last_offset = 0;
1874 		for(i=0; indel_recorder[i]  && (i<MAX_INDEL_SECTIONS); i+=3){
1875 			last_offset = indel_recorder[i+2];
1876 		}
1877 
1878 		if(0==last_offset) return 0;
1879 	}
1880 
1881 	for(i=0; indel_recorder[i]  && (i<MAX_INDEL_SECTIONS); i+=3)
1882 	{
1883 		int indels = indel_recorder[i+2] - last_indel;
1884 			// -1 : 1 insert; 1: 1 delete
1885 		int next_correct_subread = indel_recorder[i] -1;
1886 		//int next_correct_subread_last = indel_recorder[i+1] -1;
1887 
1888 		if(indels)
1889 		{
1890 			int last_correct_base = find_subread_end(read_len, global_context->config.total_subreads , last_correct_subread) - 9;
1891 			int first_correct_base = find_subread_end(read_len, global_context->config.total_subreads , next_correct_subread) - 16 + 9;
1892 
1893 			last_correct_base = max(0, last_correct_base);
1894 			last_correct_base = min(read_len-1, last_correct_base);
1895 			first_correct_base = min(first_correct_base, read_len-1);
1896 			first_correct_base = max(0, first_correct_base);
1897 			first_correct_base = max(first_correct_base, last_correct_base);
1898 
1899 			if(first_correct_base < last_correct_base || first_correct_base > last_correct_base + 3000)
1900 				SUBREADprintf("WRONG ORDER: F=%u, L=%d\n", first_correct_base , last_correct_base);
1901 
1902 			//printf("OCT27-STEPRR %s POS=%u, I=%d; INDELS=%d; COVER=%d -- %d (vote_no : %d - %d)\n", read_name, current_result->selected_position, i, indels, last_correct_base, first_correct_base, last_correct_subread, next_correct_subread);
1903 
1904 			if(global_context -> config.use_dynamic_programming_indel || read_len > EXON_LONG_READ_LENGTH)
1905 			{
1906 				char movement_buffer[MAX_READ_LENGTH * 10 / 7];
1907 				//chromosome_event_t * last_event = NULL;
1908 				int last_event_id = -1;
1909 
1910 				first_correct_base = min(first_correct_base+10, read_len);
1911 				int x1, dyna_steps;
1912 
1913 				dyna_steps = core_dynamic_align(global_context, thread_context, read_text + last_correct_base, first_correct_base - last_correct_base, voting_position + last_correct_base + last_indel, movement_buffer, indels, read_name);
1914 
1915 				movement_buffer[dyna_steps]=0;
1916 
1917 				if(0 && FIXLENstrcmp("simulated.2467286", read_name) == 0){
1918 					SUBREADprintf("DYNAMIC: indels=%d, %s\n", indels, movement_buffer);
1919 				}
1920 
1921 				//#warning ">>>>>>> COMMENT NEXT BLOCK IN RELEASE <<<<<<<<"
1922 				if(0) {
1923 					char outstr[1000];
1924 					sprintf(outstr, "OCT27-STEPDD-IR %s     %d  %d~%d ", read_name, dyna_steps, last_correct_base, first_correct_base);
1925 
1926 					for(x1=0; x1<dyna_steps;x1++)
1927 					{
1928 						int mc, mv=movement_buffer[x1];
1929 						if(mv==0)mc='=';
1930 						else if(mv==1)mc='D';
1931 						else if(mv==2)mc='I';
1932 						else mc='X';
1933 						sprintf(outstr+strlen(outstr),"%c",mc);
1934 					}
1935 					puts(outstr);
1936 				}
1937 				unsigned int cursor_on_chromosome = voting_position + last_correct_base + last_indel, cursor_on_read = last_correct_base;
1938 				int last_mv = 0;
1939 				unsigned int indel_left_boundary = 0;
1940 				int is_in_indel = 0, current_indel_len = 0, total_mismatch = 0;
1941 
1942 				for(x1=0; x1<dyna_steps;x1++)
1943 				{
1944 					int mv=movement_buffer[x1];
1945 					if(mv==3) total_mismatch++;
1946 				}
1947 
1948 				if(total_mismatch<=2 || (global_context->config.maximise_sensitivity_indel && total_mismatch <= 2 ))
1949 					for(x1=0; x1<dyna_steps;x1++)
1950 					{
1951 						int mv=movement_buffer[x1];
1952 
1953 						if(last_mv != mv)
1954 						{
1955 							if( ( mv==1 || mv==2 ) && ! is_in_indel)
1956 							{
1957 								indel_left_boundary = cursor_on_chromosome;
1958 								is_in_indel = 1;
1959 								current_indel_len = 0;
1960 							}
1961 							else if ( is_in_indel && (mv == 0 || mv == 3)  )
1962 							{
1963 								gene_value_index_t * current_value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
1964 
1965 								int ambiguous_count=0;
1966 								if(0){
1967 									int ambiguous_i, best_matched_bases = match_chro(read_text + cursor_on_read - 6, current_value_index, indel_left_boundary - 6, 6, 0, global_context->config.space_type)  +
1968 												 match_chro(read_text + cursor_on_read - min(current_indel_len,0), current_value_index, indel_left_boundary + max(0, current_indel_len), 6, 0, global_context->config.space_type);
1969 									for(ambiguous_i=-5; ambiguous_i<=5; ambiguous_i++)
1970 									{
1971 										int left_match = match_chro(read_text + cursor_on_read - 6, current_value_index, indel_left_boundary - 6, 6+ambiguous_i, 0, global_context->config.space_type);
1972 										int right_match = match_chro(read_text + cursor_on_read + ambiguous_i - min(current_indel_len,0), current_value_index, indel_left_boundary + ambiguous_i + max(0, current_indel_len), 6-ambiguous_i, 0,global_context->config.space_type);
1973 										if(left_match+right_match == best_matched_bases) ambiguous_count ++;
1974 									}
1975 								}
1976 
1977 								if(abs(current_indel_len)<=global_context -> config.max_indel_length)
1978 								{
1979 									int  old_event_id = -1;
1980 
1981 									if(0)if(indel_left_boundary >= 46481340 + 1210 - 200 && indel_left_boundary  <= 46481340 + 1210 + 200 && abs(current_indel_len )==4){
1982 										SUBREADprintf("ADD AN INDEL: %s : %u ; len = %d\n", read_name, indel_left_boundary, current_indel_len);
1983 									}
1984 									chromosome_event_t * new_event = local_add_indel_event(global_context, thread_context, event_table, read_text + cursor_on_read + min(0,current_indel_len), indel_left_boundary - 1, current_indel_len, 1, ambiguous_count, 0, &old_event_id);
1985 									mark_gapped_read(current_result);
1986 
1987 									chromosome_event_t * event_space = NULL;
1988 									if(thread_context)
1989 										event_space = ((indel_thread_context_t *)thread_context -> module_thread_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1990 									else
1991 										event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1992 
1993 									if(last_event_id>=0){
1994 										chromosome_event_t * last_event = event_space + last_event_id;
1995 										int dist = indel_left_boundary - last_event -> event_large_side ;
1996 
1997 										if(last_event -> connected_next_event_distance<1)
1998 											last_event -> connected_next_event_distance = dist;
1999 										else    last_event -> connected_next_event_distance = min(dist, last_event -> connected_next_event_distance);
2000 
2001 										chromosome_event_t * old_new_event = new_event;
2002 										if(old_event_id>=0){
2003 											assert(!new_event);
2004 											old_new_event = event_space + old_event_id;
2005 										}
2006 
2007 										if(old_new_event -> connected_previous_event_distance < 1)
2008 											old_new_event -> connected_previous_event_distance = dist;
2009 										else    old_new_event -> connected_previous_event_distance = min(dist, old_new_event -> connected_previous_event_distance);
2010 									}
2011 
2012 									if (new_event)
2013 										last_event_id = new_event -> global_event_id;
2014 									else	last_event_id = old_event_id;
2015 								}
2016 							}
2017 
2018 
2019 							if(mv == 0 || mv == 3)
2020 								is_in_indel = 0;
2021 						}
2022 
2023 						if(is_in_indel && mv == 1)
2024 							current_indel_len += 1;
2025 						if(is_in_indel && mv == 2)
2026 							current_indel_len -= 1;
2027 
2028 						if(mv == 1 || mv == 3 || mv == 0) cursor_on_chromosome++;
2029 						if(mv == 2 || mv == 3 || mv == 0) cursor_on_read++;
2030 
2031 						last_mv = mv;
2032 					}
2033 
2034 			}
2035 			else
2036 			{
2037 				if(first_correct_base - last_correct_base  < -min(0,indels)) continue;
2038 
2039 				int best_score=-99999, is_ambiguous_indel= 0;
2040 				unsigned int best_pos = 0;
2041 				int cutoff = first_correct_base-last_correct_base  + min(0, indels) - global_context -> config.flanking_subread_indel_mismatch;
2042 				int mid_j = (first_correct_base-last_correct_base )/2, min_j=99990;
2043 				int mismatched_bases = 0;
2044 
2045 				if(first_correct_base > last_correct_base + 4)
2046 					for(j=0; j<(first_correct_base-last_correct_base + min(0, indels)); j++)
2047 					{
2048 						// j is the first UNWANTED base after the first matched part.
2049 						int score , s2, s1;
2050 
2051 						s1 = match_chro(read_text + last_correct_base, current_value_index, voting_position + last_correct_base + last_indel, j, 0, global_context->config.space_type);
2052 						if(s1 >= j - global_context -> config.flanking_subread_indel_mismatch)
2053 						{
2054 							s2 = match_chro(read_text + last_correct_base + j - min(0, indels) , current_value_index, voting_position + last_correct_base + j + max(0,indels) + last_indel, first_correct_base-last_correct_base - j + min(0, indels), 0, global_context->config.space_type);
2055 
2056 							score = s2+s1;
2057 							if(score >= cutoff)
2058 							{
2059 								if(score == best_score)
2060 									is_ambiguous_indel ++;
2061 
2062 								if(score>best_score || (abs(mid_j-j)>min_j && best_score == score))
2063 								{
2064 									mismatched_bases = first_correct_base-last_correct_base  + min(0, indels) - score;
2065 									best_score = score;
2066 									// best_pos here is the absolute offset of the FIRST UNWANTED BASE.
2067 									best_pos = voting_position  + last_correct_base + j + last_indel;
2068 									min_j = abs(mid_j-j);
2069 									if(score>best_score)
2070 										is_ambiguous_indel = 0;
2071 								}
2072 							}
2073 						}
2074 					}
2075 
2076 
2077 
2078 				/*
2079 				SUBREADprintf("\n%s\nRANGE:%d - %d ; BSCORE:%d @  %d   ; AMB=%d ; INDELS=%d ; MM=%d\n", read_name, first_correct_base, last_correct_base, best_score, last_indel + last_correct_base + best_j, is_ambiguous_indel, indels, mismatched_bases);
2080 
2081 				SUBREADprintf("%s\n", read_text);
2082 				for(j=0;j<  last_indel + last_correct_base + best_j; j++)
2083 					putc(' ' , stderr);
2084 				putc('\\' , stderr);
2085 				for(j=0;j<  indels; j++)
2086 					putc(gvindex_get( current_value_index, best_pos + j)  , stderr);
2087 				putc('\n', stderr);
2088 
2089 				*/
2090 
2091 				if(best_score >0) {
2092 					//#warning ">>>>>>>>>>>>>>>>>> COMMEN THIS <<<<<<<<<<<<<<<<<<<<<<"
2093 					//printf("OCT27-STEPAD-%s POS=%u INDEL=%d\n", read_name, best_pos , indels);
2094 					local_add_indel_event(global_context, thread_context, event_table, read_text + last_correct_base + last_indel, best_pos - 1, indels, 1, is_ambiguous_indel, mismatched_bases, NULL);
2095 					mark_gapped_read(current_result);
2096 				}
2097 			}
2098 		}
2099 
2100 		last_correct_subread = indel_recorder[i+1]-1;
2101 		last_indel = indel_recorder[i+2];
2102 	}
2103 
2104 
2105 
2106 	if(global_context -> config.extending_search_indels && global_context -> config.max_indel_length>0)
2107 	{
2108 		// USING EXTENDING TO FIND INDELS
2109 		// THIS IS UNSTABLE!
2110 		int s_head = match_chro(read_text, current_value_index, voting_position, 8, 0, global_context->config.space_type);
2111 		int s_tail = match_chro(read_text+read_len - 8, current_value_index, voting_position + read_len + current_result -> indels_in_confident_coverage - 8, 8, 0, global_context->config.space_type);
2112 
2113 		//SUBREADprintf("EXT_START %d, %d\n", s_head, s_tail);
2114 
2115 		//#warning "============ REMOVE THE FIRST '1' FROM THE NEXT LINE! =========="
2116 		if(s_head<=6 || s_tail<=6)
2117 		{
2118 			int is_head_high_quality = (current_result -> result_flags & CORE_IS_NEGATIVE_STRAND)?0:1;
2119 			int k;
2120 			int cover_start = find_subread_end(read_len, global_context -> config.total_subreads, current_result -> selected_indel_record[0]-1)- 15;
2121 
2122 			for (k=0; current_result -> selected_indel_record[k]; k+=3);
2123 			int cover_end = find_subread_end(read_len, global_context -> config.total_subreads, current_result -> selected_indel_record[k-2]-1) + max(0,-current_result -> selected_indel_record[k-1]);
2124 
2125 			int head_qual = is_head_high_quality?0x7fffffff:0, tail_qual = is_head_high_quality?0:0x7fffffff;
2126 
2127 
2128 			if(qual_text && qual_text[0])
2129 			{
2130 				int j;
2131 				head_qual = 0;
2132 				tail_qual = 0;
2133 				for (j = 0; j<cover_start; j++)
2134 				{
2135 					if(FASTQ_PHRED64 == global_context -> config.phred_score_format)
2136 					{
2137 						head_qual += (1000000-get_base_error_prob64i(qual_text[j]));
2138 					}
2139 					else
2140 					{
2141 						head_qual += (1000000-get_base_error_prob33i(qual_text[j]));
2142 					}
2143 				}
2144 				for (j = 0; j<read_len - cover_end; j++)
2145 				{
2146 					if(FASTQ_PHRED64 == global_context -> config.phred_score_format)
2147 					{
2148 						tail_qual += (1000000-get_base_error_prob64i(qual_text[read_len - j -1]));
2149 					}
2150 					else
2151 					{
2152 						tail_qual += (1000000-get_base_error_prob33i(qual_text[read_len - j -1]));
2153 					}
2154 				}
2155 			}
2156 
2157 			float exon_tail_matching_rate=0.9, exon_head_matching_rate=0.9;
2158 
2159 			int head_must_correct = 4;
2160 			if(cover_start > 0)
2161 			{
2162 				if (head_qual / cover_start < 850000)
2163 				{
2164 					exon_head_matching_rate = 0.75;
2165 					head_must_correct =2;
2166 				}
2167 				else if (head_qual / cover_start < 950000 )
2168 				{
2169 					exon_head_matching_rate = 0.85;
2170 					head_must_correct =3;
2171 				}
2172 				else exon_head_matching_rate = 0.92;
2173 			}else	exon_head_matching_rate = 9999;
2174 
2175 
2176 			int tail_must_correct = 4;
2177 			if(read_len-cover_end > 0)
2178 			{
2179 				if (tail_qual / (read_len-cover_end) < 850000)
2180 				{
2181 					exon_tail_matching_rate = 0.75;
2182 					tail_must_correct =2;
2183 				}
2184 				else if (tail_qual / (read_len-cover_end) < 950000)
2185 				{
2186 					exon_tail_matching_rate = 0.85;
2187 					tail_must_correct =3;
2188 				}
2189 				else exon_tail_matching_rate = 0.92;
2190 			}else	exon_tail_matching_rate = 9999;
2191 
2192 			short head_indel_pos=-1 , tail_indel_pos=-1;
2193 			int head_indel_movement=0, tail_indel_movement=0;
2194 
2195 			if(0)SUBREADprintf("HQ=%.4f; TQ=%.4f; HM=%d; TM=%d; COVG = %d ~ %d\n", exon_head_matching_rate, exon_tail_matching_rate, head_must_correct, tail_must_correct, cover_start, cover_end);
2196 			/*
2197 			if(exon_head_matching_rate<1)
2198 				exon_head_matching_rate = 0.97;
2199 			if(exon_tail_matching_rate<1)
2200 				exon_tail_matching_rate = 0.97;
2201  			*/
2202 
2203 			core_extend_covered_region_15(global_context, current_value_index, current_result->selected_position, read_text, read_len, cover_start, cover_end, 7, head_must_correct,tail_must_correct, global_context -> config.max_indel_length + 1, global_context -> config.space_type, current_result -> selected_indel_record[k-1], & head_indel_pos, & head_indel_movement, & tail_indel_pos, & tail_indel_movement, is_head_high_quality, qual_text,  global_context -> config.phred_score_format, exon_head_matching_rate, exon_tail_matching_rate);
2204 
2205 			//head_indel_movement = -head_indel_movement;
2206 
2207 			if(0)SUBREADprintf("HMV=%d; TMV=%d; TPOS=%d\n", head_indel_movement, tail_indel_movement, tail_indel_pos);
2208 
2209 			if(head_indel_movement && s_head < 7)
2210 			{
2211 				unsigned int head_indel_left_edge = head_indel_pos + current_result->selected_position - 1;
2212 				//head_indel_left_edge -= max(0, head_indel_movement);
2213 				if(abs(head_indel_movement)<=global_context -> config.max_indel_length)
2214 				{
2215 					local_add_indel_event(global_context, thread_context, event_table, read_text + head_indel_pos, head_indel_left_edge, head_indel_movement, 1, 1, 0,NULL);
2216 					mark_gapped_read(current_result);
2217 				}
2218 			}
2219 			if(tail_indel_movement && s_tail < 7)
2220 			{
2221 				unsigned int tail_indel_left_edge = tail_indel_pos + current_result->selected_position + current_result -> indels_in_confident_coverage - 1;
2222 
2223 				if(abs(tail_indel_movement)<=global_context -> config.max_indel_length)
2224 				{
2225 					local_add_indel_event(global_context, thread_context, event_table, read_text + tail_indel_pos, tail_indel_left_edge, tail_indel_movement, 1, 1, 0,NULL);
2226 					mark_gapped_read(current_result);
2227 				}
2228 
2229 			}
2230 		}
2231 
2232 	}
2233 	return 0;
2234 }
2235 
print_indel_table(global_context_t * global_context)2236 void print_indel_table(global_context_t * global_context){
2237 }
2238 
write_indel_final_results(global_context_t * global_context)2239 int write_indel_final_results(global_context_t * global_context)
2240 {
2241 
2242 	int xk1, disk_is_full = 0;
2243 	char * inserted_bases = NULL;
2244 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
2245 	char * fn2, *ref_bases, *alt_bases;
2246 	FILE * ofp = NULL;
2247 
2248 	fn2 = malloc(MAX_FILE_NAME_LENGTH+30);
2249 	sprintf(fn2, "%s.indel.vcf", global_context->config.output_prefix);
2250 
2251 	ofp = f_subr_open(fn2, "wb");
2252 
2253 	//if(!ofp)
2254 	//	printf("HOW??? %s\n", fn2);
2255 	inserted_bases = malloc(MAX_INSERTION_LENGTH + 2);
2256 	ref_bases = malloc(1000);
2257 	alt_bases = malloc(1000);
2258 
2259 	fputs("##fileformat=VCFv4.0\n##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">\n##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">\n##INFO=<ID=SR,Number=1,Type=String,Description=\"Number of supporting reads for variants\">\n", ofp);
2260 	fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", ofp);
2261 
2262 	for(xk1 = 0; xk1 <  indel_context -> total_events ; xk1++)
2263 	{
2264 		char * chro_name;
2265 		int chro_pos;
2266 
2267 		chromosome_event_t * event_body = indel_context -> event_space_dynamic +xk1;
2268 
2269 
2270 //		#warning " ================= REMOVE '- 1' from the next LINE!!! ========================="
2271 		if((event_body -> event_type != CHRO_EVENT_TYPE_INDEL && event_body->event_type != CHRO_EVENT_TYPE_LONG_INDEL  && event_body -> event_type != CHRO_EVENT_TYPE_POTENTIAL_INDEL)|| (event_body -> final_counted_reads < 1  && event_body -> event_type == CHRO_EVENT_TYPE_INDEL) )
2272 			continue;
2273 
2274 		//assert((-event_body -> indel_length) < MAX_INSERTION_LENGTH);
2275 
2276 		if(event_body -> indel_length<0)
2277 		{
2278 			get_insertion_sequence(global_context, NULL , event_body -> inserted_bases , inserted_bases , -event_body -> indel_length);
2279 			free(event_body -> inserted_bases);
2280 		}
2281 
2282 		//assert(event_body -> event_small_side < event_body -> event_large_side );
2283 		locate_gene_position( event_body -> event_small_side , &global_context -> chromosome_table, &chro_name, &chro_pos);
2284 
2285 		// VCF format: chr21  1000001  .   AACC  AAGGGGGCC  29  .  INDEL;DP=20
2286 		if(event_body -> event_type == CHRO_EVENT_TYPE_INDEL || event_body -> event_type == CHRO_EVENT_TYPE_LONG_INDEL)
2287 		{
2288 			ref_bases[0]=0;
2289 			alt_bases[0]=0;
2290 
2291 			gene_value_index_t * current_index = find_current_value_index(global_context , event_body -> event_small_side-1 , max(0, event_body -> indel_length) + 2);
2292 			if(current_index)
2293 			{
2294 				int rlen = max(0, event_body -> indel_length) + 2;
2295 				assert(rlen<900);
2296 				gvindex_get_string(ref_bases, current_index, event_body -> event_small_side-1 , rlen, 0);
2297 				ref_bases[rlen] = 0;
2298 
2299 				if(event_body -> indel_length > 0)  // deletion
2300 				{
2301 					alt_bases[0] = ref_bases[0];
2302 					alt_bases[1] = ref_bases[event_body -> indel_length+1];
2303 					alt_bases[2] = 0;
2304 				}
2305 				else // insertion
2306 				{
2307 					alt_bases[0] = ref_bases[0];
2308 					strcpy(alt_bases+1,inserted_bases);
2309 					strcat(alt_bases,ref_bases+1);
2310 				}
2311 			}
2312 
2313 			if(CHRO_EVENT_TYPE_INDEL == event_body -> event_type)
2314 			{
2315 				if(event_body -> final_counted_reads<30)
2316 					event_body -> event_quality = pow(0.5,(30-event_body -> final_counted_reads));
2317 				else	event_body -> event_quality = 1;
2318 
2319 			}
2320 			int write_len = fprintf(ofp, "%s\t%u\t.\t%s\t%s\t%d\t.\tINDEL;DP=%d;SR=%d\n", chro_name, chro_pos, ref_bases, alt_bases, (int)(max(1, 250 + 10*log(event_body -> event_quality)/log(10))), event_body -> final_counted_reads + event_body -> anti_supporting_reads, event_body -> final_counted_reads);
2321 			if(write_len < 10) disk_is_full = 1;
2322 		}
2323 
2324 		global_context->all_indels++;
2325 	}
2326 
2327 	fclose(ofp);
2328 	free(ref_bases);
2329 	free(alt_bases);
2330 	free(inserted_bases);
2331 	if(disk_is_full){
2332 		unlink(fn2);
2333 		SUBREADprintf("ERROR: disk is full. Unable to write into the indel list.\n");
2334 	}
2335 	free(fn2);
2336 
2337 	return 0;
2338 }
2339 
destroy_indel_module(global_context_t * global_context)2340 int destroy_indel_module(global_context_t * global_context)
2341 {
2342 	int xk1;
2343 	indel_context_t * indel_context = global_context -> module_contexts[MODULE_INDEL_ID];
2344 	if(indel_context -> event_entry_table->appendix1)
2345 	{
2346 		free(indel_context -> event_entry_table -> appendix1);
2347 		free(indel_context -> event_entry_table -> appendix2);
2348 	}
2349 
2350 	// free the entry table: the pointers.
2351 	destory_event_entry_table(indel_context -> event_entry_table);
2352 	HashTableDestroy(indel_context -> event_entry_table);
2353 
2354 	free(indel_context -> event_space_dynamic);
2355 
2356 	for(xk1=0;xk1<MAX_READ_LENGTH; xk1++)
2357 	{
2358 		free(indel_context -> dynamic_align_table[xk1]);
2359 		free(indel_context -> dynamic_align_table_mask[xk1]);
2360 	}
2361 
2362 	for(xk1=0; xk1<EVENT_BODY_LOCK_BUCKETS; xk1++)
2363 		subread_destroy_lock(indel_context -> event_body_locks+xk1);
2364 
2365 	free(indel_context -> dynamic_align_table);
2366 	free(indel_context -> dynamic_align_table_mask);
2367 	return 0;
2368 }
2369 
2370 
2371 
trim_read(global_context_t * global_context,thread_context_t * thread_context,char * read_text,char * qual_text,int read_len,int * delta_position)2372 int trim_read(global_context_t *global_context , thread_context_t *thread_context, char * read_text, char * qual_text, int read_len, int * delta_position)
2373 {
2374 	int xk1, bad_bases = 0;
2375 	if(qual_text[0]==0) return read_len;
2376 
2377 
2378 	int tail_trim_pos, last_ok = read_len /2;
2379 
2380 	bad_bases=0;
2381 	for(xk1=read_len/2 ; xk1< read_len ; xk1++)
2382 	{
2383 		if(qual_text[xk1] <= '#'+2)
2384 			bad_bases++;
2385 		else
2386 			last_ok = xk1;
2387 		if(bad_bases>=2)
2388 			break;
2389 	}
2390 	tail_trim_pos = last_ok;
2391 
2392 	int head_trim_pos;
2393 
2394 	bad_bases=0;
2395 	last_ok = read_len /2;
2396 	for(xk1=read_len/2 ; xk1>=0 ; xk1--)
2397 	{
2398 		if(qual_text[xk1] <= '#'+2)
2399 			bad_bases++;
2400 		else
2401 			last_ok = xk1;
2402 		if(bad_bases>=2)
2403 			break;
2404 	}
2405 	if(bad_bases>=2)
2406 		head_trim_pos = last_ok;
2407 	else	head_trim_pos = 0;
2408 
2409 	if(3*(tail_trim_pos - head_trim_pos )<read_len)return -1;
2410 
2411 	for(xk1=0; xk1<tail_trim_pos - head_trim_pos; xk1++)
2412 	{
2413 		read_text[xk1] = read_text[xk1+head_trim_pos];
2414 		qual_text[xk1] = qual_text[xk1+head_trim_pos];
2415 	}
2416 
2417 	(* delta_position) = head_trim_pos;
2418 	read_text[xk1]=0;
2419 	qual_text[xk1]=0;
2420 
2421 	return tail_trim_pos - head_trim_pos;
2422 
2423 }
2424 
2425 #define READ_HEAD_TAIL_LEN 22
write_local_reassembly(global_context_t * global_context,HashTable * pileup_fp_table,unsigned int anchor_pos,char * read_name,char * read_text,char * qual_text,int read_len,int is_anchor_certain)2426 int write_local_reassembly(global_context_t *global_context, HashTable *pileup_fp_table, unsigned int anchor_pos, char * read_name , char * read_text ,char * qual_text , int read_len, int is_anchor_certain)
2427 {
2428 
2429 	char * chro_name;
2430 	int chro_offset;
2431 	int delta_pos = 0;
2432 	int new_read_len = trim_read(global_context, NULL, read_text, qual_text, read_len, &delta_pos);
2433 
2434 
2435 	anchor_pos += delta_pos;
2436 	if(new_read_len * 3<read_len * 2)
2437 		return 0;
2438 
2439 	read_len = new_read_len;
2440 
2441 	gene_value_index_t * base_index = &global_context->all_value_indexes[0] ;
2442 
2443 	if(is_anchor_certain)
2444 	{
2445 		int head_match = match_chro(read_text, base_index , anchor_pos, READ_HEAD_TAIL_LEN, 0, global_context -> config.space_type);
2446 		int tail_match = match_chro(read_text + read_len - READ_HEAD_TAIL_LEN, base_index , anchor_pos + read_len - READ_HEAD_TAIL_LEN, READ_HEAD_TAIL_LEN, 0, global_context -> config.space_type);
2447 		if(head_match > READ_HEAD_TAIL_LEN-3 && tail_match> READ_HEAD_TAIL_LEN-3)
2448 			return 0;
2449 	}
2450 
2451 
2452 
2453 
2454 	if(0 == locate_gene_position_max(anchor_pos, &global_context -> chromosome_table, &chro_name, &chro_offset, NULL, NULL, read_len))
2455 	{
2456 		char temp_file_name[MAX_FILE_NAME_LENGTH+40];
2457 		int close_now = 0;
2458 
2459 		sprintf(temp_file_name,"%s@%s-%04u.bin", global_context -> config.temp_file_prefix, chro_name , chro_offset / BASE_BLOCK_LENGTH );
2460 
2461 		FILE * pileup_fp = get_temp_file_pointer(temp_file_name, pileup_fp_table, &close_now);
2462 		//assert(read_len == strlen(read_text) && read_len > 90);
2463 		write_read_block_file(pileup_fp , 0, read_name, 0, chro_name , chro_offset, NULL, 0, read_text , qual_text, read_len , 1 , is_anchor_certain , anchor_pos, read_len, 0);
2464 		if(close_now) fclose(pileup_fp);
2465 	}
2466 
2467 
2468 	return 0;
2469 }
2470 
build_local_reassembly(global_context_t * global_context,thread_context_t * thread_context,int pair_number,char * read_name,char * read_text,char * qual_text,int read_len,int mate_read_len,int is_second_read,int best_read_id,int use_mate_pos,mapping_result_t * current_result,mapping_result_t * mate_result)2471 int build_local_reassembly(global_context_t *global_context , thread_context_t *thread_context, int pair_number , char * read_name , char * read_text ,char * qual_text , int read_len , int mate_read_len , int is_second_read, int best_read_id, int use_mate_pos, mapping_result_t *current_result, mapping_result_t *mate_result)
2472 {
2473 	unsigned int read_anchor_position;
2474 	if(!read_text) return 0;
2475 	int is_position_certain = 1;
2476 
2477 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
2478 	//mapping_result_t *current_result = _global_retrieve_alignment_ptr(global_context, pair_number, is_second_read, best_read_id);
2479 	//mapping_result_t * mate_result = _global_retrieve_alignment_ptr(global_context, pair_number, !is_second_read, best_read_id);
2480 
2481 
2482 
2483 	// No complex cigar strings are allowed.
2484 	if(use_mate_pos)
2485 	{
2486 		read_anchor_position = mate_result -> selected_position;
2487 
2488 		if(is_second_read + ((mate_result -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0) == 1)
2489 			// I'm the second read so I'm after the first one.
2490 			read_anchor_position += global_context -> config.expected_pair_distance;
2491 		else
2492 			// I'm the first read so I'm ahead the second one.
2493 			read_anchor_position -= global_context -> config.expected_pair_distance;
2494 
2495 		is_position_certain = 0;
2496 
2497 	}
2498 	else
2499 	{
2500 		read_anchor_position = current_result -> selected_position;
2501 	}
2502 
2503 	//gene_alue_index_t * value_index = &global_context->all_value_indexes[0] ;
2504 
2505 	if(qual_text[0] &&global_context -> config.phred_score_format == FASTQ_PHRED64)
2506 		fastq_64_to_33(qual_text);
2507 
2508 	//int window_no = 2*(read_anchor_position / REASSEMBLY_WINDOW_LENGTH);
2509 
2510 
2511 	write_local_reassembly(global_context, indel_context -> local_reassembly_pileup_files, read_anchor_position, read_name , read_text , qual_text , read_len, is_position_certain);
2512 
2513 	return 0;
2514 }
2515 
insert_pileup_read(global_context_t * global_context,reassembly_block_context_t * block_context,unsigned int first_base_pos,char * read_text,char * qual_text,int read_len,int is_read_head_aligned)2516 int insert_pileup_read(global_context_t * global_context , reassembly_block_context_t * block_context , unsigned int first_base_pos ,char * read_text , char * qual_text , int read_len, int is_read_head_aligned)
2517 {
2518 	int window_i,xk1;
2519 
2520 	for(window_i= -1; window_i<1; window_i++)
2521 	{
2522 		int de_brujin_grp_id = 2*(first_base_pos / REASSEMBLY_WINDOW_LENGTH) + window_i;
2523 		int read_pos;
2524 		if(de_brujin_grp_id<0)continue;
2525 
2526 		HashTable * current_table = block_context -> de_bruijn_graphs [de_brujin_grp_id];
2527 		for(read_pos = 0; read_pos <= read_len - global_context -> config.k_mer_length; read_pos +=1)
2528 		{
2529 			unsigned long long int node_key = 0;
2530 			for(xk1=0; xk1<global_context ->config.k_mer_length; xk1++)
2531 			{
2532 				node_key = (node_key << 2) | (base2int(read_text[xk1 + read_pos]));
2533 			}
2534 
2535 
2536 			unsigned long long int next_key_prefix = 0x8000000000000000ull | (node_key >> 2);
2537 
2538 			if(!read_pos && is_read_head_aligned)
2539 			{
2540 				int start_offset = (first_base_pos - de_brujin_grp_id * REASSEMBLY_WINDOW_LENGTH/2);
2541 				if(start_offset < block_context -> start_offsets [de_brujin_grp_id])
2542 				{
2543 					block_context -> start_offsets [de_brujin_grp_id] = start_offset;
2544 					block_context -> start_keys[1+10*de_brujin_grp_id]=0;
2545 					block_context -> start_keys[10*de_brujin_grp_id]=next_key_prefix;
2546 				}
2547 				else if(start_offset == block_context -> start_offsets [de_brujin_grp_id])
2548 				{
2549 					int start_key_id;
2550 					for(start_key_id=0; start_key_id<10&&block_context -> start_keys[de_brujin_grp_id * 10 + start_key_id]; start_key_id++);
2551 					if(start_key_id<10) block_context -> start_keys[start_key_id + 10 * de_brujin_grp_id] = next_key_prefix;
2552 				}
2553 			}
2554 
2555 			int next_node_type = node_key&0x03;
2556 			unsigned int next_jump_array = HashTableGet(current_table ,NULL+next_key_prefix) - NULL;
2557 			if(next_jump_array) next_jump_array-=1;
2558 
2559 			unsigned int current_read_count = 0xff&(next_jump_array >> (next_node_type*8));
2560 			if(current_read_count<254) current_read_count ++;
2561 
2562 			next_jump_array = 1+((next_jump_array & ~(0xff << (next_node_type*8))) | ((current_read_count) << (next_node_type*8)));
2563 
2564 			HashTablePut(current_table, NULL + next_key_prefix , NULL + next_jump_array);
2565 		}
2566 	}
2567 	return 0;
2568 }
2569 
2570 
finalise_db_graph(global_context_t * global_context,reassembly_block_context_t * block_context,int de_brujin_graph_id,HashTable * de_brujin_graph,unsigned long long int start_key,short start_offset)2571 int finalise_db_graph(global_context_t * global_context, reassembly_block_context_t * block_context, int de_brujin_graph_id, HashTable * de_brujin_graph, unsigned long long int start_key, short start_offset)
2572 {
2573 
2574 	unsigned long long int current_key = start_key;
2575 	char inserted_sequence[MAX_INSERTION_LENGTH+REASSEMBLY_WINDOW_LENGTH];
2576 	unsigned long long int used_keys[MAX_INSERTION_LENGTH+REASSEMBLY_WINDOW_LENGTH];
2577 	int inserted_length = 0;
2578 	int is_looped = 0;
2579 
2580 	int xk1;
2581 
2582 	for(xk1=0; xk1<global_context -> config.k_mer_length-1; xk1++)
2583 	{
2584 		int intbase = (start_key >> (2*(global_context -> config.k_mer_length-xk1-2))) & 3;
2585 		inserted_sequence[xk1] = "AGCT"[intbase];
2586 	}
2587 	inserted_length = global_context -> config.k_mer_length-1;
2588 
2589 	while(1)
2590 	{
2591 		unsigned int next_moves;
2592 		int max_move=-1;
2593 		next_moves = HashTableGet(de_brujin_graph , NULL + current_key) - NULL;
2594 		if(!next_moves) break;
2595 		else
2596 		{
2597 			int test_next;
2598 			int all_reads = 0;
2599 			int ignored_moves=0;
2600 			int max_move_reads=0;
2601 			unsigned long long int previous_key = current_key<<2;
2602 
2603 			next_moves-=1;
2604 			for(test_next=0; test_next<4; test_next++)
2605 				all_reads += 0xff&(next_moves>>(8*test_next));
2606 
2607 			while(1)
2608 			{
2609 				max_move = -1;
2610 				max_move_reads = -1;
2611 				for(test_next=0; test_next<4; test_next++)
2612 				{
2613 					if(ignored_moves & (1<<test_next)) continue;
2614 					int test_move_reads=0xff&(next_moves>>(8*test_next));
2615 					if(test_move_reads > max_move_reads)
2616 					{
2617 						max_move_reads = test_move_reads;
2618 						max_move = test_next;
2619 					}
2620 				}
2621 
2622 				if(max_move < 0)
2623 					break;
2624 
2625 				inserted_sequence[inserted_length] = "AGCT"[max_move];
2626 				int xk2, is_used_key = 0;
2627 				current_key = (previous_key | max_move);
2628 
2629 				for(xk2=0; xk2<inserted_length- global_context -> config.k_mer_length + 1; xk2++)
2630 				{
2631 					if(current_key == used_keys[xk2])
2632 					{
2633 						is_used_key = 1;
2634 						is_looped = 1;
2635 						ignored_moves |= (1<<max_move);
2636 						break;
2637 					}
2638 				}
2639 				if(!is_used_key || is_looped) break;
2640 			}
2641 			used_keys[inserted_length - global_context -> config.k_mer_length + 1] = current_key;
2642 			current_key &= (0x3fffffffffffffffull >> (2*(32 - global_context -> config.k_mer_length)));
2643 			current_key |= 0x8000000000000000ull;
2644 			inserted_length++;
2645 		}
2646 		if(inserted_length > MAX_INSERTION_LENGTH + REASSEMBLY_WINDOW_LENGTH - 1 || max_move<0|| is_looped)
2647 		{
2648 			is_looped = 1;
2649 			inserted_sequence[inserted_length]=0;
2650 
2651 			break;
2652 		}
2653 	}
2654 
2655 	if(!is_looped)
2656 	{
2657 		inserted_sequence[inserted_length]=0;
2658 
2659 
2660 		assert(global_context -> index_block_number == 1);
2661 		gene_value_index_t * base_index = &global_context->all_value_indexes[0] ;
2662 		unsigned int window_start = block_context -> block_start_linear_pos + de_brujin_graph_id * REASSEMBLY_WINDOW_LENGTH / 2;
2663 		int best_matched_bases = -1, best_end_pos = -1, best_start_pos = start_offset;
2664 
2665 		#define END_POS_CONFIRM_LENGTH 24
2666 
2667 		for(xk1 = inserted_length + start_offset - END_POS_CONFIRM_LENGTH ; xk1>= start_offset+15 ; xk1--)	//find the end of insertion
2668 		{
2669 			if(inserted_length - xk1 - END_POS_CONFIRM_LENGTH + best_start_pos >= MAX_INSERTION_LENGTH) break;
2670 			int matched_bases = match_chro(inserted_sequence + inserted_length - END_POS_CONFIRM_LENGTH, base_index , window_start + xk1, END_POS_CONFIRM_LENGTH, 0, global_context -> config.space_type );
2671 			if(matched_bases>best_matched_bases)
2672 			{
2673 				best_matched_bases = matched_bases;
2674 				best_end_pos = xk1 + END_POS_CONFIRM_LENGTH;
2675 			}
2676 		}
2677 
2678 
2679 		int new_base_length = inserted_length - best_end_pos + best_start_pos;
2680 		if(new_base_length>0 && best_matched_bases >= END_POS_CONFIRM_LENGTH - 1)
2681 		{
2682 			best_matched_bases = -1;
2683 			int best_insertion_point = -1;
2684 			for(xk1 = 15; xk1<best_end_pos - best_start_pos; xk1++)
2685 			{
2686 				int insertion_matched =
2687 					match_chro(inserted_sequence + xk1 - 15, base_index , window_start + best_start_pos + xk1 - 15, 15, 0, global_context -> config.space_type) +
2688 					match_chro(inserted_sequence + xk1 + new_base_length , base_index , window_start + best_start_pos + xk1 , 15, 0 , global_context -> config.space_type );
2689 				if(insertion_matched>best_matched_bases && insertion_matched>=29)
2690 				{
2691 					best_insertion_point = xk1;
2692 					best_matched_bases = insertion_matched;
2693 				}
2694 			}
2695 			if(best_insertion_point>=0)
2696 			{
2697 				unsigned int best_pos = window_start + best_start_pos + best_insertion_point;
2698 				int indels = -new_base_length;
2699 
2700 				//inserted_sequence[best_insertion_point + new_base_length] = 0;
2701 
2702 
2703 				indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
2704 				HashTable * event_table = indel_context -> event_entry_table;
2705 				chromosome_event_t * event_space = indel_context -> event_space_dynamic;
2706 
2707 				chromosome_event_t * found = NULL;
2708 				chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
2709 				int found_events = search_event(global_context, event_table, event_space, best_pos - 1, EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_INDEL, search_return);
2710 
2711 				if(found_events)
2712 				{
2713 					int kx1;
2714 					for(kx1 = 0; kx1 < found_events ; kx1++)
2715 					{
2716 						if(search_return[kx1] -> indel_length == indels)
2717 						{
2718 							found = search_return[kx1];
2719 							break;
2720 						}
2721 					}
2722 				}
2723 
2724 				if(found){
2725 					found -> supporting_reads ++;
2726 				}
2727 				else
2728 				{
2729 					int event_no;
2730 					event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) ->  total_events ++;
2731 
2732 					event_space = reallocate_event_space(global_context, NULL, event_no);
2733 
2734 					chromosome_event_t * new_event = event_space+event_no;
2735 					memset(new_event,0,sizeof(chromosome_event_t));
2736 
2737 					set_insertion_sequence(global_context, NULL , &new_event -> inserted_bases , inserted_sequence+best_insertion_point, -indels);
2738 
2739 					new_event -> event_small_side = best_pos - 1;
2740 					new_event -> event_large_side = best_pos + max(0, indels);
2741 					assert(new_event -> event_small_side && new_event -> event_large_side);
2742 					new_event -> event_type = CHRO_EVENT_TYPE_INDEL;
2743 					new_event -> indel_length = indels;
2744 					new_event -> supporting_reads = 1;
2745 
2746 					put_new_event(event_table, new_event , event_no);
2747 
2748 				}
2749 
2750 				fprintf(global_context -> long_insertion_FASTA_fp, ">BEGIN%u_%dM%dI%dM\n", window_start + best_start_pos, best_insertion_point , -indels , inserted_length + indels - best_insertion_point);
2751 				int write_cursor;
2752 				for(write_cursor = 0 ; write_cursor < inserted_length ; write_cursor += 70)
2753 				{
2754 					fwrite(inserted_sequence + write_cursor , min(70, inserted_length - write_cursor), 1, global_context -> long_insertion_FASTA_fp);
2755 					fputs("\n",global_context -> long_insertion_FASTA_fp);
2756 				}
2757 			}
2758 		}
2759 
2760 	}
2761 	return 0;
2762 }
2763 
2764 
2765 #define  finalise_pileup_file_by_voting  finalise_pileup_file
2766 
2767 #define REASSEMBLY_SUBREAD_GAP 1
2768 
2769 int search_window_once(global_context_t * global_context, reassembly_by_voting_block_context_t * block_context, int window_no, int start_read_no, int search_direction, int anti_loop, float allele_quality);
2770  // search_direction = 1 : from 5' end to 3' end
2771  // search_direction = -1 : from 3'end to 5' end
search_window(global_context_t * global_context,reassembly_by_voting_block_context_t * block_context,int window_no,int start_read_no,int search_direction)2772 void search_window(global_context_t * global_context, reassembly_by_voting_block_context_t * block_context, int window_no, int start_read_no, int search_direction)
2773 {
2774 
2775 	block_context->rebuilt_window[0] = 0;
2776 	block_context->rebuilt_window_size = 0;
2777 
2778 	memset(block_context->final_alleles, 0, sizeof(struct reassmebly_window_allele ) * global_context -> config.reassembly_window_alleles);
2779 
2780 	block_context -> search_cost = 0;
2781 	block_context -> used_read_number = 0;
2782 	block_context -> total_matched_bases = 0;
2783 	block_context -> max_matched_bases = 0;
2784 
2785 	/*int ret = */search_window_once(global_context, block_context, window_no, start_read_no, search_direction, 0, 1);
2786 
2787 }
2788 
2789 
2790 // this function returns if the read is not rectifiable (i.e. likely to be misalignment)
rectify_read_text(global_context_t * global_context,reassembly_by_voting_block_context_t * block_context,int window_no,gehash_t * index,char * next_read_txt,int really_make_change)2791 int rectify_read_text(global_context_t * global_context, reassembly_by_voting_block_context_t * block_context, int window_no, gehash_t * index, char * next_read_txt, int really_make_change)
2792 {
2793 //	return 1;
2794 	int read_offset,i,j;
2795 	int read_len = strlen(next_read_txt);
2796 
2797 	//int is_test_target = (strcmp("AATGGAACAACCGCCTCAAATGGAATGGAATGGCCTCCAAAGGAAGGGAGTGACATTTAATGAATGCATTCTAATGGAATGGCATGGAATGGACTGGAA", next_read_txt)==0);
2798 	int is_test_target = 0;
2799 
2800 	init_gene_vote(block_context -> vote_list_rectify);
2801 	for(read_offset = 0; read_offset < read_len - global_context -> config.reassembly_subread_length+1; read_offset += 1)
2802 	{
2803 		unsigned int subread_int = 0, xk2;
2804 		for(xk2=0; xk2< global_context -> config.reassembly_subread_length; xk2++)
2805 			subread_int = (subread_int << 2) | base2int(next_read_txt[read_offset + xk2]);
2806 		gehash_go_q(index, subread_int , read_offset, read_len, 0, block_context -> vote_list_rectify, 22, read_offset,  0, 0x7fffffff);
2807 	}
2808 
2809 	memset(block_context -> read_rectify_space, 0, MAX_READ_LENGTH * 4 * sizeof(short));
2810 
2811 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
2812 		for(j=0; j< block_context -> vote_list_rectify -> items[i]; j++)
2813 		{
2814 			int xk1;
2815 
2816 			int neighbour_read_no =  (block_context -> vote_list_rectify -> pos[i][j] + MAX_READ_LENGTH+1)/10000;
2817 			int neighbour_read_mapped_start, this_read_mapping_start, overlapped_length;
2818 			unsigned long long int neighbour_read_key = (window_no * 1llu +1)<<32 ;
2819 			neighbour_read_key |= (neighbour_read_no& 0xfffff)<<12;
2820 			char * neighbour_read_text = HashTableGet(block_context->read_sequence_table,NULL+neighbour_read_key);
2821 			assert(neighbour_read_text);
2822 			int neighbour_read_len = strlen(neighbour_read_text);
2823 
2824 			// neighbour_read_mapped_start is the location on the neigibour read of the first base of the current read.
2825 			neighbour_read_mapped_start = - neighbour_read_no*10000 + block_context -> vote_list_rectify -> pos[i][j];// + block_context -> vote_list -> coverage_start[i][j];
2826 
2827 			if(neighbour_read_mapped_start>=0)
2828 			{
2829 				this_read_mapping_start = 0;
2830 				overlapped_length = min(neighbour_read_len - neighbour_read_mapped_start, read_len);
2831 			}
2832 			else
2833 			{
2834 				// this_read_mapping_start is the location on this read of the first base of the neighbour read.
2835 				this_read_mapping_start = -neighbour_read_mapped_start;
2836 				neighbour_read_mapped_start = 0;
2837 				overlapped_length = min(read_len - this_read_mapping_start, neighbour_read_len);
2838 				if(is_test_target)
2839 				{
2840 					for(xk1 = 0; xk1 < 3+this_read_mapping_start; xk1++) fputc(' ',stdout );
2841 					printf("%s\n", neighbour_read_text);
2842 				}
2843 			}
2844 
2845 
2846 			for(xk1 = 0; xk1 < overlapped_length; xk1++)
2847 			{
2848 				int this_read_pos = this_read_mapping_start + xk1;
2849 				int nb_read_pos = neighbour_read_mapped_start + xk1;
2850 				block_context -> read_rectify_space[this_read_pos * 4 +base2int(neighbour_read_text[nb_read_pos])] ++;
2851 			}
2852 		}
2853 
2854 	if(is_test_target)
2855 		printf("   %s\n", next_read_txt);
2856 	char ori[2000];
2857 	strcpy(ori, next_read_txt);
2858 	is_bad_read(ori, read_len, "RECT");
2859 	int rectified = 0;
2860 
2861 
2862 	if(is_test_target)
2863 		printf("R0=%s\n", next_read_txt);
2864 	char changed[200];
2865 //	memset(changed, ' ', 200);
2866 	for(i=0; i<read_len; i++)
2867 	{
2868 		int max_support = block_context -> read_rectify_space[i*4+base2int(next_read_txt[i])];
2869 		int base_sel = -1;
2870 		for(j=0; j<4; j++)
2871 		{
2872 			if(j!=base2int(next_read_txt[i]) && block_context -> read_rectify_space[i*4+j] > max_support)
2873 			{
2874 				base_sel = int2base(j);
2875 				max_support = block_context -> read_rectify_space[i*4+j];
2876 			}
2877 		}
2878 
2879 		if(base_sel>0)
2880 		{
2881 			if(really_make_change)
2882 			{
2883 				next_read_txt[i]=base_sel;
2884 			//	changed[i] = '~';
2885 			}
2886 			rectified ++;
2887 		}
2888 	}
2889 
2890 	changed[i]=0;
2891 
2892 	if(is_test_target)
2893 		printf("R1=%s\n   %s\n\n", next_read_txt, changed);
2894 	is_bad_read(next_read_txt, read_len, "REC2");
2895 	return rectified<=2;
2896 }
2897 
search_window_once(global_context_t * global_context,reassembly_by_voting_block_context_t * block_context,int window_no,int start_read_no,int search_direction,int anti_loop,float allele_quality)2898 int search_window_once(global_context_t * global_context, reassembly_by_voting_block_context_t * block_context, int window_no, int start_read_no, int search_direction, int anti_loop, float allele_quality)
2899 {
2900 
2901 	block_context -> search_cost++;
2902 	if(block_context -> search_cost > 1000) return 0;
2903 
2904 
2905 	int is_start_certain = 1, is_digged_in = 0;
2906 	unsigned long long int starter_read_key = (window_no * 1llu +1)<<32 ;
2907 	starter_read_key |= (start_read_no& 0xfffff)<<12;
2908 
2909 	char * start_read_txt = HashTableGet(block_context->read_sequence_table,NULL+starter_read_key);
2910 	assert(start_read_txt);
2911 
2912 
2913 	int start_read_pos = HashTableGet(block_context->read_position_table, NULL+starter_read_key) - NULL ;
2914 	if(start_read_pos == 0) is_start_certain = 0;
2915 	else start_read_pos --;
2916 
2917 
2918 	int read_len = strlen(start_read_txt), read_offset;
2919 	init_gene_vote(block_context -> vote_list);
2920 
2921 	int minimum_overlap_length = read_len  /4;
2922 	//if(!is_start_certain) minimum_overlap_length *= 1.5;
2923 	//minimum_overlap_length = max(1,16-global_context ->config.reassembly_subread_length);
2924 
2925 
2926 	for(read_offset = 0; read_offset < read_len - global_context -> config.reassembly_subread_length+1; read_offset += 1)
2927 	{
2928 		unsigned int subread_int = 0, xk2;
2929 		for(xk2=0; xk2< global_context -> config.reassembly_subread_length; xk2++)
2930 			subread_int = (subread_int << 2) | base2int(start_read_txt[read_offset + xk2]);
2931 
2932 
2933 		if(global_context -> config.reassembly_tolerable_voting)
2934 			gehash_go_q_tolerable(&block_context->voting_indexes[window_no], subread_int , read_offset, read_len, 0, block_context -> vote_list, 1, 22, 24, 0, read_offset, global_context -> config.reassembly_tolerable_voting, global_context -> config.reassembly_subread_length,  0, 0x7fffffff);
2935 		else
2936 			gehash_go_q(&block_context->voting_indexes[window_no], subread_int , read_offset, read_len, 0, block_context -> vote_list,0,read_offset,  0, 0x7fffffff);
2937 	}
2938 
2939 	int WINDOW_SEARCH_TREE_WIDTH = 1;
2940 	int i,j;
2941 
2942 	int max_read_scoring [8];
2943 	unsigned int max_read_ids [8];
2944 	int max_read_nonoverlap [8];
2945 	float max_allele_quality [8];
2946 
2947 	memset(max_read_scoring, 0, sizeof(int)* 8);
2948 	memset(max_allele_quality, 0, sizeof(float)* 8);
2949 	memset(max_read_ids, 0, sizeof(int)* 8);
2950 	memset(max_read_nonoverlap, 0, sizeof(int)* 8);
2951 
2952 	//intif(is_target_window_X(window_no))
2953 	//if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
2954 	//	print_votes(block_context -> vote_list, global_context -> config.index_prefix);
2955 
2956 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
2957 		for(j=0; j< block_context -> vote_list -> items[i]; j++)
2958 		{
2959 			//if(is_digged_in)break;
2960 			//if((block_context -> vote_list -> coverage_end[i][j] - block_context -> vote_list -> coverage_start[i][j]) * 6 > read_len )
2961 //			if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
2962 //				SUBREADprintf("LLLX PRINT_VOTE @%u : V=%d\n", block_context -> vote_list -> pos[i][j],  block_context -> vote_list -> votes[i][j] );
2963 
2964 			if( block_context -> vote_list -> votes[i][j] > 4)
2965 			{
2966 				int next_read_no = (block_context -> vote_list -> pos[i][j] + MAX_READ_LENGTH +1 )/10000;
2967 				if(next_read_no == start_read_no) continue;
2968 				int next_read_nonoverlap = next_read_no * 10000 - block_context -> vote_list -> pos[i][j];
2969 				float next_read_allele_quality = allele_quality;
2970 				int anchor_to_index = 0;
2971 
2972 				if(abs(next_read_nonoverlap) < 2)
2973 					anchor_to_index = 0;	// PARALLEL or VALUELESS
2974 				else if(next_read_nonoverlap < 0)
2975 					anchor_to_index = -1;	// INDEX -> ANCHOR
2976 				else	anchor_to_index = 1;    // ANCHOR -> INDEX
2977 
2978 
2979 				unsigned long long int test_next_read_key = (window_no * 1llu +1)<<32 ;
2980 				test_next_read_key |=  ( next_read_no& 0xfffff)<<12;
2981 				int is_next_certain = 1;
2982 				int next_read_pos = HashTableGet(block_context->read_position_table, NULL+test_next_read_key) - NULL;
2983 				if(next_read_pos==0)
2984 					is_next_certain =0;
2985 				else next_read_pos--;
2986 
2987 				/*
2988 				if(!is_start_certain || !is_next_certain)
2989 				{
2990 					if(anchor_to_index == 1 && (block_context -> vote_list -> coverage_start[i][j] - next_read_nonoverlap) > 5)
2991 						anchor_to_index = 0;	// PARALLEL or VALUELESS
2992 
2993 					if(anchor_to_index == -1 && block_context -> vote_list -> coverage_start[i][j] > 5)
2994 						anchor_to_index = 0;	// PARALLEL or VALUELESS
2995 				}*/
2996 
2997 				char * next_read_txt = HashTableGet(block_context->read_sequence_table,NULL+test_next_read_key);
2998 				assert(next_read_txt);
2999 				int next_read_len = strlen(next_read_txt);
3000 				int two_read_overlapped = next_read_len - next_read_nonoverlap;
3001 				if(next_read_nonoverlap<0)
3002 					two_read_overlapped = read_len + next_read_nonoverlap;
3003 				if(two_read_overlapped< minimum_overlap_length)
3004 					anchor_to_index = 0;
3005 
3006 				if(search_direction>0)
3007 				{
3008 					if( next_read_nonoverlap + strlen(next_read_txt) <= read_len)
3009 						anchor_to_index = 0;
3010 				}
3011 
3012 				if(anchor_to_index * search_direction > 0)
3013 				{
3014 
3015 					long long int pos_delta = next_read_pos;
3016 					pos_delta -= start_read_pos;
3017 
3018 					int xk3, next_read_scoring = 0, next_read_matching = 0;
3019 
3020 					int order_reward = (is_next_certain && is_start_certain && search_direction * pos_delta >0)? 25000:0;
3021 					if(is_start_certain * is_next_certain == 0) order_reward = 10000;
3022 					int wrong_bases=0;
3023 
3024 					if(anchor_to_index > 0)
3025 					{
3026 						if(next_read_len+ next_read_nonoverlap< read_len)continue;// no new bases are added by the new read.
3027 
3028 						for(xk3 = next_read_nonoverlap ; xk3 < read_len ; xk3++)
3029 							if(start_read_txt[xk3] == next_read_txt[xk3 - next_read_nonoverlap]) next_read_matching++;
3030 						wrong_bases = read_len - next_read_nonoverlap - next_read_matching;
3031 						if(wrong_bases>=2)
3032 						{
3033 							next_read_scoring = -1;
3034 						}
3035 						else
3036 						{
3037 							block_context -> total_matched_bases += next_read_matching;
3038 
3039 							next_read_scoring = two_read_overlapped * 10000 + order_reward
3040    						    	+(unsigned int)(HashTableGet(block_context -> read_quality_table, NULL+test_next_read_key)-NULL)/10 - wrong_bases * 20000;
3041 						}
3042 					}
3043 					else
3044 					{
3045 						if(0< next_read_nonoverlap) continue; // no new bases are added by the new read.
3046 						int should_match = min(next_read_len, read_len - next_read_nonoverlap);
3047 						if(next_read_len >  read_len - next_read_nonoverlap) continue; // the head of the next read is untested: bad!
3048 						for(xk3 = - next_read_nonoverlap ; xk3 <  should_match ; xk3++)
3049 							if(next_read_txt[xk3] == start_read_txt[xk3 + next_read_nonoverlap]) next_read_matching++;
3050 
3051 						wrong_bases = should_match + next_read_nonoverlap - next_read_matching;
3052 
3053 						if(wrong_bases>=2)
3054 						{
3055 							next_read_scoring = -1;
3056 						}
3057 						else
3058 						{
3059 							block_context -> total_matched_bases += next_read_matching;
3060 
3061 							next_read_scoring = two_read_overlapped * 10000 +order_reward
3062 						 	+ (unsigned int)(HashTableGet(block_context -> read_quality_table, NULL+test_next_read_key)-NULL)/10 - wrong_bases * 20000;
3063 						}
3064 
3065 					}
3066 
3067 
3068 					if(wrong_bases)
3069 						next_read_allele_quality *= 0.5;
3070 
3071 					int is_rectifiable = rectify_read_text(global_context, block_context, window_no, &block_context->voting_indexes[window_no], next_read_txt , 0);
3072 
3073 
3074 /*
3075 					if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
3076 					{
3077 						SUBREADprintf("LLLX: NREAD #%d HAS QUAL %d ; is_rectifiable=%d ; WB=%d; ANCHOR_TO_I=%d\n", next_read_no, next_read_scoring, is_rectifiable, wrong_bases, anchor_to_index);
3078 						SUBREADprintf("LLLX: NREAD OVL=%d, RLEN=%d, NRLEN=%d, NRMATCH=%d\n", next_read_nonoverlap, read_len, next_read_len, next_read_matching);
3079 						SUBREADprintf("LLLX: THIS_READ=%s\n", start_read_txt);
3080 						SUBREADprintf("LLLX: NEXT_READ=%s\n", next_read_txt);
3081 					}
3082 */
3083 					if((next_read_scoring > max_read_scoring[ WINDOW_SEARCH_TREE_WIDTH-1]) && is_rectifiable)
3084 					{
3085 						int is_unique = 1, is_replace = 0;
3086 						for(xk3=0; xk3<WINDOW_SEARCH_TREE_WIDTH; xk3++)
3087 						{
3088 
3089 							if(global_context -> config.reassembly_window_alleles>1&&max_read_scoring[xk3]>0)
3090 							{
3091 								int position_diff = max_read_nonoverlap[xk3] - next_read_nonoverlap;
3092 								int xk4;
3093 								unsigned long long int test_prev_read_key = (window_no * 1llu +1)<<32 ;
3094 								test_prev_read_key |=  (max_read_ids[xk3] & 0xfffff)<<12;
3095 								char  *prev_read_txt = HashTableGet(block_context->read_sequence_table,NULL+test_prev_read_key);
3096 								int mismatch=0;
3097 
3098 								for(xk4 = abs(position_diff) ; ; xk4++)
3099 								{
3100 									int prev_read_pos = xk4-(position_diff>0?position_diff:0);
3101 									int next_read_pos = xk4+(position_diff<0?position_diff:0);
3102 									if(prev_read_txt[prev_read_pos]==0 || next_read_txt[next_read_pos]==0) break;
3103 									mismatch += (prev_read_txt[prev_read_pos]!=next_read_txt[next_read_pos]);
3104 								}
3105 
3106 
3107 								if(mismatch<2)
3108 								{
3109 									is_unique = 0;
3110 									if(position_diff > 0)
3111 										is_replace = xk3+1;
3112 								}
3113 							}
3114 
3115 						}
3116 						if(is_replace)
3117 						{
3118 							max_read_scoring[is_replace-1] = next_read_scoring;
3119 							max_read_ids[is_replace-1] = next_read_no;
3120 							max_read_nonoverlap[is_replace-1] = next_read_nonoverlap;
3121 							max_allele_quality[is_replace-1] = next_read_allele_quality;
3122 						}
3123 						else
3124 						{
3125 							if(is_unique) for(xk3=0; xk3<WINDOW_SEARCH_TREE_WIDTH; xk3++)
3126 							{
3127 								if(next_read_scoring > max_read_scoring[xk3])
3128 								{
3129 									int xk4;
3130 									for(xk4 = WINDOW_SEARCH_TREE_WIDTH-1; xk4>xk3; xk4--)
3131 									{
3132 										max_read_scoring[xk4] = max_read_scoring[xk4-1];
3133 										max_read_ids[xk4] = max_read_ids[xk4-1];
3134 										max_read_nonoverlap[xk4] = max_read_nonoverlap[xk4-1];
3135 										max_allele_quality[xk4] = max_allele_quality[xk4-1];
3136 									}
3137 									max_read_scoring[xk3] = next_read_scoring;
3138 									max_read_ids[xk3] = next_read_no;
3139 									max_read_nonoverlap[xk3] = next_read_nonoverlap;
3140 									max_allele_quality[xk3] = next_read_allele_quality;
3141 									//break;
3142 								}
3143 							}
3144 						}
3145 					}
3146 				}
3147 
3148 			}
3149 		}
3150 
3151 
3152 	for(i = 0; i<WINDOW_SEARCH_TREE_WIDTH; i++)
3153 	{
3154 		unsigned int next_read_no = max_read_ids[i];
3155 		int is_fresh = 1;
3156 
3157 //			if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
3158 //				SUBREADprintf("LLLX: TEST: %d\n", i);
3159 
3160 		if(max_read_scoring[i]<=0) break;
3161 		is_digged_in = 1;
3162 		for(j=0; j< block_context -> used_read_number; j++)
3163 		{
3164 			if(block_context -> used_read_ids[j] == next_read_no)
3165 			{
3166 				is_fresh = 0;
3167 
3168 				FRESH_COUNTER ++;
3169 				break;
3170 			}
3171 		}
3172 
3173 		if(!is_fresh)continue;
3174 
3175 		block_context -> used_read_ids[block_context ->used_read_number++] = next_read_no;
3176 
3177 		float next_allele_quality = max_allele_quality[i];
3178 		int next_read_nonoverlap = max_read_nonoverlap[i];
3179 
3180 
3181 		unsigned long long int test_next_read_key = (window_no * 1llu +1)<<32 ;
3182 		test_next_read_key |=  ( next_read_no& 0xfffff)<<12;
3183 
3184 
3185 		char * next_read_txt = HashTableGet(block_context->read_sequence_table,NULL+test_next_read_key);
3186 		assert(next_read_txt);
3187 
3188 		int new_bases = (search_direction>0)?strlen(next_read_txt)-(read_len - next_read_nonoverlap):-next_read_nonoverlap;
3189 		if (new_bases + block_context -> rebuilt_window_size > 1000){
3190 			return 1;
3191 		}
3192 
3193 		int basic_length = 1;//max(read_len, strlen(next_read_txt))/3;
3194 		int max_length = 99999;
3195 
3196 		//if(anti_loop) max_length =  min(read_len, strlen(next_read_txt))/3;
3197 
3198 		int current_rebuilt_size = block_context -> rebuilt_window_size;
3199 		int current_matched_bases = block_context -> total_matched_bases;
3200 		//int current_used_read_number = block_context -> used_read_number;
3201 		if(new_bases>=basic_length && new_bases < max_length)
3202 		{
3203 			if(strlen(block_context -> rebuilt_window) != block_context ->rebuilt_window_size) return 1;
3204 
3205 			//int is_rectifiable = rectify_read_text(global_context, block_context, window_no, &block_context->voting_indexes[window_no], next_read_txt , 1);
3206 			//if(!is_rectifiable) continue;
3207 
3208 			if(search_direction > 0)
3209 			{
3210 				int high_quality_offset = 0*min(2, strlen(next_read_txt) - new_bases);
3211 				assert(high_quality_offset>=0);
3212 				if(block_context ->rebuilt_window_size > high_quality_offset)
3213 				{
3214 					block_context -> rebuilt_window[ block_context ->rebuilt_window_size-high_quality_offset] =0;
3215 					strcat(block_context -> rebuilt_window , next_read_txt + strlen(next_read_txt) - new_bases - high_quality_offset);
3216 				}
3217 				else
3218 					strcat(block_context -> rebuilt_window , next_read_txt + strlen(next_read_txt) - new_bases);
3219 			}
3220 			else
3221 			{
3222 				int xk3;
3223 				for(xk3= block_context -> rebuilt_window_size ; xk3>=0 ; xk3--)
3224 					block_context -> rebuilt_window[new_bases + xk3] = block_context -> rebuilt_window[xk3];
3225 
3226 				int high_quality_offset = 0*min(2, strlen(next_read_txt) - new_bases);
3227 				//high_quality_offset = 0;
3228 				assert(high_quality_offset>=0);
3229 				memcpy(block_context -> rebuilt_window , next_read_txt , new_bases + high_quality_offset);
3230 				block_context -> rebuilt_window [block_context -> rebuilt_window_size +new_bases] = 0;
3231 			}
3232 
3233 			block_context -> rebuilt_window_size += new_bases;
3234 //			if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
3235 //				SUBREADprintf("LLLX: DIG IN: %d -> %d  (DIR=%d)\n", start_read_no, next_read_no, search_direction);
3236 
3237 			search_window_once(global_context, block_context, window_no, next_read_no, search_direction, anti_loop, next_allele_quality);
3238 			DIGIN_COUNTER ++;
3239 			if(block_context -> search_cost > 1000) return 1;
3240 
3241 		}
3242 		if(search_direction > 0)
3243 			// if the search is to the 3' end, the stack is simply dumpped.
3244 			block_context -> rebuilt_window[current_rebuilt_size] = 0;
3245 		else
3246 		{
3247 			// if the search is to the 5' end, we have to move the tail back to head.
3248 			int xk4;
3249 			int bytes_to_remove = block_context -> rebuilt_window_size - current_rebuilt_size;
3250 			for(xk4 = bytes_to_remove; xk4 <  block_context -> rebuilt_window_size; xk4++)
3251 				block_context -> rebuilt_window[xk4-bytes_to_remove] = block_context -> rebuilt_window[xk4];
3252 			block_context -> rebuilt_window[current_rebuilt_size] = 0;
3253 		}
3254 
3255 		block_context -> rebuilt_window_size = current_rebuilt_size;
3256 		block_context -> total_matched_bases = current_matched_bases;
3257 		//block_context -> used_read_number = current_used_read_number;
3258 	}
3259 
3260 //	if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
3261 //		SUBREADprintf("LLLX: NOT DIG IN: %d :: %d   DIR=%d\n", is_digged_in, block_context -> rebuilt_window_size, search_direction);
3262 
3263 	if(!is_digged_in && block_context -> rebuilt_window_size > 0)
3264 	{
3265 		//if(block_context -> total_matched_bases * 10000 /block_context -> rebuilt_window_size > block_context -> max_matched_bases)
3266 		int xk1, xk2;
3267 		for(xk1 = 0; xk1 < global_context -> config.reassembly_window_alleles; xk1++)
3268 		{
3269 			//int is_same_allele = 0;
3270 
3271 //			if(block_context -> window_start_pos > 1124357402 && block_context -> window_start_pos < 1124359402)
3272 //				SUBREADprintf("LLLX [%d] CPY: %d<%d\n", xk1, block_context -> final_alleles[xk1].rebuilt_size , block_context -> rebuilt_window_size);
3273 
3274 			if(block_context -> final_alleles[xk1].rebuilt_size < block_context -> rebuilt_window_size)
3275 			{
3276 				//int xk3=0;
3277 				for(xk2= global_context -> config.reassembly_window_alleles - 1 ; xk2 > xk1; xk2--)
3278 					memcpy(&block_context -> final_alleles[xk2], &block_context -> final_alleles[xk2 - 1], sizeof(struct reassmebly_window_allele));
3279 
3280 				strcpy(block_context -> final_alleles[xk1].rebuilt_window , block_context -> rebuilt_window);
3281 				block_context -> max_matched_bases = block_context -> total_matched_bases * 10000 / block_context -> rebuilt_window_size;
3282 				block_context -> final_alleles[xk1].rebuilt_size = block_context -> rebuilt_window_size;
3283 				block_context -> final_alleles[xk1].allele_quality = allele_quality;
3284 				break;
3285 			}
3286 		}
3287 	}
3288 
3289 	return 0;
3290 }
3291 
3292 
3293 #define MAX_INDELS_IN_WINDOW 10
3294 #define PROBE_KEY_NUMBER 12
3295 #define INDEL_FULL_ALIGN_MAX_MISMATCH 3
3296 #define SINGLE_PROBE_MAX_MISMATCH 1
3297 
full_indel_alignment(global_context_t * global_context,reassembly_by_voting_block_context_t * block_context,char * full_rebuilt_window,int full_rebuilt_window_size,gene_value_index_t * base_index,unsigned int window_start_pos,unsigned int * perfect_segment_start_pos,int * perfect_segment_lengths,int * indels_after_perfect_segments,short * indels_read_positions,float * indel_quality,unsigned int * contig_start_pos,unsigned int * contig_end_pos,int * head_removed_bases,int * tail_removed_bases,int * good_quality_indel)3298 int full_indel_alignment(global_context_t * global_context, reassembly_by_voting_block_context_t * block_context, char * full_rebuilt_window, int full_rebuilt_window_size, gene_value_index_t * base_index, unsigned int window_start_pos, unsigned int * perfect_segment_start_pos, int * perfect_segment_lengths,int * indels_after_perfect_segments, short * indels_read_positions, float * indel_quality, unsigned int * contig_start_pos, unsigned int * contig_end_pos, int * head_removed_bases, int * tail_removed_bases, int * good_quality_indel)
3299 {
3300 	int xk1;
3301 	int is_unreliable = 0;
3302 
3303 	unsigned int xk2 = window_start_pos;
3304 	unsigned int probe_poses [PROBE_KEY_NUMBER];
3305 	int used_probe_in_rebuilt_window[PROBE_KEY_NUMBER];
3306 	int used_probes = 0;
3307 
3308 	if(window_start_pos>300) xk2-=300;
3309 	if(full_rebuilt_window_size<101)return 0;
3310 	unsigned int last_best_xk2 = xk2;
3311 
3312 	memset(probe_poses ,0 , sizeof(unsigned int)*PROBE_KEY_NUMBER);
3313 	(*contig_start_pos) = 0;
3314 	(*contig_end_pos) = 0xffffffff;
3315 
3316 	for(xk1=0; xk1<PROBE_KEY_NUMBER; xk1++)
3317 	{
3318 		char * probe_key = full_rebuilt_window + (full_rebuilt_window_size-global_context -> config.reassembly_key_length) * xk1 /(PROBE_KEY_NUMBER-1);
3319 
3320 		int max_probe_match = -1;
3321 		unsigned int next_best_xk2 = 0;
3322 
3323 
3324 		// xk2 is the first base of the location matching this probe.
3325 		for(xk2 = last_best_xk2; xk2 < REASSEMBLY_WINDOW_LENGTH + window_start_pos + 300 + MAX_INSERTION_LENGTH; xk2++)
3326 		{
3327 			int probe_match = match_chro(probe_key, base_index, xk2, global_context -> config.reassembly_key_length, 0,  global_context -> config.space_type);
3328 			if(probe_match > max_probe_match )
3329 			{
3330 				next_best_xk2 = xk2;
3331 				max_probe_match = probe_match;
3332 			}
3333 		}
3334 
3335 		if(max_probe_match >= global_context -> config.reassembly_key_length - SINGLE_PROBE_MAX_MISMATCH)
3336 		{
3337 				// calculate the contig start;
3338 			if((*contig_start_pos)==0)
3339 			{
3340 				(*contig_start_pos) = next_best_xk2 + 1 - (probe_key-full_rebuilt_window);
3341 				(*head_removed_bases) = probe_key-full_rebuilt_window;
3342 			}
3343 
3344 			(*contig_end_pos) = next_best_xk2 + 1 - (probe_key-full_rebuilt_window) + full_rebuilt_window_size;
3345 			(*tail_removed_bases) = full_rebuilt_window_size - (probe_key-full_rebuilt_window) - global_context -> config.reassembly_key_length;
3346 
3347 			probe_poses[used_probes] = next_best_xk2+1;
3348 			used_probe_in_rebuilt_window[used_probes] = (full_rebuilt_window_size-global_context -> config.reassembly_key_length) * xk1 /(PROBE_KEY_NUMBER-1);
3349 			//if(max_probe_match<global_context -> config.reassembly_key_length - 1)
3350 			//	is_unreliable += 2;
3351 
3352 			used_probes +=1;
3353 		}
3354 		else
3355 		{
3356 			if(xk1 ==0 || xk1 == PROBE_KEY_NUMBER-1)
3357 			{
3358 				is_unreliable += 3;
3359 			}
3360 		}
3361 
3362 	//	if(next_best_xk2>0)
3363 	}
3364 
3365 
3366 	for(xk1=0; xk1<used_probes - 1; xk1++)
3367 	{
3368 		if(probe_poses[xk1]>= probe_poses[xk1+1]) return 0;
3369 	}
3370 
3371 	// find indels between every two mapped probes.
3372 	// and we know the indel length already.
3373 
3374 	int ret = 0, total_unmatched = 0;
3375 
3376 	//int H0_matched = match_chro(full_rebuilt_window ,  base_index, probe_poses[0]-1, full_rebuilt_window_size, 0, global_context -> config.space_type);
3377 
3378 	int total_delta = 0;
3379 	for(xk1=0; xk1 < used_probes-1; xk1++)
3380 	{
3381 
3382 		// indels_in_section  < 0 : insertion.
3383 
3384 		// used_probe_in_rebuilt_window is the location of the probe on reads
3385 
3386 		// probe_poses is the location of the probe on the chromosome.
3387 		long long int indels_in_section = probe_poses[xk1+1];
3388 		indels_in_section -= probe_poses[xk1];
3389 		indels_in_section -= used_probe_in_rebuilt_window[xk1+1];
3390 		indels_in_section += used_probe_in_rebuilt_window[xk1];
3391 
3392 		total_delta += indels_in_section;
3393 		if(indels_in_section)
3394 		{
3395 			// indels_in_section = -5: 5 insertion; indels_in_section = 3: 3 deletion
3396 			// find the edge.
3397 			int best_edge_score = -1;
3398 			unsigned int section_best_edge = 0;
3399 
3400 			// the smallest possible location of the second half is the location if the first half + deletion length
3401 			xk2 = probe_poses[xk1] - 1 + max(0, indels_in_section);
3402 
3403 
3404 			// find the exact indel point; xk2 is the tested point on the chromosome ( first WANTED base on the second half ).
3405 			for(; xk2 < probe_poses[xk1+1] -1; xk2++)
3406 			{
3407 				int left_matched = match_chro(full_rebuilt_window + used_probe_in_rebuilt_window[xk1], base_index, probe_poses[xk1] - 1, (xk2 - probe_poses[xk1]+ 1 - max(0, indels_in_section)), 0,  global_context -> config.space_type);
3408 				int right_matched = match_chro(full_rebuilt_window + used_probe_in_rebuilt_window[xk1] + (xk2 - probe_poses[xk1] + 1) -indels_in_section , base_index, xk2,  probe_poses[xk1+1] - 1 - xk2, 0,  global_context -> config.space_type);
3409 				if(left_matched+right_matched > best_edge_score)
3410 				{
3411 					best_edge_score = left_matched+right_matched;
3412 					section_best_edge = xk2;
3413 				}
3414 			}
3415 
3416 			int section_mismatch = probe_poses[xk1+1]-probe_poses[xk1]-max(0, indels_in_section) - best_edge_score;
3417 			is_unreliable += section_mismatch;
3418 
3419 
3420 			perfect_segment_start_pos[ret] = probe_poses[xk1] - 1;
3421 			// perfect_segment_lengths is : first WANTED base on the second half - first base pos on the first half - deletions
3422 			perfect_segment_lengths[ret] = section_best_edge - probe_poses[xk1] + 1 - max(0, indels_in_section);
3423 
3424 			good_quality_indel[ret] = section_mismatch < INDEL_FULL_ALIGN_MAX_MISMATCH;
3425 			indel_quality[ret] = pow(2,-is_unreliable);
3426 			indels_after_perfect_segments[ret] = indels_in_section;
3427 			indels_read_positions[ret] = used_probe_in_rebuilt_window[xk1] + perfect_segment_lengths[ret];
3428 			ret++;
3429 			total_unmatched += section_mismatch;
3430 		}
3431 		else
3432 		{
3433 
3434 			int matched = match_chro(full_rebuilt_window + used_probe_in_rebuilt_window[xk1], base_index, probe_poses[xk1] - 1, (probe_poses[xk1+1] - probe_poses[xk1]), 0,  global_context -> config.space_type);
3435 			total_unmatched += ((probe_poses[xk1+1] - probe_poses[xk1]) - matched);
3436 		}
3437 	}
3438 
3439 	if(ret>3|| total_unmatched>INDEL_FULL_ALIGN_MAX_MISMATCH || total_delta == 0)
3440 	{
3441 		return 0;
3442 	}
3443 
3444 
3445 	for(xk1=0; xk1<ret; xk1++)
3446 	{
3447 		indel_quality[xk1] *= pow(2.,-(ret * (1+total_unmatched)));
3448 	}
3449 
3450 	//printf("\nRET=%d\n\n", ret);
3451 
3452 	return ret;
3453 
3454 }
3455 
3456 
3457 
match_str(char * c1,char * c2,int len)3458 int match_str(char * c1, char * c2, int len)
3459 {
3460 	int xk1,ret=0;
3461 	for(xk1=0; xk1<len; xk1++)
3462 		ret += c1[xk1]==c2[xk1];
3463 	return ret;
3464 }
3465 
3466 
find_best_location_for_probe(global_context_t * global_context,gene_value_index_t * base_index,char * probe,unsigned int search_start,unsigned int search_len,unsigned int * ret_location)3467 int find_best_location_for_probe(global_context_t * global_context ,gene_value_index_t * base_index, char * probe, unsigned int search_start, unsigned int search_len, unsigned int * ret_location)
3468 {
3469 	unsigned int xk2;
3470 	unsigned int best_location = 0;
3471 	int best_match = -1;
3472 	for(xk2=search_start; xk2<search_start+search_len; xk2++)
3473 	{
3474 		int test_match = match_chro(probe, base_index, xk2, global_context -> config.reassembly_key_length, 0, global_context->config.space_type);
3475 		if(test_match>best_match)
3476 		{
3477 			best_match = test_match;
3478 			best_location = xk2;
3479 		}
3480 	}
3481 
3482 	* ret_location = best_location;
3483 	return best_match;
3484 
3485 }
3486 
find_potential_ultralong_indels(global_context_t * global_context,reassembly_by_voting_block_context_t * block_context,int window_no,char * contig_1,char * contig_2,unsigned int * potential_ins_left,unsigned int * potential_ins_len)3487 int find_potential_ultralong_indels(global_context_t * global_context , reassembly_by_voting_block_context_t * block_context , int window_no, char * contig_1, char * contig_2, unsigned int * potential_ins_left, unsigned int * potential_ins_len)
3488 {
3489 	int contig_len_1 = strlen(contig_1);
3490 	int contig_len_2 = strlen(contig_2);
3491 	unsigned int xk1, is_1_left;
3492 	for(is_1_left = 0; is_1_left < 2; is_1_left ++)
3493 	{
3494 		int best_match = -1;
3495 
3496 		char * test_key = is_1_left?contig_2:contig_1;
3497 		char * testee = is_1_left?contig_1:contig_2;
3498 		int testee_len = is_1_left?contig_len_1:contig_len_2;
3499 
3500 
3501 		for(xk1=0; xk1< testee_len - global_context -> config.reassembly_key_length; xk1++)
3502 		{
3503 			int test_match = match_str(test_key, testee + xk1, global_context -> config.reassembly_key_length);
3504 			if(test_match > best_match)
3505 				best_match = test_match;
3506 		}
3507 
3508 		if(best_match >= global_context -> config.reassembly_key_length-1)
3509 			return 0;
3510 	}
3511 
3512 	unsigned int window_left_pos = block_context->block_start_linear_pos + window_no * REASSEMBLY_WINDOW_LENGTH / 2 ;
3513 	unsigned int search_start = window_left_pos;
3514 	if(search_start>300)search_start-=300;
3515 	else search_start=0;
3516 	unsigned int search_end = 300 + global_context -> config.max_indel_length + window_left_pos;
3517 
3518 
3519 	gene_value_index_t * base_index = &global_context->all_value_indexes[0] ;
3520 	if(search_end > base_index -> length + base_index -> start_base_offset) search_end =  base_index -> length + base_index -> start_base_offset;
3521 
3522 	unsigned int contig_1_left_pos;
3523 	int contig_1_left_match = find_best_location_for_probe(global_context, base_index, contig_1, search_start, search_end - search_start , &contig_1_left_pos);
3524 
3525 	unsigned int contig_1_right_pos;
3526 	int contig_1_right_match = find_best_location_for_probe(global_context, base_index, contig_1 + contig_len_1 - global_context -> config.reassembly_key_length, search_start, search_end - search_start , &contig_1_right_pos);
3527 
3528 
3529 
3530 	unsigned int contig_2_left_pos;
3531 	int contig_2_left_match = find_best_location_for_probe(global_context, base_index, contig_2, search_start, search_end - search_start , &contig_2_left_pos);
3532 
3533 	unsigned int contig_2_right_pos;
3534 	int contig_2_right_match = find_best_location_for_probe(global_context, base_index, contig_2 + contig_len_2 - global_context -> config.reassembly_key_length, search_start, search_end - search_start , &contig_2_right_pos);
3535 
3536 
3537 	if(contig_1_left_match >= global_context -> config.reassembly_key_length -1 && contig_1_right_match >= global_context -> config.reassembly_key_length -1)
3538 		return 0;
3539 
3540 	if(contig_2_left_match >= global_context -> config.reassembly_key_length -1 && contig_2_right_match >= global_context -> config.reassembly_key_length -1)
3541 		return 0;
3542 
3543 	if(contig_1_left_match < global_context -> config.reassembly_key_length -1 && contig_1_right_match < global_context -> config.reassembly_key_length -1)
3544 		return 0;
3545 
3546 	if(contig_2_left_match < global_context -> config.reassembly_key_length -1 && contig_2_right_match < global_context -> config.reassembly_key_length -1)
3547 		return 0;
3548 
3549 	int ret = 0;
3550 	unsigned int basic_ins_left = 0, basic_ins_right = 0;
3551 	int is_1_at_left;
3552 
3553 	if(contig_1_left_match >= global_context -> config.reassembly_key_length -1 && contig_2_right_match >= global_context -> config.reassembly_key_length -1  && contig_1_left_pos<contig_2_right_pos)
3554 	{
3555 		if(contig_len_1 + contig_len_2 - global_context -> config.reassembly_key_length > contig_2_right_pos - contig_1_left_pos)
3556 		{
3557 			*potential_ins_len = contig_len_1 + contig_len_2 - global_context -> config.reassembly_key_length - (contig_2_right_pos - contig_1_left_pos);
3558 			*potential_ins_left = contig_1_left_pos;// + global_context -> config.reassembly_key_length;
3559 			basic_ins_left = contig_1_left_pos;
3560 			basic_ins_right = contig_2_right_pos;
3561 			is_1_at_left = 1;
3562 			ret = 1;
3563 		}
3564 	}
3565 
3566 	if(contig_1_right_match >= global_context -> config.reassembly_key_length -1  && contig_2_left_match >= global_context -> config.reassembly_key_length  -1 && contig_1_right_pos>contig_2_left_pos)
3567 	{
3568 		if( contig_len_1 + contig_len_2 - global_context -> config.reassembly_key_length> contig_1_right_pos - contig_2_left_pos)
3569 		{
3570 			*potential_ins_len = contig_len_1 + contig_len_2 - global_context -> config.reassembly_key_length - (contig_1_right_pos - contig_2_left_pos);
3571 			*potential_ins_left = contig_2_left_pos;// + global_context -> config.reassembly_key_length;
3572 			basic_ins_right = contig_1_right_pos;
3573 			basic_ins_left = contig_2_left_pos;
3574 			is_1_at_left = 0;
3575 			ret = 1;
3576 		}
3577 	}
3578 
3579 
3580 	if(ret)
3581 	{
3582 		unsigned int x1=0,left_ins_edge = * potential_ins_left, best_edge = 0;
3583 		char * left_contig = is_1_at_left ? contig_1: contig_2;
3584 		char * right_contig = is_1_at_left ? contig_2: contig_1;
3585 		char probe_window[global_context -> config.reassembly_key_length+1];
3586 		probe_window[global_context -> config.reassembly_key_length]=0;
3587 
3588 		for(; left_ins_edge < (*potential_ins_left) + global_context -> config.reassembly_key_length; left_ins_edge++)
3589 			probe_window[x1++]=gvindex_get(base_index, left_ins_edge);
3590 
3591 		for(; left_ins_edge < basic_ins_right; left_ins_edge++)
3592 		{
3593 			int delta = left_ins_edge - global_context -> config.reassembly_key_length - (*potential_ins_left);
3594 			char * test_key = left_contig + delta;
3595 
3596 			if(probe_window[global_context -> config.reassembly_key_length-1] == test_key[global_context -> config.reassembly_key_length-1]) best_edge = left_ins_edge ;
3597 
3598 			int matched = match_str(test_key, probe_window, global_context -> config.reassembly_key_length);
3599 			if(matched< global_context -> config.reassembly_key_length-1) break;
3600 
3601 
3602 			for(x1=0; x1<global_context -> config.reassembly_key_length - 1; x1++)
3603 				probe_window[x1] = probe_window[x1+1];
3604 			probe_window[global_context -> config.reassembly_key_length-1] = gvindex_get(base_index, left_ins_edge);
3605 		}
3606 
3607 		if(best_edge>0)
3608 			*potential_ins_left = best_edge;
3609 
3610 		if( ( basic_ins_right + global_context -> config.reassembly_key_length - best_edge) >strlen(right_contig) ||base_index -> length + base_index -> start_base_offset <= basic_ins_right + global_context -> config.reassembly_key_length || basic_ins_left <  base_index -> start_base_offset || best_edge >= base_index -> length + base_index -> start_base_offset) ret = 0;
3611 		else
3612 		{
3613 			int left_match = match_chro(left_contig, base_index, basic_ins_left, best_edge - basic_ins_left, 0, global_context->config.space_type);
3614 			int right_match = match_chro(right_contig + strlen(right_contig)- ( basic_ins_right + global_context -> config.reassembly_key_length - best_edge), base_index, best_edge,  basic_ins_right + global_context -> config.reassembly_key_length - best_edge, 0, global_context->config.space_type);
3615 
3616 			if(left_match+right_match< basic_ins_right+global_context -> config.reassembly_key_length-basic_ins_left - 2) ret=0;
3617 		}
3618 	}
3619 
3620 	return ret;
3621 }
3622 
3623 
put_long_indel_event(global_context_t * global_context,unsigned int best_pos,int indels,float indel_quality,char * inserted_bases,int event_type)3624 int put_long_indel_event(global_context_t * global_context, unsigned int best_pos, int indels, float indel_quality, char * inserted_bases, int event_type)
3625 {
3626 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
3627 	HashTable * event_table = indel_context -> event_entry_table;
3628 	chromosome_event_t * event_space = indel_context -> event_space_dynamic;
3629 
3630 	chromosome_event_t * found = NULL;
3631 	chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
3632 
3633 	int pos_delta;
3634 	for(pos_delta = -10; pos_delta <= 10; pos_delta++)
3635 	{
3636 		int found_events = search_event(global_context, event_table, event_space, best_pos - 1 + pos_delta, EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_LONG_INDEL|CHRO_EVENT_TYPE_INDEL, search_return);
3637 
3638 		if(found_events)
3639 		{
3640 			int kx1;
3641 			for(kx1 = 0; kx1 < found_events ; kx1++)
3642 			{
3643 				if(search_return[kx1] -> indel_length == indels || event_type == CHRO_EVENT_TYPE_POTENTIAL_INDEL )
3644 				{
3645 					found = search_return[kx1];
3646 					break;
3647 				}
3648 			}
3649 		}
3650 		if(found)break;
3651 	}
3652 
3653 /*
3654 				if(best_pos >= 1124357402&& best_pos < 1124359402)
3655 				{
3656 					fprintf(stderr, "LLLX-BB: %u :: FOUND=%p\n", best_pos, found);
3657 				}
3658 
3659 */
3660 
3661 	if(found)
3662 	{
3663 		found -> supporting_reads ++;
3664 		found -> event_quality = max(found ->event_quality, indel_quality);
3665 	}
3666 	else
3667 	{
3668 		int event_no;
3669 		event_no = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) ->  total_events ++;
3670 		event_space = reallocate_event_space(global_context , NULL, event_no);
3671 		chromosome_event_t * new_event = event_space+event_no;
3672 		memset(new_event,0,sizeof(chromosome_event_t));
3673 
3674 		if(indels<0 && inserted_bases)
3675 			set_insertion_sequence(global_context, NULL , &new_event -> inserted_bases ,  inserted_bases , -indels);
3676 		new_event -> event_small_side = best_pos - 1;
3677 		new_event -> event_large_side = best_pos + max(0, indels);
3678 		assert(new_event -> event_small_side && new_event -> event_large_side);
3679 		new_event -> event_type = event_type;
3680 		new_event -> indel_length = indels;
3681 		new_event -> supporting_reads = 1;
3682 		new_event -> event_quality = indel_quality;
3683 
3684 		put_new_event(event_table, new_event , event_no);
3685 	}
3686 
3687 	return 0;
3688 }
3689 
3690 
3691 #define BASE_NEEDED_PER_WINDOW 60
3692 
3693 #define PILEUP_BOX_LEN_CERTAIN 12
3694 #define MAX_PILEUP_BOX_COVERAGE_CERTAIN 12
3695 
3696 #define PILEUP_BOX_LEN 50
3697 #define MAX_PILEUP_BOX_COVERAGE 50
3698 
finalise_pileup_file_by_voting(global_context_t * global_context,char * temp_file_name,char * chro_name,int block_no)3699 int finalise_pileup_file_by_voting(global_context_t * global_context , char * temp_file_name, char * chro_name, int block_no)
3700 {
3701 	int global_read_no = 0;
3702 	char * read_text, * qual_text;
3703 	int xk1;
3704 	int total_window_number =2 * (BASE_BLOCK_LENGTH / REASSEMBLY_WINDOW_LENGTH+1), window_adjust;
3705 
3706 	unsigned int block_start_linear_pos = linear_gene_position(&global_context->chromosome_table, chro_name, block_no*BASE_BLOCK_LENGTH);
3707 	//if(block_start_linear_pos> 150000000) return 0;
3708 
3709 	FILE * tmp_fp = f_subr_open(temp_file_name,"rb");
3710 	if(!tmp_fp)
3711 		return 0;
3712 
3713 	print_in_box(80,0,0,"Processing %s\n", temp_file_name);
3714 
3715 	unsigned int * window_center_read_no = (unsigned int *) malloc(sizeof(unsigned int)* total_window_number * global_context -> config.reassembly_start_read_number);
3716 	unsigned int * window_base_number = (unsigned int *) malloc(sizeof(unsigned int)* total_window_number);
3717 	unsigned int * window_center_read_qual = (unsigned int *) malloc(sizeof(unsigned int)* total_window_number * global_context -> config.reassembly_start_read_number);
3718 	unsigned int * window_center_read_masks = (unsigned int *) malloc(sizeof(unsigned int)* total_window_number);
3719 	struct reassmebly_window_allele *first_half_alleles = (struct reassmebly_window_allele *) malloc(sizeof(struct reassmebly_window_allele ) * global_context -> config.reassembly_window_alleles);
3720 	struct reassmebly_window_allele *second_half_alleles = (struct reassmebly_window_allele *) malloc(sizeof(struct reassmebly_window_allele ) * global_context -> config.reassembly_window_alleles);
3721 
3722 	unsigned char * mapquality_lower_bounds_certain = calloc(1, BASE_BLOCK_LENGTH / PILEUP_BOX_LEN_CERTAIN * MAX_PILEUP_BOX_COVERAGE_CERTAIN);
3723 	unsigned char * box_read_number_certain = calloc(1, BASE_BLOCK_LENGTH / PILEUP_BOX_LEN_CERTAIN);
3724 
3725 	unsigned char * mapquality_lower_bounds = calloc(1, BASE_BLOCK_LENGTH / PILEUP_BOX_LEN * MAX_PILEUP_BOX_COVERAGE);
3726 	unsigned char * box_read_number = calloc(1, BASE_BLOCK_LENGTH / PILEUP_BOX_LEN);
3727 
3728 	char * full_rebuilt_window = (char *)malloc(3000);
3729 
3730 	reassembly_by_voting_block_context_t block_context;
3731 
3732 	block_context.voting_indexes = (gehash_t*)malloc(sizeof(gehash_t)* total_window_number);
3733 	block_context.block_start_linear_pos = block_start_linear_pos;
3734 	block_context.start_keys = (unsigned long long int *) calloc(sizeof(unsigned long long int),10 * total_window_number);
3735 	block_context.start_offsets = (short *)malloc(sizeof( short) * total_window_number);
3736 	block_context.read_no_counter = (unsigned int *) calloc(sizeof(unsigned int),total_window_number);
3737 	block_context.read_rectify_space = (short *) malloc(sizeof(unsigned short) * 4 * MAX_READ_LENGTH);
3738 	block_context.final_alleles = malloc(sizeof(struct reassmebly_window_allele ) * global_context -> config.reassembly_window_alleles);
3739 
3740 	read_text = (char *) malloc(MAX_READ_LENGTH);
3741 	qual_text = (char *) malloc(MAX_READ_LENGTH);
3742 
3743 	HashTable *read_sequence_table=HashTableCreate(300000);
3744 	HashTable *read_position_table=HashTableCreate(300000);
3745 	HashTable *read_quality_table=HashTableCreate(300000);
3746 	block_context.read_sequence_table = read_sequence_table;
3747 	block_context.read_quality_table = read_quality_table;
3748 	block_context.read_position_table = read_position_table;
3749 
3750 	memset(block_context.final_alleles, 0, sizeof(struct reassmebly_window_allele ) * global_context -> config.reassembly_window_alleles);
3751 	memset(window_center_read_no, 0, sizeof(unsigned int)* total_window_number * global_context -> config.reassembly_start_read_number);
3752 	memset(window_center_read_qual, 0, sizeof(unsigned int)* total_window_number*global_context -> config.reassembly_start_read_number);
3753 	memset(window_center_read_masks, 0, sizeof(unsigned int)* total_window_number);
3754 	memset(window_base_number, 0, sizeof(unsigned int)* total_window_number);
3755 
3756 
3757 	if(!tmp_fp) return 1;
3758 
3759 	while(!feof(tmp_fp))
3760 	{
3761 		short read_len;
3762 		long long int first_base_pos;
3763 		//unsigned int qual_of_this_read = 1;
3764 		base_block_temp_read_t read_rec;
3765 		int rlen = fread(&read_rec, sizeof(read_rec), 1, tmp_fp);
3766 		int is_position_certain = read_rec.strand;
3767 		if(rlen<1) break;
3768 
3769 		rlen = fread(&read_len, sizeof(short), 1, tmp_fp);
3770 		if(rlen < 1){
3771 			SUBREADprintf("ERROR: Unable to parse piles\n");
3772 			return -1;
3773 		}
3774 		rlen = fread(read_text, sizeof(char), read_len, tmp_fp);
3775 		if(rlen <read_len){
3776 			SUBREADprintf("ERROR: Unable to parse piles\n");
3777 			return -1;
3778 		}
3779 		rlen = fread(qual_text, sizeof(char), read_len, tmp_fp);
3780 		if(rlen <read_len){
3781 			SUBREADprintf("ERROR: Unable to parse piles\n");
3782 			return -1;
3783 		}
3784 
3785 		first_base_pos = read_rec.pos;
3786 		first_base_pos -= block_no * BASE_BLOCK_LENGTH;
3787 
3788 		for(window_adjust = 0; window_adjust < global_context -> config.reassembly_window_multiplex ; window_adjust ++)
3789 		{
3790 			int window_no = 2*(first_base_pos / REASSEMBLY_WINDOW_LENGTH)+ window_adjust , xk2;
3791 			if(window_no >=total_window_number)
3792 				break;
3793 			window_base_number[window_no]+=read_len;
3794 
3795 			int qual_of_this_read = 127, read_offset;
3796 			for(read_offset = 0; read_offset < read_len ; read_offset++)
3797 			{
3798 				int nch = qual_text[read_offset] - '!';
3799 				if(nch < 14) qual_of_this_read -=4;
3800 				else if(nch < 22) qual_of_this_read -= 2;
3801 				else if(nch < 33) qual_of_this_read --;
3802 			}
3803 			qual_of_this_read=max(1,qual_of_this_read);
3804 
3805 
3806 			if(is_position_certain)
3807 			{
3808 				int box_no = first_base_pos / PILEUP_BOX_LEN_CERTAIN;
3809 				for(xk1 = box_no * MAX_PILEUP_BOX_COVERAGE_CERTAIN ; xk1 < (box_no+1) * MAX_PILEUP_BOX_COVERAGE_CERTAIN; xk1++)
3810 				{
3811 					if(qual_of_this_read > mapquality_lower_bounds_certain[xk1])
3812 					{
3813 						for(xk2 = (box_no+1) * MAX_PILEUP_BOX_COVERAGE_CERTAIN - 1; xk2 > xk1; xk2--)
3814 							mapquality_lower_bounds_certain[xk2] = mapquality_lower_bounds_certain[xk2-1];
3815 						mapquality_lower_bounds_certain[xk1] = qual_of_this_read;
3816 						break;
3817 					}
3818 				}
3819 			}
3820 			else
3821 			{
3822 				int box_no = first_base_pos / PILEUP_BOX_LEN;
3823 				for(xk1 = box_no * MAX_PILEUP_BOX_COVERAGE ; xk1 < (box_no+1) * MAX_PILEUP_BOX_COVERAGE; xk1++)
3824 				{
3825 					if(qual_of_this_read > mapquality_lower_bounds[xk1])
3826 					{
3827 						for(xk2 = (box_no+1) * MAX_PILEUP_BOX_COVERAGE - 1; xk2 > xk1; xk2--)
3828 							mapquality_lower_bounds[xk2] = mapquality_lower_bounds[xk2-1];
3829 						mapquality_lower_bounds[xk1] = qual_of_this_read;
3830 						break;
3831 					}
3832 				}
3833 
3834 			}
3835 			if(is_position_certain && window_adjust == 1) break;
3836 		}
3837 
3838 	}
3839 	fclose(tmp_fp);
3840 
3841 	for(xk1=0; xk1< total_window_number; xk1++)
3842 	{
3843 		if(window_base_number[xk1] >= BASE_NEEDED_PER_WINDOW)
3844 		{
3845 			gehash_create(&block_context.voting_indexes[xk1], max(4000,window_base_number[xk1]*4), 1);
3846 			block_context . start_offsets[xk1] = 0x7fff;
3847 		}
3848 	}
3849 
3850 
3851 	tmp_fp = f_subr_open(temp_file_name,"rb");
3852 	while(!feof(tmp_fp))
3853 	{
3854 		short read_len;
3855 		long long int first_base_pos;
3856 		base_block_temp_read_t read_rec;
3857 		int rlen = fread(&read_rec, sizeof(read_rec), 1, tmp_fp);
3858 		int is_position_certain = read_rec.strand;
3859 
3860 		if(rlen<1) break;
3861 		rlen = fread(&read_len, sizeof(short), 1, tmp_fp);
3862 		if(rlen <1){
3863 			SUBREADprintf("ERROR: cannot parse piles.\n");
3864 			return -1;
3865 		}
3866 
3867 		rlen = fread(read_text, sizeof(char), read_len, tmp_fp);
3868 		if(rlen <read_len){
3869 			SUBREADprintf("ERROR: cannot parse piles.\n");
3870 			return -1;
3871 		}
3872 
3873 		rlen = fread(qual_text, sizeof(char), read_len, tmp_fp);
3874 		if(rlen <read_len){
3875 			SUBREADprintf("ERROR: cannot parse piles.\n");
3876 			return -1;
3877 		}
3878 		first_base_pos = read_rec.pos;
3879 		first_base_pos -= block_no * BASE_BLOCK_LENGTH;
3880 		assert(first_base_pos>=0);
3881 		read_text [read_len]=0;
3882 		qual_text [read_len]=0;
3883 
3884 		is_bad_read(read_text, read_len, "LOAD");
3885 
3886 		int read_offset;
3887 
3888 		/*
3889 		unsigned int qual_of_this_read = 1;
3890 		for(read_offset = 0; read_offset < read_len ; read_offset++)
3891 			qual_of_this_read += qual_text[read_offset];
3892 
3893 		qual_of_this_read = qual_of_this_read * 100 / read_len + (is_position_certain?200:0);
3894 		if(qual_of_this_read<1)qual_of_this_read=1;
3895 		*/
3896 
3897 		int qual_of_this_read = 127;
3898 		for(read_offset = 0; read_offset < read_len ; read_offset++)
3899 		{
3900 			int nch = qual_text[read_offset] - '!';
3901 
3902 
3903 			if(nch < 14) qual_of_this_read -=4;
3904 			else if(nch < 22) qual_of_this_read -= 2;
3905 			else if(nch < 33) qual_of_this_read --;
3906 		}
3907 
3908 		qual_of_this_read=max(1,qual_of_this_read);
3909 
3910 		for(window_adjust = 0; window_adjust < global_context -> config.reassembly_window_multiplex; window_adjust++)
3911 		{
3912 			if(is_position_certain && window_adjust >= 2) break;
3913 			int window_no = 2*(first_base_pos / REASSEMBLY_WINDOW_LENGTH) + window_adjust;
3914 
3915 			if(window_no >= total_window_number) break;
3916 			if(window_base_number[window_no]<BASE_NEEDED_PER_WINDOW) continue;
3917 
3918 
3919 			if(is_position_certain)
3920 			{
3921 				int box_no = first_base_pos / PILEUP_BOX_LEN_CERTAIN;
3922 				if(qual_of_this_read < mapquality_lower_bounds_certain[box_no * MAX_PILEUP_BOX_COVERAGE_CERTAIN + MAX_PILEUP_BOX_COVERAGE_CERTAIN - 1])
3923 					continue;
3924 				if(box_read_number_certain[box_no]>MAX_PILEUP_BOX_COVERAGE_CERTAIN)continue;
3925 				box_read_number_certain[box_no]++;
3926 				//qual_of_this_read += 200;
3927 			}
3928 			else
3929 			{
3930 				int box_no = first_base_pos / PILEUP_BOX_LEN;
3931 
3932 				if(qual_of_this_read < mapquality_lower_bounds[box_no * MAX_PILEUP_BOX_COVERAGE + MAX_PILEUP_BOX_COVERAGE - 1])
3933 					continue;
3934 				if(box_read_number[box_no]>MAX_PILEUP_BOX_COVERAGE)continue;
3935 				box_read_number[box_no]++;
3936 			}
3937 
3938 			int read_no = block_context.read_no_counter[window_no];
3939 			block_context.read_no_counter[window_no]++;
3940 
3941 
3942 			if(window_center_read_qual[window_no * global_context -> config.reassembly_start_read_number + global_context -> config.reassembly_start_read_number-1] < qual_of_this_read)//&& is_position_certain)
3943 			{
3944 
3945 				int xk44,xk3;
3946 				for(xk44=0; xk44<global_context -> config.reassembly_start_read_number; xk44++)
3947 					if(window_center_read_qual[window_no * global_context -> config.reassembly_start_read_number + xk44] < qual_of_this_read)
3948 						break;
3949 				for(xk3=global_context -> config.reassembly_start_read_number -2; xk3>=xk44 ; xk3--)
3950 				{
3951 					window_center_read_no[window_no * global_context -> config.reassembly_start_read_number + xk3 +1] =   window_center_read_no[window_no * global_context -> config.reassembly_start_read_number + xk3] ;
3952 					window_center_read_qual[window_no * global_context -> config.reassembly_start_read_number + xk3 +1] = window_center_read_qual[window_no * global_context -> config.reassembly_start_read_number + xk3];
3953 				}
3954 
3955 				window_center_read_no[window_no * global_context -> config.reassembly_start_read_number + xk44] = read_no + 1;
3956 				window_center_read_qual[window_no  * global_context -> config.reassembly_start_read_number + xk44] = qual_of_this_read;
3957 
3958 			}
3959 			for(read_offset = 0; read_offset < read_len - global_context -> config.reassembly_subread_length+1; read_offset += REASSEMBLY_SUBREAD_GAP)
3960 			{
3961 				unsigned int subread_int = 0;
3962 				int xk2;
3963 				for(xk2=0; xk2< global_context -> config.reassembly_subread_length; xk2++)
3964 					subread_int = (subread_int << 2) | base2int(read_text[read_offset + xk2]);
3965 
3966 				gehash_insert_limited(&block_context.voting_indexes[window_no], subread_int , read_offset +10000 *read_no, 400, 10000);
3967 			}
3968 
3969 
3970 
3971 		/*
3972 			if(strstr(temp_file_name, "chr1-0005"))
3973 				if(window_no == 63763 || window_no == 63762)
3974 				{
3975 					SUBREADprintf("LLLX-INSERT_WIN: [%d] : %s  RN=%d  RL=%d  ITEMS=%llu\n",window_no, read_text, read_no, read_len, block_context.voting_indexes[window_no].current_items);
3976 
3977 				}
3978 		*/
3979 
3980 			char * sequence_for_hash = (char *) malloc(read_len+1);
3981 			assert(sequence_for_hash);
3982 			strcpy(sequence_for_hash, read_text);
3983 			unsigned long long int read_hashed_key = (window_no * 1llu +1)<<32 ;
3984 			read_hashed_key |= (read_no & 0xfffff)<<12;
3985 
3986 			if(is_position_certain)
3987 				HashTablePut(read_position_table, NULL + read_hashed_key, NULL+first_base_pos+1);
3988 			HashTablePut(read_sequence_table, NULL + read_hashed_key, sequence_for_hash);
3989 			HashTablePut(read_quality_table, NULL + read_hashed_key, NULL + qual_of_this_read);
3990 		}
3991 
3992 		global_read_no ++;
3993 	}
3994 	fclose(tmp_fp);
3995 
3996 	for(xk1=0; xk1< total_window_number; xk1++)
3997 	{
3998 		if(window_base_number[xk1]>=BASE_NEEDED_PER_WINDOW)
3999 			gehash_sort(&block_context.voting_indexes[xk1]);
4000 	}
4001 
4002 
4003 	gene_vote_t *vote_list;
4004 	vote_list = (gene_vote_t *) malloc(sizeof(gene_vote_t));
4005 	assert(vote_list);
4006 	block_context.vote_list = vote_list;
4007 	block_context.vote_list_rectify = (gene_vote_t *) malloc(sizeof(gene_vote_t));
4008 
4009 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
4010 
4011 	HashTable * event_table = indel_context -> event_entry_table;
4012 
4013 	for(xk1 = 0; xk1 <  total_window_number; xk1++)
4014 	{
4015 		// xk1 is the number of the current window.
4016 
4017 		if(window_base_number[xk1]<BASE_NEEDED_PER_WINDOW)
4018 		{
4019 			//printf("SKIPPED : %d : %d/%d\n", xk1, window_base_number[xk1], BASE_NEEDED_PER_WINDOW);
4020 			continue;
4021 		}
4022 		// full_window = 5'end -> anchor_read -> 3'end
4023 
4024 
4025 
4026 		//int IGNORED_BASES_IN_WINDOW  = 10;
4027 		//for(IGNORED_BASES_IN_WINDOW = 0; IGNORED_BASES_IN_WINDOW <=60; IGNORED_BASES_IN_WINDOW+=30)
4028 		{
4029 			int start_read_i;
4030 			for(start_read_i = 0; start_read_i < global_context -> config.reassembly_start_read_number; start_read_i ++)
4031 			{
4032 
4033 				// from middle read to 5' end
4034 
4035 				if(!window_center_read_no[xk1 * global_context -> config.reassembly_start_read_number + start_read_i ]) continue;
4036 
4037 				unsigned int this_start_read_no = window_center_read_no[xk1 * global_context -> config.reassembly_start_read_number + start_read_i ]-1 ;
4038 				unsigned int window_start_pos = block_context.block_start_linear_pos + xk1 * REASSEMBLY_WINDOW_LENGTH / 2;
4039 				block_context.window_start_pos = window_start_pos;
4040 
4041 				search_window(global_context, &block_context, xk1, this_start_read_no , -1);
4042 				memcpy(first_half_alleles, block_context.final_alleles, global_context -> config.reassembly_window_alleles * sizeof(struct reassmebly_window_allele ));
4043 				search_window(global_context, &block_context, xk1, this_start_read_no,  1);
4044 				memcpy(second_half_alleles, block_context.final_alleles, global_context -> config.reassembly_window_alleles * sizeof(struct reassmebly_window_allele));
4045 
4046 				/*
4047 				if(window_start_pos >= 1124357402&& window_start_pos < 1124359402)
4048 				{
4049 					fprintf(stderr, "LLLX: %u [%d] :: START READ = %u ; FN=%s ; BASES=%u\n", window_start_pos, xk1, this_start_read_no, temp_file_name , window_base_number[xk1]);
4050 					char fname [100];
4051 					sprintf(fname, "DP-%d-%d", getpid(), xk1);
4052 					gehash_dump(&block_context.voting_indexes[xk1], fname);
4053 				}*/
4054 
4055 				unsigned long long int starter_read_key = (xk1 * 1llu +1)<<32 ;
4056 				starter_read_key |=  ( (this_start_read_no)& 0xfffff)<<12;
4057 
4058 				char * start_read_txt = HashTableGet(block_context.read_sequence_table,NULL+starter_read_key);
4059 				assert(start_read_txt);
4060 
4061 				int first_allele_no, second_allele_no;
4062 
4063 				for(first_allele_no = 0; first_allele_no <  global_context -> config.reassembly_window_alleles && first_half_alleles[first_allele_no].rebuilt_size>0 ; first_allele_no++)
4064 					for(second_allele_no = 0; second_allele_no <  global_context -> config.reassembly_window_alleles && second_half_alleles[second_allele_no].rebuilt_size>0  ; second_allele_no++)
4065 					{
4066 						int all_indels_in_window = 0;
4067 						strcpy(full_rebuilt_window, first_half_alleles [first_allele_no]. rebuilt_window);
4068 						strcat(full_rebuilt_window, start_read_txt);
4069 						strcat(full_rebuilt_window, second_half_alleles[second_allele_no].rebuilt_window);
4070 
4071 
4072 						int xk2, full_rebuilt_window_size = strlen(full_rebuilt_window);
4073 
4074 						gene_value_index_t * base_index = &global_context->all_value_indexes[0] ;
4075 
4076 						unsigned int perfect_segment_start_pos[MAX_INDELS_IN_WINDOW], contig_start_pos, contig_end_pos, all_fresh = 0;
4077 						int perfect_segment_lengths[MAX_INDELS_IN_WINDOW], head_removed_bases=0, tail_removed_bases=0;
4078 						int indels_after_perfect_segments[MAX_INDELS_IN_WINDOW];
4079 						int good_quality_indels[MAX_INDELS_IN_WINDOW];
4080 						float quality_of_indels_aln[MAX_INDELS_IN_WINDOW];
4081 						short indels_read_positions[MAX_INDELS_IN_WINDOW];
4082 						char contig_CIGAR[200];
4083 
4084 						memset(good_quality_indels, 0, sizeof(int) * MAX_INDELS_IN_WINDOW);
4085 						int indels_in_window = full_indel_alignment(global_context, &block_context, full_rebuilt_window, full_rebuilt_window_size, base_index, window_start_pos, perfect_segment_start_pos, perfect_segment_lengths, indels_after_perfect_segments, indels_read_positions, quality_of_indels_aln, &contig_start_pos, &contig_end_pos, &head_removed_bases, &tail_removed_bases, good_quality_indels);
4086 						contig_CIGAR[0]=0;
4087 						int read_position_cursor = head_removed_bases;
4088 						int is_indel_contig = 0;
4089 
4090 						for(xk2 = 0; xk2 < indels_in_window; xk2++)
4091 						{
4092 							unsigned int best_pos = perfect_segment_start_pos [xk2] + perfect_segment_lengths[xk2];
4093 							int indels = indels_after_perfect_segments[xk2], is_fresh = 1;
4094 							float quality_of_this_indel = quality_of_indels_aln[xk2] * first_half_alleles [first_allele_no].allele_quality * second_half_alleles[second_allele_no].allele_quality;
4095 
4096 							if(abs(indels) >= global_context -> config.max_indel_length){
4097 								//#warning "====================== REMOVE THE debug output ==============="
4098 								continue;
4099 							}
4100 
4101 							/*if(abs(indels) <= 16 && 0){
4102 								continue;
4103 							}*/
4104 
4105 							is_indel_contig=1;
4106 							sprintf(contig_CIGAR+strlen(contig_CIGAR), "%dM%d%c", indels_read_positions[xk2] - read_position_cursor, abs(indels), indels<0?'I':'D');
4107 							read_position_cursor = indels_read_positions[xk2];
4108 							if(indels<0) read_position_cursor -= indels;
4109 
4110 							all_indels_in_window++;
4111 
4112 							if(indels >= -global_context -> config.max_indel_length && good_quality_indels[xk2])
4113 							{
4114 								int neighbour_delta;
4115 								for(neighbour_delta = - 30; neighbour_delta < 30 ; neighbour_delta++)
4116 								{
4117 									chromosome_event_t * search_return [MAX_EVENT_ENTRIES_PER_SITE];
4118 									chromosome_event_t * event_space = indel_context -> event_space_dynamic;
4119 
4120 									int xk3, found_events = search_event(global_context, event_table, event_space, best_pos + neighbour_delta , EVENT_SEARCH_BY_SMALL_SIDE, CHRO_EVENT_TYPE_LONG_INDEL|CHRO_EVENT_TYPE_INDEL, search_return);
4121 
4122 									for(xk3 = 0; xk3<found_events; xk3++)
4123 									{
4124 										chromosome_event_t * tested_neighbour = search_return[xk3];
4125 
4126 										if(tested_neighbour -> event_quality >= quality_of_this_indel)
4127 											is_fresh = 0;
4128 										else tested_neighbour -> event_type = CHRO_EVENT_TYPE_REMOVED;
4129 									}
4130 
4131 
4132 								}
4133 
4134 
4135 								if(is_fresh)
4136 									put_long_indel_event(global_context, best_pos,  indels, quality_of_this_indel, full_rebuilt_window + indels_read_positions[xk2], CHRO_EVENT_TYPE_LONG_INDEL);
4137 								all_fresh += is_fresh;
4138 							}
4139 						}
4140 
4141 
4142 						//#warning "====================== Make sure ' is_indel_contig '  in the next line is harmless before RELEASE ==============="
4143 						if( is_indel_contig || all_fresh)
4144 						{
4145 							int write_cursor;
4146 							char * chro_begin;
4147 							int chro_offset_start = 0;
4148 							int chro_offset_end = 0;
4149 							locate_gene_position_max(contig_start_pos + head_removed_bases ,& global_context -> chromosome_table, &chro_begin, &chro_offset_start, NULL, NULL, 0);
4150 							locate_gene_position_max(contig_end_pos - tail_removed_bases ,& global_context -> chromosome_table, &chro_begin, &chro_offset_end, NULL, NULL, 0);
4151 							if(full_rebuilt_window_size - read_position_cursor - tail_removed_bases)
4152 								fprintf(global_context -> long_insertion_FASTA_fp, ">%s-%u-%u-%s%dM\n", chro_name, chro_offset_start, chro_offset_end - 1, contig_CIGAR, full_rebuilt_window_size - read_position_cursor - tail_removed_bases);
4153 							else
4154 								fprintf(global_context -> long_insertion_FASTA_fp, ">%s-%u-%u-%s0M\n", chro_name, chro_offset_start, chro_offset_end - 1, contig_CIGAR);
4155 							full_rebuilt_window[full_rebuilt_window_size - tail_removed_bases] = 0;
4156 							for(write_cursor = head_removed_bases ; write_cursor < full_rebuilt_window_size - tail_removed_bases ; write_cursor += 70)
4157 							{
4158 								fwrite(full_rebuilt_window + write_cursor , min(70, full_rebuilt_window_size - write_cursor), 1, global_context -> long_insertion_FASTA_fp);
4159 								fputs("\n",global_context -> long_insertion_FASTA_fp);
4160 							}
4161 						}
4162 
4163 					}
4164 			}
4165 		}
4166 
4167 
4168 	}
4169 
4170 	free(vote_list);
4171 	free(block_context.vote_list_rectify);
4172 
4173 	for(xk1=0; xk1< total_window_number; xk1++)
4174 	{
4175 		if(window_base_number[xk1]>=BASE_NEEDED_PER_WINDOW)
4176 			gehash_destory(&block_context.voting_indexes[xk1]);
4177 	}
4178 
4179 
4180 	free_values_destroy(read_sequence_table);
4181 
4182 	free(mapquality_lower_bounds_certain);
4183 	free(mapquality_lower_bounds);
4184 	free(box_read_number);
4185 	free(box_read_number_certain);
4186 	free(full_rebuilt_window);
4187 	free(block_context.read_no_counter);
4188 	free(block_context.voting_indexes);
4189 	free(block_context.start_keys);
4190 	free(block_context.start_offsets);
4191 	free(block_context.read_rectify_space);
4192 	free(block_context.final_alleles);
4193 	free(first_half_alleles);
4194 	free(second_half_alleles);
4195 	free(window_base_number);
4196 	free(window_center_read_no);
4197 	free(window_center_read_qual);
4198 	free(window_center_read_masks);
4199 	free(read_text);
4200 	free(qual_text);
4201 
4202 
4203 	return 0;
4204 }
finalise_pileup_file_by_debrujin(global_context_t * global_context,char * temp_file_name,char * chro_name,int block_no)4205 int finalise_pileup_file_by_debrujin(global_context_t * global_context , char * temp_file_name, char * chro_name, int block_no)
4206 {
4207 	FILE * tmp_fp = f_subr_open(temp_file_name,"rb");
4208 	if(!tmp_fp)
4209 		return 0;
4210 	char * read_text, * qual_text;
4211 	int xk1;
4212 	unsigned int block_start_linear_pos = linear_gene_position(&global_context->chromosome_table, chro_name, block_no*BASE_BLOCK_LENGTH);
4213 	int total_window_number =2 * (BASE_BLOCK_LENGTH / REASSEMBLY_WINDOW_LENGTH+1);
4214 
4215 	reassembly_block_context_t * block_context = (reassembly_block_context_t*)malloc(sizeof(reassembly_block_context_t));
4216 	block_context -> block_start_linear_pos = block_start_linear_pos;
4217 	block_context -> start_keys = (unsigned long long int *) calloc(sizeof(unsigned long long int),10 * total_window_number);
4218 	block_context -> start_offsets = (short *)malloc(sizeof( short) * total_window_number);
4219 
4220 	assert(block_context -> start_offsets);
4221 	assert(block_context -> start_keys);
4222 
4223 	block_context -> de_bruijn_graphs = ( HashTable ** )malloc(sizeof( HashTable *) * total_window_number);
4224 	for(xk1=0; xk1< total_window_number; xk1++)
4225 	{
4226 		block_context -> de_bruijn_graphs[xk1] = HashTableCreate(10* REASSEMBLY_WINDOW_LENGTH);
4227 		block_context -> start_offsets[xk1] = 0x7fff;
4228 	}
4229 
4230 	read_text = (char *) malloc(MAX_READ_LENGTH);
4231 	qual_text = (char *) malloc(MAX_READ_LENGTH);
4232 	while(!feof(tmp_fp))
4233 	{
4234 		short read_len;
4235 		unsigned int first_base_pos;
4236 		base_block_temp_read_t read_rec;
4237 		int rlen = fread(&read_rec, sizeof(read_rec), 1, tmp_fp);
4238 		if(rlen<1) break;
4239 		rlen = fread(&read_len, sizeof(short), 1, tmp_fp);
4240 		if(rlen<1) return -1;
4241 		rlen = fread(read_text, sizeof(char), read_len, tmp_fp);
4242 		if(rlen<read_len) return -1;
4243 		rlen = fread(qual_text, sizeof(char), read_len, tmp_fp);
4244 		if(rlen<read_len) return -1;
4245 		first_base_pos = read_rec.pos - block_no * BASE_BLOCK_LENGTH;
4246 
4247 		insert_pileup_read(global_context , block_context , first_base_pos , read_text , qual_text , read_len, read_rec.read_pos>0);
4248 	}
4249 
4250 	for(xk1=0; xk1< total_window_number; xk1++)
4251 	{
4252 		if(block_context -> start_offsets[xk1]<0x7fff && block_context -> de_bruijn_graphs[xk1] -> numOfElements >3)
4253 		{
4254 			finalise_db_graph(global_context, block_context, xk1, block_context -> de_bruijn_graphs[xk1] , block_context -> start_keys [xk1*10], block_context -> start_offsets [xk1]);
4255 		}
4256 		HashTableDestroy(block_context -> de_bruijn_graphs[xk1]);
4257 	}
4258 
4259 
4260 	free(block_context -> de_bruijn_graphs);
4261 	free(block_context -> start_keys);
4262 	free(block_context -> start_offsets);
4263 	free(block_context);
4264 	free(read_text);
4265 	free(qual_text);
4266 	fclose(tmp_fp);
4267 	return 0;
4268 }
4269 
finalise_long_insertions_by_hashtable(global_context_t * global_context)4270 int finalise_long_insertions_by_hashtable(global_context_t * global_context)
4271 {
4272 	int chro_i;
4273 	assert(global_context -> index_block_number == 1);
4274 	unsigned int chro_start_pos = 0;
4275 	char tmp_fname[MAX_FILE_NAME_LENGTH+40];
4276 
4277 	sprintf(tmp_fname,"%s.reassembly.fa", global_context->config.output_prefix);
4278 	global_context->long_insertion_FASTA_fp = f_subr_open(tmp_fname ,"wb");
4279 
4280 	for(chro_i=0; chro_i < global_context -> chromosome_table.total_offsets ; chro_i++)
4281 	{
4282 		unsigned int chro_length = global_context -> chromosome_table.read_offsets[chro_i] - chro_start_pos;
4283 		unsigned int chro_current_pos = 0;
4284 		for(chro_current_pos = 0; chro_current_pos < chro_length ; chro_current_pos += BASE_BLOCK_LENGTH)
4285 		{
4286 			char temp_file_name[MAX_FILE_NAME_LENGTH + 50];
4287 			int block_no = chro_current_pos / BASE_BLOCK_LENGTH;
4288 
4289 			sprintf(temp_file_name,"%s@%s-%04u.bin", global_context -> config.temp_file_prefix, global_context -> chromosome_table.read_names+chro_i*MAX_CHROMOSOME_NAME_LEN , block_no );
4290 
4291 			finalise_pileup_file(global_context , temp_file_name, global_context -> chromosome_table.read_names+MAX_CHROMOSOME_NAME_LEN*chro_i, block_no);
4292 
4293 			unlink(temp_file_name);
4294 		}
4295 		chro_start_pos = global_context -> chromosome_table.read_offsets[chro_i];
4296 	}
4297 
4298 	fclose(global_context -> long_insertion_FASTA_fp);
4299 
4300 	/*
4301 	for(xk1=0; xk1 < global_context -> config.long_indel_iterations ; xk1++)
4302 	{
4303 		explorer_insertions_in_unmapped(global_context);
4304 	}
4305 	*/
4306 
4307 	return 0;
4308 
4309 }
destroy_pileup_table(HashTable * local_reassembly_pileup_files)4310 void destroy_pileup_table(HashTable* local_reassembly_pileup_files)
4311 {
4312 	int bucket;
4313 	KeyValuePair * cursor;
4314 
4315 	for(bucket=0; bucket< local_reassembly_pileup_files -> numOfBuckets; bucket++)
4316 	{
4317 		cursor = local_reassembly_pileup_files -> bucketArray[bucket];
4318 		while (1)
4319 		{
4320 			if (!cursor) break;
4321 			FILE * fp = (FILE *)cursor->value;
4322 			if(fp != NULL+1)
4323 				fclose(fp);
4324 			free((void *)cursor->key);
4325 			cursor = cursor->next;
4326 		}
4327 	}
4328 
4329 	HashTableDestroy(local_reassembly_pileup_files);
4330 
4331 
4332 }
4333 
4334 char * _COREMAIN_delete_temp_prefix = NULL;
COREMAIN_SIGINT_hook(int param)4335 void COREMAIN_SIGINT_hook(int param)
4336 {
4337 	#ifdef MAKE_STANDALONE
4338 	int xk1, last_slash = -1;
4339 	if(_COREMAIN_delete_temp_prefix != NULL)
4340 	{
4341 		char del2[MAX_FILE_NAME_LENGTH], del_suffix[MAX_FILE_NAME_LENGTH], del_name[MAX_FILE_NAME_LENGTH];
4342 		SUBREADprintf("\n\nReceived a terminal signal. The temporary files were removed.\n");
4343 		for(xk1=0; _COREMAIN_delete_temp_prefix[xk1]; xk1++)
4344 		{
4345 			if(_COREMAIN_delete_temp_prefix[xk1]=='/') last_slash = xk1;
4346 			else if(_COREMAIN_delete_temp_prefix[xk1]=='\\')
4347 			{
4348 				SUBREADprintf("The file name is unknown.\n");
4349 				return;
4350 			}
4351 		}
4352 		if(last_slash>=0)
4353 		{
4354 			memcpy(del2, _COREMAIN_delete_temp_prefix, last_slash);
4355 			del2[last_slash] = 0;
4356 			strcpy(del_suffix , _COREMAIN_delete_temp_prefix + last_slash + 1);
4357 		}
4358 		else
4359 		{
4360 			strcpy(del2,".");
4361 			strcpy(del_suffix , _COREMAIN_delete_temp_prefix);
4362 		}
4363 
4364 		if(strlen(del_suffix)>8)
4365 		{
4366 			DIR           *d;
4367 			struct dirent *dir;
4368 
4369 			d = opendir(del2);
4370 			if (d)
4371 			{
4372 				while ((dir = readdir(d)) != NULL)
4373 				{
4374 					if(strstr(dir->d_name, del_suffix))
4375 					{
4376 						//printf("%s\n", dir->d_name);
4377 						strcpy(del_name, del2);
4378 						strcat(del_name, "/");
4379 						strcat(del_name, dir->d_name);
4380 						unlink(del_name);
4381 					}
4382 				}
4383 				closedir(d);
4384 			}
4385 		}
4386 
4387 	}
4388 
4389 	exit(param);
4390 	#endif
4391 }
4392 
4393 
4394 
finalise_long_insertions(global_context_t * global_context)4395 int finalise_long_insertions(global_context_t * global_context)
4396 {
4397 	indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
4398 	HashTable * local_reassembly_pileup_files = indel_context -> local_reassembly_pileup_files;
4399 
4400 	destroy_pileup_table(local_reassembly_pileup_files);
4401 
4402 	return finalise_long_insertions_by_hashtable(global_context);
4403 }
4404 
init_global_context(global_context_t * context)4405 void init_global_context(global_context_t * context)
4406 {
4407 	memset(context->module_contexts, 0, 5*sizeof(void *));
4408 	memset(&context->config, 0, sizeof(configuration_t));
4409 
4410 	context->config.fast_run = 0;
4411 	context->config.memory_use_multiplex = 1;
4412 	context->config.report_sam_file = 1;
4413 	context->config.do_breakpoint_detection = 0;
4414 	context->config.do_fusion_detection = 0;
4415 	context->config.do_long_del_detection = 0;
4416 	context->config.do_structural_variance_detection = 0;
4417 	context->config.more_accurate_fusions = 1;
4418 	context->config.report_multi_mapping_reads = 0;
4419 
4420 	//#warning "============= best values for the SVs application: 8; 5; 32 ==============="
4421 	context->config.top_scores = 8 - 5;
4422 	context->config.max_vote_combinations = 5 - 3;
4423 	context->config.max_vote_simples = 5;
4424 	context->config.max_vote_number_cutoff = 1;
4425 
4426 	context->config.experiment_type = 0;
4427 	context->config.prefer_donor_receptor_junctions = 1;
4428 	context->config.maximum_translocation_length = 10000;
4429 	context->config.maximum_colocating_distance = 500;
4430 	context->config.do_big_margin_filtering_for_reads = 0;
4431 	context->config.do_big_margin_filtering_for_junctions = 0;
4432 	context->config.maximum_intron_length = 500000;
4433 	context->config.use_hamming_distance_in_exon = 0;
4434 	context->config.is_third_iteration_running = 0;
4435 	context->input_reads.is_paired_end_reads = 0;
4436 	context->will_remove_input_file = 0;
4437 	context->config.ignore_unmapped_reads = 0;
4438 	context->config.report_unmapped_using_mate_pos = 1;
4439 	context->config.downscale_mapping_quality=0;
4440 	context->config.ambiguous_mapping_tolerance = 39;
4441 	context->config.use_hamming_distance_break_ties = 0;
4442 	context->config.use_quality_score_break_ties = 0;
4443 	context->config.extending_search_indels = 0;
4444 	context->config.PE_predominant_weight = 0;
4445 
4446 	//#warning "============= best values for the SVs application: 3 ====================="
4447 	context->config.multi_best_reads = 1;
4448 	context->config.reported_multi_best_reads = 1;
4449 	context->config.is_SAM_file_input=0;
4450 	context->config.use_dynamic_programming_indel=0;
4451 	context->config.use_bitmap_event_table = 1;
4452 	context->config.convert_color_to_base = 0;
4453 	context->config.is_gzip_fastq = 0;
4454 
4455 	context->config.is_BAM_output = 1;
4456 	context->config.is_BAM_input = 0;
4457 	context->config.is_input_read_order_required = 0;
4458 	context->config.read_trim_5 = 0;
4459 	context->config.read_trim_3 = 0;
4460 	context->config.minimum_exonic_subread_fraction = -1.0;
4461 	context->config.is_first_read_reversed = 0;
4462 	context->config.is_second_read_reversed = 1;
4463 	context->config.space_type = GENE_SPACE_BASE;
4464 	context->config.minimum_pair_distance = 50;
4465 	context->config.maximum_pair_distance = 600;
4466 	context->config.expected_pair_distance = 300;
4467 	context->config.restrected_read_order = 1;
4468 	context->config.k_mer_length = 28;
4469 
4470 	context->config.reassembly_start_read_number = 2;
4471 	context->config.reassembly_key_length = 28;
4472 	context->config.reassembly_subread_length = 12;
4473 	context->config.reassembly_window_multiplex = 3;
4474 	context->config.reassembly_tolerable_voting = 1;
4475 	context->config.reassembly_window_alleles = 1;
4476 	context->config.do_superlong_indel_detection = 0;
4477 	context->config.flanking_subread_indel_mismatch = 1;
4478 
4479 	context->config.total_subreads = 10;
4480 	context->config.minimum_subread_for_first_read = 3;
4481 	context->config.minimum_subread_for_second_read = 1;
4482 	context->config.min_mapped_fraction = 0;
4483 	context->config.max_mismatch_exonic_reads = 5;
4484 	context->config.max_mismatch_junction_reads = 2;
4485 	context->config.all_threads = 1;
4486 	context->config.is_first_iteration_running = 1;
4487 	context->config.is_second_iteration_running = 1;
4488 
4489 	context->config.reads_per_chunk = 20*1024*1024;
4490 
4491 //#warning "=========== 2*1024*1024 IS FOR TESTING BLOCKING AND SHOULD BE COMMENTED ==============="
4492 //	context->config.reads_per_chunk = 2*1024*1024;
4493 
4494 	context->config.use_memory_buffer = 1;
4495 	context->config.is_methylation_reads = 0;
4496 	context->config.report_no_unpaired_reads = 0;
4497 	context->config.limited_tree_scan = 0;
4498 	context->config.high_quality_base_threshold = 500000;
4499 	context->config.report_multiple_best_in_pairs = 0;
4500 	context->config.realignment_minimum_variant_distance = 16;
4501 
4502 	context->config.init_max_event_number = 70000;
4503 	context->config.show_soft_cliping = 1 ;
4504 	context->config.big_margin_record_size = 9;
4505 
4506 	context->config.read_group_id[0] = 0;
4507 	context->config.read_group_txt[0] = 0;
4508 	context->config.first_read_file[0] = 0;
4509 	context->config.second_read_file[0] = 0;
4510 	context->config.index_prefix[0] = 0;
4511 	context->config.output_prefix[0] = 0;
4512 	context->config.exon_annotation_file[0] = 0;
4513 	context->config.SAM_extra_columns = 0;
4514 	context->config.exon_annotation_file_type = FILE_TYPE_GTF;
4515 	strcpy(context->config.exon_annotation_gene_id_column, "gene_id");
4516 	strcpy(context->config.exon_annotation_feature_name_column, "exon");
4517 
4518 	context->config.DP_penalty_create_gap = -1;
4519 	context->config.DP_penalty_extend_gap = 0;
4520 	context->config.DP_match_score = 2;
4521 	context->config.DP_mismatch_penalty = 0;
4522 
4523 	context->config.max_insertion_at_junctions=0;
4524 	context->config.check_donor_at_junctions=1;
4525 	memset(&context -> input_reads, 0, sizeof(read_input_t));
4526 
4527 	signal (SIGTERM, COREMAIN_SIGINT_hook);
4528 	signal (SIGINT, COREMAIN_SIGINT_hook);
4529 
4530 
4531 	int seed_rand[2];
4532 	double double_time = miltime();
4533 	memcpy(seed_rand, &double_time, 2*sizeof(int));
4534 	myrand_srand(seed_rand[0]^seed_rand[1]); // the seed is NOT used in R because myrand_srand will always takes four random numbers from R's RNG
4535 
4536 	context->config.max_indel_length = 5;
4537 	context->config.phred_score_format = FASTQ_PHRED33;
4538 	context->start_time = miltime();
4539 
4540 	context->timecost_load_index = 0;
4541 	context->timecost_voting = 0;
4542 	context->timecost_before_realign = 0;
4543 	context->timecost_for_realign = 0;
4544 }
4545 
init_core_temp_path(global_context_t * context)4546 void init_core_temp_path(global_context_t * context){
4547 	int x1;
4548 	char mac_rand[13];
4549 	mac_or_rand_str(mac_rand);
4550 
4551 	context->config.temp_file_prefix[0] = 0;
4552 	if(context->config.output_prefix[0]){
4553 		for(x1 = strlen(context->config.output_prefix); x1 >=0; x1--){
4554 			if(context->config.output_prefix[x1]=='/'){
4555 				memcpy(context->config.temp_file_prefix, context->config.output_prefix, x1);
4556 				context->config.temp_file_prefix[x1]=0;
4557 				break;
4558 			}
4559 		}
4560 	}
4561 
4562 	if(context->config.temp_file_prefix[0] == 0)strcpy(context->config.temp_file_prefix, "./");
4563 	sprintf(context->config.temp_file_prefix+strlen(context->config.temp_file_prefix), "/core-temp-sum-%06u-%s", getpid(), mac_rand );
4564 	_COREMAIN_delete_temp_prefix = context->config.temp_file_prefix;
4565 }
4566 
4567 #define INDEL_MASK_BY_INSERTION 1
4568 #define INDEL_MASK_BY_DELETION 2
4569 #define INDEL_MASK_BY_MATCH 0
4570 #define INDEL_MASK_BY_MISMATCH 3
4571 
4572 
4573 	int CORE_DPALIGN_CREATEGAP_PENALTY = -1;
4574 	int CORE_DPALIGN_EXTENDGAP_PENALTY = 0;
4575 	int CORE_DPALIGN_MATCH_SCORE = 2;
4576 	int CORE_DPALIGN_MISMATCH_PENALTY = 0;
4577 
4578 
core_dynamic_align(global_context_t * global_context,thread_context_t * thread_context,char * read,int read_len,unsigned int begin_position,char * movement_buffer,int expected_offset,char * read_name)4579 int core_dynamic_align(global_context_t * global_context, thread_context_t * thread_context, char * read, int read_len, unsigned int begin_position, char * movement_buffer, int expected_offset, char * read_name)
4580 // read must be converted to the positive strand.
4581 // movement buffer: 0:match, 1: read-insert, 2: gene-insert, 3:mismatch
4582 // the size of the movement buffer must be equal to the length of the read plus max_indel * 3.
4583 {
4584 	int max_indel = min(16 , global_context->config.max_indel_length);
4585 	int i,j;
4586 
4587 	CORE_DPALIGN_CREATEGAP_PENALTY = global_context -> config.DP_penalty_create_gap;
4588 	CORE_DPALIGN_EXTENDGAP_PENALTY = global_context -> config.DP_penalty_extend_gap;
4589 	CORE_DPALIGN_MATCH_SCORE = global_context -> config.DP_match_score;
4590 	CORE_DPALIGN_MISMATCH_PENALTY = global_context -> config.DP_mismatch_penalty;
4591 
4592 	if(read_len < 3 || abs(expected_offset) > max_indel)
4593 		return 0;
4594 	if(expected_offset < 0 && read_len < (3-expected_offset))
4595 		return 0;
4596 
4597 	gene_value_index_t * current_value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
4598 
4599 	indel_context_t * indel_context = (indel_context_t*)global_context -> module_contexts[MODULE_INDEL_ID];
4600 
4601 	//unsigned long long table_ptr = (unsigned long long) indel_context -> dynamic_align_table;
4602 
4603 	short ** table = indel_context -> dynamic_align_table;
4604 	char ** table_mask = indel_context -> dynamic_align_table_mask;
4605 	if(thread_context)
4606 	{
4607 		indel_thread_context_t * indel_thread_context = (indel_thread_context_t*)thread_context -> module_thread_contexts[MODULE_INDEL_ID];
4608 		table = indel_thread_context -> dynamic_align_table;
4609 		table_mask = indel_thread_context -> dynamic_align_table_mask;
4610 	}
4611 
4612 
4613 	if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4614 	{
4615 		int ii;
4616 		for(ii = 0; ii<read_len - expected_offset; ii++)
4617 			SUBREADprintf("%c",gvindex_get(current_value_index, begin_position + ii));
4618 
4619 		SUBREADprintf ("\n%s\n", read);
4620 	}
4621 
4622 
4623 	// vertical move: deletion (1)
4624 	// horizontal move: insertion (2)
4625 	// cross move: match (0) or mismatch (3)
4626 	// i: vertical move; j: horizontal move
4627 
4628 	//SUBREADprintf("DM[%d]: %p %d,%d\n", thread_context -> thread_id, table_mask, read_len +  expected_offset,  read_len);
4629 	for (i=0; i<read_len +  expected_offset; i++)
4630 	{
4631 		for(j=0; j<read_len; j++)
4632 		{
4633 			if(0&&(i >= read_len +  expected_offset || j >= read_len))
4634 				SUBREADprintf("XXDM[%d]: %p %d,%d\n", thread_context -> thread_id, table_mask, read_len +  expected_offset,  read_len);
4635 			table_mask[i][j]=0;
4636 
4637 			if(0&&(i >= read_len +  expected_offset || j >= read_len))
4638 				SUBREADprintf("YYDM[%d]: %p %d,%d\n", thread_context -> thread_id, table_mask, read_len +  expected_offset,  read_len);
4639 
4640 			if (j < i - max_indel || j > max_indel + i)
4641 			{
4642 				table[i][j]=-9999;
4643 				if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4644 				{
4645 					putchar('\t');
4646 				}
4647 				continue;
4648 			}
4649 
4650 			short from_upper;
4651 
4652 			if (i>0) from_upper = table[i-1][j] + (table_mask[i-1][j] == INDEL_MASK_BY_DELETION?CORE_DPALIGN_EXTENDGAP_PENALTY:CORE_DPALIGN_CREATEGAP_PENALTY);
4653 			else     from_upper = -9999;
4654 
4655 			short from_left;
4656 
4657 			if (j>0) from_left = table[i][j-1] + (table_mask[i][j-1] == INDEL_MASK_BY_INSERTION?CORE_DPALIGN_EXTENDGAP_PENALTY:CORE_DPALIGN_CREATEGAP_PENALTY);
4658 			else     from_left = -9999;
4659 
4660 			char chromo_ch = gvindex_get(current_value_index, begin_position + i);
4661 			char is_matched_ij = (chromo_ch == read[j])?CORE_DPALIGN_MATCH_SCORE:CORE_DPALIGN_MISMATCH_PENALTY;
4662 
4663 			short from_upperleft;
4664 
4665 			if (i>0 && j>0) from_upperleft = table[i-1][j-1] + is_matched_ij;
4666 			else if(i==0 && j==0) from_upperleft = is_matched_ij;
4667 			else	    from_upperleft = -9999;
4668 
4669 			if (from_upperleft == from_upper && from_upperleft > from_left)
4670 			{
4671 				table_mask[i][j]= INDEL_MASK_BY_DELETION;
4672 				table[i][j] = from_upper;
4673 			}
4674 			else if(from_upperleft == from_left && from_upperleft > from_upper)
4675 			{
4676 				table_mask[i][j]= INDEL_MASK_BY_INSERTION;
4677 				table[i][j] = from_left;
4678 			}
4679 			else if(from_upperleft > from_left && from_upperleft > from_upper)
4680 			{
4681 				table_mask[i][j]= (chromo_ch == read[j])?INDEL_MASK_BY_MATCH:INDEL_MASK_BY_MISMATCH;
4682 				table[i][j] = from_upperleft;
4683 			}
4684 			else if(from_upperleft == from_left && from_upperleft == from_upper)
4685 			{
4686 				table_mask[i][j]= (chromo_ch == read[j])?INDEL_MASK_BY_MATCH:INDEL_MASK_BY_MISMATCH;
4687 				table[i][j] = from_upperleft;
4688 			}
4689 			else if(from_left > from_upper)
4690 			{
4691 				table_mask[i][j]= INDEL_MASK_BY_INSERTION;
4692 				table[i][j] = from_left;
4693 			}
4694 			else if(from_left <= from_upper)
4695 			{
4696 				table_mask[i][j]= INDEL_MASK_BY_DELETION;
4697 				table[i][j] = from_upper;
4698 			}
4699 
4700 			if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4701 				SUBREADprintf("%c%c\t", chromo_ch, read[j]);
4702 		}
4703 		if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4704 			SUBREADputs("");
4705 
4706 	}
4707 	#ifdef indel_debug
4708 	//SUBREADputs("");
4709 	//SUBREADputs("");
4710 
4711 	#endif
4712 
4713 	short path_i = read_len + expected_offset - 1;
4714 	int out_pos = 0, delta=0;
4715 	j = read_len - 1;
4716 
4717 	if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4718 	{
4719 		int ii,jj;
4720 		for(ii=0;ii< path_i+1; ii++)
4721 		{
4722 			SUBREADprintf("%d\t", ii);
4723 			for(jj=0; jj<j+1; jj++)
4724 				SUBREADprintf("% 6d",table[ii][jj]);
4725 			SUBREADputs("");
4726 		}
4727 		SUBREADprintf("  \t");
4728 		for(jj=0; jj<j+1; jj++)
4729 			SUBREADprintf("#%4d ",jj);
4730 		SUBREADputs("");
4731 		SUBREADputs("");
4732 
4733 		for(ii=0;ii< path_i+1; ii++)
4734 		{
4735 			for(jj=0; jj<j+1; jj++)
4736 				SUBREADprintf("% 6d",table_mask[ii][jj]);
4737 			SUBREADputs("");
4738 		}
4739 	}
4740 
4741 	while(1)
4742 	{
4743 		if(table_mask[path_i][j] == INDEL_MASK_BY_INSERTION)
4744 		{
4745 			j--;
4746 			delta --;
4747 			movement_buffer[out_pos++] = 2;
4748 		}
4749 		else if(table_mask[path_i][j] == INDEL_MASK_BY_DELETION)
4750 		{
4751 			path_i--;
4752 			delta ++;
4753 			movement_buffer[out_pos++] = 1;
4754 		}
4755 		else if(table_mask[path_i][j] == INDEL_MASK_BY_MATCH || table_mask[path_i][j] == INDEL_MASK_BY_MISMATCH)
4756 		{
4757 			movement_buffer[out_pos++] = table_mask[path_i][j] == INDEL_MASK_BY_MATCH?0:3;
4758 			path_i--;
4759 			j--;
4760 		}
4761 
4762 		if(path_i == -1 && j == -1) break;
4763 		if(j<0 || path_i<0) return 0;
4764 	}
4765 	//out_pos++;
4766 	//movement_buffer[out_pos]= 0;
4767 
4768 	if(expected_offset!=delta)return 0;
4769 	for(i=0; i<out_pos/2; i++)
4770 	{
4771 		char tmp;
4772 		tmp = movement_buffer[out_pos-1-i];
4773 		movement_buffer[out_pos-1-i] = movement_buffer[i];
4774 		movement_buffer[i] = tmp;
4775 	}
4776 
4777 	if(0 && strcmp(read_name, "MISEQ:13:000000000-A1H1M:1:1112:12194:5511") == 0)
4778 	{
4779 		for(i=0; i<out_pos; i++)
4780 		{
4781 			char tmp = movement_buffer[i];
4782 			switch(tmp){
4783 				case 0: putchar('=');break;
4784 				case 1: putchar('D');break;
4785 				case 2: putchar('I');break;
4786 				case 3: putchar('X');break;
4787 			}
4788 		}
4789 		putchar('\n');
4790 	}
4791 	return out_pos;
4792 
4793 }
4794 
4795 
4796