1 #include <string.h>
2 #include <ctype.h>
3 #include <assert.h>
4 #include "LRMconfig.h"
5 #include "LRMchro-event.h"
6 #include "LRMfile-io.h"
7 #include "LRMbase-index.h"
8 #include "LRMhelper.h"
9
10 #define JUNCTION_CONFIRM_WINDOW 14
11
12 #define ceq(c,t) ((c)[0]==(t)[0] && (c)[1]==(t)[1])
13 #define c2eq(ch1, ch2, tg1, tg2) ((ceq(ch1, tg1) && ceq(ch2, tg2)) || (ceq(ch1, tg2) && ceq(ch2, tg1)) )
14
15 #define LRMis_donor_chars(cc) (((cc)[0]=='G' && (cc)[1]=='T') || \
16 ((cc)[0]=='A' && (cc)[1]=='G') || \
17 ((cc)[0]=='A' && (cc)[1]=='C') || \
18 ((cc)[0]=='C' && (cc)[1]=='T') ||\
19 ((cc)[0]=='G' && (cc)[1]=='C'))
20
21
LRMpaired_chars(char * ch1,char * ch2)22 int LRMpaired_chars(char * ch1, char * ch2){
23 if (c2eq(ch1, ch2, "GC", "AG") || c2eq(ch1, ch2, "GT", "AG") || c2eq(ch1, ch2, "CT", "AC")) {
24 if ( ceq(ch1, "GC") || ceq(ch1, "CT") || ceq(ch1, "GT")) return 1;
25 }
26 return 0;
27 }
28
29
LRMscanning_events_compare(void * arr,int l,int r)30 int LRMscanning_events_compare(void * arr, int l, int r){
31 void ** arrr = (void **) arr;
32 LRMcontext_t * context = arrr[0];
33 int * event_ids = arrr[1];
34 LRMevent_t * body_l = context -> event_space+event_ids[l];
35 LRMevent_t * body_r = context -> event_space+event_ids[r];
36
37 if(body_l->small_side > body_r->small_side)return 1;
38 if(body_l->small_side < body_r->small_side)return -1;
39
40 if(body_l->large_side > body_r->large_side)return 1;
41 if(body_l->large_side < body_r->large_side)return -1;
42
43 if(body_l->event_type > body_r->event_type) return 1;
44 if(body_l->event_type < body_r->event_type) return -1;
45
46 if(body_l -> indel_length > body_r -> indel_length) return -1; // same length, but L is del and R is ins -- prefer del than ins
47 if(body_l -> indel_length < body_r -> indel_length) return 1;
48
49 return -1;
50 }
51
LRMscanning_events_merge(void * arr,int start,int items,int items2)52 void LRMscanning_events_merge(void * arr, int start, int items, int items2){
53 void ** arrr = (void **) arr;
54 int * records = arrr[1];
55
56 int read_1_ptr = start, read_2_ptr = start+items, write_ptr;
57 int * merged_records = malloc(sizeof(int) * (items+items2));
58
59 for(write_ptr=0; write_ptr<items+items2; write_ptr++){
60 if((read_1_ptr >= start+items)||(read_2_ptr < start+items+items2 && LRMscanning_events_compare(arr, read_1_ptr, read_2_ptr) > 0))
61 merged_records[write_ptr] = records[read_2_ptr++];
62 else
63 merged_records[write_ptr] = records[read_1_ptr++];
64 }
65 memcpy(records + start, merged_records, sizeof(int) * (items+items2));
66 free(merged_records);
67 }
68
LRMscanning_events_exchange(void * arr,int l,int r)69 void LRMscanning_events_exchange(void * arr, int l, int r){
70 void ** arrr = (void **) arr;
71 int * records = arrr[1];
72
73 int tmpi;
74
75 tmpi = records[l];
76 records[l] = records[r];
77 records[r] = tmpi;
78 }
79
LRMevents_build_entries(LRMcontext_t * context)80 int LRMevents_build_entries(LRMcontext_t * context){
81 int x1,side_i;
82
83 for(x1=0; x1 < context->event_number; x1++){
84 LRMevent_t * te = context->event_space+ x1;
85 for(side_i = 0; side_i <2; side_i++){
86 unsigned int sidepos = side_i?te->large_side:te->small_side;
87 int * entries_list = LRMHashTableGet(context -> events_realignment, NULL+sidepos);
88 //LRMprintf("INSERT ENTRY : %u -> %p ; SRC: %u ~ %u\n", sidepos, entries_list, te->small_side, te->large_side);
89 if(NULL == entries_list){
90 entries_list = malloc(sizeof(int) * 3);
91 if(!entries_list){
92 LRMprintf("ERROR: NO MEMORY CAN BE ALLOCATED.\n");
93 assert(0);
94 }
95 entries_list[0]=2;
96 entries_list[1]=0;
97 LRMHashTablePut(context -> events_realignment , NULL+sidepos, entries_list);
98 }
99 int x2 = 0, inserted = 0;
100 for(x2=1; x2< 1+ min( LRMMAX_EVENTS_PER_SITE , entries_list[0] ); x2++){
101 //#warning ">>>>>>>>> COMMENT NEXT LINE <<<<<<<<"
102 //if( x1 + 1 == entries_list[x2] )LRMprintf("REPEATED ENTRY: %d\n");
103
104 if(0 == entries_list[x2]){
105 entries_list[x2] = x1+1;
106 if( x2 < entries_list[0] )entries_list[x2+1]=0;
107 inserted = 1;
108 break;
109 }
110 }
111 if((!inserted) && entries_list[0] < LRMMAX_EVENTS_PER_SITE){
112 int last_x1 = entries_list[0];
113 entries_list[0] = LRMMAX_EVENTS_PER_SITE;
114 entries_list = realloc(entries_list, sizeof(int)*(1+LRMMAX_EVENTS_PER_SITE));
115 entries_list[last_x1] = x1+1;
116 entries_list[last_x1+1] = 0;
117
118 if(te -> small_side == 457511654 ) LRMprintf("INSERT_NEW EVENT : %d AT %u\n",x1, sidepos );
119
120 LRMHashTablePut(context -> events_realignment, NULL+sidepos, entries_list);
121 }
122 }
123 }
124 return 0;
125 }
126
LRMevents_reorder_merge_next(LRMcontext_t * context,int * order_index)127 void LRMevents_reorder_merge_next(LRMcontext_t * context, int *order_index){
128 LRMevent_t *prev_event = NULL, * new_space = malloc(sizeof(LRMevent_t) * 10000);
129 int x1, new_space_size = 10000, new_space_used = 0;
130
131 for(x1=0; x1 <=context->event_number; x1++){
132 LRMevent_t *this_event = NULL;
133 if(x1 < context->event_number) this_event = context->event_space+order_index[x1];
134 if( x1 < context->event_number && prev_event!=NULL &&
135 prev_event->large_side == this_event->large_side &&
136 prev_event->small_side == this_event->small_side &&
137 prev_event->event_type == this_event->event_type &&
138 prev_event->indel_length == this_event->indel_length){
139 prev_event -> supporting_reads ++;
140 }else{
141 if(new_space_size -1 < new_space_used){
142 new_space_size*=1.7;
143 new_space = realloc(new_space, sizeof(LRMevent_t)*new_space_size);
144 }
145 if(prev_event) memcpy(new_space+(new_space_used++), prev_event, sizeof(LRMevent_t));
146
147 if(this_event){
148 prev_event = this_event;
149 prev_event -> supporting_reads = 1;
150 }
151 }
152 }
153
154 free(context -> event_space);
155 context -> event_space = new_space;
156 context -> event_space_size = new_space_size;
157 context -> event_number = new_space_used;
158 }
159
LRMevents_reorder(LRMcontext_t * context)160 int LRMevents_reorder(LRMcontext_t * context){
161 int * order_index = malloc(context -> event_number*sizeof(int));
162 int x1=0;
163 while(x1<context -> event_number){
164 order_index[x1]=x1;
165 x1++;
166 }
167 void * sort_arr[2];
168 sort_arr [0] = context;
169 sort_arr [1] = order_index;
170
171 LRMmerge_sort(sort_arr, context -> event_number, LRMscanning_events_compare, LRMscanning_events_exchange, LRMscanning_events_merge);
172 //basic_sort(sort_arr, context -> event_number, LRMscanning_events_compare, LRMscanning_events_exchange);
173 LRMevents_reorder_merge_next(context, order_index);
174
175 if(0){
176 LRMprintf("Total events : %d\n", context -> event_number);
177 for(x1=0; x1<context -> event_number; x1++){
178 LRMevent_t * te = context->event_space+ x1;
179 if(1 || te -> small_side == 457511654){
180
181 char pos1txt[100], pos2txt[100];
182 LRMpos2txt(context, te->small_side, pos1txt);
183 LRMpos2txt(context, te->large_side, pos2txt);
184
185 LRMprintf("SORTED EVENT: TYPE: %d - INS %d %s ~ %s, nsup=%d\n", te -> event_type, te -> indel_length, pos1txt, pos2txt, te->supporting_reads);
186 }
187 }
188 }
189
190 free(order_index);
191 return 0;
192 }
193
LRMchro_event_new(LRMcontext_t * context,LRMthread_context_t * thread_context,LRMread_iteration_context_t * iteration_context,LRMevent_t * new_event)194 int LRMchro_event_new(LRMcontext_t * context, LRMthread_context_t * thread_context, LRMread_iteration_context_t * iteration_context, LRMevent_t * new_event){
195 //#warning " ================ NO INDEL EVENTS ================"
196 if(new_event -> event_type == LRMEVENT_TYPE_INDEL) return 0;
197
198 LRMthread_lock(&context -> event_space_lock);
199 if(context -> event_space_size < context -> event_number + 1 ){
200 context -> event_space_size *= 1.7;
201 context -> event_space =realloc(context -> event_space, sizeof(LRMevent_t) * context -> event_space_size);
202 if(!context -> event_space )return 1;
203 }
204 memcpy(context -> event_space+context -> event_number, new_event, sizeof(LRMevent_t));
205 context -> event_number++;
206 //LRMprintf("Total events after adding : %d\n", context -> event_number);
207 LRMthread_lockrelease(&context -> event_space_lock);
208
209 return 0;
210 }
211
LRMevents_search(LRMcontext_t * context,unsigned int testing_pos,int search_for_smaller_coordinate,int * event_ids_buff)212 int LRMevents_search(LRMcontext_t * context, unsigned int testing_pos, int search_for_smaller_coordinate, int * event_ids_buff){
213 int * event_here = LRMHashTableGet(context -> events_realignment, NULL + testing_pos);
214 if(NULL == event_here)return 0;
215 int x1, ret = 0;
216 for(x1 = 1; x1< 1+ min(LRMMAX_EVENTS_PER_SITE, event_here[0]); x1++){
217 int event_i = event_here[x1] - 1;
218 if(0 > event_i)break;
219
220 //LRMprintf("TESTING EVENT AT %u [%d] : %d \n", testing_pos, x1 , event_i);
221 LRMevent_t * thise = context -> event_space + event_i;
222 if((thise -> large_side == testing_pos && !search_for_smaller_coordinate)||
223 (thise -> small_side == testing_pos && search_for_smaller_coordinate) )
224 event_ids_buff[ret++]=event_i;
225 }
226 return ret;
227 }
228
229
230
231
232
233
234
235 #define LRMDP_score(x,y) dynamic_score_buffer[ (x)*dynamic_row_width + (y) ]
236 #define LRMDP_move(x,y) dynamic_movement_BEFORE_buffer[ (x)*dynamic_row_width + (y) ]
237
238 #define LRMSOFTCLIPPING_WINDOW 30
239 #define LRMSOFTCLIPPING_MATCHED 25
240
LRMsoftclipping_moves(LRMcontext_t * context,LRMthread_context_t * thread_context,LRMread_iteration_context_t * iteration_context,char * move_buff,int moves,int bases_in_read,int to_large)241 int LRMsoftclipping_moves(LRMcontext_t* context, LRMthread_context_t* thread_context, LRMread_iteration_context_t * iteration_context, char * move_buff, int moves, int bases_in_read, int to_large){
242 int ii;
243
244 int included_read_length = 0;
245 int last_M = 0x7fffffff;
246 int window_end = moves - 1;
247 int window_start = 0;
248 int window_MX = 0, window_M=0;
249
250
251 //LRMprintf("MOVES for %s =%s\n", iteration_context -> read_name, move_buff);
252 for(ii = moves -1; ii>= 0; ii--){
253 if( move_buff[ii] == 'M' || move_buff[ii] == 'X' ){
254 window_MX++;
255 if(move_buff[ii] == 'M')
256 window_M++;
257 }
258 if(window_MX == LRMSOFTCLIPPING_WINDOW)break;
259 }
260 window_start = ii;
261
262 if(window_MX ==LRMSOFTCLIPPING_WINDOW){
263 for(; window_start >=0 ; window_start--){
264 if(move_buff[window_start]=='M' || move_buff[window_start] =='X'){
265 window_MX ++;
266 if(move_buff[window_start]=='M')
267 window_M++;
268 }
269
270 if(window_MX > LRMSOFTCLIPPING_WINDOW){
271 while(1){
272 char nch = move_buff[window_end--];
273 if(nch == 'M' || nch == 'X'){
274 window_MX --;
275 if(nch == 'M') window_M--;
276 break;
277 }
278 }
279 }
280
281 //LRMprintf("M=%d, W = %d - %d, windows_MX=%d in %d\n", window_M, window_start, window_end, window_MX, bases_in_read);
282 if(window_M < LRMSOFTCLIPPING_MATCHED)
283 break;
284 }
285 }
286
287 int smallwindow_Xs = 0;
288 last_M = window_end;
289
290 for(ii = window_end; ii>=0 && ii >= window_start; ii--){
291 if(move_buff[ii] == 'M')
292 last_M = ii;
293
294 if(move_buff[ii] == 'X' && window_M < LRMSOFTCLIPPING_MATCHED){
295 smallwindow_Xs++;
296 if(smallwindow_Xs > 1) break;
297 }
298 }
299
300 if(last_M > 0){
301 //LRMprintf("M=%d, last 'M' at %d, windows_MX=%d in %d\n", window_M, last_M, window_MX, bases_in_read);
302 for(ii = moves -1; ii>= last_M; ii--){
303 if(move_buff[ii] == 'M' || move_buff[ii] == 'X' || move_buff[ii] == 'I')
304 included_read_length++;
305 }
306
307 //assert( (last_M - 1) >=(bases_in_read - included_read_length));
308 //LRMprintf("last_M=%d, included_read_length=%d in %d\n", last_M, included_read_length, bases_in_read );
309 int Ss = (bases_in_read - included_read_length);
310 if(Ss < 2 || last_M < 11){
311 for(ii = last_M - 1; ii > (last_M - 1) - Ss ; ii--){
312 if(ii<0){
313 LRMprintf("MINUS_MOVE : %s , last_M = %d, Ss = %d\n", iteration_context -> read_name, last_M, Ss);
314 return -1;
315 }
316 move_buff[ii]='S';
317 }
318 for(; ii >= 0; ii--)
319 move_buff[ii]='.';
320 }else{
321 for(ii = last_M - 1; ii >= 0; ii--)
322 move_buff[ii]='.';
323 int splen = sprintf(move_buff + last_M - 10, "%dS", Ss);
324 if(to_large){
325 for(ii = 0; ii < splen/2; ii++){
326 int tv;
327 tv = move_buff[ last_M - 10 + ii];
328 move_buff[ last_M - 10 + ii] = move_buff[ last_M - 10 + splen -1 - ii ];
329 move_buff[ last_M - 10 + splen -1 - ii ] = tv;
330 }
331 }
332 move_buff[ last_M - 10+splen ] = '.';
333 }
334 }
335 return 0;
336 }
337
LRMindel_dynamic_search_debug(LRMcontext_t * context,int * dynamic_score_buffer,char * dynamic_movement_BEFORE_buffer,int dynamic_row_width,int dynamic_rows,int * best_offset_history)338 void LRMindel_dynamic_search_debug(LRMcontext_t* context, int * dynamic_score_buffer, char * dynamic_movement_BEFORE_buffer, int dynamic_row_width, int dynamic_rows, int *best_offset_history){
339 int rr, ii;
340
341 LRMprintf(" ");
342 for(ii=0;ii<dynamic_row_width;ii++)
343 LRMprintf(" % 4d ", ii - context -> max_dynamic_indel_length);
344 LRMprintf("\n");
345
346 for(rr=0; rr<dynamic_rows; rr++){
347 LRMprintf("%4d | %4d ", best_offset_history?best_offset_history[ rr ]:-1, rr);
348 for(ii=0;ii<dynamic_row_width;ii++){
349 LRMprintf("% 4d %c ", LRMDP_score(rr,ii), LRMDP_move(rr,ii));
350 }
351 LRMprintf("\n");
352 }
353 }
354
355
LRMtest_move_buff(LRMcontext_t * context,LRMthread_context_t * thread_context,LRMread_iteration_context_t * iteration_context,char * movtxt,int moves,int rlen)356 void LRMtest_move_buff( LRMcontext_t* context, LRMthread_context_t* thread_context, LRMread_iteration_context_t * iteration_context, char * movtxt, int moves, int rlen){
357 return;
358 int ii, tmpi=-1, cumlen=0, oldopt = -1;
359 int rebuild_len = 0;
360 for(ii = 0; ii < moves; ii++){
361 int nch = movtxt[ii];
362 assert(nch);
363 if(nch == '.' || nch == '/') continue;
364 if(nch == 'X') nch = 'M';
365
366 if(isdigit(nch)){
367 if(tmpi<0)tmpi=0;
368 tmpi=tmpi*10+(nch-'0');
369 }else{
370 if(tmpi<0) tmpi = 1;
371 if(oldopt != nch){
372 if(oldopt == 'M' || oldopt == 'I' || oldopt == 'S') rebuild_len += cumlen;
373 cumlen = 0;
374 }
375 cumlen += tmpi;
376 tmpi = -1;
377 oldopt = nch;
378 }
379 }
380 if(cumlen && (oldopt == 'M' || oldopt == 'I' || oldopt == 'S'))rebuild_len += cumlen;
381 if(rebuild_len != rlen){
382 LRMprintf("WRONG MOVES %s (moves = %d): %d (rebuild) != %d (subread_dist)\t%s \n", iteration_context -> read_name, moves, rebuild_len, rlen, movtxt);
383 }
384 }
385
386 int my_debug = 0;
LRMdynamic_in_middle(LRMcontext_t * context,LRMthread_context_t * thread_context,LRMread_iteration_context_t * iteration_context,int last_correct_base,int first_correct_base,unsigned int last_correct_base_on_chro,int expected_indels)387 int LRMdynamic_in_middle(LRMcontext_t* context, LRMthread_context_t* thread_context, LRMread_iteration_context_t * iteration_context, int last_correct_base, int first_correct_base , unsigned int last_correct_base_on_chro, int expected_indels){
388 int moves, xx1;
389
390 char * corrected_read = iteration_context -> read_text;//+ last_correct_base;
391 int high_penalty_create_gap = 0;
392
393 if(0) {
394 char postxt[100];
395 LRMpos2txt(context, last_correct_base_on_chro, postxt);
396 LRMprintf("Dynamic: at %s : %d - %d ; expected = %d\n", postxt, last_correct_base, first_correct_base, expected_indels);
397 for(xx1 = 0 ; xx1 < first_correct_base - last_correct_base ; xx1++ ){
398 LRMprintf("%c", corrected_read[ xx1 + last_correct_base ]);
399 }
400 LRMprintf("\n");
401 for(xx1 = 0 ; xx1 < first_correct_base - last_correct_base - expected_indels; xx1++){
402 LRMprintf("%c", LRMgvindex_get(& context -> current_base_index, last_correct_base_on_chro + xx1));
403 }
404 LRMprintf("\n");
405 }
406
407 int dynamic_rows = first_correct_base - last_correct_base;
408 int trying_indel_length ;
409
410 if(abs(expected_indels) < 10) trying_indel_length = abs(expected_indels) * 2;
411 else trying_indel_length = (abs(expected_indels) -10) * 3/2 + 20;
412
413 trying_indel_length = max(trying_indel_length, 10);
414 trying_indel_length = min(trying_indel_length, LRMINDEL_DYNAMIC_CHANNEL_TOLERANCE);
415
416 int best_offset_history [dynamic_rows];
417 int score_match = context -> dynamic_programming_score_match;
418 int score_mismatch = context -> dynamic_programming_score_mismatch;
419 int score_create_gap = context -> dynamic_programming_score_create_gap * (1+high_penalty_create_gap);
420 int score_extend_gap = context -> dynamic_programming_score_extend_gap;
421 int dynamic_row_width = 2* trying_indel_length + 1;
422
423
424 char postxt[100];
425 LRMpos2txt(context, last_correct_base_on_chro, postxt);
426 //LRMprintf("DDDDD1 SIZE=%d ; last_correct = %d; first_correct = %d ; delta=%d ; pos=%u (%s)\n", dynamic_row_width * dynamic_rows, last_correct_base, first_correct_base, expected_indels, last_correct_base_on_chro, postxt);
427 int * dynamic_score_buffer = (int *)thread_context -> dynamic_programming_score_buffer;
428 char * dynamic_movement_BEFORE_buffer = thread_context -> dynamic_programming_movement_buffer;
429 memset(dynamic_score_buffer, 0, sizeof(int) * dynamic_row_width * dynamic_rows);
430 memset(dynamic_movement_BEFORE_buffer, 0, sizeof(char) * dynamic_row_width * dynamic_rows);
431 char * indel_movement_buff = (char *) thread_context -> dynamic_programming_indel_movement_buf;
432 int this_movement_start = thread_context -> dynamic_programming_indel_movement_start;
433
434 LRMDP_score(0, trying_indel_length )=0;
435 int read_cursor = last_correct_base, row_i, indel_i;
436 unsigned int chro_cursor = last_correct_base_on_chro;
437
438 int last_slope_offset = 0;
439
440 if(1){
441 float slope = expected_indels *1. / dynamic_rows;
442 for(xx1 = 0; xx1 < dynamic_rows; xx1++)
443 best_offset_history[xx1] = 1* (int)(xx1 * slope);
444 }
445
446 for(; read_cursor < first_correct_base; read_cursor++){
447 row_i = read_cursor - last_correct_base;
448 int slope_offset = row_i>0?best_offset_history[row_i-1]:0;
449 int last_slope_delta = slope_offset - last_slope_offset;
450
451 for(indel_i = dynamic_row_width-1 ; indel_i >=0; indel_i --){ // negative: deletion; positive: insertion
452 int testing_indel = indel_i - trying_indel_length;
453 if(1){
454 int score_from_del = -0x7fffffff, score_from_ins = -0x7fffffff, score_from_match = -0x7fffffff;
455 int is_matched_base = toupper(corrected_read[read_cursor]) == toupper(LRMgvindex_get(& context -> current_base_index, chro_cursor - slope_offset - testing_indel));
456
457 if(row_i>0 && (indel_i-1+ last_slope_delta)>=0 && (indel_i-1+ last_slope_delta)< dynamic_row_width && LRMDP_score(row_i-1, indel_i-1 + last_slope_delta) > -0x7ffffff0)
458 score_from_ins = LRMDP_score(row_i-1, indel_i-1 + last_slope_delta) + ( (LRMDP_move(row_i-1, indel_i-1+ last_slope_delta) == 'M' || LRMDP_move(row_i-1, indel_i-1 +last_slope_delta) == 'X')?score_create_gap:score_extend_gap);
459
460 if(testing_indel < 0 || testing_indel < row_i)if(indel_i < dynamic_row_width-1 && LRMDP_score(row_i, indel_i+1) > -0x7ffffff0)
461 score_from_del = LRMDP_score(row_i, indel_i+1) + ((LRMDP_move(row_i, indel_i+1) == 'M' || LRMDP_move(row_i, indel_i+1) == 'X')?score_create_gap:score_extend_gap);
462
463 if((indel_i+ last_slope_delta)>=0 && (indel_i+ last_slope_delta)< dynamic_row_width && (row_i ==0 || LRMDP_score(row_i-1, indel_i + last_slope_delta) > -0x7ffffff0)){
464 score_from_match =(row_i > 0 ?LRMDP_score(row_i-1, indel_i + last_slope_delta): 0)+ (is_matched_base?score_match:score_mismatch);
465 if(row_i == 0 && testing_indel > 0) score_from_match += score_create_gap + (testing_indel-1) * score_extend_gap;
466 }
467
468 int final_score = max(score_from_del, max(score_from_ins, score_from_match));
469 if(testing_indel + slope_offset > 0 && row_i < slope_offset + testing_indel){
470 LRMDP_score(row_i, indel_i) = score_create_gap + (score_extend_gap-1) * (testing_indel+slope_offset) ;
471 LRMDP_move(row_i, indel_i) = 'I';
472 }else{
473 LRMDP_score(row_i, indel_i) = final_score;
474 if(final_score < -0x7ffffff0) LRMDP_move(row_i, indel_i) = '?';
475 else LRMDP_move(row_i, indel_i) = score_from_del == final_score?'D':((score_from_ins == final_score)?'I': ( is_matched_base ?'M':'X'));
476 }
477 }
478 }
479 last_slope_offset = slope_offset;
480 chro_cursor ++;
481 }
482
483 if(0) LRMindel_dynamic_search_debug(context, dynamic_score_buffer, dynamic_movement_BEFORE_buffer, dynamic_row_width, dynamic_rows, best_offset_history);
484
485 row_i = first_correct_base - last_correct_base - 1;
486 indel_i = trying_indel_length + expected_indels -(row_i >0?best_offset_history[row_i - 1]:0);
487
488 moves = 0;
489 while(row_i >= 0 && indel_i >=0 && indel_i < dynamic_row_width){
490 int slope_offset = best_offset_history[row_i-1];
491 int next_slope_offset = row_i > 1?best_offset_history[row_i-2]:0;
492 int last_slope_delta = slope_offset - next_slope_offset;
493 //#warning "========= DO NOT ASSERT ============="
494 indel_movement_buff[ this_movement_start + moves] = LRMDP_move(row_i, indel_i);
495 if(indel_movement_buff[ this_movement_start + moves]=='?')LRMprintf("Assertion_Error:%s\n", iteration_context -> read_name);
496 assert(indel_movement_buff[ this_movement_start + moves]!='?');
497
498 if(indel_movement_buff[ this_movement_start + moves] == 'M' || indel_movement_buff[ this_movement_start + moves] == 'X'){
499 row_i--;
500 indel_i += last_slope_delta;
501 } else if(indel_movement_buff[ this_movement_start + moves] == 'D')indel_i++;
502 else {
503 indel_i --;
504 indel_i += last_slope_delta;
505 row_i--;
506 }
507 moves ++;
508
509 if(row_i < 0 && indel_i < trying_indel_length)
510 for(; indel_i < trying_indel_length; indel_i++) indel_movement_buff[ this_movement_start + ( moves++ ) ] ='D';
511
512 if(moves > max( LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15, 300 ) + context -> max_dynamic_indel_length ){
513 LRMprintf("ERROR: Dynamic programming moves more than %d\n", max( (int)(LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15), 300 ) + context -> max_dynamic_indel_length);
514 return -1;
515 }
516 }
517
518 indel_movement_buff[ this_movement_start + moves]=0;
519 for(row_i = 0; row_i < moves/2; row_i++){
520 char tmp = indel_movement_buff[ this_movement_start + row_i];
521 indel_movement_buff[ this_movement_start + row_i] = indel_movement_buff[ this_movement_start + moves - row_i - 1];
522 indel_movement_buff[ this_movement_start + moves - row_i - 1] = tmp;
523 }
524
525 indel_movement_buff[ this_movement_start + (moves++)]='/';
526 indel_movement_buff[ this_movement_start + moves] = 0;
527
528 LRMtest_move_buff( context, thread_context, iteration_context , indel_movement_buff + this_movement_start, moves , dynamic_rows);
529 if(0)LRMprintf("MOVES = %s\n", indel_movement_buff + this_movement_start);
530
531 return moves;
532 }
533
LRMdynamic_to_ends(LRMcontext_t * context,LRMthread_context_t * thread_context,LRMread_iteration_context_t * iteration_context,int last_mapped_in_read,unsigned int last_correct_base_on_chro,int search_to_3end)534 int LRMdynamic_to_ends(LRMcontext_t* context, LRMthread_context_t* thread_context, LRMread_iteration_context_t * iteration_context, int last_mapped_in_read, unsigned int last_correct_base_on_chro, int search_to_3end){
535 int moves = 0;
536 int high_penalty_create_gap = 0;
537
538 int last_correct_base = search_to_3end? last_mapped_in_read : 0;
539 int first_correct_base = search_to_3end? iteration_context -> read_length : last_mapped_in_read;
540 int head_SS = 0, tail_SS = 0;
541
542 int this_movement_start = thread_context -> dynamic_programming_indel_movement_start;
543 //LRMprintf("SEARCH_END_01 %s : start=%d, moves=%d\n", iteration_context -> read_name , this_movement_start, moves);
544 char * indel_movement_buff = (char *) thread_context -> dynamic_programming_indel_movement_buf;
545 int bases_in_read_no_SS = first_correct_base - last_correct_base;
546 //if(bases_in_read_no_SS<3){
547 // return sprintf(indel_movement_buff + this_movement_start,"%dS/",bases_in_read_no_SS);
548 //}
549
550 if((!search_to_3end) && first_correct_base >= LRMDYNAMIC_MAXIMUM_GAP_LENGTH){
551 last_correct_base = first_correct_base - LRMDYNAMIC_MAXIMUM_GAP_LENGTH;
552 head_SS = last_correct_base;
553 //LRMprintf("HEAD_SKIP = %d\n", head_SS);
554 }
555 if(search_to_3end && iteration_context -> read_length - last_correct_base >= LRMDYNAMIC_MAXIMUM_GAP_LENGTH){
556 first_correct_base = last_correct_base + LRMDYNAMIC_MAXIMUM_GAP_LENGTH;
557 tail_SS = iteration_context -> read_length - first_correct_base;
558 }
559
560 char * corrected_read = iteration_context -> read_text;// + first_correct_base;
561 int bases_in_read = first_correct_base - last_correct_base;
562 int trying_indel_length = LRMINDEL_DYNAMIC_CHANNEL_TOLERANCE/3;
563 //if( bases_in_read > 1000 ) trying_indel_length = LRMINDEL_DYNAMIC_CHANNEL_TOLERANCE/2;
564 //if( bases_in_read > 10000 ) trying_indel_length = LRMINDEL_DYNAMIC_CHANNEL_TOLERANCE;
565 if(bases_in_read_no_SS<2) trying_indel_length = 1;
566
567 //int best_offset_history [bases_in_read];
568 int best_offset_history[bases_in_read];
569 int score_match = context -> dynamic_programming_score_match;
570 int score_mismatch = context -> dynamic_programming_score_mismatch;
571 int score_create_gap = context -> dynamic_programming_score_create_gap * (1+high_penalty_create_gap);
572 int score_extend_gap = context -> dynamic_programming_score_extend_gap;
573 int dynamic_row_width = 2* trying_indel_length + 1;
574
575 int * dynamic_score_buffer = (int *)thread_context -> dynamic_programming_score_buffer;
576 char * dynamic_movement_BEFORE_buffer = thread_context -> dynamic_programming_movement_buffer;
577 //LRMprintf("EEEEE1 %s: Memsize=4 * %d = %d x %d\n", iteration_context -> read_name, dynamic_row_width * bases_in_read , dynamic_row_width, bases_in_read);
578 memset(dynamic_score_buffer, 0, sizeof(int) * dynamic_row_width * bases_in_read);
579 memset(dynamic_movement_BEFORE_buffer, 0, sizeof(char) * dynamic_row_width * bases_in_read);
580
581 LRMDP_score(0, trying_indel_length )=0;
582 unsigned int chro_cursor ;
583
584 int last_slope_offset = 0, read_i, indel_i, previous_base_in_read = 0;
585
586 if(0){
587 char postxt[100];
588 LRMpos2txt(context, last_correct_base_on_chro, postxt);
589 LRMprintf("EXTEND_UNKNOWN: %s\n", postxt);
590 if(!search_to_3end){
591 int bb;
592 bb = corrected_read[first_correct_base];
593 corrected_read[first_correct_base] = 0;
594 LRMprintf("READ: %s\n", corrected_read);
595 corrected_read[first_correct_base] = bb;
596
597 LRMprintf("CHRO: ");
598 for(chro_cursor = last_correct_base_on_chro - bases_in_read - 10; chro_cursor < last_correct_base_on_chro; chro_cursor ++){
599 bb = LRMgvindex_get(& context -> current_base_index, chro_cursor);
600 LRMprintf("%c", bb);
601 }
602 LRMprintf("\n\n");
603 }else{
604 int bb;
605 bb = corrected_read[first_correct_base];
606 corrected_read[first_correct_base] = 0;
607 LRMprintf("READ: %s\n", corrected_read + last_correct_base);
608 corrected_read[first_correct_base] = bb;
609
610 LRMprintf("CHRO: ");
611 for(chro_cursor = last_correct_base_on_chro; chro_cursor < last_correct_base_on_chro + 10 + bases_in_read; chro_cursor ++){
612 bb = LRMgvindex_get(& context -> current_base_index, chro_cursor);
613 LRMprintf("%c", bb);
614 }
615 LRMprintf("\n\n");
616 }
617 }
618
619 for(read_i = 0; read_i < bases_in_read; read_i ++){
620 int this_base_in_read = search_to_3end? read_i :(bases_in_read - read_i -1);
621 int this_base_value_in_read = corrected_read [ last_correct_base + this_base_in_read ];
622
623 int slope_offset = read_i>0?best_offset_history[previous_base_in_read]:0;
624 int last_slope_delta = slope_offset - last_slope_offset;
625 if(0 && ! search_to_3end)LRMprintf("GET READ_BASE='%c' LAST_OFF=%d SLP_OFF=%d LAST_DELTA=%d\n", this_base_value_in_read, last_slope_offset, slope_offset, last_slope_delta);
626
627 int thisrow_max_score = -0x7fffffff, thisrow_max_indel_from_start = -0x7fffffff;
628
629
630 for(indel_i = search_to_3end?dynamic_row_width-1 : 0; indel_i !=(search_to_3end?-1:dynamic_row_width); indel_i+=(search_to_3end?-1:1)){
631 int indel_from_start = slope_offset + ( indel_i - trying_indel_length); // if to_3end: +:ins, -:del; if !to_3end: +:del, -:ins
632 unsigned int chro_location_after_indel = last_correct_base_on_chro + ( search_to_3end ?( read_i):(- read_i -1)) - indel_from_start; // if !to_3end: "DEL" from right is "INS" from left
633 int this_base_value_in_chro = LRMgvindex_get(& context -> current_base_index, chro_location_after_indel);
634
635 int score_from_del = -0x7fffffff, score_from_ins = -0x7fffffff, score_from_match = -0x7fffffff;
636 int is_matched_base = toupper(this_base_value_in_read)== toupper(this_base_value_in_chro);
637
638
639 if(search_to_3end){
640 if(read_i > 0 && indel_i-1 + last_slope_delta >=0 && indel_i-1 + last_slope_delta < dynamic_row_width && read_i > 0 && LRMDP_score(read_i-1, indel_i-1 + last_slope_delta) > -0x7ffffff0)
641 score_from_ins = LRMDP_score(read_i - 1, indel_i-1 + last_slope_delta) + ( (LRMDP_move(read_i - 1, indel_i-1+ last_slope_delta) == 'I')?score_extend_gap:score_create_gap);
642
643 if( indel_from_start < 0 || indel_from_start < read_i ) if(indel_i < dynamic_row_width-1 && read_i > 0 && LRMDP_score(read_i, indel_i+1) > -0x7ffffff0)
644 score_from_del = LRMDP_score(read_i, indel_i+1) + ((LRMDP_move(read_i, indel_i+1) == 'D')?score_extend_gap:score_create_gap);
645
646 if((indel_i+ last_slope_delta)>=0 && (indel_i+ last_slope_delta)< dynamic_row_width && (read_i ==0 || LRMDP_score(read_i-1, indel_i + last_slope_delta) > -0x7ffffff0))
647 score_from_match =(read_i > 0 ?LRMDP_score(read_i-1, indel_i + last_slope_delta): 0)+ (is_matched_base?score_match:score_mismatch);
648 //if(read_i == 2) LRMprintf("INDEL_i=%d, F_INS=%d, F_DEL=%d, F_MAT=%d\n", indel_i, score_from_ins, score_from_del, score_from_match);
649 }else{
650 if( indel_from_start > 0 || ( -indel_from_start < read_i ) ) if(indel_i>0 && LRMDP_score(read_i, indel_i-1) > -0x7ffffff0)
651 score_from_ins = LRMDP_score(read_i, indel_i-1) + ( (LRMDP_move(read_i, indel_i-1) == 'I')?score_extend_gap:score_create_gap);
652
653 if(indel_i < dynamic_row_width-1 && indel_i+1 + last_slope_delta >= 0 && indel_i+1 + last_slope_delta < dynamic_row_width && read_i > 0 && LRMDP_score(read_i - 1, indel_i+1+last_slope_delta) > -0x7ffffff0)
654 score_from_del = LRMDP_score(read_i - 1, indel_i+1 + last_slope_delta) + ((LRMDP_move(read_i - 1, indel_i+1+last_slope_delta) == 'D')?score_extend_gap:score_create_gap);
655
656 if((indel_i+ last_slope_delta)>=0 && (indel_i+ last_slope_delta)< dynamic_row_width && (read_i ==0 || LRMDP_score(read_i-1, indel_i + last_slope_delta) > -0x7ffffff0))
657 score_from_match =(read_i > 0 ?LRMDP_score(read_i-1, indel_i + last_slope_delta): 0)+ (is_matched_base?score_match:score_mismatch);
658 //if(read_i == 4 && !search_to_3end) // LRMprintf("INDEL_i=%d, F_INS=%d, F_DEL=%d, F_MAT=%d\n", indel_i, score_from_ins, score_from_del, score_from_match);
659 // LRMprintf("SCORE AT readi=%d, indel_i=%d = %d ;; MOVE = '%c'\n", read_i, indel_i-1+ last_slope_delta, LRMDP_score(read_i, indel_i-1+ last_slope_delta), LRMDP_move(read_i, indel_i-1+ last_slope_delta));
660 }
661
662 if(read_i == 0 && indel_from_start > 0 && !search_to_3end) score_from_match += score_create_gap + (indel_from_start-1) * score_extend_gap;
663 if(read_i == 0 && indel_from_start > 0 && search_to_3end) score_from_match += score_create_gap + (indel_from_start-1) * score_extend_gap;
664
665 int final_score = max(score_from_del, max(score_from_ins, score_from_match));
666 //if(read_i == 4 && !search_to_3end)LRMprintf(" == READ_I %d; INDEL_I %d; INDEL_START %d; FINAL %d\n", read_i, indel_i, indel_from_start, final_score);
667
668 if(indel_from_start < 0 && read_i < - indel_from_start && !search_to_3end){
669 LRMDP_score(read_i, indel_i) = score_create_gap - score_extend_gap * indel_from_start ;
670 LRMDP_move(read_i, indel_i) = 'D';
671 } else if(indel_from_start > 0 && read_i < indel_from_start && search_to_3end) {
672 LRMDP_score(read_i, indel_i) = score_create_gap + score_extend_gap * (indel_from_start -1) ;
673 LRMDP_move(read_i, indel_i) = 'I';
674 } else if(final_score < -0x7ffffff0){
675 LRMDP_move( read_i, indel_i) = '?';
676 }else{
677 if(0 && ! search_to_3end)LRMprintf("#%d %c%c%c %c %d |", indel_i, indel_from_start?' ':'>', this_base_value_in_chro, indel_from_start?' ':'<', score_from_del == final_score?'D':((score_from_ins == final_score)?'I': ( is_matched_base ?'M':'X')), final_score);
678 LRMDP_score( read_i, indel_i) = final_score;
679 //LRMprintf(" !! READ_I %d; INDEL_I %d; INDEL_START %d; FINAL %d\n", read_i, indel_i, indel_from_start, final_score);
680 if(final_score > thisrow_max_score && abs( indel_i - dynamic_row_width/2 ) < dynamic_row_width/3){
681 thisrow_max_score = final_score;
682 thisrow_max_indel_from_start = indel_from_start;
683 }
684 LRMDP_move(read_i, indel_i) =(score_from_del == final_score?'D':((score_from_ins == final_score)?'I': ( is_matched_base ?'M':'X')));
685 }
686 }
687 //if(thisrow_max_indel_from_start < -0x7ffffff0) LRMprintf("ERROR READ\n");
688 //else LRMprintf(" READ\n");
689 assert(thisrow_max_indel_from_start > -0x7ffffff0);
690 best_offset_history[this_base_in_read] = thisrow_max_indel_from_start;
691 if(0 && ! search_to_3end)LRMprintf("\nSET %d-th BEST_FROM_START=%d ; SLOPE_OFFSET=%d ; MAX_SCORE=%d\n\n", this_base_in_read, thisrow_max_indel_from_start, slope_offset, thisrow_max_score);
692
693 last_slope_offset = slope_offset;
694 previous_base_in_read = this_base_in_read;
695 }
696 read_i = bases_in_read - 1;
697
698 //LRMprintf("RESTART SEARCH: best_offset_history[read_i]=%d, best_offset_history[read_i-1]=%d\n", best_offset_history[ search_to_3end? bases_in_read - 1 : 0 ] - best_offset_history[ search_to_3end? bases_in_read - 2 : 1 ] );
699 if(bases_in_read_no_SS > 2)
700 indel_i = best_offset_history[ search_to_3end? bases_in_read - 1 : 0 ] - best_offset_history[ search_to_3end? bases_in_read - 2 : 1 ] + trying_indel_length;
701 else
702 indel_i = 1;
703
704 int error_in = 0;
705 while(read_i >= 0 && indel_i >=0 && indel_i < dynamic_row_width){
706 int real_base_in_read = search_to_3end? read_i:(bases_in_read - (read_i+1));
707 int slope_offset;
708 int next_slope_offset;
709
710 if(search_to_3end){
711 slope_offset = (real_base_in_read>0)?best_offset_history[real_base_in_read - 1]:0;
712 next_slope_offset = (real_base_in_read>1)?best_offset_history[real_base_in_read - 2]:0;
713 }else{
714 slope_offset = (real_base_in_read<bases_in_read - 1)?best_offset_history[real_base_in_read + 1]:0;
715 next_slope_offset = (real_base_in_read<bases_in_read - 2)?best_offset_history[real_base_in_read + 2]:0;
716 }
717
718 int last_slope_delta = next_slope_offset - slope_offset;
719 indel_movement_buff[ this_movement_start + moves] = LRMDP_move(read_i, indel_i);
720
721 //LRMprintf("%s R %d , SCORE+=%d (INDEL_i=%d) MOVE='%c' LAST_DELTA=%d\n", iteration_context -> read_name, read_i, LRMDP_score(read_i, indel_i), indel_i, LRMDP_move(read_i, indel_i), last_slope_delta);
722 //LRMprintf("SEARCH_END_01 %s : start=%d, moves=%d\n", iteration_context -> read_name , this_movement_start, moves);
723 if(indel_movement_buff[ this_movement_start + moves]=='?'){
724 error_in = 1;
725 //LRMprintf("WRONG_READ_ERROR_MOVE : %s\n", iteration_context -> read_name);
726 break;
727 }
728
729 if(indel_movement_buff[ this_movement_start + moves] == 'M' || indel_movement_buff[ this_movement_start + moves] == 'X'){
730 read_i--;
731 if(read_i>=0)indel_i -= last_slope_delta;
732 }else {
733
734 if(search_to_3end){
735 if(indel_movement_buff[ this_movement_start + moves] == 'D')indel_i++;
736 else {
737 indel_i --;
738 if(read_i>=0)indel_i -= last_slope_delta;
739 read_i--;
740 }
741 }else{
742 if(indel_movement_buff[ this_movement_start + moves] == 'I')indel_i--;
743 else {
744 indel_i ++;
745 if(read_i>=0)indel_i -= last_slope_delta;
746 read_i--;
747 }
748 }
749 }
750 moves ++;
751
752 //if(read_i<0) LRMprintf("END_UP: indel_i = %d > %d\n", indel_i, trying_indel_length);
753 if(search_to_3end){
754 if(read_i < 0 && indel_i < trying_indel_length)
755 for(; indel_i <trying_indel_length; indel_i++) indel_movement_buff[ this_movement_start + (moves++)] = 'D';
756 }else if(read_i < 0 && indel_i > trying_indel_length)
757 for(; indel_i > trying_indel_length; indel_i--) indel_movement_buff[ this_movement_start + (moves++)] = 'I';
758
759 if(moves > max( LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15, 300 ) + context -> max_dynamic_indel_length ){
760 LRMprintf("ERROR: Dynamic programming moves more than %d\n", max( (int)(LRMDYNAMIC_MAXIMUM_GAP_LENGTH * 15), 300 ) + context -> max_dynamic_indel_length);
761 return -1;
762 }
763 }
764
765 //LRMprintf("SEARCH_END_02 %s : start=%d, moves=%d\n", iteration_context -> read_name , this_movement_start, moves);
766 if(0){
767 indel_movement_buff[this_movement_start + moves] = 0;
768 LRMprintf("MOVE_ENDS [ %s E=%d] = %s\n",iteration_context -> read_name , error_in, indel_movement_buff);
769 }
770 if(error_in){
771 moves = sprintf(indel_movement_buff + this_movement_start, "%dS", bases_in_read);
772 return moves;
773 }
774
775 indel_movement_buff[ this_movement_start + moves]=0;
776 if(!search_to_3end){
777 for(read_i = 0; read_i < moves; read_i++){
778 char tmp = indel_movement_buff[ this_movement_start + read_i];
779 if(tmp == 'I') indel_movement_buff[ this_movement_start + read_i]='D';
780 else if(tmp == 'D') indel_movement_buff[ this_movement_start + read_i]='I';
781 }
782 }
783
784 error_in = LRMsoftclipping_moves(context,thread_context, iteration_context ,indel_movement_buff + this_movement_start, moves, bases_in_read, search_to_3end);
785 if(error_in){
786 LRMprintf("SOFT CLIPPING ERROR : %d of %s\n", error_in, iteration_context -> read_name);
787 return -1;
788 }
789
790 if(search_to_3end){
791 for(read_i = 0; read_i < moves/2; read_i++){
792 char tmp = indel_movement_buff[this_movement_start + read_i];
793 indel_movement_buff[ this_movement_start + read_i] = indel_movement_buff[this_movement_start + moves - read_i - 1];
794 indel_movement_buff[ this_movement_start + moves - read_i - 1] = tmp;
795 }
796 }
797
798 if(head_SS>0){
799 int iii;
800 for(iii = moves-1; iii >=0; iii--){
801 indel_movement_buff[ this_movement_start + iii + 10] = indel_movement_buff[ this_movement_start + iii];
802 }
803 for(iii = 0; iii < 10; iii++) indel_movement_buff[ this_movement_start + iii] = '.';
804 moves += 10;
805 iii = sprintf(indel_movement_buff + this_movement_start, "%dS",head_SS);
806 indel_movement_buff[this_movement_start + iii] = '.';
807 }
808 if(tail_SS>0)moves += sprintf(indel_movement_buff + this_movement_start + moves, "%dS",tail_SS);
809 indel_movement_buff[ this_movement_start + (moves++)]='/';
810 indel_movement_buff[this_movement_start + moves] = 0;
811 LRMtest_move_buff( context, thread_context, iteration_context , indel_movement_buff + this_movement_start, moves , bases_in_read_no_SS );
812 if(0)LRMprintf("MOVE_ENDS [ %s E=%d] = %s; rlen = %d\n",iteration_context -> read_name , error_in, indel_movement_buff + this_movement_start, bases_in_read);
813 return moves;
814 }
815
816