1
2%{
3#include "kmer_assembly.h"
4#include "kmer_assembly_untangler.h"
5
6%}
7
8
9
10%{
11#include "kmer_assembly_error.h"
12
13
14boolean lift_indel_error_KmerAssembly(KmerAssemblyIndex * kai,KmerAssemblyPath * real,KmerAssemblyPath * error,long int * start_labels,int label_len,int error_position,int error_len)
15{
16  int i,j,k,true;
17
18  for(i=0,true=0;i<error->stack_len && true<real->stack_len;i++,true++) {
19    if( i == error_position ) {
20      if( error_len > 0 ) {
21	for(j=0;j<error_len;) {
22	  i++;
23	  j++;
24	}
25      } else {
26	error_len = abs(error_len);
27	for(j=0;j<error_len;) {
28	  true++;
29	  j++;
30	}
31      }
32    }
33
34    for(k=0;k<label_len;k++) {
35      /* remove label from error, add to real */
36      if( error->stack[i] == NULL ) {
37	fprintf(stderr,"issue - error path is missing a link!");
38      } else {
39	remove_sequence_label_KmerAssemblyLink(error->stack[i],start_labels[k]+i);
40      }
41
42      if( real->stack[true] == NULL ) {
43	fprintf(stderr,"issue - true path is missing a link!");
44      } else {
45	add_sequence_label_KmerAssemblyLink(real->stack[true],start_labels[k]+i);
46      }
47      if( error->stack[i] != NULL && error->stack[i]->sequence_label_len == 0 ) {
48	detach_KmerAssemblyLink(kai,error->stack[i]);
49      }
50    }
51  }
52
53  return TRUE;
54
55}
56
57boolean lift_forward_error_KmerAssembly(KmerAssemblyIndex * kai,KmerAssemblyPath * real,KmerAssemblyPath * error,long int * start_labels,int label_len)
58{
59  int i,k;
60
61  for(i=0;i<error->stack_len;i++) {
62    for(k=0;k<label_len;k++) {
63      /* remove label from error, add to real */
64      remove_sequence_label_KmerAssemblyLink(error->stack[i],start_labels[k]+i);
65      add_sequence_label_KmerAssemblyLink(real->stack[i],start_labels[k]+i);
66      if( error->stack[i]->sequence_label_len == 0 ) {
67	detach_KmerAssemblyLink(kai,error->stack[i]);
68      }
69    }
70  }
71
72  return TRUE;
73}
74
75int mark_tangles_KmerAssembly(KmerAssemblyIndex * kai)
76{
77  int i;
78  kmer_t kmer;
79  KmerAssemblyNode * node;
80  char buffer[256];
81
82  int count = 0;
83
84  kmer = -1;
85  kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer);
86
87  for(;kmer != -1;  kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer)) {
88    node = (*kai->kii->retrieve_by_kmer)(kai->kii->handle,kmer);
89    if( node->prev_len > 1 ) {
90      reverse_map_dna_number(node->number,kai->kii->kmer_size,buffer);
91      buffer[kai->kii->kmer_size] = '\0';
92
93      fprintf(stderr,"Marking node (%ld) [%s] as next tangled\n",node->number,buffer);
94      count++;
95      for(i=0;i<node->prev_len;i++) {
96	node->prev[i]->state |= KMER_ASSEMBLY_NEXT_TANGLED;
97      }
98    }
99
100    if( node->next_len > 1 ) {
101      reverse_map_dna_number(node->number,kai->kii->kmer_size,buffer);
102      buffer[kai->kii->kmer_size] = '\0';
103
104
105      fprintf(stderr,"Marking node (%ld) [%s] as prev tangled\n",node->number,buffer);
106      count++;
107      for(i=0;i<node->next_len;i++) {
108	node->next[i]->state |= KMER_ASSEMBLY_PREV_TANGLED;
109      }
110    }
111  }
112
113  return count;
114}
115
116int resolve_forward_errors_KmerAssembly(KmerAssemblyIndex * kai,int depth,int verbose,int max_path_enum)
117{
118  int i;
119  int total = 0;
120
121  kmer_t kmer;
122  KmerAssemblyNode * node;
123
124  char buffer[128];
125  SinglePosSequence * sps;
126
127  assert(kai != NULL);
128  assert(kai->kii != NULL );
129  assert(kai->kii->next_filled_kmer != NULL);
130
131  kmer = -1;
132  kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer);
133
134  for(;kmer != -1;  kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer)) {
135    node = (*kai->kii->retrieve_by_kmer)(kai->kii->handle,kmer);
136    assert(node != NULL);
137
138    if( node->next_len < 2 ) {
139      continue;
140    }
141
142    for(i=0;i<node->next_len;i++) {
143      if( node->next[i]->sequence_label_len <= depth ) {
144	if( verbose ) {
145	  reverse_map_dna_number(node->next[i]->next->number,kai->kii->kmer_size,buffer);
146	  buffer[kai->kii->kmer_size] = '\0';
147	  sps = lookup_Sequence_SinglePosSpace(kai->sps,node->next[i]->sequence_label[0]);
148
149	  fprintf(stderr,"Attempting to resolve error [%s] from sequence %s, depth %d\n",buffer,((AssemblySequence*)sps->data)->seq->name,node->next[i]->sequence_label_len);
150	}
151
152	total += attempt_resolve_forward_error_KmerAssembly(kai,node,depth,max_path_enum);
153	break;
154      }
155    }
156  }
157
158  return total;
159}
160
161
162
163void remove_errors_KmerAssemblyIndex(KmerAssemblyIndex * kai,int max_error_depth)
164{
165  int i;
166  kmer_t kmer;
167  KmerAssemblyNode * node;
168
169  assert(kai != NULL);
170  assert(kai->kii != NULL);
171  assert(kai->kii->next_filled_kmer != NULL);
172
173  kmer = -1;
174  kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer);
175
176  for(;kmer != -1;  kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer)) {
177    node = (*kai->kii->retrieve_by_kmer)(kai->kii->handle,kmer);
178    assert(node);
179
180    if( node->next_len > 1 ) {
181      for(i=0;i<node->next_len;i++) {
182	if( node->next[i]->sequence_label_len <= max_error_depth ) {
183	  detach_KmerAssemblyLink(kai,node->next[i]);
184	}
185      }
186    }
187
188    if( node->prev_len > 1 ) {
189      for(i=0;i<node->prev_len;i++) {
190	if( node->prev[i]->sequence_label_len <= max_error_depth ) {
191	  detach_KmerAssemblyLink(kai,node->prev[i]);
192	}
193      }
194    }
195  }
196
197
198}
199
200int attempt_resolve_forward_error_KmerAssembly(KmerAssemblyIndex * kai,KmerAssemblyNode * node,int depth,int max_path_enum)
201{
202  int success;
203  int i,j,k;
204  KmerAssemblyPath * final_real;
205  KmerAssemblyPath * final_error;
206  char * alphabet = "ATGC";
207  char buffer[256];
208  char next[256];
209  char selfbase;
210
211  kmer_t new_number;
212
213  int no_success = 0;
214  int no_failures = 0;
215
216  assert(kai != NULL);
217  assert(node != NULL);
218
219
220  final_real = new_KmerAssemblyPath();
221  final_error = new_KmerAssemblyPath();
222
223  if( node->next_len == 0 ) {
224    warn("No outgoing links, therefore no errors");
225    return TRUE;
226  }
227
228  if( node->next_len == 0 ) {
229    warn("Only one outgoing link, so no error");
230    return TRUE;
231  }
232
233  /* look for links with depth outgoing */
234  for(i=0;i<node->next_len;i++) {
235    if( node->next[i]->sequence_label_len <= depth ) {
236      /* see whether this is an error*/
237
238      success = 0;
239      reverse_map_dna_number(node->next[i]->next->number,kai->kii->kmer_size,buffer);
240
241      selfbase = buffer[kai->kii->kmer_size-1];
242
243      /* test each base */
244      for(j=0;j<4;j++) {
245	if( alphabet[j] == selfbase ) {
246	  continue;
247	}
248	buffer[kai->kii->kmer_size-1] = alphabet[j];
249	new_number = forward_dna_number_from_string(buffer,kai->kii->kmer_size);
250	for(k=0;k<node->next_len;k++) {
251	  if( k == i ) {
252	    continue;
253	  }
254	  if( new_number == node->next[k]->next->number ) {
255	    /* potential fix */
256	    final_error->stack_len = 0;
257	    final_real->stack_len  = 0;
258
259	    fprintf(stderr,"Found potential fix of %c to %c in %.*s (real:%c (%d), error:%c (%d))\n",selfbase,alphabet[j],kai->kii->kmer_size,buffer,node->next[k]->base,k,node->next[i]->base,i);
260
261	    /* consider potential insertion of one base */
262
263	    if( find_indel_path_KmerAssembly(node->next[k],node->next[i],-1,kai->kii->kmer_size-1,final_real,final_error,kai->kii->kmer_size+1,max_path_enum) == TRUE ) {
264	      fprintf(stderr,"SUCCESS in insertion modelling\n");
265	      lift_indel_error_KmerAssembly(kai,final_real,final_error,node->next[i]->sequence_label,node->next[i]->sequence_label_len,kai->kii->kmer_size-1,-1);
266	      success = 1;
267	      break;
268	    }
269
270	    if( find_indel_path_KmerAssembly(node->next[k],node->next[i],1,kai->kii->kmer_size-1,final_real,final_error,kai->kii->kmer_size+1,max_path_enum) == TRUE ) {
271	      fprintf(stderr,"SUCCESS in deletion modelling\n");
272	      lift_indel_error_KmerAssembly(kai,final_real,final_error,node->next[i]->sequence_label,node->next[i]->sequence_label_len,kai->kii->kmer_size-1,1);
273	      success = 1;
274	      break;
275	    }
276
277
278
279	    /* otherwise substitution is all we can really suggest... */
280
281	    if( find_error_path_KmerAssembly(node->next[k],node->next[i],alphabet[j],kai->kii->kmer_size-1,final_real,final_error,kai->kii->kmer_size+1,max_path_enum) == TRUE ) {
282	      /* fix error */
283	      /*
284		fprintf(stderr,"Found Error: Real\n");
285		show_KmerAssemblyPath(final_real,stderr);
286		fprintf(stderr,"   Error\n");
287		show_KmerAssemblyPath(final_error,stderr);
288		fprintf(stderr,"\n");
289	      */
290
291	      fprintf(stderr,"SUCCESS in substitution modelling\n");
292
293	      lift_forward_error_KmerAssembly(kai,final_real,final_error,node->next[i]->sequence_label,node->next[i]->sequence_label_len);
294	      success = 1;
295	      break;
296	    }
297	  }
298	}
299      }
300      if( success == 0 ) {
301	no_failures++;
302      } else {
303	no_success++;
304      }
305    }
306  }
307
308  free_KmerAssemblyPath(final_real);
309  free_KmerAssemblyPath(final_error);
310
311  return no_success;
312}
313
314
315boolean find_indel_path_KmerAssembly(KmerAssemblyLink *real,KmerAssemblyLink * error,int delete_length,int proposed_position,KmerAssemblyPath * final_real,KmerAssemblyPath * final_error,int max_search,int max_path_enum)
316{
317  int current_path = 1;
318
319  return extend_indel_path_KmerAssembly(real,error,0,0,delete_length,proposed_position,final_real,final_error,max_search,&current_path,max_path_enum);
320}
321
322boolean extend_indel_path_KmerAssembly(KmerAssemblyLink* real,KmerAssemblyLink * error,int real_pos,int error_pos,int delete_length,int proposed_position,KmerAssemblyPath * final_real,KmerAssemblyPath * final_error,int max_search,int *current_path,int max_path_enum)
323{
324  int real_start;
325  int error_start;
326  int k;
327  int i,j;
328
329  KmerAssemblyLink * temp_real[256];
330  KmerAssemblyLink * temp_error[256];
331
332  if( max_search > 256 ) {
333    fatal("Madness. Max search greater than 256 - shouldn't happen!");
334  }
335
336  if( max_path_enum < *current_path ) {
337    fprintf(stderr,"Terminating due too many paths\n");
338    return FALSE;
339  }
340
341  real_start = real_pos;
342  error_start = error_pos;
343
344
345  /*  fprintf(stderr,"Entering into extension code with real %d, error %d\n",real_pos,error_pos);*/
346
347  while( real != NULL && error != NULL ) {
348
349    if( proposed_position != real_pos ) {
350      if( real->base != error->base ) {
351	fprintf(stderr,"in considering indel (%d, path %d), real (%c) and error (%c) do not agree at position %d,%d\n",delete_length,current_path,real->base,error->base,real_pos,error_pos);
352	return FALSE;
353      }
354    } else {
355      if( delete_length > 0 ) {
356	/* we move along in the real positions by delete number */
357	for(k=0;k<delete_length;k++) {
358	  temp_real[real_pos] = real;
359	  real_pos++;
360	  real = real->next->next[0];
361	  if( real == NULL ) {
362	    return FALSE;
363	  }
364	  if( real->next->next_len > 1 ) {
365	    fprintf(stderr,"Cannot handle over branched cases in indel area\n");
366	    return FALSE;
367	  }
368	}
369      } else {
370	/* we move along in the error positions by the delete number */
371	delete_length = abs(delete_length);
372	for(k=0;k<delete_length;k++) {
373	  temp_error[error_pos] = error;
374	  error_pos++;
375	  error = error->next->next[0];
376	  if( error == NULL ) {
377	    return FALSE;
378	  }
379	  if( error->next->next_len > 1 ) {
380	    fprintf(stderr,"Cannot handle over branched cases in indel area\n");
381	    return FALSE;
382	  }
383	}
384      }
385    }
386
387
388    temp_real[real_pos++] = real;
389    temp_error[error_pos++] = error;
390
391
392    if( real_pos > max_search ) {
393      /* will return TRUE after loop */
394      break;
395    }
396
397    if( real == error ) {
398      /* same node, therefore the streams have successfully merged */
399      break;
400    }
401
402
403    if( real->next->next_len == 1 && error->next->next_len == 1 ) {
404      real = real->next->next[0];
405      error = error->next->next[0];
406      continue;
407    }
408
409    /* this is a tail position */
410    if( error->next->next_len == 0 ) {
411
412      /* will return TRUE after loop */
413      break;
414    }
415
416
417    /* recursively call for each possible path: if any return
418       true, return TRUE */
419
420    /*    fprintf(stderr,"paths are branching, real with %d, error with %d\n",real->next->next_len,error->next->next_len);*/
421
422    if( real->next->next_len > 1 || error->next->next_len > 1 ) {
423      for(i=0;i<real->next->next_len;i++) {
424	for(j=0;j<error->next->next_len;j++) {
425	  (*current_path)++;
426	  if( extend_indel_path_KmerAssembly(real->next->next[i],error->next->next[j],real_pos,error_pos,delete_length,proposed_position,final_real,final_error,max_search,current_path,max_path_enum) == TRUE ) {
427	    break;
428	  }
429	}
430      }
431
432      /*      fprintf(stderr,"Recursively unable to find path\n");*/
433      /* unable to find a good path */
434
435      return FALSE;
436    }
437
438    fprintf(stderr,"should be impossible to reach here, all other options have been considered");
439
440    return FALSE;
441  }
442
443
444  /* have to put the right path into the path datastructures */
445
446  /* we might have to extend the path datastructures */
447
448  if( final_real->max_stack < real_pos || final_error->max_stack < error_pos ) {
449    fatal("Have to extend path; should be easy, but not done yet");
450  }
451
452  if( final_real->stack_len < real_pos ) {
453    /* first return */
454    final_real->stack_len = real_pos;
455  }
456
457  if( final_error->stack_len < error_pos ) {
458    /* first return */
459    final_error->stack_len = error_pos;
460  }
461
462  for(i=real_start;i<real_pos;i++) {
463    final_real->stack[i]   = temp_real[i];
464  }
465
466  for(i=error_start;i<error_pos;i++) {
467    final_error->stack[i]   = temp_error[i];
468  }
469
470  return TRUE;
471
472
473}
474
475
476boolean find_error_path_KmerAssembly(KmerAssemblyLink * start_real,KmerAssemblyLink * start_error,char proposed_fix,int proposed_fix_position,KmerAssemblyPath * final_real,KmerAssemblyPath * final_error,int max_search,int max_path_enum)
477{
478  int current_path = 1;
479  return extend_error_path_KmerAssembly(start_real,start_error,0,proposed_fix,proposed_fix_position,final_real,final_error,max_search,&current_path,max_path_enum);
480}
481
482
483boolean extend_error_path_KmerAssembly(KmerAssemblyLink * real,KmerAssemblyLink * error,int current_pos,char proposed_fix,int proposed_fix_position,KmerAssemblyPath * final_real,KmerAssemblyPath * final_error,int max_search,int * current_path,int max_path_enum)
484{
485  int i;
486  int j;
487
488  int temp_pos;
489  int startpos = current_pos;
490  KmerAssemblyLink * temp_real[256];
491  KmerAssemblyLink * temp_error[256];
492
493  if( max_search > 256 ) {
494    fatal("Madness. Max search greater than 256 - shouldn't happen!");
495  }
496
497
498  if( max_path_enum < *current_path ) {
499    fprintf(stderr,"Terminating due too many paths\n");
500    return FALSE;
501  }
502
503  temp_pos = current_pos;
504
505
506  /*  fprintf(stderr,"Entering into extension code with %d temp_pos\n",temp_pos);*/
507
508  while( real != NULL && error != NULL ) {
509
510    if( proposed_fix_position != temp_pos ) {
511      if( real->base != error->base ) {
512	/*	fprintf(stderr,"real (%c) and error (%c) do not agree at position %d\n",real->base,error->base,temp_pos); */
513	return FALSE;
514      }
515    } else if ( real->base != proposed_fix ) {
516      fprintf(stderr,"In fixed position real (%c) and proposed (%c) do not agree\n",real->base,proposed_fix);
517      return FALSE;
518    }
519
520
521    temp_real[temp_pos] = real;
522    temp_error[temp_pos] = error;
523    temp_pos++;
524
525    if( temp_pos > max_search ) {
526      /* will return TRUE after loop */
527      break;
528    }
529
530    if( real->next->next_len == 1 && error->next->next_len == 1 ) {
531      real = real->next->next[0];
532      error = error->next->next[0];
533      continue;
534    }
535
536    /* this is a tail position */
537    if( error->next->next_len == 0 ) {
538
539      /* will return TRUE after loop */
540      break;
541    }
542
543
544    /* recursively call for each possible path: if any return
545       true, return TRUE */
546
547    /*    fprintf(stderr,"paths are branching, real with %d, error with %d\n",real->next->next_len,error->next->next_len);*/
548
549    if( real->next->next_len > 1 || error->next->next_len > 1 ) {
550      for(i=0;i<real->next->next_len;i++) {
551	for(j=0;j<error->next->next_len;j++) {
552	  (*current_path)++;
553	  if( extend_error_path_KmerAssembly(real->next->next[i],error->next->next[j],temp_pos,proposed_fix,proposed_fix_position,final_real,final_error,max_search,current_path,max_path_enum) == TRUE ) {
554	    break;
555	  }
556	}
557      }
558
559      /*      fprintf(stderr,"Recursively unable to find path\n");*/
560      /* unable to find a good path */
561
562      return FALSE;
563    }
564
565    fprintf(stderr,"should be impossible to reach here, all other options have been considered");
566
567    return FALSE;
568  }
569
570
571  /* have to put the right path into the path datastructures */
572
573  /* we might have to extend the path datastructures */
574
575  if( final_real->max_stack < temp_pos || final_error->max_stack < temp_pos ) {
576    fatal("Have to extend path; should be easy, but not done yet");
577  }
578
579  if( final_real->stack_len < temp_pos ) {
580    /* first return */
581    final_real->stack_len = temp_pos;
582    final_error->stack_len = temp_pos;
583  }
584
585  for(i=startpos;i<temp_pos;i++) {
586    final_real->stack[i]   = temp_real[i];
587    final_error->stack[i]  = temp_error[i];
588  }
589
590  return TRUE;
591
592}
593