1 2%{ 3#include "kmer_assembly.h" 4#include "kmer_assembly_untangler.h" 5 6%} 7 8 9 10%{ 11#include "kmer_assembly_error.h" 12 13 14boolean lift_indel_error_KmerAssembly(KmerAssemblyIndex * kai,KmerAssemblyPath * real,KmerAssemblyPath * error,long int * start_labels,int label_len,int error_position,int error_len) 15{ 16 int i,j,k,true; 17 18 for(i=0,true=0;i<error->stack_len && true<real->stack_len;i++,true++) { 19 if( i == error_position ) { 20 if( error_len > 0 ) { 21 for(j=0;j<error_len;) { 22 i++; 23 j++; 24 } 25 } else { 26 error_len = abs(error_len); 27 for(j=0;j<error_len;) { 28 true++; 29 j++; 30 } 31 } 32 } 33 34 for(k=0;k<label_len;k++) { 35 /* remove label from error, add to real */ 36 if( error->stack[i] == NULL ) { 37 fprintf(stderr,"issue - error path is missing a link!"); 38 } else { 39 remove_sequence_label_KmerAssemblyLink(error->stack[i],start_labels[k]+i); 40 } 41 42 if( real->stack[true] == NULL ) { 43 fprintf(stderr,"issue - true path is missing a link!"); 44 } else { 45 add_sequence_label_KmerAssemblyLink(real->stack[true],start_labels[k]+i); 46 } 47 if( error->stack[i] != NULL && error->stack[i]->sequence_label_len == 0 ) { 48 detach_KmerAssemblyLink(kai,error->stack[i]); 49 } 50 } 51 } 52 53 return TRUE; 54 55} 56 57boolean lift_forward_error_KmerAssembly(KmerAssemblyIndex * kai,KmerAssemblyPath * real,KmerAssemblyPath * error,long int * start_labels,int label_len) 58{ 59 int i,k; 60 61 for(i=0;i<error->stack_len;i++) { 62 for(k=0;k<label_len;k++) { 63 /* remove label from error, add to real */ 64 remove_sequence_label_KmerAssemblyLink(error->stack[i],start_labels[k]+i); 65 add_sequence_label_KmerAssemblyLink(real->stack[i],start_labels[k]+i); 66 if( error->stack[i]->sequence_label_len == 0 ) { 67 detach_KmerAssemblyLink(kai,error->stack[i]); 68 } 69 } 70 } 71 72 return TRUE; 73} 74 75int mark_tangles_KmerAssembly(KmerAssemblyIndex * kai) 76{ 77 int i; 78 kmer_t kmer; 79 KmerAssemblyNode * node; 80 char buffer[256]; 81 82 int count = 0; 83 84 kmer = -1; 85 kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer); 86 87 for(;kmer != -1; kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer)) { 88 node = (*kai->kii->retrieve_by_kmer)(kai->kii->handle,kmer); 89 if( node->prev_len > 1 ) { 90 reverse_map_dna_number(node->number,kai->kii->kmer_size,buffer); 91 buffer[kai->kii->kmer_size] = '\0'; 92 93 fprintf(stderr,"Marking node (%ld) [%s] as next tangled\n",node->number,buffer); 94 count++; 95 for(i=0;i<node->prev_len;i++) { 96 node->prev[i]->state |= KMER_ASSEMBLY_NEXT_TANGLED; 97 } 98 } 99 100 if( node->next_len > 1 ) { 101 reverse_map_dna_number(node->number,kai->kii->kmer_size,buffer); 102 buffer[kai->kii->kmer_size] = '\0'; 103 104 105 fprintf(stderr,"Marking node (%ld) [%s] as prev tangled\n",node->number,buffer); 106 count++; 107 for(i=0;i<node->next_len;i++) { 108 node->next[i]->state |= KMER_ASSEMBLY_PREV_TANGLED; 109 } 110 } 111 } 112 113 return count; 114} 115 116int resolve_forward_errors_KmerAssembly(KmerAssemblyIndex * kai,int depth,int verbose,int max_path_enum) 117{ 118 int i; 119 int total = 0; 120 121 kmer_t kmer; 122 KmerAssemblyNode * node; 123 124 char buffer[128]; 125 SinglePosSequence * sps; 126 127 assert(kai != NULL); 128 assert(kai->kii != NULL ); 129 assert(kai->kii->next_filled_kmer != NULL); 130 131 kmer = -1; 132 kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer); 133 134 for(;kmer != -1; kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer)) { 135 node = (*kai->kii->retrieve_by_kmer)(kai->kii->handle,kmer); 136 assert(node != NULL); 137 138 if( node->next_len < 2 ) { 139 continue; 140 } 141 142 for(i=0;i<node->next_len;i++) { 143 if( node->next[i]->sequence_label_len <= depth ) { 144 if( verbose ) { 145 reverse_map_dna_number(node->next[i]->next->number,kai->kii->kmer_size,buffer); 146 buffer[kai->kii->kmer_size] = '\0'; 147 sps = lookup_Sequence_SinglePosSpace(kai->sps,node->next[i]->sequence_label[0]); 148 149 fprintf(stderr,"Attempting to resolve error [%s] from sequence %s, depth %d\n",buffer,((AssemblySequence*)sps->data)->seq->name,node->next[i]->sequence_label_len); 150 } 151 152 total += attempt_resolve_forward_error_KmerAssembly(kai,node,depth,max_path_enum); 153 break; 154 } 155 } 156 } 157 158 return total; 159} 160 161 162 163void remove_errors_KmerAssemblyIndex(KmerAssemblyIndex * kai,int max_error_depth) 164{ 165 int i; 166 kmer_t kmer; 167 KmerAssemblyNode * node; 168 169 assert(kai != NULL); 170 assert(kai->kii != NULL); 171 assert(kai->kii->next_filled_kmer != NULL); 172 173 kmer = -1; 174 kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer); 175 176 for(;kmer != -1; kmer = (*kai->kii->next_filled_kmer)(kai->kii->handle,kmer)) { 177 node = (*kai->kii->retrieve_by_kmer)(kai->kii->handle,kmer); 178 assert(node); 179 180 if( node->next_len > 1 ) { 181 for(i=0;i<node->next_len;i++) { 182 if( node->next[i]->sequence_label_len <= max_error_depth ) { 183 detach_KmerAssemblyLink(kai,node->next[i]); 184 } 185 } 186 } 187 188 if( node->prev_len > 1 ) { 189 for(i=0;i<node->prev_len;i++) { 190 if( node->prev[i]->sequence_label_len <= max_error_depth ) { 191 detach_KmerAssemblyLink(kai,node->prev[i]); 192 } 193 } 194 } 195 } 196 197 198} 199 200int attempt_resolve_forward_error_KmerAssembly(KmerAssemblyIndex * kai,KmerAssemblyNode * node,int depth,int max_path_enum) 201{ 202 int success; 203 int i,j,k; 204 KmerAssemblyPath * final_real; 205 KmerAssemblyPath * final_error; 206 char * alphabet = "ATGC"; 207 char buffer[256]; 208 char next[256]; 209 char selfbase; 210 211 kmer_t new_number; 212 213 int no_success = 0; 214 int no_failures = 0; 215 216 assert(kai != NULL); 217 assert(node != NULL); 218 219 220 final_real = new_KmerAssemblyPath(); 221 final_error = new_KmerAssemblyPath(); 222 223 if( node->next_len == 0 ) { 224 warn("No outgoing links, therefore no errors"); 225 return TRUE; 226 } 227 228 if( node->next_len == 0 ) { 229 warn("Only one outgoing link, so no error"); 230 return TRUE; 231 } 232 233 /* look for links with depth outgoing */ 234 for(i=0;i<node->next_len;i++) { 235 if( node->next[i]->sequence_label_len <= depth ) { 236 /* see whether this is an error*/ 237 238 success = 0; 239 reverse_map_dna_number(node->next[i]->next->number,kai->kii->kmer_size,buffer); 240 241 selfbase = buffer[kai->kii->kmer_size-1]; 242 243 /* test each base */ 244 for(j=0;j<4;j++) { 245 if( alphabet[j] == selfbase ) { 246 continue; 247 } 248 buffer[kai->kii->kmer_size-1] = alphabet[j]; 249 new_number = forward_dna_number_from_string(buffer,kai->kii->kmer_size); 250 for(k=0;k<node->next_len;k++) { 251 if( k == i ) { 252 continue; 253 } 254 if( new_number == node->next[k]->next->number ) { 255 /* potential fix */ 256 final_error->stack_len = 0; 257 final_real->stack_len = 0; 258 259 fprintf(stderr,"Found potential fix of %c to %c in %.*s (real:%c (%d), error:%c (%d))\n",selfbase,alphabet[j],kai->kii->kmer_size,buffer,node->next[k]->base,k,node->next[i]->base,i); 260 261 /* consider potential insertion of one base */ 262 263 if( find_indel_path_KmerAssembly(node->next[k],node->next[i],-1,kai->kii->kmer_size-1,final_real,final_error,kai->kii->kmer_size+1,max_path_enum) == TRUE ) { 264 fprintf(stderr,"SUCCESS in insertion modelling\n"); 265 lift_indel_error_KmerAssembly(kai,final_real,final_error,node->next[i]->sequence_label,node->next[i]->sequence_label_len,kai->kii->kmer_size-1,-1); 266 success = 1; 267 break; 268 } 269 270 if( find_indel_path_KmerAssembly(node->next[k],node->next[i],1,kai->kii->kmer_size-1,final_real,final_error,kai->kii->kmer_size+1,max_path_enum) == TRUE ) { 271 fprintf(stderr,"SUCCESS in deletion modelling\n"); 272 lift_indel_error_KmerAssembly(kai,final_real,final_error,node->next[i]->sequence_label,node->next[i]->sequence_label_len,kai->kii->kmer_size-1,1); 273 success = 1; 274 break; 275 } 276 277 278 279 /* otherwise substitution is all we can really suggest... */ 280 281 if( find_error_path_KmerAssembly(node->next[k],node->next[i],alphabet[j],kai->kii->kmer_size-1,final_real,final_error,kai->kii->kmer_size+1,max_path_enum) == TRUE ) { 282 /* fix error */ 283 /* 284 fprintf(stderr,"Found Error: Real\n"); 285 show_KmerAssemblyPath(final_real,stderr); 286 fprintf(stderr," Error\n"); 287 show_KmerAssemblyPath(final_error,stderr); 288 fprintf(stderr,"\n"); 289 */ 290 291 fprintf(stderr,"SUCCESS in substitution modelling\n"); 292 293 lift_forward_error_KmerAssembly(kai,final_real,final_error,node->next[i]->sequence_label,node->next[i]->sequence_label_len); 294 success = 1; 295 break; 296 } 297 } 298 } 299 } 300 if( success == 0 ) { 301 no_failures++; 302 } else { 303 no_success++; 304 } 305 } 306 } 307 308 free_KmerAssemblyPath(final_real); 309 free_KmerAssemblyPath(final_error); 310 311 return no_success; 312} 313 314 315boolean find_indel_path_KmerAssembly(KmerAssemblyLink *real,KmerAssemblyLink * error,int delete_length,int proposed_position,KmerAssemblyPath * final_real,KmerAssemblyPath * final_error,int max_search,int max_path_enum) 316{ 317 int current_path = 1; 318 319 return extend_indel_path_KmerAssembly(real,error,0,0,delete_length,proposed_position,final_real,final_error,max_search,¤t_path,max_path_enum); 320} 321 322boolean extend_indel_path_KmerAssembly(KmerAssemblyLink* real,KmerAssemblyLink * error,int real_pos,int error_pos,int delete_length,int proposed_position,KmerAssemblyPath * final_real,KmerAssemblyPath * final_error,int max_search,int *current_path,int max_path_enum) 323{ 324 int real_start; 325 int error_start; 326 int k; 327 int i,j; 328 329 KmerAssemblyLink * temp_real[256]; 330 KmerAssemblyLink * temp_error[256]; 331 332 if( max_search > 256 ) { 333 fatal("Madness. Max search greater than 256 - shouldn't happen!"); 334 } 335 336 if( max_path_enum < *current_path ) { 337 fprintf(stderr,"Terminating due too many paths\n"); 338 return FALSE; 339 } 340 341 real_start = real_pos; 342 error_start = error_pos; 343 344 345 /* fprintf(stderr,"Entering into extension code with real %d, error %d\n",real_pos,error_pos);*/ 346 347 while( real != NULL && error != NULL ) { 348 349 if( proposed_position != real_pos ) { 350 if( real->base != error->base ) { 351 fprintf(stderr,"in considering indel (%d, path %d), real (%c) and error (%c) do not agree at position %d,%d\n",delete_length,current_path,real->base,error->base,real_pos,error_pos); 352 return FALSE; 353 } 354 } else { 355 if( delete_length > 0 ) { 356 /* we move along in the real positions by delete number */ 357 for(k=0;k<delete_length;k++) { 358 temp_real[real_pos] = real; 359 real_pos++; 360 real = real->next->next[0]; 361 if( real == NULL ) { 362 return FALSE; 363 } 364 if( real->next->next_len > 1 ) { 365 fprintf(stderr,"Cannot handle over branched cases in indel area\n"); 366 return FALSE; 367 } 368 } 369 } else { 370 /* we move along in the error positions by the delete number */ 371 delete_length = abs(delete_length); 372 for(k=0;k<delete_length;k++) { 373 temp_error[error_pos] = error; 374 error_pos++; 375 error = error->next->next[0]; 376 if( error == NULL ) { 377 return FALSE; 378 } 379 if( error->next->next_len > 1 ) { 380 fprintf(stderr,"Cannot handle over branched cases in indel area\n"); 381 return FALSE; 382 } 383 } 384 } 385 } 386 387 388 temp_real[real_pos++] = real; 389 temp_error[error_pos++] = error; 390 391 392 if( real_pos > max_search ) { 393 /* will return TRUE after loop */ 394 break; 395 } 396 397 if( real == error ) { 398 /* same node, therefore the streams have successfully merged */ 399 break; 400 } 401 402 403 if( real->next->next_len == 1 && error->next->next_len == 1 ) { 404 real = real->next->next[0]; 405 error = error->next->next[0]; 406 continue; 407 } 408 409 /* this is a tail position */ 410 if( error->next->next_len == 0 ) { 411 412 /* will return TRUE after loop */ 413 break; 414 } 415 416 417 /* recursively call for each possible path: if any return 418 true, return TRUE */ 419 420 /* fprintf(stderr,"paths are branching, real with %d, error with %d\n",real->next->next_len,error->next->next_len);*/ 421 422 if( real->next->next_len > 1 || error->next->next_len > 1 ) { 423 for(i=0;i<real->next->next_len;i++) { 424 for(j=0;j<error->next->next_len;j++) { 425 (*current_path)++; 426 if( extend_indel_path_KmerAssembly(real->next->next[i],error->next->next[j],real_pos,error_pos,delete_length,proposed_position,final_real,final_error,max_search,current_path,max_path_enum) == TRUE ) { 427 break; 428 } 429 } 430 } 431 432 /* fprintf(stderr,"Recursively unable to find path\n");*/ 433 /* unable to find a good path */ 434 435 return FALSE; 436 } 437 438 fprintf(stderr,"should be impossible to reach here, all other options have been considered"); 439 440 return FALSE; 441 } 442 443 444 /* have to put the right path into the path datastructures */ 445 446 /* we might have to extend the path datastructures */ 447 448 if( final_real->max_stack < real_pos || final_error->max_stack < error_pos ) { 449 fatal("Have to extend path; should be easy, but not done yet"); 450 } 451 452 if( final_real->stack_len < real_pos ) { 453 /* first return */ 454 final_real->stack_len = real_pos; 455 } 456 457 if( final_error->stack_len < error_pos ) { 458 /* first return */ 459 final_error->stack_len = error_pos; 460 } 461 462 for(i=real_start;i<real_pos;i++) { 463 final_real->stack[i] = temp_real[i]; 464 } 465 466 for(i=error_start;i<error_pos;i++) { 467 final_error->stack[i] = temp_error[i]; 468 } 469 470 return TRUE; 471 472 473} 474 475 476boolean find_error_path_KmerAssembly(KmerAssemblyLink * start_real,KmerAssemblyLink * start_error,char proposed_fix,int proposed_fix_position,KmerAssemblyPath * final_real,KmerAssemblyPath * final_error,int max_search,int max_path_enum) 477{ 478 int current_path = 1; 479 return extend_error_path_KmerAssembly(start_real,start_error,0,proposed_fix,proposed_fix_position,final_real,final_error,max_search,¤t_path,max_path_enum); 480} 481 482 483boolean extend_error_path_KmerAssembly(KmerAssemblyLink * real,KmerAssemblyLink * error,int current_pos,char proposed_fix,int proposed_fix_position,KmerAssemblyPath * final_real,KmerAssemblyPath * final_error,int max_search,int * current_path,int max_path_enum) 484{ 485 int i; 486 int j; 487 488 int temp_pos; 489 int startpos = current_pos; 490 KmerAssemblyLink * temp_real[256]; 491 KmerAssemblyLink * temp_error[256]; 492 493 if( max_search > 256 ) { 494 fatal("Madness. Max search greater than 256 - shouldn't happen!"); 495 } 496 497 498 if( max_path_enum < *current_path ) { 499 fprintf(stderr,"Terminating due too many paths\n"); 500 return FALSE; 501 } 502 503 temp_pos = current_pos; 504 505 506 /* fprintf(stderr,"Entering into extension code with %d temp_pos\n",temp_pos);*/ 507 508 while( real != NULL && error != NULL ) { 509 510 if( proposed_fix_position != temp_pos ) { 511 if( real->base != error->base ) { 512 /* fprintf(stderr,"real (%c) and error (%c) do not agree at position %d\n",real->base,error->base,temp_pos); */ 513 return FALSE; 514 } 515 } else if ( real->base != proposed_fix ) { 516 fprintf(stderr,"In fixed position real (%c) and proposed (%c) do not agree\n",real->base,proposed_fix); 517 return FALSE; 518 } 519 520 521 temp_real[temp_pos] = real; 522 temp_error[temp_pos] = error; 523 temp_pos++; 524 525 if( temp_pos > max_search ) { 526 /* will return TRUE after loop */ 527 break; 528 } 529 530 if( real->next->next_len == 1 && error->next->next_len == 1 ) { 531 real = real->next->next[0]; 532 error = error->next->next[0]; 533 continue; 534 } 535 536 /* this is a tail position */ 537 if( error->next->next_len == 0 ) { 538 539 /* will return TRUE after loop */ 540 break; 541 } 542 543 544 /* recursively call for each possible path: if any return 545 true, return TRUE */ 546 547 /* fprintf(stderr,"paths are branching, real with %d, error with %d\n",real->next->next_len,error->next->next_len);*/ 548 549 if( real->next->next_len > 1 || error->next->next_len > 1 ) { 550 for(i=0;i<real->next->next_len;i++) { 551 for(j=0;j<error->next->next_len;j++) { 552 (*current_path)++; 553 if( extend_error_path_KmerAssembly(real->next->next[i],error->next->next[j],temp_pos,proposed_fix,proposed_fix_position,final_real,final_error,max_search,current_path,max_path_enum) == TRUE ) { 554 break; 555 } 556 } 557 } 558 559 /* fprintf(stderr,"Recursively unable to find path\n");*/ 560 /* unable to find a good path */ 561 562 return FALSE; 563 } 564 565 fprintf(stderr,"should be impossible to reach here, all other options have been considered"); 566 567 return FALSE; 568 } 569 570 571 /* have to put the right path into the path datastructures */ 572 573 /* we might have to extend the path datastructures */ 574 575 if( final_real->max_stack < temp_pos || final_error->max_stack < temp_pos ) { 576 fatal("Have to extend path; should be easy, but not done yet"); 577 } 578 579 if( final_real->stack_len < temp_pos ) { 580 /* first return */ 581 final_real->stack_len = temp_pos; 582 final_error->stack_len = temp_pos; 583 } 584 585 for(i=startpos;i<temp_pos;i++) { 586 final_real->stack[i] = temp_real[i]; 587 final_error->stack[i] = temp_error[i]; 588 } 589 590 return TRUE; 591 592} 593