1 package assemble; 2 3 import java.util.ArrayList; 4 import java.util.Arrays; 5 import java.util.BitSet; 6 import java.util.concurrent.atomic.AtomicInteger; 7 8 import dna.AminoAcid; 9 import jgi.BBMerge; 10 import kmer.AbstractKmerTable; 11 import kmer.HashArray1D; 12 import kmer.HashForest; 13 import kmer.KmerNode; 14 import kmer.KmerTableSet; 15 import shared.KillSwitch; 16 import shared.Shared; 17 import shared.Timer; 18 import shared.Tools; 19 import stream.ConcurrentReadInputStream; 20 import stream.Read; 21 import structures.ByteBuilder; 22 import structures.IntList; 23 import structures.ListNum; 24 import structures.LongList; 25 import ukmer.Kmer; 26 27 28 /** 29 * Short-kmer assembler based on KmerCountExact. 30 * @author Brian Bushnell 31 * @date May 15, 2015 32 * 33 */ 34 public class Tadpole1 extends Tadpole { 35 36 /** 37 * Code entrance from the command line. 38 * @param args Command line arguments 39 */ main(String[] args)40 public static void main(String[] args){ 41 Timer t=new Timer(), t2=new Timer(); 42 t.start(); 43 t2.start(); 44 45 //Create a new CountKmersExact instance 46 Tadpole1 wog=new Tadpole1(args, true); 47 t2.stop(); 48 outstream.println("Initialization Time: \t"+t2); 49 50 ///And run it 51 wog.process(t); 52 } 53 54 /** 55 * Constructor. 56 * @param args Command line arguments 57 */ Tadpole1(String[] args, boolean setDefaults)58 public Tadpole1(String[] args, boolean setDefaults){ 59 super(args, setDefaults); 60 61 final int bytesPerKmer; 62 { 63 int mult=12; 64 if(useOwnership){mult+=4;} 65 if(processingMode==correctMode || processingMode==discardMode){} 66 else if(processingMode==contigMode || processingMode==extendMode){mult+=1;} 67 bytesPerKmer=mult; 68 } 69 70 tables=new KmerTableSet(args, bytesPerKmer); 71 k=tables.k; 72 k2=tables.k2; 73 74 shift=2*k; 75 shift2=shift-2; 76 mask=(shift>63 ? -1L : ~((-1L)<<shift)); 77 } 78 79 80 /*--------------------------------------------------------------*/ 81 /*---------------- Outer Methods ----------------*/ 82 /*--------------------------------------------------------------*/ 83 84 85 /*--------------------------------------------------------------*/ 86 /*---------------- Inner Methods ----------------*/ 87 /*--------------------------------------------------------------*/ 88 89 @Override initializeOwnership()90 void initializeOwnership(){ 91 tables.initializeOwnership(); 92 } 93 94 @Override shave(boolean shave, boolean rinse)95 long shave(boolean shave, boolean rinse){ 96 long sum=0; 97 98 for(int i=0; i<maxShaveDepth; i++){ 99 int a=1, b=maxShaveDepth, c=i+1; 100 // if(i>3){Shaver.verbose2=true;} 101 outstream.println("\nShave("+a+", "+b+", "+c+")"); 102 final Shaver shaver=Shaver.makeShaver(tables, THREADS, a, b, c, minCountExtend, branchMult2, Tools.max(minContigLen, shaveDiscardLen), shaveExploreDist, shave, rinse); 103 long removed=shaver.shave(a, b); 104 sum+=removed; 105 if(removed<100 || i>2){break;} 106 } 107 108 outstream.println(); 109 return sum; 110 } 111 112 @Override loadKmers(Timer t)113 public long loadKmers(Timer t){ 114 tables.process(t); 115 return tables.kmersLoaded; 116 } 117 118 119 /*--------------------------------------------------------------*/ 120 /*---------------- Recall Methods ----------------*/ 121 /*--------------------------------------------------------------*/ 122 rcomp(long kmer)123 final long rcomp(long kmer){return AminoAcid.reverseComplementBinaryFast(kmer, k);} toValue(long kmer, long rkmer)124 final long toValue(long kmer, long rkmer){return tables.toValue(kmer, rkmer);} getCount(long kmer, long rkmer)125 public final int getCount(long kmer, long rkmer){return tables.getCount(kmer, rkmer);} claim(long kmer, int id)126 final boolean claim(long kmer, int id){return claim(kmer, rcomp(kmer), id);} claim(long kmer, long rkmer, int id)127 final boolean claim(long kmer, long rkmer, int id){return tables.claim(kmer, rkmer, id);} doubleClaim(ByteBuilder bb, int id )128 final boolean doubleClaim(ByteBuilder bb, int id/*, long rid*/){return tables.doubleClaim(bb, id/*, rid*/);} claim(ByteBuilder bb, int id, boolean earlyExit)129 final boolean claim(ByteBuilder bb, int id, /*long rid, */boolean earlyExit){return tables.claim(bb, id/*, rid*/, earlyExit);} claim(byte[] array, int len, int id, boolean earlyExit)130 final boolean claim(byte[] array, int len, int id, /*long rid, */boolean earlyExit){return tables.claim(array, len, id/*, rid*/, earlyExit);} findOwner(long kmer)131 final int findOwner(long kmer){return tables.findOwner(kmer);} findOwner(ByteBuilder bb, int id)132 final int findOwner(ByteBuilder bb, int id){return tables.findOwner(bb, id);} findOwner(byte[] array, int len, int id)133 final int findOwner(byte[] array, int len, int id){return tables.findOwner(array, len, id);} release(long key, int id)134 final void release(long key, int id){tables.release(key, id);} release(ByteBuilder bb, int id)135 final void release(ByteBuilder bb, int id){tables.release(bb, id);} release(byte[] array, int len, int id)136 final void release(byte[] array, int len, int id){tables.release(array, len, id);} fillRightCounts(long kmer, long rkmer, int[] counts)137 final int fillRightCounts(long kmer, long rkmer, int[] counts){return tables.fillRightCounts(kmer, rkmer, counts, mask, shift2);} fillLeftCounts(long kmer, long rkmer, int[] counts)138 final int fillLeftCounts(long kmer, long rkmer, int[] counts){return tables.fillLeftCounts(kmer, rkmer, counts, mask, shift2);} toText(long kmer)139 final StringBuilder toText(long kmer){return AbstractKmerTable.toText(kmer, k);} 140 141 /*--------------------------------------------------------------*/ 142 /*---------------- Inner Classes ----------------*/ 143 /*--------------------------------------------------------------*/ 144 145 /*--------------------------------------------------------------*/ 146 /*---------------- BuildThread ----------------*/ 147 /*--------------------------------------------------------------*/ 148 149 @Override makeBuildThread(int id, int mode, ConcurrentReadInputStream[] crisa)150 BuildThread makeBuildThread(int id, int mode, ConcurrentReadInputStream[] crisa){ 151 return new BuildThread(id, mode, crisa); 152 } 153 154 /** 155 * Builds contigs. 156 */ 157 private class BuildThread extends AbstractBuildThread{ 158 BuildThread(int id_, int mode_, ConcurrentReadInputStream[] crisa_)159 public BuildThread(int id_, int mode_, ConcurrentReadInputStream[] crisa_){ 160 super(id_, mode_, crisa_); 161 } 162 163 @Override run()164 public void run(){ 165 if(crisa==null || crisa.length==0){ 166 //Build from kmers 167 168 if(id==0){outstream.print("Seeding with min count = ");} 169 String comma=""; 170 for(int i=contigPasses-1; i>0; i--){ 171 minCountSeedCurrent=(int)Tools.min(Integer.MAX_VALUE, Tools.max(minCountSeed+i, (long)Math.floor((minCountSeed)*Math.pow(contigPassMult, i)*0.92-0.25) )); 172 if(id==0){ 173 outstream.print(comma+minCountSeedCurrent); 174 comma=", "; 175 } 176 while(processNextTable(nextTable[i])){} 177 while(processNextVictims(nextVictims[i])){} 178 } 179 //Final pass 180 minCountSeedCurrent=minCountSeed; 181 if(id==0){outstream.println(comma+minCountSeedCurrent);} 182 while(processNextTable(nextTable[0])){} 183 while(processNextVictims(nextVictims[0])){} 184 }else{ 185 //Extend reads 186 for(ConcurrentReadInputStream cris : crisa){ 187 synchronized(crisa){ 188 if(!cris.started()){ 189 cris.start(); 190 } 191 } 192 run(cris); 193 } 194 } 195 } 196 processNextTable(AtomicInteger aint)197 private boolean processNextTable(AtomicInteger aint){ 198 final int tnum=aint.getAndAdd(1); 199 if(tnum>=tables.ways){return false;} 200 final HashArray1D table=tables.getTable(tnum); 201 if(verbose2 && id==0){outstream.println("Processing table "+tnum+", size "+table.size());} 202 final int max=table.arrayLength(); 203 for(int cell=0; cell<max; cell++){ 204 int x=processCell(table, cell); 205 } 206 return true; 207 } 208 processNextVictims(AtomicInteger aint)209 private boolean processNextVictims(AtomicInteger aint){ 210 final int tnum=aint.getAndAdd(1); 211 if(tnum>=tables.ways){return false;} 212 final HashArray1D table=tables.getTable(tnum); 213 final HashForest forest=table.victims(); 214 if(verbose2 && id==0){outstream.println("Processing forest "+tnum+", size "+forest.size());} 215 final int max=forest.arrayLength(); 216 for(int cell=0; cell<max; cell++){ 217 KmerNode kn=forest.getNode(cell); 218 int x=traverseKmerNode(kn); 219 } 220 return true; 221 } 222 processCell(HashArray1D table, int cell)223 private int processCell(HashArray1D table, int cell){ 224 int count=table.readCellValue(cell); 225 if(count<minCountSeedCurrent){return 0;} 226 227 long key=table.getKmer(cell); 228 229 if(verbose){outstream.println("id="+id+" processing cell "+cell+"; \tkmer="+key+"\t"+toText(key));} 230 if(useOwnership){ 231 int owner=table.getCellOwner(cell); 232 if(verbose){outstream.println("Owner is initially "+owner);} 233 if(owner>-1){return 0;} 234 owner=table.setOwner(key, id, cell); 235 if(verbose){outstream.println("Owner is now "+owner);} 236 if(owner!=id){return 0;} 237 } 238 return processKmer(key); 239 } 240 traverseKmerNode(KmerNode kn)241 private int traverseKmerNode(KmerNode kn){ 242 int sum=0; 243 if(kn!=null){ 244 sum+=processKmerNode(kn); 245 if(kn.left()!=null){ 246 sum+=traverseKmerNode(kn.left()); 247 } 248 if(kn.right()!=null){ 249 sum+=traverseKmerNode(kn.right()); 250 } 251 } 252 return sum; 253 } 254 processKmerNode(KmerNode kn)255 private int processKmerNode(KmerNode kn){ 256 final long key=kn.pivot(); 257 final int count=kn.getValue(key); 258 if(count<minCountSeedCurrent){return 0;} 259 260 if(verbose){outstream.println("id="+id+" processing KmerNode; \tkmer="+key+"\t"+toText(key));} 261 if(useOwnership){ 262 int owner=kn.getOwner(key); 263 if(verbose){outstream.println("Owner is initially "+owner);} 264 if(owner>-1){return 0;} 265 owner=kn.setOwner(key, id); 266 if(verbose){outstream.println("Owner is now "+owner);} 267 if(owner!=id){return 0;} 268 } 269 return processKmer(key); 270 } 271 processKmer(long key)272 private int processKmer(long key){ 273 Contig contig=makeContig(key, builderT, true); 274 if(contig!=null){ 275 float coverage=tables.calcCoverage(contig); 276 if(coverage<minCoverage || coverage>maxCoverage){return 0;} 277 if(verbose){outstream.println("Added "+contig.length());} 278 contig.id=(int)contigNum.incrementAndGet(); 279 contigs.add(contig); 280 return contig.length(); 281 }else{ 282 if(verbose){outstream.println("Created null contig.");} 283 } 284 return 0; 285 } 286 run(ConcurrentReadInputStream cris)287 private void run(ConcurrentReadInputStream cris){ 288 289 ListNum<Read> ln=cris.nextList(); 290 ArrayList<Read> reads=(ln!=null ? ln.list : null); 291 292 //While there are more reads lists... 293 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 294 295 //For each read (or pair) in the list... 296 for(int i=0; i<reads.size(); i++){ 297 final Read r1=reads.get(i); 298 final Read r2=r1.mate; 299 300 processReadPair(r1, r2); 301 } 302 303 //Fetch a new read list 304 cris.returnList(ln); 305 ln=cris.nextList(); 306 reads=(ln!=null ? ln.list : null); 307 } 308 cris.returnList(ln); 309 } 310 processReadPair(Read r1, Read r2)311 private void processReadPair(Read r1, Read r2){ 312 if(verbose){outstream.println("Considering read "+r1.id+" "+new String(r1.bases));} 313 314 readsInT++; 315 basesInT+=r1.length(); 316 if(r2!=null){ 317 readsInT++; 318 basesInT+=r2.length(); 319 } 320 321 if(mode==insertMode){ 322 int x=BBMerge.findOverlapStrict(r1, r2, false); 323 if(x<1){ 324 x=findInsertSize(r1, r2, rightCounts); 325 } 326 insertSizes.increment(Tools.max(x, 0)); 327 return; 328 } 329 330 if(ecco && r1!=null && r2!=null && !r1.discarded() && !r2.discarded()){BBMerge.findOverlapStrict(r1, r2, true);} 331 332 if(r1!=null){ 333 if(r1.discarded()){ 334 lowqBasesT+=r1.length(); 335 lowqReadsT++; 336 }else{ 337 byte[] bases=makeContig(r1.bases, builderT, r1.numericID); 338 if(bases!=null){ 339 if(verbose){outstream.println("Added "+bases.length);} 340 final long num=contigNum.incrementAndGet(); 341 Contig temp=new Contig(bases, "contig_"+num+"_length_"+bases.length, (int)num); 342 contigs.add(temp); 343 } 344 } 345 } 346 if(r2!=null){ 347 if(r2.discarded()){ 348 lowqBasesT+=r2.length(); 349 lowqReadsT++; 350 }else{ 351 byte[] bases=makeContig(r2.bases, builderT, r2.numericID); 352 if(bases!=null){ 353 if(verbose){outstream.println("Added "+bases.length);} 354 final long num=contigNum.incrementAndGet(); 355 Contig temp=new Contig(bases, "contig_"+num+"_length_"+bases.length, (int)num); 356 contigs.add(temp); 357 } 358 } 359 } 360 } 361 362 /** From kmers */ makeContig(final long key, final ByteBuilder bb, boolean alreadyClaimed)363 private Contig makeContig(final long key, final ByteBuilder bb, boolean alreadyClaimed){ 364 builderT.setLength(0); 365 builderT.appendKmer(key, k); 366 if(verbose){outstream.println("Filled builder: "+builderT);} 367 368 final int initialLength=bb.length(); 369 assert(initialLength==k); 370 if(initialLength<k){return null;} 371 // outstream.print("A"); 372 373 boolean success=(alreadyClaimed || !useOwnership ? true : claim(key, id)); 374 if(verbose){outstream.println("Thread "+id+" checking owner after setting: "+findOwner(bb, id));} 375 if(!success){ 376 assert(bb.length()==k); 377 // release(bb, id); //no need to release 378 return null; 379 } 380 // outstream.print("B"); 381 if(verbose /*|| true*/){outstream.println("Thread "+id+" building contig; initial length "+bb.length());} 382 if(verbose){outstream.println("Extending to right.");} 383 final int rightStatus, leftStatus; 384 float leftRatio=0, rightRatio=0; 385 { 386 final int status=extendToRight(bb, leftCounts, rightCounts, id); 387 388 if(status==DEAD_END){ 389 //do nothing 390 }else if(status==LOOP){//TODO 391 //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat. 392 //Perhaps, the last kmer should be reclassified as a junction and removed. 393 }else if(status==BAD_SEED){ 394 assert(bb.length()==k); 395 release(key, id); 396 // outstream.print("B1"); 397 return null; 398 }else{ 399 if(bb.length()==k){ 400 if(status==BAD_OWNER){ 401 if(IGNORE_BAD_OWNER){ 402 //do nothing 403 }else{ 404 release(key, id); 405 // outstream.print("B2"); 406 return null; 407 } 408 }else if(isBranchCode(status)){ 409 release(key, id); 410 // outstream.print("B3"); 411 return null; 412 }else{ 413 throw new RuntimeException("Bad return value: "+status); 414 } 415 }else{ 416 if(status==BAD_OWNER){ 417 if(IGNORE_BAD_OWNER){ 418 bb.length--; 419 }else{ 420 release(bb, id); 421 // outstream.print("B4"); 422 return null; 423 } 424 }else if(status==F_BRANCH || status==D_BRANCH){ 425 rightRatio=calcRatio(rightCounts); 426 }else if(status==B_BRANCH){ 427 rightRatio=calcRatio(leftCounts); 428 }else{ 429 throw new RuntimeException("Bad return value: "+status); 430 } 431 } 432 } 433 rightStatus=status; 434 } 435 // outstream.print("C"); 436 bb.reverseComplementInPlace(); 437 if(verbose /*|| true*/){outstream.println("Extending rcomp to right; current length "+bb.length());} 438 { 439 final int status=extendToRight(bb, leftCounts, rightCounts, id); 440 441 if(status==DEAD_END){ 442 //do nothing 443 }else if(status==LOOP){//TODO 444 //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat. 445 //Perhaps, the last kmer should be reclassified as a junction and removed. 446 }else if(status==BAD_SEED){ 447 assert(false) : bb;//This should never happen. 448 assert(bb.length()==k); 449 release(key, id); 450 return null; 451 }else{ 452 if(status==BAD_OWNER){ 453 if(IGNORE_BAD_OWNER){ 454 if(bb.length()>k){bb.length--;} 455 }else{ 456 release(bb, id); 457 // outstream.print("C1"); 458 return null; 459 } 460 }else if(status==F_BRANCH || status==D_BRANCH){ 461 leftRatio=calcRatio(rightCounts); 462 }else if(status==B_BRANCH){ 463 leftRatio=calcRatio(leftCounts); 464 }else{ 465 throw new RuntimeException("Bad return value: "+status); 466 } 467 } 468 leftStatus=status; 469 } 470 // outstream.print("D"); 471 472 if(verbose /*|| true*/){outstream.println("A: Final length for thread "+id+": "+bb.length());} 473 474 // if(useOwnership && THREADS==1){assert(claim(bases, bases.length, id, rid));} 475 success=(useOwnership ? doubleClaim(bb, id) : true); 476 if(verbose /*|| true*/){outstream.println("Success for thread "+id+": "+success);} 477 478 if(trimEnds>0){bb.trimByAmount(trimEnds, trimEnds);} 479 else if(trimCircular && leftStatus==LOOP && rightStatus==LOOP){bb.trimByAmount(0, k-1);} 480 if(joinContigs || (bb.length()>=initialLength+minExtension && (bb.length()>=minContigLen || popBubbles))){ 481 if(success){ 482 bb.reverseComplementInPlace(); 483 byte[] bases=bb.toBytes(); 484 Contig c=new Contig(bases); 485 c.leftCode=leftStatus; 486 c.rightCode=rightStatus; 487 c.rightRatio=rightRatio; 488 c.leftRatio=leftRatio; 489 if(!c.canonical()){c.rcomp();} 490 return c; 491 }else{ 492 // outstream.print("E"); 493 // assert(false) : bb.length()+", "+id; 494 release(bb, id); 495 return null; 496 } 497 } 498 if(verbose /*|| true*/){outstream.println("A: Contig was too short for "+id+": "+bb.length());} 499 // assert(false) : bb.length()+", "+initialLength+", "+minExtension+", "+minContigLen; 500 // outstream.print("F"); 501 return null; 502 } 503 504 /** From a seed */ makeContig(final byte[] bases, final ByteBuilder bb, long rid)505 private byte[] makeContig(final byte[] bases, final ByteBuilder bb, long rid){ 506 if(bases==null || bases.length<k){return null;} 507 // if(verbose /*|| true*/){outstream.println("Thread "+id+" checking owner: "+findOwner(bases, bases.length, id));} 508 int owner=useOwnership ? findOwner(bases, bases.length, id) : -1; 509 if(owner>=id){return null;} 510 boolean success=(useOwnership ? claim(bases, bases.length, id, true) : true); 511 if(verbose /*|| true*/){outstream.println("Thread "+id+" checking owner after setting: "+findOwner(bases, bases.length, id));} 512 if(!success){ 513 release(bases, bases.length, id); 514 return null; 515 } 516 if(verbose /*|| true*/){outstream.println("Thread "+id+" building contig; initial length "+bases.length);} 517 bb.setLength(0); 518 bb.append(bases); 519 if(verbose){outstream.println("Extending to right.");} 520 { 521 final int status=extendToRight(bb, leftCounts, rightCounts, id); 522 523 if(status==DEAD_END){ 524 //do nothing 525 }else if(status==LOOP){//TODO 526 //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat. 527 //Perhaps, the last kmer should be reclassified as a junction and removed. 528 }else if(status==BAD_SEED){ 529 //do nothing 530 }else{ 531 if(status==BAD_OWNER){ 532 release(bb, id); 533 return null; 534 }else if(isBranchCode(status)){ 535 //do nothing 536 }else{ 537 throw new RuntimeException("Bad return value: "+status); 538 } 539 } 540 } 541 bb.reverseComplementInPlace(); 542 if(verbose /*|| true*/){outstream.println("Extending rcomp to right; current length "+bb.length());} 543 { 544 final int status=extendToRight(bb, leftCounts, rightCounts, id); 545 546 if(status==DEAD_END){ 547 //do nothing 548 }else if(status==LOOP){//TODO 549 //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat. 550 //Perhaps, the last kmer should be reclassified as a junction and removed. 551 }else if(status==BAD_SEED){ 552 //do nothing 553 }else{ 554 if(status==BAD_OWNER){ 555 release(bb, id); 556 return null; 557 }else if(isBranchCode(status)){ 558 //do nothing 559 }else{ 560 throw new RuntimeException("Bad return value: "+status); 561 } 562 } 563 } 564 if(verbose /*|| true*/){outstream.println("B: Final length for thread "+id+": "+bb.length());} 565 566 // if(useOwnership && THREADS==1){assert(claim(bases, bases.length, id, rid));} 567 success=(useOwnership ? doubleClaim(bb, id) : true); 568 if(verbose /*|| true*/){outstream.println("Success for thread "+id+": "+success);} 569 if(bb.length()>=bases.length+minExtension && (bb.length()>=minContigLen || popBubbles)){ 570 if(success){ 571 bb.reverseComplementInPlace(); 572 return bb.toBytes(); 573 }else{ 574 // assert(false) : bb.length()+", "+id; 575 release(bb.array, bb.length(), id); 576 return null; 577 } 578 } 579 580 if(verbose /*|| true*/){outstream.println("B: Contig was too short for "+id+": "+bb.length());} 581 return null; 582 } 583 584 } 585 586 587 /*--------------------------------------------------------------*/ 588 /*---------------- Contig Processing ----------------*/ 589 /*--------------------------------------------------------------*/ 590 591 @Override makeProcessContigThread(ArrayList<Contig> contigs, AtomicInteger next)592 ProcessContigThread makeProcessContigThread(ArrayList<Contig> contigs, AtomicInteger next){ 593 return new ProcessContigThread(contigs, next); 594 } 595 596 @Override initializeContigs(ArrayList<Contig> contigs)597 public void initializeContigs(ArrayList<Contig> contigs){ 598 tables.clearOwnership(); 599 tables.initializeOwnership(); 600 { 601 int cnum=0; 602 for(Contig c : contigs){ 603 c.id=cnum; 604 // if(c.leftBranch()){ 605 if(true){ 606 long kmer=c.leftKmer(k); 607 tables.claim(kmer, rcomp(kmer), cnum); 608 } 609 // if(c.rightBranch()){ 610 if(true){ 611 long kmer=c.rightKmer(k); 612 tables.claim(kmer, rcomp(kmer), cnum); 613 } 614 cnum++; 615 } 616 } 617 } 618 619 class ProcessContigThread extends AbstractProcessContigThread { 620 ProcessContigThread(ArrayList<Contig> contigs_, AtomicInteger next_)621 ProcessContigThread(ArrayList<Contig> contigs_, AtomicInteger next_){ 622 super(contigs_, next_); 623 lastExitCondition=BAD_SEED; 624 } 625 626 @Override processContigLeft(Contig c, int[] leftCounts, int[] rightCounts, int[] extraCounts, ByteBuilder bb)627 public void processContigLeft(Contig c, int[] leftCounts, int[] rightCounts, int[] extraCounts, ByteBuilder bb){ 628 if(c.leftCode==DEAD_END){return;} 629 630 final long kmer0=c.leftKmer(k); 631 final long rkmer0=rcomp(kmer0); 632 633 assert(tables.getCount(kmer0, rkmer0)>0); 634 assert(tables.findOwner(kmer0)==c.id) : tables.findOwner(kmer0)+", "+c.id; 635 // System.err.println(tables.findOwner(kmer0)); 636 637 int leftMaxPos=fillLeftCounts(kmer0, rkmer0, leftCounts); 638 int leftMax=leftCounts[leftMaxPos]; 639 int leftSecondPos=Tools.secondHighestPosition(leftCounts); 640 int leftSecond=leftCounts[leftSecondPos]; 641 642 for(int x=0; x<leftCounts.length; x++){ 643 bb.clear(); 644 final int count=leftCounts[x]; 645 int target=-1; 646 if(count>0 && isJunction(leftMax, count)){ 647 long x2=3-x; 648 long rkmer=((rkmer0<<2)|(long)x2)&mask; 649 long kmer=(kmer0>>>2)|(((long)x)<<shift2); 650 assert(kmer==rcomp(rkmer)); 651 assert(tables.getCount(kmer, rkmer)==count) : count+", "+tables.getCount(kmer, rkmer); 652 bb.append(AminoAcid.numberToBase[x]); 653 target=exploreRight(rkmer, kmer, extraCounts, rightCounts, bb); 654 if(verbose){ 655 outstream.println(c.id+"L_F: x="+x+", cnt="+count+", dest="+target 656 +", "+codeStrings[lastExitCondition]+", len="+lastLength+", orient="+lastOrientation); 657 } 658 } 659 if(target>=0){ 660 Edge se=new Edge(c.id, target, lastLength, lastOrientation, count, bb.toBytes()); 661 c.addLeftEdge(se); 662 edgesMadeT++; 663 } 664 } 665 } 666 667 @Override processContigRight(Contig c, int[] leftCounts, int[] rightCounts, int[] extraCounts, ByteBuilder bb)668 public void processContigRight(Contig c, int[] leftCounts, int[] rightCounts, int[] extraCounts, ByteBuilder bb){ 669 if(c.rightCode==DEAD_END){return;} 670 671 final long kmer0=c.rightKmer(k); 672 final long rkmer0=rcomp(kmer0); 673 674 int rightMaxPos=fillRightCounts(kmer0, rkmer0, rightCounts); 675 int rightMax=rightCounts[rightMaxPos]; 676 int rightSecondPos=Tools.secondHighestPosition(rightCounts); 677 int rightSecond=rightCounts[rightSecondPos]; 678 679 for(int x=0; x<rightCounts.length; x++){ 680 bb.clear(); 681 final int count=rightCounts[x]; 682 int target=-1; 683 if(count>0 && isJunction(rightMax, count)){ 684 long x2=3-x; 685 long kmer=((kmer0<<2)|(long)x)&mask; 686 long rkmer=(rkmer0>>>2)|(((long)x2)<<shift2); 687 assert(kmer==rcomp(rkmer)); 688 assert(tables.getCount(kmer, rkmer)==count) : count+", "+tables.getCount(kmer, rkmer); 689 bb.append(AminoAcid.numberToBase[x]); 690 target=exploreRight(kmer, rkmer, leftCounts, extraCounts, bb); 691 if(verbose){ 692 outstream.println(c.id+"R_F: x="+x+", cnt="+count+", dest="+target+", "+codeStrings[lastExitCondition]+", len="+lastLength+", orient="+lastOrientation); 693 } 694 } 695 if(target>=0){ 696 lastOrientation|=1; 697 Edge se=new Edge(c.id, target, lastLength, lastOrientation, count, bb.toBytes()); 698 c.addRightEdge(se); 699 edgesMadeT++; 700 } 701 } 702 } 703 exploreRight(long kmer, long rkmer, int[] leftCounts, int[] rightCounts, ByteBuilder bb)704 private int exploreRight(long kmer, long rkmer, int[] leftCounts, int[] rightCounts, ByteBuilder bb){ 705 int length=1; 706 int owner=-1; 707 lastTarget=-1; 708 for(; length<500; length++){ 709 owner=tables.findOwner(kmer, rkmer); 710 if(owner>=0){break;} 711 712 final int leftMaxPos=fillLeftCounts(kmer, rkmer, leftCounts); 713 final int leftMax=leftCounts[leftMaxPos]; 714 final int leftSecondPos=Tools.secondHighestPosition(leftCounts); 715 final int leftSecond=leftCounts[leftSecondPos]; 716 if(isJunction(leftMax, leftSecond)){ 717 lastExitCondition=B_BRANCH; 718 lastLength=length; 719 return -1; 720 } 721 722 final int rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts); 723 final int rightMax=rightCounts[rightMaxPos]; 724 final int rightSecondPos=Tools.secondHighestPosition(rightCounts); 725 final int rightSecond=rightCounts[rightSecondPos]; 726 727 // outstream.println("* "+Arrays.toString(leftCounts)+", "+Arrays.toString(rightCounts)+", "+rightMaxPos); 728 729 if(rightMax<minCountExtend){ 730 // assert(false) : Arrays.toString(rightCounts); 731 lastExitCondition=DEAD_END; 732 lastLength=length; 733 return -1; 734 }else if(isJunction(rightMax, rightSecond)){ 735 lastExitCondition=F_BRANCH; 736 lastLength=length; 737 return -1; 738 } 739 bb.append(AminoAcid.numberToBase[rightMaxPos]); 740 long x=rightMaxPos; 741 long x2=3-x; 742 kmer=((kmer<<2)|x)&mask; 743 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 744 } 745 lastLength=length; 746 lastTarget=owner; 747 if(owner>=0){ 748 lastExitCondition=SUCCESS; 749 Contig dest=contigs.get(owner); 750 long left=dest.leftKmer(k); 751 // if(left==kmer){ 752 // lastOrientation=0; 753 // }else if(left==rkmer){ 754 // lastOrientation=1; 755 // }else{ 756 // long right=dest.rightKmer(k); 757 // if(right==kmer){ 758 // lastOrientation=2; 759 // }else if(right==rkmer){ 760 // lastOrientation=3; 761 // } 762 // } 763 if(left==kmer || left==rkmer){ 764 lastOrientation=0; 765 }else{ 766 long right=dest.rightKmer(k); 767 if(right==kmer || right==rkmer){ 768 lastOrientation=2; 769 }else{ 770 assert(false); 771 } 772 } 773 }else{ 774 lastExitCondition=TOO_LONG; 775 } 776 return owner; 777 } 778 779 } 780 781 /*--------------------------------------------------------------*/ 782 /*---------------- Extension Methods ----------------*/ 783 /*--------------------------------------------------------------*/ 784 785 findInsertSize(Read r1, Read r2, int[] rightCounts)786 public int findInsertSize(Read r1, Read r2, int[] rightCounts){ 787 final long kmer1=tables.rightmostKmer(r1.bases, r1.length()); 788 final long kmer2=tables.rightmostKmer(r2.bases, r2.length()); 789 if(kmer1<0 || kmer2<0){return -1;} 790 final long rkmer1=rcomp(kmer1); 791 final long rkmer2=rcomp(kmer2); 792 final int x=measureInsert(kmer1, rkmer1, kmer2, rkmer2, 24000, rightCounts); 793 if(x<0){return -1;} 794 return r1.length()+r2.length()+x-k;//TODO: May be off by 1. 795 } 796 797 @Override extendRead(Read r, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int distance, final Kmer kmer)798 public int extendRead(Read r, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int distance, final Kmer kmer){ 799 return extendRead(r, bb, leftCounts, rightCounts, distance); 800 } 801 802 @Override extendRead(Read r, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int distance)803 public int extendRead(Read r, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int distance){ 804 final int initialLen=r.length(); 805 if(initialLen<k){return 0;} 806 bb.setLength(0); 807 bb.append(r.bases); 808 final int extension=extendToRight2(bb, leftCounts, rightCounts, distance, true); 809 if(extension>0){ 810 r.bases=bb.toBytes(); 811 if(r.quality!=null){ 812 final byte q=Shared.FAKE_QUAL; 813 r.quality=KillSwitch.copyOf(r.quality, r.bases.length); 814 for(int i=initialLen; i<r.quality.length; i++){ 815 r.quality[i]=q; 816 } 817 } 818 } 819 assert(extension==r.length()-initialLen); 820 return extension; 821 } 822 823 /** Returns distance between the two kmers, or -1 */ measureInsert(final long kmer1, final long rkmer1, final long kmer2, final long rkmer2, final int maxlen, final int[] rightCounts)824 public int measureInsert(final long kmer1, final long rkmer1, final long kmer2, final long rkmer2, final int maxlen, final int[] rightCounts){ 825 final long key2=toValue(kmer2, rkmer2); 826 long kmer=kmer1; 827 long rkmer=rkmer1; 828 int len=0; 829 830 { 831 int count=tables.getCount(key2); 832 if(count<minCountSeed){return -1;} 833 } 834 835 long key=toValue(kmer, rkmer); 836 int count=tables.getCount(key); 837 if(count<minCountSeed){return -1;} 838 if(count<minCountSeed){ 839 if(verbose){outstream.println("Returning because count was too low: "+count);} 840 return -1; 841 } 842 843 int rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts); 844 int rightMax=rightCounts[rightMaxPos]; 845 // int rightSecondPos=Tools.secondHighestPosition(rightCounts); 846 // int rightSecond=rightCounts[rightSecondPos]; 847 848 if(rightMax<minCountExtend){return -1;} 849 // if(isJunction(rightMax, rightSecond)){return -1;} 850 851 while(key!=key2 && len<maxlen){ 852 853 //Generate the new kmer 854 // final byte b=AminoAcid.numberToBase[rightMaxPos]; 855 final long x=rightMaxPos; 856 final long x2=AminoAcid.numberToComplement[(int)x]; 857 858 //Now consider the next kmer 859 kmer=((kmer<<2)|(long)x)&mask; 860 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 861 862 assert(tables.getCount(kmer, rkmer)==rightMax); 863 count=rightMax; 864 865 assert(count>=minCountExtend) : count; 866 867 rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts); 868 rightMax=rightCounts[rightMaxPos]; 869 // rightSecondPos=Tools.secondHighestPosition(rightCounts); 870 // rightSecond=rightCounts[rightSecondPos]; 871 872 if(verbose){ 873 outstream.println("kmer: "+toText(kmer)+", "+toText(rkmer)); 874 outstream.println("Counts: "+count+", "+Arrays.toString(rightCounts)); 875 outstream.println("rightMaxPos="+rightMaxPos); 876 outstream.println("rightMax="+rightMax); 877 // outstream.println("rightSecondPos="+rightSecondPos); 878 // outstream.println("rightSecond="+rightSecond); 879 } 880 881 if(rightMax<minCountExtend){ 882 if(verbose){outstream.println("A: Breaking because highest right was too low:"+rightMax);} 883 break; 884 } 885 886 // if(isJunction(rightMax, rightSecond)){return -1;} 887 888 len++; 889 } 890 return len>=maxlen ? -1 : len; 891 } 892 893 894 895 /** 896 * Extend these bases into a contig. 897 * Stops at both left and right junctions. 898 * Claims ownership. 899 */ extendToRight(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int id)900 public int extendToRight(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int id){ 901 if(bb.length()<k){return BAD_SEED;} 902 long kmer=0; 903 long rkmer=0; 904 int len=0; 905 906 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the rightmost kmer */ 907 { 908 final int bblen=bb.length(); 909 final byte[] bases=bb.array; 910 for(int i=bblen-k; i<bblen; i++){ 911 final byte b=bases[i]; 912 final long x=AminoAcid.baseToNumber[b]; 913 final long x2=AminoAcid.baseToComplementNumber[b]; 914 kmer=((kmer<<2)|x)&mask; 915 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 916 if(x<0){ 917 len=0; 918 kmer=rkmer=0; 919 }else{len++;} 920 if(verbose){outstream.println("A: Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 921 } 922 } 923 924 if(len<k){return BAD_SEED;} 925 else{assert(len==k);} 926 927 /* Now the trailing kmer has been initialized. */ 928 929 long key=toValue(kmer, rkmer); 930 HashArray1D table=tables.getTableForKey(key); 931 int count=table.getValue(key); 932 if(count<minCountSeed){ 933 if(verbose){outstream.println("Returning because count was too low: "+count);} 934 return BAD_SEED; 935 } 936 937 int owner=(useOwnership ? table.getOwner(key) : id); 938 if(verbose){outstream.println("Owner: "+owner);} 939 if(owner>id){return BAD_OWNER;} 940 941 int leftMaxPos=0; 942 int leftMax=minCountExtend; 943 int leftSecondPos=1; 944 int leftSecond=0; 945 946 if(leftCounts!=null){ 947 leftMaxPos=fillLeftCounts(kmer, rkmer, leftCounts); 948 leftMax=leftCounts[leftMaxPos]; 949 leftSecondPos=Tools.secondHighestPosition(leftCounts); 950 leftSecond=leftCounts[leftSecondPos]; 951 } 952 953 int rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts); 954 int rightMax=rightCounts[rightMaxPos]; 955 int rightSecondPos=Tools.secondHighestPosition(rightCounts); 956 int rightSecond=rightCounts[rightSecondPos]; 957 958 if(verbose){ 959 outstream.println("kmer: "+toText(kmer)+", "+toText(rkmer)); 960 outstream.println("Counts: "+count+", "+(leftCounts==null ? "null" : Arrays.toString(leftCounts))+", "+Arrays.toString(rightCounts)); 961 outstream.println("leftMaxPos="+leftMaxPos); 962 outstream.println("leftMax="+leftMax); 963 outstream.println("leftSecondPos="+leftSecondPos); 964 outstream.println("leftSecond="+leftSecond); 965 outstream.println("rightMaxPos="+rightMaxPos); 966 outstream.println("rightMax="+rightMax); 967 outstream.println("rightSecondPos="+rightSecondPos); 968 outstream.println("rightSecond="+rightSecond); 969 } 970 971 if(rightMax<minCountExtend){return DEAD_END;} 972 if(isJunction(rightMax, rightSecond)){//Returning here is fine because nothing can be added 973 if(verbose){outstream.println("B: Breaking because isJunction("+rightMax+", "+rightSecond+", "+leftMax+", "+leftSecond+")");} 974 return isJunction(leftMax, leftSecond) ? D_BRANCH : F_BRANCH; 975 } 976 if(isJunction(leftMax, leftSecond)){//Returning here is necessary, but this should mean the the length is exactly K 977 assert(bb.length()==k) : bb.length()+", "+k+", "+leftMax+", "+leftSecond; 978 if(verbose){outstream.println("B: Breaking because isJunction("+rightMax+", "+rightSecond+", "+leftMax+", "+leftSecond+")");} 979 return B_BRANCH; 980 } 981 982 if(useOwnership){ 983 owner=table.setOwner(key, id); 984 if(verbose){outstream.println("A. Owner is now "+id+" for key "+key);} 985 if(owner!=id){ 986 if(verbose){outstream.println("Returning early because owner was "+owner+" for thread "+id+".");} 987 return BAD_OWNER; 988 } 989 } 990 991 final int maxLen=Tools.min((extendRight<0 ? maxContigLen : bb.length()+extendRight), maxContigLen); 992 993 while(owner==id && bb.length()<maxLen){ 994 995 //Generate the new kmer 996 final byte b=AminoAcid.numberToBase[rightMaxPos]; 997 final long x=rightMaxPos; 998 final long x2=AminoAcid.numberToComplement[(int)x]; 999 1000 final long evicted=(kmer>>>shift2); //Binary value that falls off the end. 1001 1002 //Now consider the next kmer 1003 kmer=((kmer<<2)|(long)x)&mask; 1004 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1005 1006 key=toValue(kmer, rkmer); 1007 table=tables.getTableForKey(key); 1008 1009 assert(table.getValue(key)==rightMax || rightMax==0) : key+", "+table.getValue(key)+", "+rightMax+", "+Arrays.toString(rightCounts); 1010 count=rightMax; 1011 1012 assert(count>=minCountExtend) : count; 1013 1014 if(leftCounts!=null){ 1015 leftMaxPos=fillLeftCounts(kmer, rkmer, leftCounts); 1016 leftMax=leftCounts[leftMaxPos]; 1017 leftSecondPos=Tools.secondHighestPosition(leftCounts); 1018 leftSecond=leftCounts[leftSecondPos]; 1019 } 1020 1021 rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts); 1022 rightMax=rightCounts[rightMaxPos]; 1023 rightSecondPos=Tools.secondHighestPosition(rightCounts); 1024 rightSecond=rightCounts[rightSecondPos]; 1025 1026 if(verbose){ 1027 outstream.println("kmer: "+toText(kmer)+", "+toText(rkmer)); 1028 outstream.println("Counts: "+count+", "+(leftCounts==null ? "null" : Arrays.toString(leftCounts))+", "+Arrays.toString(rightCounts)); 1029 outstream.println("leftMaxPos="+leftMaxPos); 1030 outstream.println("leftMax="+leftMax); 1031 outstream.println("leftSecondPos="+leftSecondPos); 1032 outstream.println("leftSecond="+leftSecond); 1033 outstream.println("rightMaxPos="+rightMaxPos); 1034 outstream.println("rightMax="+rightMax); 1035 outstream.println("rightSecondPos="+rightSecondPos); 1036 outstream.println("rightSecond="+rightSecond); 1037 } 1038 1039 final boolean fbranch=isJunction(rightMax, rightSecond); 1040 final boolean bbranch=isJunction(leftMax, leftSecond); 1041 final boolean hbranch=(leftCounts!=null && leftMaxPos!=evicted && branchMult1>0); 1042 if(bbranch){ 1043 if(verbose){outstream.println("B: Breaking - isJunction("+rightMax+", "+rightSecond+", "+leftMax+", "+leftSecond+"); " 1044 + "("+fbranch+", "+bbranch+", "+hbranch+")");} 1045 return fbranch ? D_BRANCH : B_BRANCH; 1046 }else if(hbranch){ 1047 if(verbose){outstream.println("B: Breaking - isJunction("+rightMax+", "+rightSecond+", " 1048 + ""+leftMax+", "+leftSecond+"); ("+fbranch+", "+bbranch+", "+hbranch+")");} 1049 if(verbose){outstream.println("Hidden branch: leftMaxPos!=evicted ("+leftMaxPos+"!="+evicted+")" + 1050 "\nleftMaxPos="+leftMaxPos+", leftMax="+leftMax+", leftSecondPos="+leftSecondPos+", leftSecond="+leftSecond);} 1051 return fbranch ? D_BRANCH : B_BRANCH; 1052 } 1053 1054 bb.append(b); 1055 if(verbose){outstream.println("Added base "+(char)b);} 1056 1057 if(useOwnership){ 1058 owner=table.getOwner(key); 1059 if(verbose){outstream.println("Owner is initially "+id+" for key "+key);} 1060 if(owner==id){//loop detection 1061 if(verbose /*|| true*/){ 1062 // outstream.println(new String(bb.array, bb.length()-31, 31)); 1063 outstream.println(bb); 1064 outstream.println(toText(kmer)); 1065 outstream.println(toText(rkmer)); 1066 outstream.println("Breaking because owner was "+owner+" for thread "+id+"."); 1067 } 1068 return fbranch ? F_BRANCH : LOOP; 1069 } 1070 owner=table.setOwner(key, id); 1071 if(verbose){outstream.println("B. Owner is now "+id+" for key "+key);} 1072 } 1073 1074 if(fbranch){ 1075 if(verbose){outstream.println("B: Breaking - isJunction("+rightMax+", "+rightSecond+", "+leftMax+", "+leftSecond+"); " 1076 + "("+fbranch+", "+bbranch+", "+hbranch+")");} 1077 return F_BRANCH; 1078 }else if(rightMax<minCountExtend){ 1079 if(verbose){outstream.println("B: Breaking because highest right was too low:"+rightMax);} 1080 return DEAD_END; 1081 } 1082 } 1083 assert(owner!=id); 1084 if(verbose /*|| true*/){ 1085 outstream.println("Current contig: "+bb+"\nReturning because owner was "+owner+" for thread "+id+"."); 1086 } 1087 return BAD_OWNER; 1088 } 1089 1090 @Override extendToRight2(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int distance, boolean includeJunctionBase)1091 public int extendToRight2(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int distance, boolean includeJunctionBase){ 1092 if(verbose || verbose2){outstream.println("Entering extendToRight2 (no kmers).");} 1093 final int initialLength=bb.length(); 1094 if(initialLength<k){return 0;} 1095 long kmer=0; 1096 long rkmer=0; 1097 1098 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the rightmost kmer */ 1099 { 1100 int len=0; 1101 final byte[] bases=bb.array; 1102 for(int i=initialLength-k; i<initialLength; i++){ 1103 final byte b=bases[i]; 1104 final long x=AminoAcid.baseToNumber[b]; 1105 final long x2=AminoAcid.baseToComplementNumber[b]; 1106 kmer=((kmer<<2)|x)&mask; 1107 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1108 if(x<0){ 1109 len=0; 1110 kmer=rkmer=0; 1111 }else{len++;} 1112 if(verbose){outstream.println("B: Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 1113 } 1114 if(len<k){ 1115 if(verbose || verbose2){outstream.println("Returning because len<k: "+len+"<"+k);} 1116 return 0; 1117 } 1118 else{assert(len==k);} 1119 } 1120 return extendToRight2(bb, leftCounts, rightCounts, distance, includeJunctionBase, kmer, rkmer); 1121 } 1122 1123 /** 1124 * Extend these bases to the right by at most 'distance'. 1125 * Stops at right junctions only. 1126 * Does not claim ownership. 1127 */ extendToRight2(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int distance, boolean includeJunctionBase, long kmer, long rkmer)1128 public int extendToRight2(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int distance, boolean includeJunctionBase, 1129 long kmer, long rkmer){ 1130 if(verbose || verbose2){outstream.println("Entering extendToRight2 (with kmers).");} 1131 final int initialLength=bb.length(); 1132 1133 /* Now the trailing kmer has been initialized. */ 1134 1135 long key=toValue(kmer, rkmer); 1136 HashArray1D table=tables.getTableForKey(key); 1137 int count=table.getValue(key); 1138 if(count<minCountSeed){ 1139 if(verbose || verbose2){outstream.println("Returning because count was too low: "+count+"<"+minCountSeed);} 1140 return 0; 1141 } 1142 1143 int leftMaxPos=0; 1144 int leftMax=minCountExtend; 1145 int leftSecondPos=1; 1146 int leftSecond=0; 1147 1148 if(leftCounts!=null){ 1149 leftMaxPos=fillLeftCounts(kmer, rkmer, leftCounts); 1150 leftMax=leftCounts[leftMaxPos]; 1151 leftSecondPos=Tools.secondHighestPosition(leftCounts); 1152 leftSecond=leftCounts[leftSecondPos]; 1153 } 1154 1155 int rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts); 1156 int rightMax=rightCounts[rightMaxPos]; 1157 int rightSecondPos=Tools.secondHighestPosition(rightCounts); 1158 int rightSecond=rightCounts[rightSecondPos]; 1159 1160 if(verbose){ 1161 outstream.println("kmer: "+toText(kmer)+", "+toText(rkmer)); 1162 outstream.println("Counts: "+count+", "+Arrays.toString(rightCounts)); 1163 outstream.println("rightMaxPos="+rightMaxPos); 1164 outstream.println("rightMax="+rightMax); 1165 outstream.println("rightSecondPos="+rightSecondPos); 1166 outstream.println("rightSecond="+rightSecond); 1167 } 1168 1169 if(rightMax<minCountExtend){ 1170 if(verbose || verbose2){outstream.println("Returning because rightMax was too low: "+rightMax+"<"+minCountExtend+"\n"+count+", "+Arrays.toString(rightCounts));} 1171 return 0; 1172 } 1173 if(isJunction(rightMax, rightSecond, leftMax, leftSecond)){ 1174 if(verbose || verbose2){outstream.println("Returning because isJunction: "+rightMax+", "+rightSecond+"; "+leftMax+", "+leftSecond);} 1175 return 0; 1176 } 1177 1178 final int maxLen=Tools.min(bb.length()+distance, maxContigLen); 1179 1180 while(bb.length()<maxLen){ 1181 1182 //Generate the new kmer 1183 final byte b=AminoAcid.numberToBase[rightMaxPos]; 1184 final long x=rightMaxPos; 1185 final long x2=AminoAcid.numberToComplement[(int)x]; 1186 1187 final long evicted=(kmer>>>shift2); //Binary value that falls off the end. 1188 1189 //Now consider the next kmer 1190 kmer=((kmer<<2)|(long)x)&mask; 1191 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1192 1193 key=toValue(kmer, rkmer); 1194 table=tables.getTableForKey(key); 1195 1196 assert(table.getValue(key)==rightMax || rightMax==0); 1197 count=rightMax; 1198 1199 assert(count>=minCountExtend) : count; 1200 1201 if(leftCounts!=null){ 1202 leftMaxPos=fillLeftCounts(kmer, rkmer, leftCounts); 1203 leftMax=leftCounts[leftMaxPos]; 1204 leftSecondPos=Tools.secondHighestPosition(leftCounts); 1205 leftSecond=leftCounts[leftSecondPos]; 1206 } 1207 1208 rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts); 1209 rightMax=rightCounts[rightMaxPos]; 1210 rightSecondPos=Tools.secondHighestPosition(rightCounts); 1211 rightSecond=rightCounts[rightSecondPos]; 1212 1213 if(verbose){ 1214 outstream.println("kmer: "+toText(kmer)+", "+toText(rkmer)); 1215 outstream.println("Counts: "+count+", "+Arrays.toString(rightCounts)); 1216 outstream.println("rightMaxPos="+rightMaxPos); 1217 outstream.println("rightMax="+rightMax); 1218 outstream.println("rightSecondPos="+rightSecondPos); 1219 outstream.println("rightSecond="+rightSecond); 1220 } 1221 1222 if(isJunction(rightMax, rightSecond, leftMax, leftSecond)){ 1223 if(verbose){outstream.println("B: Breaking because isJunction("+rightMax+", "+rightSecond+", "+leftMax+", "+leftSecond+")");} 1224 if(includeJunctionBase && kmer>rkmer){ 1225 bb.append(b); 1226 if(verbose){outstream.println("Added base "+(char)b);} 1227 } 1228 break; 1229 } 1230 1231 if(leftCounts!=null && leftMaxPos!=evicted){ 1232 if(verbose){outstream.println("B: Breaking because of hidden branch: leftMaxPos!=evicted ("+leftMaxPos+"!="+evicted+")" + 1233 "\nleftMaxPos="+leftMaxPos+", leftMax="+leftMax+", leftSecondPos="+leftSecondPos+", leftSecond="+leftSecond);} 1234 if(includeJunctionBase && kmer>rkmer){ 1235 bb.append(b); 1236 if(verbose){outstream.println("Added base "+(char)b);} 1237 } 1238 break; 1239 } 1240 1241 bb.append(b); 1242 if(verbose){outstream.println("Added base "+(char)b);} 1243 1244 if(rightMax<minCountExtend){ 1245 if(verbose || verbose2){outstream.println("C: Breaking because highest right was too low: "+rightMax+"<"+minCountExtend);} 1246 break; 1247 } 1248 } 1249 if(verbose || verbose2){outstream.println("Extended by "+(bb.length()-initialLength));} 1250 return bb.length()-initialLength; 1251 } 1252 1253 1254 /*--------------------------------------------------------------*/ 1255 /*---------------- Junk Detection ----------------*/ 1256 /*--------------------------------------------------------------*/ 1257 1258 @Override isJunk(Read r)1259 public boolean isJunk(Read r){ 1260 boolean junk=isJunk(r, localRightCounts.get()); 1261 return junk; 1262 } 1263 1264 @Override isJunk(Read r, final int[] counts, Kmer kmer)1265 public boolean isJunk(Read r, final int[] counts, Kmer kmer){ 1266 return isJunk(r, counts); 1267 } 1268 isJunk(Read r, final int[] counts)1269 public boolean isJunk(Read r, final int[] counts){ 1270 final int blen=r.length(); 1271 if(blen<k){return true;} 1272 final byte[] bases=r.bases; 1273 1274 long kmer=0; 1275 long rkmer=0; 1276 int len=0; 1277 1278 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the leftmost kmer */ 1279 { 1280 for(int i=0; i<k; i++){ 1281 final byte b=bases[i]; 1282 final long x=AminoAcid.baseToNumber[b]; 1283 final long x2=AminoAcid.baseToComplementNumber[b]; 1284 kmer=((kmer<<2)|x)&mask; 1285 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1286 if(x<0){ 1287 len=0; 1288 kmer=rkmer=0; 1289 }else{len++;} 1290 if(verbose){outstream.println("C: Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 1291 } 1292 } 1293 1294 if(len>=k){ 1295 int maxPos=fillLeftCounts(kmer, rkmer, counts); 1296 if(counts[maxPos]>0){ 1297 //outstream.println("Not junk, left counts[maxPos]="+counts[maxPos]+"; right="+fillRightCounts(kmer, rkmer, counts, mask, shift2)); 1298 return false; 1299 } 1300 } 1301 1302 final boolean paired=(r.mateLength()>=k); 1303 int maxDepth=0; 1304 { 1305 for(int i=k; i<blen; i++){ 1306 final byte b=bases[i]; 1307 final long x=AminoAcid.baseToNumber[b]; 1308 final long x2=AminoAcid.baseToComplementNumber[b]; 1309 kmer=((kmer<<2)|x)&mask; 1310 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1311 if(x<0){ 1312 len=0; 1313 kmer=rkmer=0; 1314 }else{ 1315 len++; 1316 if(len>=k){ 1317 int depth=getCount(kmer, rkmer); 1318 if(depth>maxDepth){ 1319 maxDepth=depth; 1320 if(maxDepth>1 && (!paired || maxDepth>2)){ 1321 //outstream.println("Not junk, maxDepth="+maxDepth); 1322 return false; 1323 } 1324 } 1325 } 1326 } 1327 if(verbose){outstream.println("D: Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 1328 } 1329 } 1330 1331 if(len>=k && !paired){ 1332 int maxPos=fillRightCounts(kmer, rkmer, counts); 1333 if(counts[maxPos]>0){ 1334 //outstream.println("Not junk, right counts[maxPos]="+counts[maxPos]); 1335 return false; 1336 } 1337 } 1338 return true; 1339 } 1340 1341 @Override hasKmersAtOrBelow(Read r, int tooLow, final float fraction, Kmer kmer)1342 public boolean hasKmersAtOrBelow(Read r, int tooLow, final float fraction, Kmer kmer){ 1343 return hasKmersAtOrBelow(r, tooLow, fraction); 1344 } 1345 1346 @Override hasKmersAtOrBelow(Read r, final int tooLow, final float fraction)1347 public boolean hasKmersAtOrBelow(Read r, final int tooLow, final float fraction){ 1348 final int blen=r.length(); 1349 if(blen<k){return true;} 1350 final byte[] bases=r.bases; 1351 1352 // outstream.println("\n"+new String(r.bases)+":"); 1353 1354 long kmer=0; 1355 long rkmer=0; 1356 int len=0; 1357 1358 final int limit=Tools.max(1, Math.round((bases.length-kbig+1)*fraction)); 1359 int valid=0, invalid=0; 1360 1361 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the leftmost kmer */ 1362 { 1363 for(int i=0; i<blen; i++){ 1364 final byte b=bases[i]; 1365 final long x=AminoAcid.baseToNumber[b]; 1366 final long x2=AminoAcid.baseToComplementNumber[b]; 1367 kmer=((kmer<<2)|x)&mask; 1368 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1369 if(x<0){ 1370 len=0; 1371 kmer=rkmer=0; 1372 }else{ 1373 len++; 1374 if(len>=k){ 1375 int depth=getCount(kmer, rkmer); 1376 // outstream.println("depth="+depth+", kmer="+toText(kmer)); 1377 if(depth>tooLow){valid++;} 1378 else{ 1379 invalid++; 1380 if(invalid>=limit){return true;} 1381 } 1382 } 1383 } 1384 if(verbose){outstream.println("E: Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 1385 } 1386 } 1387 1388 //Compensate for nocalls changing the expected kmer count 1389 final int limit2=Tools.max(1, Math.round((valid+invalid)*fraction)); 1390 return valid<1 || invalid>=limit2; 1391 } 1392 1393 1394 /*--------------------------------------------------------------*/ 1395 /*---------------- Error Correction ----------------*/ 1396 /*--------------------------------------------------------------*/ 1397 1398 @Override errorCorrect(Read r)1399 public int errorCorrect(Read r){ 1400 initializeThreadLocals(); 1401 int corrected=errorCorrect(r, localLeftCounts.get(), localRightCounts.get(), localLongList.get(), 1402 localIntList.get(), localIntList2.get(), localByteBuilder.get(), localByteBuilder2.get(), localTracker.get(), localBitSet.get()); 1403 return corrected; 1404 } 1405 1406 @Override errorCorrect(Read r, final int[] leftCounts, final int[] rightCounts, LongList kmers, IntList counts, IntList counts2, final ByteBuilder bb, final ByteBuilder bb2, final ErrorTracker tracker, final BitSet bs, Kmer kmer, Kmer kmer2)1407 public int errorCorrect(Read r, final int[] leftCounts, final int[] rightCounts, LongList kmers, IntList counts, IntList counts2, 1408 final ByteBuilder bb, final ByteBuilder bb2, final ErrorTracker tracker, final BitSet bs, Kmer kmer, Kmer kmer2){ 1409 return errorCorrect(r, leftCounts, rightCounts, kmers, counts, counts2, bb, bb2, tracker, bs); 1410 } 1411 hasErrorsFast(LongList kmers)1412 boolean hasErrorsFast(LongList kmers){ 1413 if(kmers.size<1){return false;} 1414 int prev=-1; 1415 1416 final int incr=Tools.mid(1, k/2, 9), mcc=minCountCorrect(); 1417 for(int i=0; i<kmers.size; i+=incr){ 1418 long kmer=kmers.get(i); 1419 if(kmer<0){ 1420 return true; 1421 } 1422 long rkmer=rcomp(kmer); 1423 int count=getCount(kmer, rkmer); 1424 final int min=Tools.min(count, prev), max=Tools.max(count, prev); 1425 if(count<mcc || (i>0 && (isError(max+1, min-1)))){return true;} 1426 prev=count; 1427 } 1428 1429 long kmer=kmers.get(kmers.size()-1); 1430 if(kmer<0){return true;} 1431 long rkmer=rcomp(kmer); 1432 int count=getCount(kmer, rkmer); 1433 final int min=Tools.min(count, prev), max=Tools.max(count, prev); 1434 return count<mcc || isError(max+1, min-1); 1435 } 1436 errorCorrect(Read r, final int[] leftCounts, final int[] rightCounts, LongList kmers, IntList counts, IntList counts2, final ByteBuilder bb, final ByteBuilder bb2, final ErrorTracker tracker, final BitSet bs)1437 public int errorCorrect(Read r, final int[] leftCounts, final int[] rightCounts, LongList kmers, IntList counts, IntList counts2, 1438 final ByteBuilder bb, final ByteBuilder bb2, final ErrorTracker tracker, final BitSet bs){ 1439 1440 final byte[] bases=r.bases; 1441 final byte[] quals=r.quality; 1442 tracker.clear(); 1443 int valid=tables.fillKmers(bases, kmers); 1444 if(valid<2){return 0;} 1445 if(!r.containsUndefined() && !hasErrorsFast(kmers)){return 0;} 1446 1447 tables.fillCounts(kmers, counts); 1448 final int possibleErrors=tracker.suspected=countErrors(counts, quals); 1449 if(possibleErrors<0){return 0;} 1450 final float expectedErrors=r.expectedErrors(true, r.length()); 1451 final Rollback roll=ECC_ROLLBACK ? new Rollback(r, counts) : null; 1452 1453 assert(counts.size>0); 1454 1455 int correctedPincer=0; 1456 int correctedTail=0; 1457 int correctedBrute=0; 1458 int correctedReassemble=0; 1459 1460 if(ECC_PINCER){ 1461 correctedPincer+=errorCorrectPincer(bases, quals, leftCounts, rightCounts, kmers, counts, bb, tracker, errorExtensionPincer); 1462 } 1463 1464 if(ECC_TAIL || ECC_ALL){ 1465 int start=(ECC_ALL ? 0 : counts.size-k-1); 1466 // if(ECC_PINCER && tracker!=null && tracker.detected>correctedPincer){start=start-k;} 1467 correctedTail+=errorCorrectTail(bases, quals, leftCounts, rightCounts, kmers, counts, bb, tracker, start, errorExtensionTail); 1468 r.reverseComplement(); 1469 valid=tables.fillKmers(bases, kmers); 1470 counts.reverse(); 1471 correctedTail+=errorCorrectTail(bases, quals, leftCounts, rightCounts, kmers, counts, bb, tracker, start, errorExtensionTail); 1472 r.reverseComplement(); 1473 counts.reverse(); 1474 } 1475 1476 if(ECC_REASSEMBLE){ 1477 if(verbose){outstream.println("Correcting "+possibleErrors+" errors. Counts:\n"+counts);} 1478 if((correctedPincer<1 && correctedTail<1) || countErrors(counts, quals)>0){ 1479 correctedReassemble=reassemble(bases, quals, rightCounts, counts, counts2, tracker, errorExtensionReassemble, bb, bb2, null, null, bs); 1480 } 1481 // assert(tracker.detectedReassemble>0) : counts; 1482 if(verbose){outstream.println("Corrected "+correctedReassemble+" errors. Counts:\n"+counts);} 1483 } 1484 assert(counts.size>0); 1485 1486 //123 For testing. 1487 if(false && tracker.detected()>tracker.corrected()){ 1488 correctedBrute+=errorCorrectBruteForce(bases, quals, leftCounts, rightCounts, kmers, counts, bb, tracker, errorExtensionPincer); 1489 } 1490 1491 assert(correctedPincer+correctedTail+correctedReassemble+correctedBrute==tracker.corrected()) 1492 : correctedPincer+", "+correctedTail+", "+correctedReassemble+", "+correctedBrute+", "+tracker; 1493 1494 if(ECC_ROLLBACK && (tracker.corrected()>0 || tracker.rollback)){ 1495 1496 if(!tracker.rollback && quals!=null && tracker.corrected()>3){ 1497 float mult=Tools.max(1, 0.5f*(0.5f+0.01f*r.length()));//1 for a 150bp read. 1498 if(countErrors(counts, quals)>0 && tracker.corrected()>mult+expectedErrors){tracker.rollback=true;} 1499 else if(tracker.corrected()>2.5f*mult+expectedErrors){tracker.rollback=true;} 1500 } 1501 1502 // boolean printed=false; 1503 IntList counts0=roll.counts0; 1504 for(int i=0; !tracker.rollback && i<counts.size; i++){ 1505 int a=Tools.max(0, counts0.get(i)); 1506 int b=Tools.max(0, counts.get(i)); 1507 // assert(b+1>=a) : "Z: RID="+r.numericID+"; "+a+"->"+b+"\n"+counts0+"\n"+counts; 1508 if(b<a-1 && !isSimilar(a, b)){ 1509 // assert(false) : "Y: RID="+r.numericID+"; "+a+"->"+b+"\n"+counts0+"\n"+counts; 1510 if(verbose){outstream.println("Y: RID="+r.numericID+"; "+a+"->"+b+"\n"+counts0+"\n"+counts);} 1511 tracker.rollback=true; 1512 } 1513 // else if(b<a-1 && !printed){ 1514 // assert(false); 1515 // if(verbose){outstream.println("X: RID="+r.numericID+"; "+a+"->"+b+"\n"+counts0+"\n"+counts);} 1516 // printed=true; 1517 // } 1518 } 1519 1520 if(tracker.rollback){ 1521 roll.rollback(r, counts); 1522 tracker.clearCorrected(); 1523 return 0; 1524 } 1525 } 1526 1527 if(MARK_BAD_BASES>0 && (!MARK_ERROR_READS_ONLY || countErrors(counts, quals)>0 || 1528 r.expectedErrors(false, r.length())>3)){ 1529 int marked=markBadBases(bases, quals, counts, bs, MARK_BAD_BASES, MARK_DELTA_ONLY, MARK_QUALITY); 1530 tracker.marked=marked; 1531 } 1532 1533 return tracker.corrected(); 1534 } 1535 findBestMutant(final byte[] bases, final int a, final LongList kmers, final long[] bestMutant)1536 private int findBestMutant(final byte[] bases, final int a, final LongList kmers, final long[] bestMutant){ 1537 Arrays.fill(bestMutant, -1); 1538 final long kmer0=kmers.get(a); 1539 if(kmer0<0){return -1;} 1540 long mask=3; 1541 int max=0, second=0; 1542 for(int i=0; i<k; i++){ 1543 for(long num=0; num<4; num++){ 1544 final long kmer=(kmer0&~mask)|(num<<(2*i)); 1545 if(kmer!=kmer0){ 1546 final long rkmer=rcomp(kmer); 1547 int count=getCount(kmer, rkmer); 1548 if(count>max){ 1549 second=max; 1550 max=count; 1551 bestMutant[0]=kmer; 1552 bestMutant[1]=rkmer; 1553 bestMutant[2]=i; 1554 }else if(count>second){ 1555 second=count; 1556 } 1557 } 1558 } 1559 mask<<=2; 1560 } 1561 bestMutant[3]=max; 1562 bestMutant[4]=second; 1563 return max>second ? max : -1; 1564 } 1565 fixBestMutant(final byte[] bases, final int a, final LongList kmers, final IntList counts, final int pos, final long kmer)1566 private boolean fixBestMutant(final byte[] bases, final int a, final LongList kmers, final IntList counts, final int pos, final long kmer){ 1567 int shift=2*pos; 1568 long mask=(3L)<<shift; 1569 byte num=(byte)((kmer>>shift)&mask); 1570 byte base=AminoAcid.numberToBase[num]; 1571 byte old=bases[a+pos]; 1572 bases[a+pos]=base; 1573 1574 // long[] copy=counts.toArray(); 1575 1576 // tables.regenerateKmers(bases, kmers, counts, a); 1577 tables.fillKmers(bases, kmers); 1578 tables.fillCounts(kmers, counts); 1579 1580 // for(int i=0; i<copy.length; i++){ 1581 // if(counts.get(i)<copy[i]){ 1582 // bases[a+pos]=old; 1583 // tables.fillKmers(bases, kmers); 1584 // tables.fillCounts(kmers, counts); 1585 // return false; 1586 // } 1587 // } 1588 1589 return true; 1590 } 1591 errorCorrectBruteForce(final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer, final LongList kmers, final IntList counts, final ByteBuilder bb, final ErrorTracker tracker, final int errorExtension)1592 public int errorCorrectBruteForce(final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer, 1593 final LongList kmers, final IntList counts, final ByteBuilder bb, final ErrorTracker tracker, final int errorExtension){ 1594 1595 int detected=0; 1596 int corrected=0; 1597 1598 final long[] bestMutant=new long[5]; 1599 final int[] countCopy=counts.toArray(); 1600 final byte[] basesCopy=bases.clone(); 1601 1602 final int maxCount=Tools.max(counts.array, 0, counts.size-1); 1603 if(maxCount<minCountCorrect()){ 1604 return 0; 1605 } 1606 int thresh=Tools.max(3, maxCount/4); 1607 1608 for(int a=0, d=k+1; a<counts.size; a++, d++){ 1609 final int aCount=counts.get(a); 1610 if(aCount<thresh){ 1611 detected++; 1612 boolean fixed=false; 1613 int mCount=findBestMutant(bases, a, kmers, bestMutant); 1614 if(mCount>thresh && mCount>=minCountCorrect() && (isSimilar(mCount, maxCount) || (mCount<maxCount && mCount>(1+aCount)*3))){ 1615 long kmer=bestMutant[0]; 1616 long rkmer=bestMutant[1]; 1617 int pos=(int)bestMutant[2]; 1618 assert(mCount==bestMutant[3]); 1619 int secondBestCount=(int)bestMutant[4]; 1620 if(mCount>(1+secondBestCount*3) && pos>=errorExtension && pos<=(k-errorExtension)){ 1621 fixBestMutant(bases, a, kmers, counts, pos, kmer); 1622 fixed=true; 1623 corrected++; 1624 } 1625 } 1626 } 1627 } 1628 1629 for(int i=0; i<countCopy.length; i++){ 1630 if(counts.get(i)<countCopy[i]){ 1631 for(int j=0; j<bases.length; j++){bases[j]=basesCopy[j];} 1632 counts.clear(); 1633 for(int count : countCopy){counts.add(count);} 1634 tables.fillKmers(bases, kmers); 1635 corrected=0; 1636 } 1637 } 1638 1639 { 1640 tracker.detectedBrute+=detected; 1641 tracker.correctedBrute+=corrected; 1642 } 1643 1644 return corrected; 1645 } 1646 errorCorrectPincer(final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer, final LongList kmers, final IntList counts, final ByteBuilder bb, final ErrorTracker tracker, final int errorExtension)1647 public int errorCorrectPincer(final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer, 1648 final LongList kmers, final IntList counts, final ByteBuilder bb, final ErrorTracker tracker, final int errorExtension){ 1649 1650 int detected=0; 1651 int corrected=0; 1652 1653 //a is the index of the left kmer 1654 //b is a+1 (right-extension of left kmer) 1655 //c is d-1 (left-extension of right kmer) 1656 //d is the index of the right kmer 1657 //the base between the kmers is at a+k 1658 for(int a=0, d=k+1; d<counts.size; a++, d++){ 1659 final int aCount=counts.get(a); 1660 final int bCount=counts.get(a+1); 1661 final int cCount=counts.get(d-1); 1662 final int dCount=counts.get(d); 1663 final byte qb=(quals==null ? 20 : quals[a+kbig]); 1664 if(isError(aCount, bCount, qb) && isError(dCount, cCount, qb) && isSimilar(aCount, dCount)){ 1665 if(verbose){ 1666 outstream.println("Found error: "+aCount+", "+bCount+", "+cCount+", "+dCount); 1667 } 1668 //Looks like a 1bp substitution; attempt to correct. 1669 detected++; 1670 int ret=correctSingleBasePincer(a, d, bases, quals, leftBuffer, rightBuffer, kmers, counts, bb, errorExtension); 1671 corrected+=ret; 1672 if(verbose){ 1673 outstream.println("Corrected error."); 1674 } 1675 }else{ 1676 if(verbose){ 1677 outstream.println("Not an error: "+aCount+", "+bCount+", "+cCount+", "+dCount+ 1678 "; "+isError(aCount, bCount, qb)+", "+isError(dCount, cCount, qb)+", "+isSimilar(aCount, dCount)); 1679 } 1680 } 1681 } 1682 1683 // if(detected==0 && counts.get(0)>2 && counts.get(counts.size-1)>2){ 1684 // assert(!verbose); 1685 // verbose=true; 1686 // outstream.println("\n"+counts); 1687 // errorCorrectPincer(bases, quals, leftBuffer, rightBuffer, kmers, counts, bb, tracker); 1688 // assert(false); 1689 // } 1690 1691 { 1692 tracker.detectedPincer+=detected; 1693 tracker.correctedPincer+=corrected; 1694 } 1695 1696 return corrected; 1697 } 1698 errorCorrectTail(final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer, final LongList kmers, final IntList counts, final ByteBuilder bb, final ErrorTracker tracker, final int startPos, final int errorExtension)1699 public int errorCorrectTail(final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer, 1700 final LongList kmers, final IntList counts, final ByteBuilder bb, final ErrorTracker tracker, final int startPos, final int errorExtension){ 1701 if(bases.length<k+2+errorExtension+deadZone){return 0;} 1702 int detected=0; 1703 int corrected=0; 1704 1705 //a is the index of the left kmer 1706 //b is a+1 1707 //the base between the kmers is at a+k 1708 for(int a=Tools.max(startPos, errorExtension), lim=counts.size-deadZone-1; a<lim; a++){//errorExtension-1 1709 final int aCount=counts.get(a); 1710 final int bCount=counts.get(a+1); 1711 final byte qb=(quals==null ? 20 : quals[a+kbig]); 1712 if(isError(aCount, bCount, qb) && isSimilar(aCount, a-errorExtension, a-1, counts) && isError(aCount, a+2, a+k, counts)){ 1713 if(verbose){ 1714 outstream.println("Found error: "+aCount+", "+bCount); 1715 } 1716 //Assume like a 1bp substitution; attempt to correct. 1717 detected++; 1718 int ret=correctSingleBaseRight(a, bases, quals, leftBuffer, rightBuffer, kmers, counts, bb, errorExtension); 1719 corrected+=ret; 1720 if(verbose){ 1721 outstream.println("Corrected error."); 1722 } 1723 }else{ 1724 if(verbose){ 1725 outstream.println("Not an error: "+aCount+", "+bCount+ 1726 "; "+isError(aCount, bCount, qb)+", "+isSimilar(aCount, a-errorExtension, a-1, counts)+", "+isError(aCount, a+2, a+k, counts)); 1727 } 1728 } 1729 } 1730 1731 // if(detected==0 && counts.get(0)>2 && counts.get(counts.size-1)>2){ 1732 // assert(!verbose); 1733 // verbose=true; 1734 // outstream.println("\n"+counts); 1735 // errorCorrectPincer(bases, quals, leftBuffer, rightBuffer, kmers, counts, bb, tracker); 1736 // assert(false); 1737 // } 1738 1739 { 1740 tracker.detectedTail+=detected; 1741 tracker.correctedTail+=corrected; 1742 } 1743 1744 return corrected; 1745 } 1746 1747 // public int reassemble_inner(final ByteBuilder bases, final byte[] quals, final int[] rightCounts, final IntList counts, 1748 // final int errorExtension, final Kmer kmer, final Kmer kmer2){ 1749 // throw new RuntimeException("TODO"); 1750 // } 1751 1752 @Override reassemble_inner(final ByteBuilder bb, final byte[] quals, final int[] rightCounts, final IntList counts, final int errorExtension, final Kmer kmer, final Kmer regenKmer)1753 public int reassemble_inner(final ByteBuilder bb, final byte[] quals, final int[] rightCounts, final IntList counts, 1754 final int errorExtension, final Kmer kmer, final Kmer regenKmer){ 1755 return reassemble_inner(bb, quals, rightCounts, counts, errorExtension); 1756 } 1757 reassemble_inner(final ByteBuilder bb, final byte[] quals, final int[] rightCounts, final IntList counts, final int errorExtension)1758 public int reassemble_inner(final ByteBuilder bb, final byte[] quals, final int[] rightCounts, final IntList counts, 1759 final int errorExtension){ 1760 final int length=bb.length(); 1761 if(length<k+1+deadZone){return 0;} 1762 final byte[] bases=bb.array; 1763 1764 long kmer=0, rkmer=0; 1765 1766 int detected=0; 1767 int corrected=0; 1768 int len=0; 1769 1770 //a is the index of the first base of the left kmer 1771 //b=a+1 is the index of the next base 1772 //ca=a-k is the index of the next base 1773 //cb=a-k is the index of the next base 1774 //the base between the kmers is at a+k 1775 for(int a=0, lim=length-deadZone-1; a<lim; a++){ 1776 1777 //Generate the new kmer 1778 final byte aBase=bases[a]; 1779 final long x=AminoAcid.baseToNumber[aBase]; 1780 final long x2=AminoAcid.baseToComplementNumber[aBase]; 1781 1782 if(x<0){ 1783 len=0; 1784 kmer=rkmer=0; 1785 }else{ 1786 1787 1788 //Now consider the next kmer 1789 kmer=((kmer<<2)|(long)x)&mask; 1790 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1791 len++; 1792 1793 if(verbose){ 1794 outstream.println("len: "+len+" vs "+k+"; a="+a); 1795 } 1796 1797 if(len>=k){ 1798 1799 final int b=a+1; 1800 final int ca=a-k+1; 1801 final int cb=ca+1; 1802 1803 final int aCount=counts.get(ca); 1804 final int bCount=counts.get(cb); 1805 final byte qb=(quals==null ? 20 : quals[b]); 1806 1807 if(verbose){ 1808 outstream.println("ca="+ca+", cb="+cb+"; aCount="+aCount+", bCount="+bCount); 1809 outstream.println(isError(aCount, bCount, qb)+", "+isSimilar(aCount, ca-errorExtension, ca-1, counts)+ 1810 ", "+isError(aCount, ca+2, ca+k, counts)); 1811 } 1812 1813 // if(isError(aCount, bCount) && isSimilar(aCount, ca-errorExtension, ca-1, counts) && isError(aCount, ca+2, ca+k, counts)){ 1814 if(isSubstitution(ca, errorExtension, qb, counts)){ 1815 if(verbose){ 1816 outstream.println("***Found error: "+aCount+", "+bCount); 1817 } 1818 //Assume a 1bp substitution; attempt to correct. 1819 1820 int rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts); 1821 int rightMax=rightCounts[rightMaxPos]; 1822 int rightSecondPos=Tools.secondHighestPosition(rightCounts); 1823 int rightSecond=rightCounts[rightSecondPos]; 1824 1825 byte base=bases[b]; 1826 byte num=AminoAcid.baseToNumber[base]; 1827 1828 if(rightMax>=minCountExtend){ 1829 detected++; 1830 if(num==rightMax){ 1831 detected--; 1832 // bases2[b]=base; 1833 }else if((isError(rightMax, rightSecond, qb) || !isJunction(rightMax, rightSecond)) && isSimilar(aCount, rightMax)){ 1834 bases[b]=AminoAcid.numberToBase[rightMaxPos]; 1835 corrected++; 1836 tables.regenerateCounts(bases, counts, ca); 1837 if(verbose){outstream.println("Corrected error: "+num+"->"+rightMaxPos+". New counts:\n"+counts);} 1838 } 1839 1840 // else if(rightSecond>=minCountExtend && isJunction(rightMax, rightSecond) && isSimilar(aCount, rightSecond) 1841 // && !isSimilar(aCount, rightMax)){//This branch may not be very safe. 1842 // bases2[b]=AminoAcid.numberToBase[rightSecondPos]; 1843 // corrected++; 1844 // if(verbose){outstream.println("Corrected error.");} 1845 // } 1846 } 1847 1848 }else{ 1849 if(verbose){ 1850 outstream.println("Not an error: "+aCount+", "+bCount+ 1851 "; "+isError(aCount, bCount, qb)+", "+isSimilar(aCount, a-errorExtension, a-1, counts)+", "+isError(aCount, a+2, a+k, counts)); 1852 } 1853 } 1854 } 1855 } 1856 } 1857 1858 return corrected; 1859 } 1860 correctSingleBasePincer(final int a, final int d, final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer, final LongList kmers, final IntList counts, final ByteBuilder bb, final int errorExtension)1861 private int correctSingleBasePincer(final int a, final int d, final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer, 1862 final LongList kmers, final IntList counts, final ByteBuilder bb, final int errorExtension){ 1863 final byte leftReplacement, rightReplacement; 1864 final int loc=a+k; 1865 { 1866 bb.clear(); 1867 final long kmer=kmers.get(a); 1868 final long rkmer=rcomp(kmer); 1869 int extension=extendToRight2(bb, null, rightBuffer, errorExtension, true, kmer, rkmer); 1870 if(extension<errorExtension){return 0;} 1871 for(int i=1; i<extension; i++){ 1872 if(bb.get(i)!=bases[loc+i]){ 1873 return 0; 1874 } 1875 } 1876 leftReplacement=bb.get(0); 1877 } 1878 { 1879 bb.clear(); 1880 final long rkmer=kmers.get(d); 1881 final long kmer=rcomp(rkmer); 1882 int extension=extendToRight2(bb, null, rightBuffer, errorExtension, true, kmer, rkmer); 1883 if(extension<errorExtension){return 0;} 1884 bb.reverseComplementInPlace(); 1885 for(int i=0; i<extension-1; i++){ 1886 if(bb.get(i)!=bases[loc+i+1-extension]){ 1887 return 0; 1888 } 1889 } 1890 rightReplacement=bb.get(extension-1); 1891 } 1892 if(leftReplacement!=rightReplacement){return 0;} 1893 if(bases[loc]==leftReplacement){return 0;} 1894 if(!isSimilar(a, leftReplacement, kmers, counts)){return 0;} 1895 1896 bases[loc]=leftReplacement; 1897 assert(d==a+k+1); 1898 tables.regenerateKmers(bases, kmers, counts, a); 1899 return 1; 1900 } 1901 correctSingleBaseRight(final int a, final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer, final LongList kmers, final IntList counts, final ByteBuilder bb, final int errorExtension0)1902 private int correctSingleBaseRight(final int a, final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer, 1903 final LongList kmers, final IntList counts, final ByteBuilder bb, final int errorExtension0){ 1904 final byte leftReplacement; 1905 final int loc=a+k; 1906 final int errorExtension=Tools.min(errorExtension0, bases.length-loc); 1907 { 1908 bb.clear(); 1909 final long kmer=kmers.get(a); 1910 final long rkmer=rcomp(kmer); 1911 int extension=extendToRight2(bb, null, rightBuffer, errorExtension, true, kmer, rkmer); 1912 if(extension<errorExtension){return 0;} 1913 for(int i=1; i<extension; i++){ 1914 if(bb.get(i)!=bases[loc+i]){ 1915 return 0; 1916 } 1917 } 1918 leftReplacement=bb.get(0); 1919 } 1920 1921 if(bases[loc]==leftReplacement){return 0;} 1922 if(!isSimilar(a, leftReplacement, kmers, counts)){return 0;} 1923 1924 bases[loc]=leftReplacement; 1925 tables.regenerateKmers(bases, kmers, counts, a); 1926 return 1; 1927 } 1928 isSimilar(int a, byte newBase, LongList kmers, IntList counts)1929 private boolean isSimilar(int a, byte newBase, LongList kmers, IntList counts){ 1930 long kmer=kmers.get(a); 1931 1932 final long x=AminoAcid.baseToNumber[newBase]; 1933 kmer=((kmer<<2)|x)&mask; 1934 long rkmer=rcomp(kmer); 1935 int count=getCount(kmer, rkmer); 1936 int aCount=counts.get(a); 1937 boolean similar=isSimilar(aCount, count); 1938 return similar; 1939 } 1940 1941 /*--------------------------------------------------------------*/ 1942 /*---------------- Inherited Abstract Methods ----------------*/ 1943 /*--------------------------------------------------------------*/ 1944 1945 @Override makeKhist()1946 final void makeKhist(){ 1947 tables.makeKhist(outHist, histColumns, histMax, histHeader, histZeros, true, smoothHist, gcHist, false, 0.01, 1, 1); 1948 } 1949 @Override dumpKmersAsText()1950 final void dumpKmersAsText(){ 1951 tables.dumpKmersAsBytes_MT(outKmers, minToDump, maxToDump, true, null); 1952 } 1953 1954 /*--------------------------------------------------------------*/ 1955 /*---------------- Fields ----------------*/ 1956 /*--------------------------------------------------------------*/ 1957 1958 @Override tables()1959 public final KmerTableSet tables(){return tables;} 1960 public final KmerTableSet tables; 1961 1962 /** Normal kmer length */ 1963 final int k; 1964 /** k-1; used in some expressions */ 1965 final int k2; 1966 1967 final int shift; 1968 final int shift2; 1969 final long mask; 1970 1971 } 1972