1 package bloom; 2 3 import java.util.ArrayList; 4 import java.util.Arrays; 5 import java.util.BitSet; 6 7 import dna.AminoAcid; 8 import dna.Data; 9 import fileIO.FileFormat; 10 import fileIO.ReadWrite; 11 import shared.KillSwitch; 12 import shared.Parse; 13 import shared.Parser; 14 import shared.PreParser; 15 import shared.ReadStats; 16 import shared.Timer; 17 import shared.Tools; 18 import stream.ConcurrentReadInputStream; 19 import stream.ConcurrentReadOutputStream; 20 import stream.Read; 21 import structures.ListNum; 22 23 /** 24 * @author Brian Bushnell 25 * @date Aug 20, 2012 26 * 27 */ 28 public class ErrorCorrect extends Thread{ 29 main(String[] args)30 public static void main(String[] args){ 31 32 {//Preparse block for help, config files, and outstream 33 PreParser pp=new PreParser(args, new Object() { }.getClass().getEnclosingClass(), false); 34 args=pp.args; 35 //outstream=pp.outstream; 36 } 37 38 String reads1=args[0]; 39 String reads2=(args.length>1 ? args[1] : null); 40 41 int k=23; 42 int cbits=4; 43 int gap=0; 44 int hashes=1; 45 int thresh1=1; 46 int thresh2=2; 47 int matrixbits=34; 48 long maxReads=-1; 49 int buildpasses=1; 50 long tablereads=-1; //How many reads to process when building the hashtable 51 int buildStepsize=4; 52 String output=null; 53 boolean ordered=true; 54 boolean overwrite=false; 55 boolean append=false; 56 57 for(int i=2; i<args.length; i++){ 58 final String arg=args[i]; 59 final String[] split=arg.split("="); 60 String a=split[0].toLowerCase(); 61 String b=split.length>1 ? split[1] : null; 62 63 if(Parser.parseZip(arg, a, b)){ 64 //do nothing 65 }else if(Parser.parseQuality(arg, a, b)){ 66 //do nothing 67 }else if(a.equals("k") || a.equals("kmer")){ 68 k=Integer.parseInt(b); 69 }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ 70 cbits=Integer.parseInt(b); 71 }else if(a.equals("initialthresh") || a.equals("thresh1")){ 72 thresh1=Integer.parseInt(b); 73 }else if(a.equals("thresh") || a.equals("thresh2")){ 74 thresh2=Integer.parseInt(b); 75 }else if(a.startsWith("gap")){ 76 gap=Integer.parseInt(b); 77 }else if(a.startsWith("matrixbits")){ 78 matrixbits=Integer.parseInt(b); 79 }else if(a.startsWith("hashes") || a.startsWith("multihash")){ 80 hashes=Integer.parseInt(b); 81 }else if(a.startsWith("maxerrors")){ 82 ERROR_CORRECTION_LIMIT=Integer.parseInt(b); 83 }else if(a.startsWith("passes")){ 84 buildpasses=Integer.parseInt(b); 85 }else if(a.startsWith("stepsize") || a.startsWith("buildstepsize")){ 86 buildStepsize=Integer.parseInt(b); 87 }else if(a.equals("threads") || a.equals("t")){ 88 System.err.println("Can't change threadcount for this class."); //THREADS=Integer.parseInt(b); 89 }else if(a.startsWith("reads") || a.startsWith("maxreads")){ 90 maxReads=Parse.parseKMG(b); 91 }else if(a.startsWith("tablereads")){ 92 tablereads=Parse.parseKMG(b); 93 }else if(a.startsWith("build") || a.startsWith("genome")){ 94 Data.setGenome(Integer.parseInt(b)); 95 Data.sysout.println("Set genome to "+Data.GENOME_BUILD); 96 }else if(a.equals("outputinfo") || a.startsWith("info")){ 97 OUTPUT_INFO=Parse.parseBoolean(b); 98 }else if(a.startsWith("out")){ 99 output=b; 100 }else if(a.startsWith("verbose")){ 101 KCountArray.verbose=Parse.parseBoolean(b); 102 // verbose=KCountArray.verbose=Parse.parseBoolean(b); 103 }else if(a.equals("ordered") || a.equals("ord")){ 104 ordered=Parse.parseBoolean(b); 105 }else if(a.equals("append") || a.equals("app")){ 106 append=ReadStats.append=Parse.parseBoolean(b); 107 }else if(a.equals("overwrite") || a.equals("ow")){ 108 overwrite=Parse.parseBoolean(b); 109 }else{ 110 throw new RuntimeException("Unknown parameter "+args[i]); 111 } 112 } 113 114 {//Process parser fields 115 Parser.processQuality(); 116 } 117 118 KCountArray kca=makeTable(reads1, reads2, k, cbits, gap, hashes, buildpasses, matrixbits, tablereads, buildStepsize, thresh1, thresh2); 119 120 detect(reads1, reads2, kca, k, thresh2, maxReads, output, ordered, append, overwrite); 121 } 122 makeTable(String reads1, String reads2, int k, int cbits, int gap, int hashes, int buildpasses, int matrixbits, long maxreads, int stepsize, int thresh1, int thresh2)123 public static KCountArray makeTable(String reads1, String reads2, int k, int cbits, int gap, int hashes, int buildpasses, int matrixbits, 124 long maxreads, int stepsize, int thresh1, int thresh2){ 125 126 Timer thash=new Timer(); 127 128 KmerCount6.maxReads=maxreads; 129 int kbits=2*k; 130 matrixbits=Tools.min(kbits, matrixbits); 131 132 thash.start(); 133 // Data.sysout.println("kbits="+(kbits)+" -> "+(1L<<kbits)+", matrixbits="+(matrixbits)+" -> "+(1L<<matrixbits)+", cbits="+cbits+", gap="+gap+", hashes="+hashes); 134 KCountArray kca=KCountArray.makeNew(1L<<kbits, 1L<<matrixbits, cbits, gap, hashes); 135 136 assert(gap==0) : "TODO"; 137 if(buildpasses==1){ 138 139 KmerCount6.count(reads1, reads2, k, cbits, gap, true, kca); 140 kca.shutdown(); 141 142 }else{ 143 assert(buildpasses>1); 144 KCountArray trusted=null; 145 for(int i=1; i<buildpasses; i++){ 146 boolean conservative=i>2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3) 147 int step=(stepsize==1 ? 1 : stepsize+i%2); 148 // if(!conservative){step=(step+3)/4;} 149 if(!conservative){step=Tools.min(3, (step+3)/4);} 150 151 KmerCount6.count(reads1, reads2, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative); 152 153 kca.shutdown(); 154 Data.sysout.println("Trusted: \t"+kca.toShortString()); 155 trusted=kca; 156 kca=KCountArray.makeNew(1L<<kbits, 1L<<matrixbits, cbits, gap, hashes); 157 158 } 159 160 KmerCount6.count(reads1, reads2, k, cbits, true, kca, trusted, maxreads, thresh2, stepsize, true); 161 162 kca.shutdown(); 163 } 164 165 166 thash.stop(); 167 // Data.sysout.println(kca.toString()); 168 Data.sysout.println("Table : \t"+kca.toShortString()); 169 Data.sysout.println("Hash time: \t"+thash); 170 return kca; 171 } 172 detect(String reads1, String reads2, KCountArray kca, int k, int thresh, long maxReads, String output, boolean ordered, boolean append, boolean overwrite)173 public static void detect(String reads1, String reads2, KCountArray kca, int k, int thresh, long maxReads, String output, boolean ordered, boolean append, boolean overwrite) { 174 175 176 final ConcurrentReadInputStream cris; 177 { 178 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); 179 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); 180 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); 181 } 182 assert(cris!=null) : reads1; 183 184 cris.start(); 185 if(verbose){System.err.println("Started cris");} 186 boolean paired=cris.paired(); 187 if(verbose){System.err.println("Paired: "+paired);} 188 189 ConcurrentReadOutputStream ros=null; 190 if(output!=null){ 191 String out1=output.replaceFirst("#", "1"), out2=null; 192 193 if(cris.paired()){ 194 if(output.contains("#")){ 195 out2=output.replaceFirst("#", "2"); 196 }else{ 197 System.err.println("Writing interleaved."); 198 } 199 } 200 201 final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS)); 202 203 FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, OUTPUT_INFO ? ".info" : null, true, overwrite, append, ordered); 204 FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, OUTPUT_INFO ? ".info" : null, true, overwrite, append, ordered); 205 ros=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true); 206 207 assert(!ff1.sam()) : "Sam files need reference info for the header."; 208 } 209 210 211 if(ros!=null){ 212 ros.start(); 213 Data.sysout.println("Started output threads."); 214 } 215 216 detect(cris, kca, k, thresh, maxReads, ros); 217 218 ReadWrite.closeStreams(cris, ros); 219 if(verbose){System.err.println("Closed stream");} 220 } 221 detect(ConcurrentReadInputStream cris, KCountArray kca, int k, int thresh, long maxReads, ConcurrentReadOutputStream ros)222 public static void detect(ConcurrentReadInputStream cris, KCountArray kca, int k, int thresh, long maxReads, ConcurrentReadOutputStream ros) { 223 Timer tdetect=new Timer(); 224 tdetect.start(); 225 226 ListNum<Read> ln=cris.nextList(); 227 ArrayList<Read> reads=(ln!=null ? ln.list : null); 228 229 long covered=0; 230 long uncovered=0; 231 232 long coveredFinal=0; 233 long uncoveredFinal=0; 234 235 long fullyCorrected=0; 236 long failed=0; 237 238 long totalBases=0; 239 long totalReads=0; 240 241 242 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 243 for(Read r : reads){ 244 Read r2=r.mate; 245 { 246 247 // if(r.numericID==23){verbose=true;} 248 249 totalReads++; 250 if(verbose){System.err.println();} 251 totalBases+=r.length(); 252 // BitSet bs=detectErrors(r, kca, k, thresh); 253 BitSet bs=detectErrorsBulk(r, kca, k, thresh, 1); 254 if(verbose){System.err.println(toString(bs, r.length()));} 255 // Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.length())); 256 if(verbose){System.err.println(toString(detectErrors(r, kca, k, thresh), r.length()-k+1));} 257 if(bs==null){//No errors, or can't detect errors 258 assert(false); 259 }else{ 260 int x=bs.cardinality(); 261 covered+=x; 262 uncovered+=(r.length()-x); 263 if(x<r.length()){ 264 bs=correctErrors(r, kca, k, thresh, bs, ERROR_CORRECTION_LIMIT, MAX_ERROR_BURST); 265 } 266 int y=bs.cardinality(); 267 coveredFinal+=y; 268 uncoveredFinal+=(r.length()-y); 269 if(x<r.length()){ 270 if(y==r.length()){ 271 fullyCorrected++; 272 }else{ 273 failed++; 274 } 275 } 276 } 277 } 278 if(r2!=null){ 279 totalReads++; 280 totalBases+=r2.length(); 281 // BitSet bs=detectErrors(r2, kca, k, thresh); 282 BitSet bs=detectErrorsBulk(r2, kca, k, thresh, 1); 283 if(verbose){System.err.println(toString(bs, r2.length()));} 284 // Data.sysout.println(toString(detectErrorsTips(r2, kca, k, thresh), r2.length())); 285 if(verbose){System.err.println(toString(detectErrors(r2, kca, k, thresh), r2.length()-k+1));} 286 if(bs==null){//No errors, or can't detect errors 287 }else{ 288 int x=bs.cardinality(); 289 covered+=x; 290 uncovered+=(r2.length()-x); 291 if(x<r2.length()){ 292 bs=correctErrors(r2, kca, k, thresh, bs, ERROR_CORRECTION_LIMIT, MAX_ERROR_BURST); 293 } 294 int y=bs.cardinality(); 295 coveredFinal+=y; 296 uncoveredFinal+=(r2.length()-y); 297 if(x<r2.length()){ 298 if(y==r2.length()){ 299 fullyCorrected++; 300 }else{ 301 failed++; 302 } 303 } 304 } 305 } 306 } 307 308 if(ros!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight. 309 if(DONT_OUTPUT_BAD_READS){removeBad(reads);} 310 for(Read r : reads){ 311 if(r!=null){ 312 r.samline=null; 313 assert(r.bases!=null); 314 if(r.sites!=null && r.sites.isEmpty()){r.sites=null;} 315 } 316 } 317 // System.err.println("Adding list of length "+readlist.size()); 318 ros.add(reads, ln.id); 319 } 320 321 cris.returnList(ln); 322 //System.err.println("fetching list"); 323 ln=cris.nextList(); 324 reads=(ln!=null ? ln.list : null); 325 } 326 if(verbose){System.err.println("Finished reading");} 327 cris.returnList(ln); 328 if(verbose){System.err.println("Returned list");} 329 330 tdetect.stop(); 331 Data.sysout.println("Detect time: \t"+tdetect); 332 Data.sysout.println("Total reads: \t"+totalReads); 333 Data.sysout.println("Total bases: \t"+totalBases); 334 Data.sysout.println("Reads Corrected:\t"+fullyCorrected); 335 Data.sysout.println("Reads Failed: \t"+failed); 336 337 Data.sysout.println("\n - before correction - "); 338 Data.sysout.println("Covered: \t"+covered); 339 Data.sysout.println("Uncovered: \t"+uncovered); 340 341 Data.sysout.println("\n - after correction - "); 342 Data.sysout.println("Covered: \t"+coveredFinal); 343 Data.sysout.println("Uncovered: \t"+uncoveredFinal); 344 } 345 346 /** Sets a 1 bit at start of each kmer with count at least thresh */ detectErrors(final Read r, final KCountArray kca, final int k, final int thresh)347 public static BitSet detectErrors(final Read r, final KCountArray kca, final int k, final int thresh){ 348 349 final int kbits=2*k; 350 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); 351 final int gap=kca.gap; 352 final byte[] bases=r.bases; 353 assert(kca.gap==0); 354 355 int bslen=r.length()-k-gap+1; 356 if(bslen<1){return null;} //Read is too short to detect errors 357 BitSet bs=new BitSet(bslen); 358 359 int len=0; 360 long kmer=0; 361 for(int i=0; i<bases.length; i++){ 362 byte b=bases[i]; 363 int x=AminoAcid.baseToNumber[b]; 364 if(x<0){ 365 len=0; 366 kmer=0; 367 }else{ 368 kmer=((kmer<<2)|x)&mask; 369 len++; 370 371 if(len>=k){ 372 int count=kca.read(kmer); 373 if(count>=thresh){ 374 bs.set(i+1-k); 375 } 376 } 377 } 378 } 379 380 return bs; 381 } 382 383 /** Sets a 1 bit for every base covered by a kmer with count at least thresh */ detectErrorsBulk(final Read r, final KCountArray kca, final int k, final int thresh, final int stepsize)384 public static BitSet detectErrorsBulk(final Read r, final KCountArray kca, final int k, final int thresh, final int stepsize){ 385 386 final int kbits=2*k; 387 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); 388 final int gap=kca.gap; 389 final byte[] bases=r.bases; 390 assert(gap==0); 391 392 if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors 393 BitSet bs=new BitSet(r.length()); 394 final int setlen=k+gap; 395 396 int len=0; 397 long kmer=0; 398 for(int i=0; i<bases.length; i++){ 399 byte b=bases[i]; 400 int x=AminoAcid.baseToNumber[b]; 401 if(x<0){ 402 len=0; 403 kmer=0; 404 }else{ 405 kmer=((kmer<<2)|x)&mask; 406 len++; 407 408 if(len>=k && ((len-k)%stepsize==0 || i==bases.length-1)){ 409 int count=kca.read(kmer); 410 if(count>=thresh){ 411 bs.set(i+1-setlen, i+1); 412 } 413 } 414 } 415 } 416 r.errors=bs.cardinality()-r.length(); 417 418 return bs; 419 } 420 421 /** Sets 1 for all bases. 422 * Then clears all bits covered by incorrect kmers. */ detectTrusted(final Read r, final KCountArray kca, final int k, final int thresh, final int detectStepsize)423 public static BitSet detectTrusted(final Read r, final KCountArray kca, final int k, final int thresh, final int detectStepsize){ 424 425 final int kbits=2*k; 426 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); 427 final int gap=kca.gap; 428 final byte[] bases=r.bases; 429 assert(gap==0); 430 431 if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors 432 BitSet bs=new BitSet(r.length()); 433 bs.set(0, r.length()); 434 final int setlen=k+gap; 435 436 int len=0; 437 long kmer=0; 438 for(int i=0; i<bases.length; i++){ 439 byte b=bases[i]; 440 int x=AminoAcid.baseToNumber[b]; 441 if(x<0){ 442 len=0; 443 kmer=0; 444 }else{ 445 kmer=((kmer<<2)|x)&mask; 446 len++; 447 448 if(len>=k && (i%detectStepsize==0 || i==bases.length-1)){ 449 int count=kca.read(kmer); 450 if(count<thresh){ 451 bs.clear(i+1-setlen, i+1); 452 // bs.clear(i+1-setlen+detectStepsize, i+1-detectStepsize); 453 // bs.clear(i+k/2-detectStepsize, i+k/2+detectStepsize); 454 // bs.clear(i+k/2); 455 } 456 } 457 } 458 } 459 // assert(bases.length==r.length()); 460 return bs; 461 } 462 detectErrorsTips(final Read r, final KCountArray kca, final int k, final int thresh)463 public static BitSet detectErrorsTips(final Read r, final KCountArray kca, final int k, final int thresh){ 464 // if(kca.gap>0){return detectErrorsSplit(r, kca, k, thresh);} 465 466 final int kbits=2*k; 467 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); 468 final int gap=kca.gap; 469 final byte[] bases=r.bases; 470 assert(gap==0); 471 472 if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors 473 BitSet bs=new BitSet(r.length()); 474 final int setlen=k+gap; 475 476 int len=0; 477 long kmer=0; 478 for(int i=0; i<bases.length; i++){ 479 byte b=bases[i]; 480 int x=AminoAcid.baseToNumber[b]; 481 if(x<0){ 482 len=0; 483 kmer=0; 484 }else{ 485 kmer=((kmer<<2)|x)&mask; 486 len++; 487 488 if(len>=k){ 489 int count=kca.read(kmer); 490 if(count>=thresh){ 491 bs.set(i+1-setlen); 492 bs.set(i); 493 } 494 } 495 } 496 } 497 return bs; 498 } 499 500 501 /** Assumes bulk mode was used; e.g., any '0' bit is covered by no correct kmers */ correctErrors(final Read r, final KCountArray kca, final int k, final int thresh, BitSet bs, final int maxCorrections, final int maxBurst)502 public static BitSet correctErrors(final Read r, final KCountArray kca, final int k, final int thresh, BitSet bs, final int maxCorrections, final int maxBurst){ 503 if(kca.gap>0){assert(false) : "TODO";} 504 505 assert(!OUTPUT_INFO) : "TODO: Outputting correction data is not yet supported."; 506 507 int corrections=0; //Alternately, corrections=r.errorsCorrected 508 r.errors=0; 509 510 if(bs.cardinality()==0){//Cannot be corrected 511 r.errors=r.length(); 512 return bs; 513 } 514 515 // verbose=!bs.get(0); 516 if(verbose){ 517 Data.sysout.println(); 518 Data.sysout.println(toString(bs, r.length())); 519 Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.length())); 520 Data.sysout.println(toString(detectErrors(r, kca, k, thresh), r.length()-k+1)); 521 } 522 523 524 int lastloc=-99; 525 int burst=1; 526 while(!bs.get(0) && corrections<maxCorrections){//While the read starts with a '0', correct from the right. 527 // Data.sysout.println("Could not correct."); 528 // return bs; 529 int errorLoc=bs.nextSetBit(0)-1;//Location to left of first '1' 530 if(Tools.absdif(errorLoc,lastloc)<=BURST_THRESH){burst++;} 531 else{burst=1;} 532 lastloc=errorLoc; 533 boolean success=(burst<=MAX_ERROR_BURST) && correctFromRight(r, kca, k, thresh, bs, errorLoc); 534 if(success){ 535 corrections++; 536 bs=detectErrorsBulk(r, kca, k, thresh, 1); 537 if(verbose){System.err.println(">\n"+toString(bs, r.length()));} 538 }else{ 539 r.errors=r.length()-bs.cardinality(); 540 // r.errorsCorrected+=corrections; 541 if(verbose){System.err.println("Could not correct.");} 542 r.bases[errorLoc]='N'; 543 r.quality[errorLoc]=0; 544 return bs; 545 } 546 } 547 548 burst=1; 549 while(bs.cardinality()<r.length() && corrections<maxCorrections){ 550 if(bs.get(0)){//First bit is a "1", can correct from the left 551 int errorLoc=bs.nextClearBit(0);//Location to left of first '0' 552 if(Tools.absdif(errorLoc,lastloc)<=BURST_THRESH){burst++;} 553 else{burst=1;} 554 lastloc=errorLoc; 555 boolean success=(burst<=MAX_ERROR_BURST) && correctFromLeft(r, kca, k, thresh, bs, errorLoc); 556 if(success){ 557 corrections++; 558 bs=detectErrorsBulk(r, kca, k, thresh, 1); 559 if(verbose){System.err.println(">\n"+toString(bs, r.length()));} 560 }else{ 561 r.errors=r.length()-bs.cardinality(); 562 // r.errorsCorrected+=corrections; 563 r.bases[errorLoc]='N'; 564 r.quality[errorLoc]=0; 565 if(verbose){System.err.println("Could not correct.");} 566 return bs; 567 } 568 } 569 } 570 571 r.errors=r.length()-bs.cardinality(); 572 // r.errorsCorrected+=corrections; 573 assert(corrections<=maxCorrections); 574 return bs; 575 } 576 577 578 /** 579 * @param r 580 * @param kca 581 * @param k 582 * @param thresh 583 * @param bs 584 * @param errorLoc 585 * @return 586 */ correctFromLeft(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error)587 private static boolean correctFromLeft(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error) { 588 final int kbits=2*k; 589 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); 590 final int gap=kca.gap; 591 final int setlen=k+gap; 592 final int startLoc=error-(setlen)+1; 593 final byte oldBase=r.bases[error]; 594 final byte[] bases=r.bases; 595 596 final int minAdvance=Tools.min(MIN_ADVANCE, bases.length-error); 597 598 long kmer=0; 599 int len=0; 600 for(int i=startLoc; i<error; i++){ 601 byte b=bases[i]; 602 int x=AminoAcid.baseToNumber[b]; 603 if(x<0){ 604 len=0; 605 kmer=0; 606 throw new RuntimeException("Can't correct from left!\nerror="+error+"\n"+toString(bs, bases.length)+"\n"+new String(bases)+"\nreadID: "+r.numericID); 607 }else{ 608 kmer=((kmer<<2)|x)&mask; 609 len++; 610 } 611 } 612 assert(len==setlen-1) : setlen+", "+len+", "+error+", "+startLoc; 613 614 int[] counts=KillSwitch.allocInt1D(4); 615 int[] dists=KillSwitch.allocInt1D(4); 616 int maxLoc=Tools.min(bases.length-1, error+setlen-1); 617 if(!bs.get(error+1)){maxLoc=Tools.min(maxLoc, error+9);} 618 else{ 619 for(int i=error+2; i<=maxLoc; i++){ 620 if(!bs.get(i)){ 621 maxLoc=i-1; 622 break; 623 } 624 } 625 } 626 627 if(verbose){System.err.println("correctFromLeft. Error = "+error+", maxloc="+maxLoc);} 628 for(int bnum=0; bnum<4; bnum++){ 629 byte c=AminoAcid.numberToBase[bnum]; 630 bases[error]=c; 631 if(verbose){System.err.println("Considering "+(char)c);} 632 long key=kmer; 633 for(int loc=error; loc<=maxLoc; loc++){ 634 c=bases[loc]; 635 int x=AminoAcid.baseToNumber[c]; 636 if(x<0){ 637 if(verbose){System.err.println("break: N");} 638 break; 639 } 640 key=((key<<2)|x)&mask; 641 int count=kca.read(key); 642 if(count<thresh){ 643 if(verbose){System.err.println("break: count="+count);} 644 break; 645 } 646 dists[bnum]++; 647 counts[bnum]+=count; 648 } 649 } 650 bases[error]=oldBase; 651 652 //Note: I could require both to be the same, to decrease false-positives 653 654 final int muid=maxUniqueIndex(dists); 655 Arrays.sort(dists); 656 final int advance=dists[3]; 657 final int delta=dists[3]-dists[2]; 658 // if(advance<minAdvance){return false;} 659 if(delta<minAdvance){return false;} 660 661 int best=(muid<0 ? maxUniqueIndex(counts) : muid); 662 663 if(verbose){System.err.println("Best="+best+": "+Arrays.toString(dists)+" \t"+Arrays.toString(counts));} 664 if(best<0){return false;} 665 byte bestC=AminoAcid.numberToBase[best]; 666 if(bestC==oldBase){return false;} 667 bases[error]=bestC; 668 669 r.quality[error]=(byte)Tools.min(10, 3+delta); 670 671 return true; 672 } 673 674 675 676 /** 677 * @param r 678 * @param kca 679 * @param k 680 * @param thresh 681 * @param bs 682 * @param errorLoc 683 * @return 684 */ correctFromRight(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error)685 private static boolean correctFromRight(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error) { 686 final int kbits=2*k; 687 final int shift=kbits-2; 688 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); 689 final int gap=kca.gap; 690 final int setlen=k+gap; 691 final int stopLoc=error+(setlen)-1; 692 final byte oldBase=r.bases[error]; 693 final byte[] bases=r.bases; 694 695 final int minAdvance=Tools.min(MIN_ADVANCE, error+1); 696 697 long kmer=0; 698 int len=0; 699 700 for(int i=error+1; i<=stopLoc; i++){ 701 byte b=bases[i]; 702 int x=AminoAcid.baseToNumber[b]; 703 if(x<0){ 704 len=0; 705 kmer=0; 706 throw new RuntimeException("Can't correct from right!\nerror="+error+"\n"+toString(bs, bases.length)+"\n"+new String(bases)); 707 }else{ 708 kmer=((kmer<<2)|x)&mask; 709 len++; 710 } 711 // Data.sysout.print((char)b); 712 } 713 kmer<<=2; 714 715 if(verbose){ 716 Data.sysout.println(); 717 String s=Long.toBinaryString(kmer); 718 while(s.length()<kbits){s="0"+s;} 719 Data.sysout.println("kmer = \t"+s); 720 } 721 assert(len==setlen-1) : setlen+", "+len+", "+error+", "+stopLoc; 722 723 int[] counts=KillSwitch.allocInt1D(4); 724 int[] dists=KillSwitch.allocInt1D(4); 725 int minLoc=Tools.max(0, error-setlen+1); 726 if(error==0 || !bs.get(error-1)){minLoc=Tools.max(minLoc, error-9);} 727 else{ 728 for(int i=error-2; i>=minLoc; i--){ 729 if(!bs.get(i)){ 730 minLoc=i+1; 731 break; 732 } 733 } 734 } 735 736 if(verbose){ 737 Data.sysout.println("correctFromRight. Error = "+error+", minloc="+minLoc); 738 Data.sysout.println(new String(r.bases)); 739 } 740 for(int bnum=0; bnum<4; bnum++){ 741 byte c=AminoAcid.numberToBase[bnum]; 742 bases[error]=c; 743 if(verbose){System.err.println("Considering "+(char)c);} 744 long key=kmer; 745 for(int loc=error; loc>=minLoc; loc--){ 746 c=bases[loc]; 747 int x=AminoAcid.baseToNumber[c]; 748 if(x<0){ 749 if(verbose){System.err.println("break: N");} 750 break; 751 } 752 key=((key>>2)|(((long)x)<<shift))&mask; 753 // { 754 // String s=Long.toBinaryString(key); 755 // while(s.length()<kbits){s="0"+s;} 756 // Data.sysout.println("mask="+Long.toBinaryString(mask)+", shift="+shift+", c="+c+", x="+x+", key = \t"+s); 757 // } 758 int count=kca.read(key); 759 if(count<thresh){ 760 if(verbose){System.err.println("break: count="+count);} 761 break; 762 } 763 dists[bnum]++; 764 counts[bnum]+=count; 765 } 766 } 767 bases[error]=oldBase; 768 769 //Note: I could require both to be the same, to decrease false-positives 770 771 final int muid=maxUniqueIndex(dists); 772 Arrays.sort(dists); 773 final int advance=dists[3]; 774 final int delta=dists[3]-dists[2]; 775 // if(advance<minAdvance){return false;} 776 if(delta<minAdvance){return false;} 777 778 int best=(muid<0 ? maxUniqueIndex(counts) : muid); 779 780 if(verbose){System.err.println("Best="+best+": "+Arrays.toString(dists)+" \t"+Arrays.toString(counts));} 781 if(best<0){return false;} 782 byte bestC=AminoAcid.numberToBase[best]; 783 if(bestC==oldBase){return false;} 784 bases[error]=bestC; 785 786 r.quality[error]=(byte)Tools.min(10, 3+delta); 787 788 return true; 789 } 790 791 792 /** returns index of highest value, if unique; else a negative number */ maxUniqueIndex(int[] array)793 private static int maxUniqueIndex(int[] array){ 794 int max=array[0]; 795 int maxIndex=0; 796 for(int i=1; i<array.length; i++){ 797 if(array[i]>max){ 798 max=array[i]; 799 maxIndex=i; 800 }else if(max==array[i]){ 801 maxIndex=-1; 802 } 803 } 804 return maxIndex; 805 } 806 toString(BitSet bs, int len)807 public static final String toString(BitSet bs, int len){ 808 // assert(verbose); 809 StringBuilder sb=new StringBuilder(len); 810 for(int i=0; i<len; i++){sb.append(bs.get(i) ? '1' : '0');} 811 return sb.toString(); 812 } 813 removeBad(ArrayList<Read> list)814 private static void removeBad(ArrayList<Read> list){ 815 816 for(int i=0; i<list.size(); i++){ 817 Read r=list.get(i); 818 if(r.errors>0){ 819 if(r.mate==null || r.mate.errors>0){ 820 list.set(i, null); 821 } 822 } 823 } 824 825 } 826 827 public static boolean verbose=false; 828 /** Bails out if a read still has errors after correcting this many. */ 829 public static int ERROR_CORRECTION_LIMIT=6; 830 /** Max allowed number of nearby corrections. 831 * A long error burst indicates the read simply has low coverage, and is not being corrected correctly. */ 832 public static int MAX_ERROR_BURST=3; 833 /** Bursts have at most this distance between errors. E.G. '1' means errors are adjacent. */ 834 public static int BURST_THRESH=2; 835 /** Withhold uncorrectable reads from output. */ 836 public static boolean DONT_OUTPUT_BAD_READS=false; 837 /** Do not correct an error if it is at most this far from the next error. Instead, bail out. */ 838 public static int MIN_ADVANCE=1; 839 840 /** Number of threads used for error correction. Does not control number of threads for creating the hash table. 841 * Additionally, up to 2 threads are used for reading and up to 2 for writing. For this (singlethreaded) class, the number does nothing. */ 842 public static final int THREADS=1; 843 844 /** Output correction data instead of the corrected read */ 845 public static boolean OUTPUT_INFO=false; 846 847 848 } 849