1 package kmer; 2 3 import java.io.File; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 import java.util.BitSet; 7 import java.util.concurrent.atomic.AtomicLong; 8 9 import assemble.Contig; 10 import bloom.KmerCountAbstract; 11 import dna.AminoAcid; 12 import fileIO.ByteStreamWriter; 13 import fileIO.FileFormat; 14 import fileIO.ReadWrite; 15 import jgi.BBMerge; 16 import shared.Parse; 17 import shared.Parser; 18 import shared.PreParser; 19 import shared.Primes; 20 import shared.ReadStats; 21 import shared.Shared; 22 import shared.Timer; 23 import shared.Tools; 24 import shared.TrimRead; 25 import stream.ConcurrentReadInputStream; 26 import stream.FastaReadInputStream; 27 import stream.Read; 28 import structures.ByteBuilder; 29 import structures.IntList; 30 import structures.ListNum; 31 import structures.LongList; 32 import ukmer.Kmer; 33 34 35 /** 36 * Loads and holds kmers for Tadpole 37 * @author Brian Bushnell 38 * @date Jun 22, 2015 39 * 40 */ 41 public class KmerTableSet extends AbstractKmerTableSet { 42 43 /** 44 * Code entrance from the command line. 45 * @param args Command line arguments 46 */ main(String[] args)47 public static void main(String[] args){ 48 Timer t=new Timer(), t2=new Timer(); 49 t.start(); 50 t2.start(); 51 KmerTableSet set=new KmerTableSet(args); 52 t2.stop(); 53 outstream.println("Initialization Time: \t"+t2); 54 55 ///And run it 56 set.process(t); 57 58 //Close the print stream if it was redirected 59 Shared.closeStream(outstream); 60 } 61 62 63 /** 64 * Constructor. 65 * @param args Command line arguments 66 */ KmerTableSet(String[] args)67 private KmerTableSet(String[] args){ 68 this(args, 12);//+5 if using ownership and building contigs 69 } 70 71 72 /** 73 * Constructor. 74 * @param args Command line arguments 75 */ KmerTableSet(String[] args, final int bytesPerKmer_)76 public KmerTableSet(String[] args, final int bytesPerKmer_){ 77 {//Preparse block for help, config files, and outstream 78 PreParser pp=new PreParser(args, getClass(), false); 79 args=pp.args; 80 outstream=pp.outstream; 81 }//TODO - no easy way to close outstream 82 83 /* Initialize local variables with defaults */ 84 Parser parser=new Parser(); 85 boolean prealloc_=false; 86 int k_=31; 87 int ways_=-1; 88 int filterMax_=2; 89 boolean ecco_=false, merge_=false; 90 boolean rcomp_=true; 91 double minProb_=defaultMinprob; 92 int tableType_=AbstractKmerTable.ARRAY1D; 93 /* Parse arguments */ 94 for(int i=0; i<args.length; i++){ 95 96 final String arg=args[i]; 97 String[] split=arg.split("="); 98 String a=split[0].toLowerCase(); 99 String b=split.length>1 ? split[1] : null; 100 101 if(Parser.parseCommonStatic(arg, a, b)){ 102 //do nothing 103 }else if(Parser.parseZip(arg, a, b)){ 104 //do nothing 105 }else if(Parser.parseQuality(arg, a, b)){ 106 //do nothing 107 }else if(Parser.parseFasta(arg, a, b)){ 108 //do nothing 109 }else if(parser.parseInterleaved(arg, a, b)){ 110 //do nothing 111 }else if(parser.parseTrim(arg, a, b)){ 112 //do nothing 113 }else if(a.equals("in") || a.equals("in1")){ 114 in1.clear(); 115 if(b!=null){ 116 String[] s=b.split(","); 117 for(String ss : s){ 118 in1.add(ss); 119 } 120 } 121 }else if(a.equals("in2")){ 122 in2.clear(); 123 if(b!=null){ 124 String[] s=b.split(","); 125 for(String ss : s){ 126 in2.add(ss); 127 } 128 } 129 }else if(a.equals("extra")){ 130 extra.clear(); 131 if(b!=null){ 132 String[] s=b.split(","); 133 for(String ss : s){ 134 extra.add(ss); 135 } 136 } 137 }else if(a.equals("append") || a.equals("app")){ 138 append=ReadStats.append=Parse.parseBoolean(b); 139 }else if(a.equals("overwrite") || a.equals("ow")){ 140 overwrite=Parse.parseBoolean(b); 141 }else if(a.equals("initialsize")){ 142 initialSize=Parse.parseIntKMG(b); 143 }else if(a.equals("showstats") || a.equals("stats")){ 144 showStats=Parse.parseBoolean(b); 145 }else if(a.equals("ways")){ 146 ways_=Parse.parseIntKMG(b); 147 }else if(a.equals("buflen") || a.equals("bufflen") || a.equals("bufferlength")){ 148 buflen=Parse.parseIntKMG(b); 149 }else if(a.equals("k")){ 150 assert(b!=null) : "\nk needs an integer value from 1 to 31, such as k=27. Default is 31.\n"; 151 k_=Parse.parseIntKMG(b); 152 }else if(a.equals("threads") || a.equals("t")){ 153 THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b)); 154 }else if(a.equals("showspeed") || a.equals("ss")){ 155 showSpeed=Parse.parseBoolean(b); 156 }else if(a.equals("ecco")){ 157 ecco_=Parse.parseBoolean(b); 158 }else if(a.equals("merge")){ 159 merge_=Parse.parseBoolean(b); 160 }else if(a.equals("verbose")){ 161 // assert(false) : "Verbose flag is currently static final; must be recompiled to change."; 162 verbose=Parse.parseBoolean(b); 163 }else if(a.equals("verbose2")){ 164 // assert(false) : "Verbose flag is currently static final; must be recompiled to change."; 165 verbose2=Parse.parseBoolean(b); 166 }else if(a.equals("minprob")){ 167 minProb_=Double.parseDouble(b); 168 }else if(a.equals("minprobprefilter") || a.equals("mpp")){ 169 minProbPrefilter=Parse.parseBoolean(b); 170 }else if(a.equals("minprobmain") || a.equals("mpm")){ 171 minProbMain=Parse.parseBoolean(b); 172 }else if(a.equals("reads") || a.startsWith("maxreads")){ 173 maxReads=Parse.parseKMG(b); 174 }else if(a.equals("prealloc") || a.equals("preallocate")){ 175 if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){ 176 prealloc_=Parse.parseBoolean(b); 177 }else{ 178 preallocFraction=Tools.max(0, Double.parseDouble(b)); 179 prealloc_=(preallocFraction>0); 180 } 181 }else if(a.equals("prefilter")){ 182 if(b==null || b.length()<1 || !Tools.isDigit(b.charAt(0))){ 183 prefilter=Parse.parseBoolean(b); 184 }else{ 185 filterMax_=Parse.parseIntKMG(b); 186 prefilter=filterMax_>0; 187 } 188 }else if(a.equals("prefiltersize") || a.equals("prefilterfraction") || a.equals("pff")){ 189 prefilterFraction=Tools.max(0, Double.parseDouble(b)); 190 assert(prefilterFraction<=1) : "prefiltersize must be 0-1, a fraction of total memory."; 191 prefilter=prefilterFraction>0; 192 }else if(a.equals("prehashes") || a.equals("hashes")){ 193 prehashes=Parse.parseIntKMG(b); 194 }else if(a.equals("prefilterpasses") || a.equals("prepasses")){ 195 assert(b!=null) : "Bad parameter: "+arg; 196 if(b.equalsIgnoreCase("auto")){ 197 prepasses=-1; 198 }else{ 199 prepasses=Parse.parseIntKMG(b); 200 } 201 }else if(a.equals("onepass")){ 202 onePass=Parse.parseBoolean(b); 203 }else if(a.equals("passes")){ 204 int passes=Parse.parseIntKMG(b); 205 onePass=(passes<2); 206 }else if(a.equals("rcomp")){ 207 rcomp_=Parse.parseBoolean(b); 208 }else if(a.equals("tabletype")){ 209 tableType_=Integer.parseInt(b); 210 } 211 212 else if(a.equalsIgnoreCase("filterMemoryOverride") || a.equalsIgnoreCase("filterMemory") || 213 a.equalsIgnoreCase("prefilterMemory") || a.equalsIgnoreCase("filtermem")){ 214 filterMemoryOverride=Parse.parseKMG(b); 215 } 216 217 else if(IGNORE_UNKNOWN_ARGS){ 218 //Do nothing 219 }else{ 220 throw new RuntimeException("Unknown parameter "+args[i]); 221 } 222 } 223 224 {//Process parser fields 225 Parser.processQuality(); 226 227 qtrimLeft=parser.qtrimLeft; 228 qtrimRight=parser.qtrimRight; 229 trimq=parser.trimq; 230 trimE=parser.trimE(); 231 232 minAvgQuality=parser.minAvgQuality; 233 minAvgQualityBases=parser.minAvgQualityBases; 234 amino=Shared.AMINO_IN; 235 if(amino){k_=Tools.min(k_, 12);} 236 } 237 238 if(prepasses==0 || !prefilter){ 239 prepasses=0; 240 prefilter=false; 241 } 242 243 244 // assert(false) : prepasses+", "+onePass+", "+prefilter; 245 246 { 247 long memory=Runtime.getRuntime().maxMemory(); 248 double xmsRatio=Shared.xmsRatio(); 249 // long tmemory=Runtime.getRuntime().totalMemory(); 250 usableMemory=(long)Tools.max(((memory-96000000)*(xmsRatio>0.97 ? 0.82 : 0.72)), memory*0.45); 251 if(prepasses==0 || !prefilter){ 252 filterMemory0=filterMemory1=0; 253 }else if(filterMemoryOverride>0){ 254 filterMemory0=filterMemory1=filterMemoryOverride; 255 }else{ 256 double low=Tools.min(prefilterFraction, 1-prefilterFraction); 257 double high=1-low; 258 if(prepasses<0 || (prepasses&1)==1){//odd passes 259 filterMemory0=(long)(usableMemory*low); 260 filterMemory1=(long)(usableMemory*high); 261 }else{//even passes 262 filterMemory0=(long)(usableMemory*high); 263 filterMemory1=(long)(usableMemory*low); 264 } 265 } 266 tableMemory=(long)(usableMemory*.95-filterMemory0); 267 } 268 269 tableType=tableType_; 270 prealloc=prealloc_; 271 bytesPerKmer=bytesPerKmer_; 272 if(ways_<1){ 273 long maxKmers=(2*tableMemory)/bytesPerKmer; 274 long minWays=Tools.min(10000, maxKmers/Integer.MAX_VALUE); 275 ways_=(int)Tools.max(31, (int)(THREADS*2.5), minWays); 276 ways_=(int)Primes.primeAtLeast(ways_); 277 assert(ways_>0); 278 // System.err.println("ways="+ways_); 279 } 280 281 /* Set final variables; post-process and validate argument combinations */ 282 283 onePass=onePass&prefilter; 284 ways=ways_; 285 filterMax=Tools.min(filterMax_, 0x7FFFFFFF); 286 ecco=ecco_; 287 merge=merge_; 288 minProb=(float)minProb_; 289 rcomp=rcomp_; 290 // assert(false) : tableMemory+", "+bytesPerKmer+", "+prealloc+", "+preallocFraction; 291 estimatedKmerCapacity=(long)((tableMemory*1.0/bytesPerKmer)*((prealloc ? preallocFraction : 0.81))*(HashArray.maxLoadFactorFinal*0.97)); 292 293 // System.err.println("tableMemory="+tableMemory+", bytesPerKmer="+bytesPerKmer+", estimatedKmerCapacity="+estimatedKmerCapacity); 294 KmerCountAbstract.minProb=(minProbPrefilter ? minProb : 0); 295 k=k_; 296 k2=k-1; 297 int bitsPerBase=(amino ? 5 : 2); 298 int mask=(amino ? 31 : 3); 299 coreMask=(AbstractKmerTableSet.MASK_CORE ? ~(((-1L)<<(bitsPerBase*(k_-1)))|mask) : -1L); 300 301 if(k<1 || k>31){throw new RuntimeException("\nk needs an integer value from 1 to 31, such as k=27. Default is 31.\n");} 302 303 if(initialSize<1){ 304 final long memOverWays=tableMemory/(bytesPerKmer*ways); 305 final double mem2=(prealloc ? preallocFraction : 1)*tableMemory; 306 initialSize=(prealloc || memOverWays<initialSizeDefault ? (int)Tools.min(2142000000, (long)(mem2/(bytesPerKmer*ways))) : initialSizeDefault); 307 if(initialSize!=initialSizeDefault){ 308 System.err.println("Initial size set to "+initialSize); 309 } 310 } 311 312 /* Adjust I/O settings and filenames */ 313 314 assert(FastaReadInputStream.settingsOK()); 315 316 if(in1.isEmpty()){ 317 //throw new RuntimeException("Error - at least one input file is required."); 318 } 319 320 for(int i=0; i<in1.size(); i++){ 321 String s=in1.get(i); 322 if(s!=null && s.contains("#") && !new File(s).exists()){ 323 int pound=s.lastIndexOf('#'); 324 String a=s.substring(0, pound); 325 String b=s.substring(pound+1); 326 in1.set(i, a+1+b); 327 in2.add(a+2+b); 328 } 329 } 330 331 if(!extra.isEmpty()){ 332 ArrayList<String> temp=(ArrayList<String>) extra.clone(); 333 extra.clear(); 334 for(String s : temp){ 335 if(s!=null && s.contains("#") && !new File(s).exists()){ 336 int pound=s.lastIndexOf('#'); 337 String a=s.substring(0, pound); 338 String b=s.substring(pound+1); 339 extra.add(a); 340 extra.add(b); 341 }else{ 342 extra.add(s); 343 } 344 } 345 } 346 347 { 348 boolean allowDuplicates=true; 349 if(!Tools.testInputFiles(allowDuplicates, true, in1, in2, extra)){ 350 throw new RuntimeException("\nCan't read some input files.\n"); 351 } 352 } 353 assert(THREADS>0); 354 355 if(DISPLAY_PROGRESS){ 356 // assert(false); 357 outstream.println("Initial:"); 358 outstream.println("Ways="+ways+", initialSize="+initialSize+", prefilter="+(prefilter ? "t" : "f")+", prealloc="+(prealloc ? (""+preallocFraction) : "f")); 359 Shared.printMemory(); 360 outstream.println(); 361 } 362 } 363 364 /*--------------------------------------------------------------*/ 365 /*---------------- Outer Methods ----------------*/ 366 /*--------------------------------------------------------------*/ 367 368 @Override clear()369 public void clear(){ 370 tables=null; 371 } 372 373 /*--------------------------------------------------------------*/ 374 /*---------------- Inner Methods ----------------*/ 375 /*--------------------------------------------------------------*/ 376 377 @Override allocateTables()378 public void allocateTables(){ 379 assert(tables==null); 380 tables=null; 381 final long coreMask=(MASK_CORE ? ~(((-1L)<<(2*(k-1)))|3) : -1L); 382 383 ScheduleMaker scheduleMaker=new ScheduleMaker(ways, bytesPerKmer, prealloc, 384 (prealloc ? preallocFraction : 1.0), -1, (prefilter ? prepasses : 0), prefilterFraction, filterMemoryOverride); 385 int[] schedule=scheduleMaker.makeSchedule(); 386 tables=AbstractKmerTable.preallocate(ways, tableType, schedule, coreMask); 387 388 // tables=AbstractKmerTable.preallocate(ways, tableType, initialSize, coreMask, (!prealloc || preallocFraction<1)); 389 } 390 391 /** 392 * Load reads into tables, using multiple LoadThread. 393 */ 394 @Override loadKmers(String fname1, String fname2)395 public long loadKmers(String fname1, String fname2){ 396 397 /* Create read input stream */ 398 final ConcurrentReadInputStream cris; 399 { 400 FileFormat ff1=FileFormat.testInput(fname1, FileFormat.FASTQ, null, true, true); 401 FileFormat ff2=FileFormat.testInput(fname2, FileFormat.FASTQ, null, true, true); 402 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, false, ff1, ff2); 403 cris.start(); //4567 404 } 405 406 // /* Optionally skip the first reads, since initial reads may have lower quality */ 407 // if(skipreads>0){ 408 // long skipped=0; 409 // 410 // ListNum<Read> ln=cris.nextList(); 411 // ArrayList<Read> reads=(ln!=null ? ln.list : null); 412 // 413 // while(skipped<skipreads && reads!=null && reads.size()>0){ 414 // skipped+=reads.size(); 415 // 416 // cris.returnList(ln); 417 // ln=cris.nextList(); 418 // reads=(ln!=null ? ln.list : null); 419 // } 420 // cris.returnList(ln); 421 // if(reads==null || reads.isEmpty()){ 422 // ReadWrite.closeStreams(cris); 423 // System.err.println("Skipped all of the reads."); 424 // System.exit(0); 425 // } 426 // } 427 428 /* Create ProcessThreads */ 429 ArrayList<LoadThread> alpt=new ArrayList<LoadThread>(THREADS); 430 for(int i=0; i<THREADS; i++){alpt.add(new LoadThread(cris));} 431 for(LoadThread pt : alpt){pt.start();} 432 433 long added=0; 434 435 /* Wait for threads to die, and gather statistics */ 436 for(LoadThread pt : alpt){ 437 while(pt.getState()!=Thread.State.TERMINATED){ 438 try { 439 pt.join(); 440 } catch (InterruptedException e) { 441 // TODO Auto-generated catch block 442 e.printStackTrace(); 443 } 444 } 445 added+=pt.added; 446 447 readsIn+=pt.readsInT; 448 basesIn+=pt.basesInT; 449 lowqReads+=pt.lowqReadsT; 450 lowqBases+=pt.lowqBasesT; 451 readsTrimmed+=pt.readsTrimmedT; 452 basesTrimmed+=pt.basesTrimmedT; 453 kmersIn+=pt.kmersInT; 454 } 455 456 /* Shut down I/O streams; capture error status */ 457 errorState|=ReadWrite.closeStreams(cris); 458 return added; 459 } 460 461 /*--------------------------------------------------------------*/ 462 /*---------------- Inner Classes ----------------*/ 463 /*--------------------------------------------------------------*/ 464 465 /** 466 * Loads kmers. 467 */ 468 private class LoadThread extends Thread{ 469 470 /** 471 * Constructor 472 * @param cris_ Read input stream 473 */ LoadThread(ConcurrentReadInputStream cris_)474 public LoadThread(ConcurrentReadInputStream cris_){ 475 cris=cris_; 476 table=new HashBuffer(tables, buflen, k, false, true); 477 } 478 479 @Override run()480 public void run(){ 481 ListNum<Read> ln=cris.nextList(); 482 ArrayList<Read> reads=(ln!=null ? ln.list : null); 483 484 //While there are more reads lists... 485 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 486 487 //For each read (or pair) in the list... 488 for(int i=0; i<reads.size(); i++){ 489 Read r1=reads.get(i); 490 Read r2=r1.mate; 491 492 if(!r1.validated()){r1.validate(true);} 493 if(r2!=null && !r2.validated()){r2.validate(true);} 494 495 if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));} 496 497 readsInT++; 498 basesInT+=r1.length(); 499 if(r2!=null){ 500 readsInT++; 501 basesInT+=r2.length(); 502 } 503 504 if(maxNs<Integer.MAX_VALUE){ 505 if(r1!=null && r1.countUndefined()>maxNs){r1.setDiscarded(true);} 506 if(r2!=null && r2.countUndefined()>maxNs){r2.setDiscarded(true);} 507 } 508 509 //Determine whether to discard the reads based on average quality 510 if(minAvgQuality>0){ 511 if(r1!=null && r1.quality!=null && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){r1.setDiscarded(true);} 512 if(r2!=null && r2.quality!=null && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){r2.setDiscarded(true);} 513 } 514 515 if(r1!=null){ 516 if(qtrimLeft || qtrimRight){ 517 int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, trimE, 1, false); 518 basesTrimmedT+=x; 519 readsTrimmedT+=(x>0 ? 1 : 0); 520 } 521 if(r1.length()<k){r1.setDiscarded(true);} 522 } 523 if(r2!=null){ 524 if(qtrimLeft || qtrimRight){ 525 int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, trimE, 1, false); 526 basesTrimmedT+=x; 527 readsTrimmedT+=(x>0 ? 1 : 0); 528 } 529 if(r2.length()<k){r2.setDiscarded(true);} 530 } 531 532 if((ecco || merge) && r1!=null && r2!=null && !r1.discarded() && !r2.discarded()){ 533 if(merge){ 534 final int insert=BBMerge.findOverlapStrict(r1, r2, false); 535 if(insert>0){ 536 r2.reverseComplement(); 537 r1=r1.joinRead(insert); 538 r2=null; 539 } 540 }else if(ecco){ 541 BBMerge.findOverlapStrict(r1, r2, true); 542 } 543 } 544 545 if(minLen>0){ 546 if(r1!=null && r1.length()<minLen){r1.setDiscarded(true);} 547 if(r2!=null && r2.length()<minLen){r2.setDiscarded(true);} 548 } 549 550 if(r1!=null){ 551 if(r1.discarded()){ 552 lowqBasesT+=r1.length(); 553 lowqReadsT++; 554 }else{ 555 long temp=addKmersToTable(r1); 556 added+=temp; 557 if(verbose){System.err.println("A: Added "+temp);} 558 } 559 } 560 if(r2!=null){ 561 if(r2.discarded()){ 562 lowqBasesT+=r2.length(); 563 lowqReadsT++; 564 }else{ 565 long temp=addKmersToTable(r2); 566 added+=temp; 567 if(verbose){System.err.println("B: Added "+temp);} 568 } 569 } 570 } 571 572 //Fetch a new read list 573 cris.returnList(ln); 574 ln=cris.nextList(); 575 reads=(ln!=null ? ln.list : null); 576 } 577 cris.returnList(ln); 578 long temp=table.flush(); 579 if(verbose){System.err.println("Flush: Added "+temp);} 580 added+=temp; 581 } 582 addKmersToTableAA(final Read r)583 private final int addKmersToTableAA(final Read r){ 584 if(r==null || r.bases==null){return 0;} 585 final float minProb2=(minProbMain ? minProb : 0); 586 final byte[] bases=r.bases; 587 final byte[] quals=r.quality; 588 final int shift=5*k; 589 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 590 long kmer=0; 591 int created=0; 592 int len=0; 593 594 if(bases==null || bases.length<k){return -1;} 595 596 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 597 float prob=1; 598 for(int i=0; i<bases.length; i++){ 599 final byte b=bases[i]; 600 final long x=AminoAcid.acidToNumber[b]; 601 602 //Update kmers 603 kmer=((kmer<<5)|x)&mask; 604 605 if(minProb2>0 && quals!=null){//Update probability 606 prob=prob*PROB_CORRECT[quals[i]]; 607 if(len>k){ 608 byte oldq=quals[i-k]; 609 prob=prob*PROB_CORRECT_INVERSE[oldq]; 610 } 611 } 612 613 //Handle Ns 614 if(x<0){ 615 len=0; 616 kmer=0; 617 prob=1; 618 }else{len++;} 619 620 if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 621 if(len>=k && prob>=minProb2){ 622 kmersInT++; 623 final long key=kmer; 624 if(!prefilter || prefilterArray.read(key)>filterMax2){ 625 int temp=table.incrementAndReturnNumCreated(key, 1); 626 created+=temp; 627 if(verbose){System.err.println("C: Added "+temp);} 628 } 629 } 630 } 631 632 return created; 633 } 634 addKmersToTable(final Read r)635 private final int addKmersToTable(final Read r){ 636 if(amino){return addKmersToTableAA(r);} 637 if(onePass){return addKmersToTable_onePass(r);} 638 if(r==null || r.bases==null){return 0;} 639 final float minProb2=(minProbMain ? minProb : 0); 640 final byte[] bases=r.bases; 641 final byte[] quals=r.quality; 642 final int shift=2*k; 643 final int shift2=shift-2; 644 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 645 long kmer=0; 646 long rkmer=0; 647 int created=0; 648 int len=0; 649 650 if(bases==null || bases.length<k){return -1;} 651 652 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 653 float prob=1; 654 for(int i=0; i<bases.length; i++){ 655 final byte b=bases[i]; 656 final long x=AminoAcid.baseToNumber[b]; 657 final long x2=AminoAcid.baseToComplementNumber[b]; 658 659 //Update kmers 660 kmer=((kmer<<2)|x)&mask; 661 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 662 663 if(minProb2>0 && quals!=null){//Update probability 664 prob=prob*PROB_CORRECT[quals[i]]; 665 if(len>k){ 666 byte oldq=quals[i-k]; 667 prob=prob*PROB_CORRECT_INVERSE[oldq]; 668 } 669 } 670 671 //Handle Ns 672 if(x<0){ 673 len=0; 674 kmer=rkmer=0; 675 prob=1; 676 }else{len++;} 677 678 if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 679 if(len>=k && prob>=minProb2){ 680 kmersInT++; 681 final long key=toValue(kmer, rkmer); 682 if(!prefilter || prefilterArray.read(key)>filterMax2){ 683 int temp=table.incrementAndReturnNumCreated(key, 1); 684 created+=temp; 685 if(verbose){System.err.println("C: Added "+temp);} 686 } 687 } 688 } 689 690 return created; 691 } 692 693 addKmersToTable_onePass(final Read r)694 private final int addKmersToTable_onePass(final Read r){ 695 assert(prefilter); 696 if(r==null || r.bases==null){return 0;} 697 final byte[] bases=r.bases; 698 final byte[] quals=r.quality; 699 final int shift=2*k; 700 final int shift2=shift-2; 701 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 702 long kmer=0; 703 long rkmer=0; 704 int created=0; 705 int len=0; 706 707 if(bases==null || bases.length<k){return -1;} 708 709 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 710 float prob=1; 711 for(int i=0; i<bases.length; i++){ 712 final byte b=bases[i]; 713 final long x=AminoAcid.baseToNumber[b]; 714 final long x2=AminoAcid.baseToComplementNumber[b]; 715 716 //Update kmers 717 kmer=((kmer<<2)|x)&mask; 718 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 719 720 if(minProb>0 && quals!=null){//Update probability 721 prob=prob*PROB_CORRECT[quals[i]]; 722 if(len>k){ 723 byte oldq=quals[i-k]; 724 prob=prob*PROB_CORRECT_INVERSE[oldq]; 725 } 726 } 727 728 //Handle Ns 729 if(x<0){ 730 len=0; 731 kmer=rkmer=0; 732 prob=1; 733 }else{len++;} 734 735 if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 736 if(len>=k){ 737 final long key=toValue(kmer, rkmer); 738 int count=prefilterArray.incrementAndReturnUnincremented(key, 1); 739 if(count>=filterMax2){ 740 int temp=table.incrementAndReturnNumCreated(key, 1); 741 created+=temp; 742 if(verbose){System.err.println("D: Added "+temp);} 743 } 744 } 745 } 746 return created; 747 } 748 749 /*--------------------------------------------------------------*/ 750 751 /** Input read stream */ 752 private final ConcurrentReadInputStream cris; 753 754 private final HashBuffer table; 755 756 public long added=0; 757 758 private long readsInT=0; 759 private long basesInT=0; 760 private long lowqReadsT=0; 761 private long lowqBasesT=0; 762 private long readsTrimmedT=0; 763 private long basesTrimmedT=0; 764 private long kmersInT=0; 765 766 } 767 768 769 /*--------------------------------------------------------------*/ 770 /*---------------- Convenience ----------------*/ 771 /*--------------------------------------------------------------*/ 772 regenerateKmers(byte[] bases, LongList kmers, IntList counts, final int a)773 public void regenerateKmers(byte[] bases, LongList kmers, IntList counts, final int a){ 774 final int loc=a+k; 775 final int lim=Tools.min(counts.size, a+k+1); 776 final int shift=2*k; 777 final int shift2=shift-2; 778 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 779 long kmer=kmers.get(a); 780 long rkmer=rcomp(kmer); 781 int len=k; 782 783 // assert(false) : a+", "+loc+", "+lim; 784 785 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 786 for(int i=loc, j=a+1; j<lim; i++, j++){ 787 final byte b=bases[i]; 788 final long x=AminoAcid.baseToNumber[b]; 789 final long x2=AminoAcid.baseToComplementNumber[b]; 790 kmer=((kmer<<2)|x)&mask; 791 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 792 if(x<0){ 793 len=0; 794 kmer=rkmer=0; 795 }else{len++;} 796 797 if(len>=k){ 798 assert(kmers.get(j)!=kmer); 799 kmers.set(j, kmer); 800 int count=getCount(kmer, rkmer); 801 counts.set(j, count); 802 }else{ 803 kmers.set(j, -1); 804 counts.set(j, 0); 805 } 806 } 807 } 808 809 @Override regenerateCounts(byte[] bases, IntList counts, final Kmer dummy, BitSet changed)810 public int regenerateCounts(byte[] bases, IntList counts, final Kmer dummy, BitSet changed){ 811 assert(!changed.isEmpty()); 812 final int firstBase=changed.nextSetBit(0), lastBase=changed.length()-1; 813 final int ca=firstBase-k; 814 // final int b=changed.nextSetBit(0);ca+kbig; //first base changed 815 final int firstCount=Tools.max(firstBase-k+1, 0), lastCount=Tools.min(counts.size-1, lastBase); //count limit 816 // System.err.println("ca="+ca+", b="+b+", lim="+lim); 817 // System.err.println("Regen from count "+(ca+1)+"-"+lim); 818 819 final int shift=2*k; 820 final int shift2=shift-2; 821 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 822 long kmer=0, rkmer=0; 823 int len=0; 824 int valid=0; 825 826 // System.err.println("ca="+ca+", b="+b+", lim="+lim+", "+counts); 827 828 for(int i=Tools.max(0, firstBase-k+1), lim=Tools.min(lastBase+k-1, bases.length-1); i<=lim; i++){ 829 final byte base=bases[i]; 830 final long x=AminoAcid.baseToNumber[base]; 831 final long x2=AminoAcid.baseToComplementNumber[base]; 832 kmer=((kmer<<2)|x)&mask; 833 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 834 835 if(x<0){ 836 len=0; 837 kmer=rkmer=0; 838 }else{ 839 len++; 840 } 841 842 final int c=i-k+1; 843 if(i>=firstBase){ 844 if(len>=k){ 845 valid++; 846 int count=getCount(kmer, rkmer); 847 counts.set(c, count); 848 }else if(c>=0){ 849 counts.set(c, 0); 850 } 851 } 852 } 853 854 return valid; 855 } 856 857 @Override fillSpecificCounts(byte[] bases, IntList counts, BitSet positions, final Kmer dummy)858 public int fillSpecificCounts(byte[] bases, IntList counts, BitSet positions, final Kmer dummy){ 859 final int b=k-1; 860 final int lim=(positions==null ? bases.length : Tools.min(bases.length, positions.length()+k-1)); 861 final int shift=2*k; 862 final int shift2=shift-2; 863 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 864 long kmer=0, rkmer=0; 865 int len=0; 866 int valid=0; 867 868 counts.clear(); 869 870 for(int i=0, j=-b; i<lim; i++, j++){ 871 final byte base=bases[i]; 872 final long x=AminoAcid.baseToNumber[base]; 873 final long x2=AminoAcid.baseToComplementNumber[base]; 874 kmer=((kmer<<2)|x)&mask; 875 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 876 877 if(x<0){ 878 len=0; 879 kmer=rkmer=0; 880 }else{ 881 len++; 882 } 883 884 if(j>=0){ 885 if(len>=k && (positions==null || positions.get(j))){ 886 valid++; 887 int count=getCount(kmer, rkmer); 888 assert(i-k+1==counts.size()) : "j="+j+", counts="+counts.size()+", b="+(b)+", (i-k+1)="+(i-k+1); 889 assert(j==counts.size()); 890 counts.add(Tools.max(count, 0)); 891 // counts.set(i-k+1, count); 892 }else{ 893 counts.add(0); 894 // counts.set(i-k+1, 0); 895 } 896 } 897 } 898 899 // assert((counts.size==0 && bases.length<k) || counts.size==bases.length-k+1) : bases.length+", "+k+", "+counts.size; 900 assert((counts.size==0 && bases.length<k) || counts.size==lim-k+1) : bases.length+", "+k+", "+counts.size; 901 902 return valid; 903 } 904 regenerateCounts(byte[] bases, IntList counts, final int ca)905 public int regenerateCounts(byte[] bases, IntList counts, final int ca){ 906 final int b=ca+k-1; 907 final int lim=Tools.min(bases.length, b+k+1); 908 final int shift=2*k; 909 final int shift2=shift-2; 910 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 911 long kmer=0, rkmer=0; 912 int len=0; 913 int valid=0; 914 915 final int clen=counts.size; 916 917 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts. 918 * i is an index in the base array, j is an index in the count array. */ 919 for(int i=b-k+1; i<lim; i++){ 920 final byte base=bases[i]; 921 final long x=AminoAcid.baseToNumber[base]; 922 final long x2=AminoAcid.baseToComplementNumber[base]; 923 kmer=((kmer<<2)|x)&mask; 924 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 925 926 if(x<0){ 927 len=0; 928 kmer=rkmer=0; 929 }else{ 930 len++; 931 } 932 933 if(i>=b){ 934 if(len>=k){ 935 valid++; 936 int count=getCount(kmer, rkmer); 937 counts.set(i-k+1, count); 938 }else{ 939 counts.set(i-k+1, 0); 940 } 941 } 942 } 943 944 assert((counts.size==0 && bases.length<k) || counts.size==bases.length-k+1) : bases.length+", "+k+", "+counts.size; 945 assert(clen==counts.size); 946 947 return valid; 948 } 949 950 /** Returns number of valid kmers */ fillKmers(byte[] bases, LongList kmers)951 public int fillKmers(byte[] bases, LongList kmers){ 952 final int blen=bases.length; 953 if(blen<k){return 0;} 954 final int min=k-1; 955 final int shift=2*k; 956 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 957 long kmer=0; 958 int len=0; 959 int valid=0; 960 961 kmers.clear(); 962 963 /* Loop through the bases, maintaining a forward kmer via bitshifts */ 964 for(int i=0; i<blen; i++){ 965 final byte b=bases[i]; 966 assert(b>=0) : Arrays.toString(bases); 967 final long x=AminoAcid.baseToNumber[b]; 968 kmer=((kmer<<2)|x)&mask; 969 if(x<0){ 970 len=0; 971 kmer=0; 972 }else{len++;} 973 if(i>=min){ 974 if(len>=k){ 975 kmers.add(kmer); 976 valid++; 977 }else{ 978 kmers.add(-1); 979 } 980 } 981 } 982 return valid; 983 } 984 fillCounts(LongList kmers, IntList counts)985 public void fillCounts(LongList kmers, IntList counts){ 986 counts.clear(); 987 for(int i=0; i<kmers.size; i++){ 988 long kmer=kmers.get(i); 989 if(kmer>=0){ 990 long rkmer=rcomp(kmer); 991 int count=getCount(kmer, rkmer); 992 counts.add(count); 993 }else{ 994 counts.add(0); 995 } 996 } 997 } 998 999 1000 /*--------------------------------------------------------------*/ 1001 /*---------------- Helper Methods ----------------*/ 1002 /*--------------------------------------------------------------*/ 1003 1004 @Override regenerate(final int limit)1005 public long regenerate(final int limit){ 1006 long sum=0; 1007 for(AbstractKmerTable akt : tables){ 1008 sum+=akt.regenerate(limit); 1009 } 1010 return sum; 1011 } 1012 getTableForKey(long key)1013 public HashArray1D getTableForKey(long key){ 1014 return (HashArray1D) tables[kmerToWay(key)]; 1015 } 1016 1017 @Override getTable(int tnum)1018 public HashArray1D getTable(int tnum){ 1019 return (HashArray1D) tables[tnum]; 1020 } 1021 1022 @Override fillHistogram(int histMax)1023 public long[] fillHistogram(int histMax) { 1024 return HistogramMaker.fillHistogram(tables, histMax); 1025 } 1026 1027 @Override countGC(long[] gcCounts, int max)1028 public void countGC(long[] gcCounts, int max) { 1029 for(AbstractKmerTable set : tables){ 1030 set.countGC(gcCounts, max); 1031 } 1032 } 1033 1034 @Override initializeOwnership()1035 public void initializeOwnership(){ 1036 OwnershipThread.initialize(tables); 1037 } 1038 1039 @Override clearOwnership()1040 public void clearOwnership(){ 1041 OwnershipThread.clear(tables); 1042 } 1043 rightmostKmer(final ByteBuilder bb)1044 public long rightmostKmer(final ByteBuilder bb){ 1045 return rightmostKmer(bb.array, bb.length()); 1046 } 1047 rightmostKmer(final byte[] bases, final int blen)1048 public long rightmostKmer(final byte[] bases, final int blen){ 1049 if(blen<k){return -1;} 1050 final int shift=2*k; 1051 final int shift2=shift-2; 1052 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 1053 long kmer=0; 1054 long rkmer=0; 1055 int len=0; 1056 1057 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the rightmost kmer */ 1058 { 1059 for(int i=blen-k; i<blen; i++){ 1060 final byte b=bases[i]; 1061 final long x=AminoAcid.baseToNumber[b]; 1062 final long x2=AminoAcid.baseToComplementNumber[b]; 1063 kmer=((kmer<<2)|x)&mask; 1064 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1065 if(x<0){ 1066 len=0; 1067 kmer=rkmer=0; 1068 }else{len++;} 1069 if(verbose){outstream.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 1070 } 1071 } 1072 1073 if(len<k){return -1;} 1074 else{assert(len==k);} 1075 return kmer; 1076 } 1077 leftmostKmer(final ByteBuilder bb)1078 public long leftmostKmer(final ByteBuilder bb){ 1079 return leftmostKmer(bb.array, bb.length()); 1080 } 1081 leftmostKmer(final byte[] bases, final int blen)1082 public long leftmostKmer(final byte[] bases, final int blen){ 1083 if(blen<k){return -1;} 1084 final int shift=2*k; 1085 final int shift2=shift-2; 1086 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 1087 long kmer=0; 1088 long rkmer=0; 1089 int len=0; 1090 1091 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the leftmost kmer */ 1092 { 1093 for(int i=0; i<k; i++){ 1094 final byte b=bases[i]; 1095 final long x=AminoAcid.baseToNumber[b]; 1096 final long x2=AminoAcid.baseToComplementNumber[b]; 1097 kmer=((kmer<<2)|x)&mask; 1098 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1099 if(x<0){ 1100 len=0; 1101 kmer=rkmer=0; 1102 }else{len++;} 1103 if(verbose){outstream.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 1104 } 1105 } 1106 1107 if(len<k){return -1;} 1108 else{assert(len==k);} 1109 return kmer; 1110 } 1111 doubleClaim(final ByteBuilder bb, final int id)1112 public boolean doubleClaim(final ByteBuilder bb, final int id){ 1113 return doubleClaim(bb.array, bb.length(), id); 1114 } 1115 1116 /** Ensures there can be only one owner. */ doubleClaim(final byte[] bases, final int blength, final int id)1117 public boolean doubleClaim(final byte[] bases, final int blength, final int id){ 1118 boolean success=claim(bases, blength, id, true); 1119 if(verbose){outstream.println("success1="+success+", id="+id+", blength="+blength);} 1120 if(!success){return false;} 1121 success=claim(bases, blength, id+CLAIM_OFFSET, true); 1122 if(verbose){outstream.println("success2="+success+", id="+id+", blength="+blength);} 1123 return success; 1124 } 1125 claim(final ByteBuilder bb, final int id, final boolean exitEarly)1126 public boolean claim(final ByteBuilder bb, final int id, final boolean exitEarly){ 1127 return claim(bb.array, bb.length(), id, exitEarly); 1128 } 1129 calcCoverage(final byte[] bases, final int blength)1130 public float calcCoverage(final byte[] bases, final int blength){ 1131 final int shift=2*k; 1132 final int shift2=shift-2; 1133 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 1134 long kmer=0, rkmer=0; 1135 int len=0; 1136 long sum=0, max=0; 1137 int kmers=0; 1138 1139 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 1140 for(int i=0; i<blength; i++){ 1141 final byte b=bases[i]; 1142 final long x=AminoAcid.baseToNumber[b]; 1143 final long x2=AminoAcid.baseToComplementNumber[b]; 1144 kmer=((kmer<<2)|x)&mask; 1145 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1146 if(x<0){ 1147 len=0; 1148 kmer=rkmer=0; 1149 }else{len++;} 1150 if(len>=k){ 1151 int count=getCount(kmer, rkmer); 1152 sum+=count; 1153 max=Tools.max(count, max); 1154 kmers++; 1155 } 1156 } 1157 return sum==0 ? 0 : sum/(float)kmers; 1158 } 1159 calcCoverage(final Contig contig)1160 public float calcCoverage(final Contig contig){ 1161 final byte[] bases=contig.bases; 1162 if(bases.length<k){return 0;} 1163 final int shift=2*k; 1164 final int shift2=shift-2; 1165 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 1166 long kmer=0, rkmer=0; 1167 int len=0; 1168 long sum=0; 1169 int max=0, min=Integer.MAX_VALUE; 1170 int kmers=0; 1171 1172 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 1173 for(int i=0; i<bases.length; i++){ 1174 final byte b=bases[i]; 1175 final long x=AminoAcid.baseToNumber[b]; 1176 final long x2=AminoAcid.baseToComplementNumber[b]; 1177 kmer=((kmer<<2)|x)&mask; 1178 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1179 if(x<0){ 1180 len=0; 1181 kmer=rkmer=0; 1182 }else{len++;} 1183 if(len>=k){ 1184 int count=getCount(kmer, rkmer); 1185 sum+=count; 1186 max=Tools.max(count, max); 1187 min=Tools.min(count, min); 1188 kmers++; 1189 } 1190 } 1191 contig.coverage=sum==0 ? 0 : sum/(float)kmers; 1192 contig.maxCov=max; 1193 contig.minCov=sum==0 ? 0 : min; 1194 return contig.coverage; 1195 } 1196 claim(final byte[] bases, final int blength, final int id, boolean exitEarly)1197 public boolean claim(final byte[] bases, final int blength, final int id, boolean exitEarly){ 1198 if(verbose){outstream.println("Thread "+id+" claim start.");} 1199 final int shift=2*k; 1200 final int shift2=shift-2; 1201 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 1202 long kmer=0, rkmer=0; 1203 int len=0; 1204 boolean success=true; 1205 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 1206 for(int i=0; i<blength && success; i++){ 1207 final byte b=bases[i]; 1208 final long x=AminoAcid.baseToNumber[b]; 1209 final long x2=AminoAcid.baseToComplementNumber[b]; 1210 kmer=((kmer<<2)|x)&mask; 1211 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1212 if(x<0){ 1213 len=0; 1214 kmer=rkmer=0; 1215 }else{len++;} 1216 if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 1217 if(len>=k){ 1218 success=claim(kmer, rkmer, id/*, rid, i*/); 1219 success=(success || !exitEarly); 1220 } 1221 } 1222 return success; 1223 } 1224 claim(final long kmer, final long rkmer, final int id )1225 public boolean claim(final long kmer, final long rkmer, final int id/*, final long rid, final int pos*/){ 1226 //TODO: rid and pos are just for debugging. 1227 final long key=toValue(kmer, rkmer); 1228 final int way=kmerToWay(key); 1229 final AbstractKmerTable table=tables[way]; 1230 final int count=table.getValue(key); 1231 assert(count==-1 || count>0) : count; 1232 // if(verbose /*|| true*/){outstream.println("Count="+count+".");} 1233 if(count<0){return true;} 1234 assert(count>0) : count; 1235 final int owner=table.setOwner(key, id); 1236 if(verbose){outstream.println("owner="+owner+".");} 1237 // assert(owner==id) : id+", "+owner+", "+rid+", "+pos; 1238 return owner==id; 1239 } 1240 release(ByteBuilder bb, final int id)1241 public void release(ByteBuilder bb, final int id){ 1242 release(bb.array, bb.length(), id); 1243 } 1244 release(final byte[] bases, final int blength, final int id)1245 public void release(final byte[] bases, final int blength, final int id){ 1246 if(verbose /*|| true*/){outstream.println("*Thread "+id+" release start.");} 1247 final int shift=2*k; 1248 final int shift2=shift-2; 1249 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 1250 long kmer=0, rkmer=0; 1251 int len=0; 1252 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 1253 for(int i=0; i<blength; i++){ 1254 final byte b=bases[i]; 1255 final long x=AminoAcid.baseToNumber[b]; 1256 final long x2=AminoAcid.baseToComplementNumber[b]; 1257 kmer=((kmer<<2)|x)&mask; 1258 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1259 if(x<0){ 1260 len=0; 1261 kmer=rkmer=0; 1262 }else{len++;} 1263 if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 1264 if(len>=k){ 1265 release(kmer, rkmer, id); 1266 } 1267 } 1268 } 1269 release(final long kmer, final long rkmer, final int id)1270 public boolean release(final long kmer, final long rkmer, final int id){ 1271 return release(toValue(kmer, rkmer), id); 1272 } 1273 release(final long key, final int id)1274 public boolean release(final long key, final int id){ 1275 final int way=kmerToWay(key); 1276 final AbstractKmerTable table=tables[way]; 1277 final int count=table.getValue(key); 1278 // if(verbose /*|| true*/){outstream.println("Count="+count+".");} 1279 if(count<1){return true;} 1280 return table.clearOwner(key, id); 1281 } 1282 findOwner(ByteBuilder bb, final int id)1283 public int findOwner(ByteBuilder bb, final int id){ 1284 return findOwner(bb.array, bb.length(), id); 1285 } 1286 findOwner(final byte[] bases, final int blength, final int id)1287 public int findOwner(final byte[] bases, final int blength, final int id){ 1288 final int shift=2*k; 1289 final int shift2=shift-2; 1290 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 1291 long kmer=0, rkmer=0; 1292 int len=0; 1293 int maxOwner=-1; 1294 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 1295 for(int i=0; i<blength; i++){ 1296 final byte b=bases[i]; 1297 final long x=AminoAcid.baseToNumber[b]; 1298 final long x2=AminoAcid.baseToComplementNumber[b]; 1299 kmer=((kmer<<2)|x)&mask; 1300 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1301 if(x<0){ 1302 len=0; 1303 kmer=rkmer=0; 1304 }else{len++;} 1305 if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 1306 if(len>=k){ 1307 int owner=findOwner(kmer, rkmer); 1308 maxOwner=Tools.max(owner, maxOwner); 1309 if(maxOwner>id){break;} 1310 } 1311 } 1312 return maxOwner; 1313 } 1314 findOwner(final long kmer)1315 public int findOwner(final long kmer){ 1316 return findOwner(kmer, rcomp(kmer)); 1317 } 1318 findOwner(final long kmer, final long rkmer)1319 public int findOwner(final long kmer, final long rkmer){ 1320 final long key=toValue(kmer, rkmer); 1321 final int way=kmerToWay(key); 1322 final AbstractKmerTable table=tables[way]; 1323 final int count=table.getValue(key); 1324 if(count<0){return -1;} 1325 final int owner=table.getOwner(key); 1326 return owner; 1327 } 1328 getCount(long kmer, long rkmer)1329 public int getCount(long kmer, long rkmer){ 1330 long key=toValue(kmer, rkmer); 1331 int way=kmerToWay(key); 1332 return tables[way].getValue(key); 1333 } 1334 getCount(long key)1335 public int getCount(long key){ 1336 int way=kmerToWay(key); 1337 return tables[way].getValue(key); 1338 } 1339 1340 /*--------------------------------------------------------------*/ 1341 /*---------------- Fill Counts ----------------*/ 1342 /*--------------------------------------------------------------*/ 1343 fillRightCounts(long kmer, long rkmer, int[] counts, long mask, int shift2)1344 public int fillRightCounts(long kmer, long rkmer, int[] counts, long mask, int shift2){ 1345 if(FAST_FILL && MASK_CORE && k>2/*((k&1)==1)*/){ 1346 return fillRightCounts_fast(kmer, rkmer, counts, mask, shift2); 1347 }else{ 1348 return fillRightCounts_safe(kmer, rkmer, counts, mask, shift2); 1349 } 1350 } 1351 fillRightCounts_fast(final long kmer0, final long rkmer0, int[] counts, long mask, int shift2)1352 public int fillRightCounts_fast(final long kmer0, final long rkmer0, int[] counts, 1353 long mask, int shift2){ 1354 // assert((k&1)==1) : k; 1355 assert(MASK_CORE); 1356 final long kmer=(kmer0<<2)&mask; 1357 final long rkmer=(rkmer0>>>2); 1358 final long a=kmer&coreMask, b=rkmer&coreMask; 1359 1360 int max=-1, maxPos=0; 1361 if(a==b){ 1362 return fillRightCounts_safe(kmer0, rkmer0, counts, mask, shift2); 1363 }else if(a>b){ 1364 // return fillRightCounts_safe(kmer0, rkmer0, counts, mask, shift2); 1365 final long core=a; 1366 final int way=kmerToWay(core); 1367 // final AbstractKmerTable table=tables[way]; 1368 final HashArray table=(HashArray) tables[way]; 1369 final int cell=table.kmerToCell(kmer); 1370 for(int i=0; i<=3; i++){ 1371 final long key=kmer|i; 1372 final int count=Tools.max(0, table.getValue(key, cell)); 1373 // final int count=Tools.max(0, table.getValue(key)); 1374 counts[i]=count; 1375 if(count>max){ 1376 max=count; 1377 maxPos=i; 1378 } 1379 } 1380 }else{ 1381 // return fillRightCounts_safe(kmer0, rkmer0, counts, mask, shift2); 1382 //Use a higher increment for the left bits 1383 final long core=b; 1384 final int way=kmerToWay(core); 1385 // final AbstractKmerTable table=tables[way]; 1386 final HashArray table=(HashArray) tables[way]; 1387 final int cell=table.kmerToCell(rkmer); 1388 final long incr=1L<<(2*(k-1)); 1389 long j=3*incr; 1390 for(int i=0; i<=3; i++, j-=incr){ 1391 final long key=rkmer|j; 1392 final int count=Tools.max(0, table.getValue(key, cell)); 1393 // final int count=Tools.max(0, table.getValue(key)); 1394 counts[i]=count; 1395 if(count>max){ 1396 max=count; 1397 maxPos=i; 1398 } 1399 } 1400 } 1401 return maxPos; 1402 } 1403 1404 //TODO: Change this to take advantage of coreMask 1405 //Requires special handling of core palindromes. 1406 //Thus it would be easiest to just handle odd kmers, and K is normally 31 anyway. fillRightCounts_safe(long kmer, long rkmer, int[] counts, long mask, int shift2)1407 public int fillRightCounts_safe(long kmer, long rkmer, int[] counts, long mask, int shift2){ 1408 assert(kmer==rcomp(rkmer)); 1409 if(verbose){outstream.println("fillRightCounts: "+toText(kmer)+", "+toText(rkmer));} 1410 kmer=(kmer<<2)&mask; 1411 rkmer=(rkmer>>>2); 1412 int max=-1, maxPos=0; 1413 1414 for(int i=0; i<=3; i++){ 1415 long kmer2=kmer|((long)i); 1416 long rkmer2=rkmer|(((long)AminoAcid.numberToComplement[i])<<shift2); 1417 if(verbose){outstream.println("kmer: "+toText(kmer2)+", "+toText(rkmer2));} 1418 assert(kmer2==(kmer2&mask)); 1419 assert(rkmer2==(rkmer2&mask)); 1420 assert(kmer2==rcomp(rkmer2)); 1421 long key=toValue(kmer2, rkmer2); 1422 int way=kmerToWay(key); 1423 int count=tables[way].getValue(key); 1424 assert(count==NOT_PRESENT || count>=0); 1425 count=Tools.max(count, 0); 1426 counts[i]=count; 1427 if(count>max){ 1428 max=count; 1429 maxPos=i; 1430 } 1431 } 1432 return maxPos; 1433 } 1434 1435 /** For KmerCompressor. */ fillRightCountsRcompOnly(long kmer, long rkmer, int[] counts, long mask, int shift2)1436 public int fillRightCountsRcompOnly(long kmer, long rkmer, int[] counts, long mask, int shift2){ 1437 assert(kmer==rcomp(rkmer)); 1438 if(verbose){outstream.println("fillRightCounts: "+toText(kmer)+", "+toText(rkmer));} 1439 kmer=(kmer<<2)&mask; 1440 rkmer=(rkmer>>>2); 1441 int max=-1, maxPos=0; 1442 1443 for(int i=0; i<=3; i++){ 1444 long kmer2=kmer|((long)i); 1445 long rkmer2=rkmer|(((long)AminoAcid.numberToComplement[i])<<shift2); 1446 if(verbose){outstream.println("kmer: "+toText(kmer2)+", "+toText(rkmer2));} 1447 assert(kmer2==(kmer2&mask)); 1448 assert(rkmer2==(rkmer2&mask)); 1449 assert(kmer2==rcomp(rkmer2)); 1450 long key=rkmer2; 1451 int way=kmerToWay(key); 1452 int count=tables[way].getValue(key); 1453 assert(count==NOT_PRESENT || count>=0); 1454 count=Tools.max(count, 0); 1455 counts[i]=count; 1456 if(count>max){ 1457 max=count; 1458 maxPos=i; 1459 } 1460 } 1461 return maxPos; 1462 } 1463 fillLeftCounts(long kmer, long rkmer, int[] counts, long mask, int shift2)1464 public int fillLeftCounts(long kmer, long rkmer, int[] counts, long mask, int shift2){ 1465 if(FAST_FILL && MASK_CORE && k>2/*((k&1)==1)*/){ 1466 return fillLeftCounts_fast(kmer, rkmer, counts, mask, shift2); 1467 }else{ 1468 return fillLeftCounts_safe(kmer, rkmer, counts, mask, shift2); 1469 } 1470 } 1471 fillLeftCounts_fast(final long kmer0, final long rkmer0, int[] counts, long mask, int shift2)1472 public int fillLeftCounts_fast(final long kmer0, final long rkmer0, int[] counts, 1473 long mask, int shift2){ 1474 // assert((k&1)==1) : k; 1475 assert(MASK_CORE); 1476 final long rkmer=(rkmer0<<2)&mask; 1477 final long kmer=(kmer0>>>2); 1478 final long a=kmer&coreMask, b=rkmer&coreMask; 1479 1480 int max=-1, maxPos=0; 1481 if(a==b){ 1482 return fillLeftCounts_safe(kmer0, rkmer0, counts, mask, shift2); 1483 }else if(a>b){ 1484 // return fillLeftCounts_safe(kmer0, rkmer0, counts, mask, shift2); 1485 1486 final long core=a; 1487 final int way=kmerToWay(core); 1488 // final AbstractKmerTable table=tables[way]; 1489 final HashArray table=(HashArray) tables[way]; 1490 final int cell=table.kmerToCell(kmer); 1491 final long incr=1L<<(2*(k-1)); 1492 long j=0;//Must be declared as a long, outside of loop 1493 for(int i=0; i<=3; i++, j+=incr){ 1494 // final long rkmer2=rkmer|((long)AminoAcid.numberToComplement[i]); 1495 // final long kmer2=kmer|(((long)i)<<shift2); 1496 // final long key2=toValue(rkmer2, kmer2); 1497 // int way2=kmerToWay(key2); 1498 // int count2=tables[way2].getValue(key2); 1499 // count2=Tools.max(count2, 0); 1500 1501 final long key=kmer|j; 1502 final int count=Tools.max(0, table.getValue(key, cell)); 1503 // int count=Tools.max(0, table.getValue(key)); 1504 1505 // assert(way==way2) : i+", "+way+", "+way2; 1506 // assert(key==key2) : i+", "+Long.toBinaryString(kmer)+", "+ 1507 // Long.toBinaryString(key)+", "+Long.toBinaryString(key2)+ 1508 // ", "+Long.toBinaryString(j)+", "+Long.toBinaryString(incr); 1509 // assert(count==count2) : i+", "+count+", "+count2; 1510 1511 counts[i]=count; 1512 if(count>max){ 1513 max=count; 1514 maxPos=i; 1515 } 1516 } 1517 return maxPos; 1518 }else{ 1519 // return fillLeftCounts_safe(kmer0, rkmer0, counts, mask, shift2); 1520 final long core=b; 1521 final int way=kmerToWay(core); 1522 // final AbstractKmerTable table=tables[way]; 1523 final HashArray table=(HashArray) tables[way]; 1524 final int cell=table.kmerToCell(rkmer); 1525 for(int i=0, j=3; i<=3; i++, j--){ 1526 final long key=rkmer|j; 1527 final int count=Tools.max(0, table.getValue(key, cell)); 1528 // final int count=Tools.max(0, table.getValue(key)); 1529 1530 counts[i]=count; 1531 if(count>max){ 1532 max=count; 1533 maxPos=i; 1534 } 1535 } 1536 return maxPos; 1537 } 1538 } 1539 fillLeftCounts_safe(final long kmer0, final long rkmer0, int[] counts, long mask, int shift2)1540 public int fillLeftCounts_safe(final long kmer0, final long rkmer0, int[] counts, long mask, int shift2){ 1541 assert(kmer0==rcomp(rkmer0)); 1542 if(verbose){outstream.println("fillLeftCounts: "+toText(kmer0)+", "+toText(rkmer0));} 1543 long rkmer=(rkmer0<<2)&mask; 1544 long kmer=(kmer0>>>2); 1545 int max=-1, maxPos=0; 1546 // assert(false) : shift2+", "+k; 1547 for(int i=0; i<=3; i++){ 1548 final long rkmer2=rkmer|((long)AminoAcid.numberToComplement[i]); 1549 final long kmer2=kmer|(((long)i)<<shift2); 1550 if(verbose){outstream.println("kmer: "+toText(kmer2)+", "+toText(rkmer2));} 1551 assert(kmer2==(kmer2&mask)); 1552 assert(rkmer2==(rkmer2&mask)); 1553 assert(kmer2==rcomp(rkmer2)) : "\n"+"kmer: \t"+toText(rcomp(rkmer2))+", "+toText(rcomp(kmer2)); 1554 long key=toValue(rkmer2, kmer2); 1555 int way=kmerToWay(key); 1556 int count=tables[way].getValue(key); 1557 assert(count==NOT_PRESENT || count>=0); 1558 count=Tools.max(count, 0); 1559 counts[i]=count; 1560 if(count>max){ 1561 max=count; 1562 maxPos=i; 1563 } 1564 } 1565 return maxPos; 1566 } 1567 1568 /*--------------------------------------------------------------*/ 1569 /*---------------- Printing Methods ----------------*/ 1570 /*--------------------------------------------------------------*/ 1571 1572 @Override dumpKmersAsBytes(String fname, int mincount, int maxcount, boolean printTime, AtomicLong remaining)1573 public boolean dumpKmersAsBytes(String fname, int mincount, int maxcount, boolean printTime, AtomicLong remaining){ 1574 if(fname==null){return false;} 1575 Timer t=new Timer(); 1576 1577 ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, append, true); 1578 bsw.start(); 1579 for(AbstractKmerTable set : tables){ 1580 set.dumpKmersAsBytes(bsw, k, mincount, maxcount, remaining); 1581 } 1582 bsw.poisonAndWait(); 1583 1584 t.stop(); 1585 if(printTime){outstream.println("Kmer Dump Time: \t"+t);} 1586 return bsw.errorState; 1587 } 1588 1589 @Override dumpKmersAsBytes_MT(String fname, int mincount, int maxcount, boolean printTime, AtomicLong remaining)1590 public boolean dumpKmersAsBytes_MT(String fname, int mincount, int maxcount, boolean printTime, AtomicLong remaining){ 1591 final int threads=Tools.min(Shared.threads(), tables.length); 1592 if(threads<3 || DumpThread.NUM_THREADS==1){return dumpKmersAsBytes(fname, mincount, maxcount, printTime, remaining);} 1593 1594 if(fname==null){return false;} 1595 Timer t=new Timer(); 1596 1597 ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, append, true); 1598 bsw.start(); 1599 DumpThread.dump(k, mincount, maxcount, tables, bsw, remaining); 1600 bsw.poisonAndWait(); 1601 1602 t.stop(); 1603 if(printTime){outstream.println("Kmer Dump Time: \t"+t);} 1604 return bsw.errorState; 1605 } 1606 1607 /*--------------------------------------------------------------*/ 1608 /*---------------- Recall Methods ----------------*/ 1609 /*--------------------------------------------------------------*/ 1610 rcomp(long kmer)1611 private final long rcomp(long kmer){return AminoAcid.reverseComplementBinaryFast(kmer, k);} toText(long kmer)1612 private final StringBuilder toText(long kmer){return AbstractKmerTable.toText(kmer, k);} 1613 1614 /*--------------------------------------------------------------*/ 1615 /*---------------- Static Methods ----------------*/ 1616 /*--------------------------------------------------------------*/ 1617 1618 /** 1619 * Transforms a kmer into a canonical value stored in the table. Expected to be inlined. 1620 * @param kmer Forward kmer 1621 * @param rkmer Reverse kmer 1622 * @return Canonical value 1623 */ toValue(long kmer, long rkmer)1624 public final long toValue(long kmer, long rkmer){ 1625 // long value=(rcomp ? Tools.max(kmer, rkmer) : kmer); 1626 // return value; 1627 if(!rcomp){return kmer;} 1628 final long a=kmer&coreMask, b=rkmer&coreMask; 1629 return a>b ? kmer : a<b ? rkmer : Tools.max(kmer, rkmer); 1630 } 1631 1632 /*--------------------------------------------------------------*/ 1633 /*---------------- Final Primitives ----------------*/ 1634 /*--------------------------------------------------------------*/ 1635 1636 @Override kbig()1637 public int kbig(){return k;} 1638 @Override filterMemory(int pass)1639 public long filterMemory(int pass){return ((pass&1)==0) ? filterMemory0 : filterMemory1;} 1640 @Override ecco()1641 public boolean ecco(){return ecco;} 1642 @Override qtrimLeft()1643 public boolean qtrimLeft(){return qtrimLeft;} 1644 @Override qtrimRight()1645 public boolean qtrimRight(){return qtrimRight;} 1646 @Override minAvgQuality()1647 public float minAvgQuality(){return minAvgQuality;} 1648 @Override tableMemory()1649 public long tableMemory(){return tableMemory;} 1650 @Override estimatedKmerCapacity()1651 public long estimatedKmerCapacity(){return estimatedKmerCapacity;} 1652 @Override ways()1653 public int ways(){return ways;} 1654 @Override rcomp()1655 public boolean rcomp(){return rcomp;} 1656 kmerToWay(final long kmer)1657 public final int kmerToWay(final long kmer){ 1658 final int way=(int)((kmer&coreMask)%ways); 1659 return way; 1660 } 1661 1662 /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in tables[Y] */ 1663 private AbstractKmerTable[] tables; 1664 tables()1665 public AbstractKmerTable[] tables(){return tables;} 1666 1667 public long filterMemoryOverride=0; 1668 1669 public final int tableType; //AbstractKmerTable.ARRAY1D; 1670 1671 private final int bytesPerKmer; 1672 1673 private final long usableMemory; 1674 private final long filterMemory0; 1675 private final long filterMemory1; 1676 private final long tableMemory; 1677 private final long estimatedKmerCapacity; 1678 1679 /** Number of tables (and threads, during loading) */ 1680 private final boolean prealloc; 1681 1682 /** Number of tables (and threads, during loading) */ 1683 public final int ways; 1684 1685 /** Normal kmer length */ 1686 public final int k; 1687 /** k-1; used in some expressions */ 1688 public final int k2; 1689 1690 public final long coreMask; 1691 1692 /** Look for reverse-complements as well as forward kmers. Default: true */ 1693 private final boolean rcomp; 1694 1695 /** Quality-trim the left side */ 1696 public final boolean qtrimLeft; 1697 /** Quality-trim the right side */ 1698 public final boolean qtrimRight; 1699 /** Trim bases at this quality or below. Default: 4 */ 1700 public final float trimq; 1701 /** Error rate for trimming (derived from trimq) */ 1702 private final float trimE; 1703 1704 /** Throw away reads below this average quality after trimming. Default: 0 */ 1705 public final float minAvgQuality; 1706 /** If positive, calculate average quality from the first X bases only. Default: 0 */ 1707 public final int minAvgQualityBases; 1708 1709 /** Ignore kmers with probability of correctness less than this */ 1710 public final float minProb; 1711 1712 /** Correct via overlap */ 1713 private final boolean ecco; 1714 1715 /** Attempt to merge via overlap prior to counting kmers */ 1716 private final boolean merge; 1717 1718 /*--------------------------------------------------------------*/ 1719 /*---------------- Walker ----------------*/ 1720 /*--------------------------------------------------------------*/ 1721 walk()1722 public Walker walk(){ 1723 return new WalkerKST(); 1724 } 1725 1726 public class WalkerKST extends Walker { 1727 WalkerKST()1728 WalkerKST(){ 1729 w=tables[0].walk(); 1730 } 1731 1732 /** 1733 * Fills this object with the next key and value. 1734 * @return True if successful. 1735 */ next()1736 public boolean next(){ 1737 if(w==null){return false;} 1738 if(w.next()){return true;} 1739 if(tnum<tables.length){tnum++;} 1740 w=(tnum<tables.length ? tables[tnum].walk() : null); 1741 return w==null ? false : w.next(); 1742 } 1743 kmer()1744 public long kmer(){return w.kmer();} value()1745 public int value(){return w.value();} 1746 1747 private Walker w=null; 1748 1749 /** current table number */ 1750 private int tnum=0; 1751 } 1752 1753 } 1754