1 package sketch; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.util.ArrayList; 6 import java.util.Arrays; 7 8 import dna.AminoAcid; 9 import fileIO.FileFormat; 10 import fileIO.ReadWrite; 11 import jgi.BBMerge; 12 import jgi.TranslateSixFrames; 13 import prok.CallGenes; 14 import prok.GeneCaller; 15 import prok.Orf; 16 import prok.ProkObject; 17 import shared.Parse; 18 import shared.Tools; 19 import stream.ConcurrentReadInputStream; 20 import stream.Read; 21 import structures.EntropyTracker; 22 import structures.ListNum; 23 import tax.TaxNode; 24 import tax.TaxTree; 25 26 /** 27 * Creates MinHashSketches rapidly. 28 * 29 * @author Brian Bushnell 30 * @date July 6, 2016 31 * 32 */ 33 public class SketchMakerMini extends SketchObject { 34 35 /*--------------------------------------------------------------*/ 36 /*---------------- Initialization ----------------*/ 37 /*--------------------------------------------------------------*/ 38 SketchMakerMini(SketchTool tool_, int mode_, DisplayParams params)39 public SketchMakerMini(SketchTool tool_, int mode_, DisplayParams params){ 40 this(tool_, mode_, params.minEntropy, params.minProb, params.minQual); 41 } 42 43 /** 44 * Constructor. 45 */ SketchMakerMini(SketchTool tool_, int mode_, float minEntropy_, float minProb_, byte minQual_)46 public SketchMakerMini(SketchTool tool_, int mode_, float minEntropy_, float minProb_, byte minQual_){ 47 48 tool=tool_; 49 mode=mode_; 50 minProb=minProb_; 51 minQual=minQual_; 52 53 aminoShift=AminoAcid.AMINO_SHIFT; 54 if(!aminoOrTranslate()){ 55 shift=2*k; 56 shift2=shift-2; 57 mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32 58 }else{ 59 shift=aminoShift*k; 60 shift2=shift-aminoShift; 61 mask=(shift>63 ? -1L : ~((-1L)<<shift)); 62 } 63 64 if(AUTOSIZE && (mode==ONE_SKETCH || mode==PER_FILE)){ 65 heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(80000*Tools.mid(1, AUTOSIZE_FACTOR, 32))), tool.minKeyOccuranceCount, tool.trackCounts); 66 }else if(AUTOSIZE_LINEAR && (mode==ONE_SKETCH || mode==PER_FILE)){ 67 heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(10000000*Tools.mid(0.1, 2*AUTOSIZE_LINEAR_DENSITY, 0.00001))), 68 tool.minKeyOccuranceCount, tool.trackCounts); 69 }else{ 70 heap=new SketchHeap(tool.stTargetSketchSize, tool.minKeyOccuranceCount, tool.trackCounts); 71 } 72 73 if(minEntropy_>0){ 74 eTracker=new EntropyTracker(entropyK, k, (amino || translate), minEntropy_, true); 75 }else{ 76 eTracker=null; 77 } 78 79 if(translate || processSSU){ 80 gCaller=CallGenes.makeGeneCaller(pgm); 81 }else{ 82 gCaller=null; 83 } 84 } 85 86 /*--------------------------------------------------------------*/ 87 /*---------------- Outer Methods ----------------*/ 88 /*--------------------------------------------------------------*/ 89 90 /** Create read streams and process all data */ toSketches(final String fname, float samplerate, long reads)91 public ArrayList<Sketch> toSketches(final String fname, float samplerate, long reads){ 92 heap.clear(false); //123 93 final String simpleName; 94 95 final FileFormat ffin1, ffin2; 96 if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){ 97 ffin1=FileFormat.testInput(fname.replaceFirst("#", "1"), FileFormat.FASTQ, null, true, true); 98 ffin2=FileFormat.testInput(fname.replaceFirst("#", "2"), FileFormat.FASTQ, null, true, true); 99 }else{ 100 ffin1=FileFormat.testInput(fname, FileFormat.FASTA, null, true, true); 101 ffin2=null; 102 } 103 104 //Create a read input stream 105 final ConcurrentReadInputStream cris; 106 { 107 simpleName=ffin1.simpleName(); 108 heap.setFname(simpleName); 109 cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null); 110 if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);} 111 cris.start(); //Start the stream 112 if(verbose){outstream.println("Started cris");} 113 } 114 if(mode==ONE_SKETCH || mode==PER_FILE){ 115 if(heap.name0()==null){heap.setName0(simpleName);} 116 } 117 ArrayList<Sketch> sketches=processInner(cris); 118 119 errorState|=ReadWrite.closeStream(cris); 120 sketchesMade++; 121 return sketches; 122 } 123 124 /*--------------------------------------------------------------*/ 125 /*---------------- Inner Methods ----------------*/ 126 /*--------------------------------------------------------------*/ 127 128 /** Iterate through the reads */ processInner(ConcurrentReadInputStream cris)129 ArrayList<Sketch> processInner(ConcurrentReadInputStream cris){ 130 assert(heap.size()==0); 131 ArrayList<Sketch> sketches=new ArrayList<Sketch>(mode==ONE_SKETCH || mode==PER_FILE ? 1 : 8); 132 133 //Grab the first ListNum of reads 134 ListNum<Read> ln=cris.nextList(); 135 //Grab the actual read list from the ListNum 136 ArrayList<Read> reads=(ln!=null ? ln.list : null); 137 138 //As long as there is a nonempty read list... 139 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 140 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access 141 142 //Loop through each read in the list 143 for(int idx=0; idx<reads.size(); idx++){ 144 final Read r1=reads.get(idx); 145 final Read r2=r1.mate; 146 147 processReadPair(r1, r2); 148 if(mode!=ONE_SKETCH && mode!=PER_FILE){ 149 if(heap!=null && heap.size()>0 && heap.maxLen()>=Tools.max(1, minSketchSize)){ 150 int size=heap.size(); 151 Sketch sketch=new Sketch(heap, false, tool.trackCounts, null); 152 assert(sketch.keys.length>0) : sketch.keys.length+", "+size; 153 sketch.loadSSU(); 154 sketches.add(sketch); 155 sketchesMade++; 156 } 157 if(heap!=null){heap.clear(false);} 158 } 159 } 160 161 //Notify the input stream that the list was used 162 cris.returnList(ln); 163 // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access 164 165 //Fetch a new list 166 ln=cris.nextList(); 167 reads=(ln!=null ? ln.list : null); 168 } 169 170 //Notify the input stream that the final list was used 171 if(ln!=null){ 172 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); 173 } 174 175 if(mode==ONE_SKETCH || mode==PER_FILE){ 176 Sketch sketch=new Sketch(heap, false, tool.trackCounts, null); 177 sketch.loadSSU(); 178 sketches.add(sketch); 179 sketchesMade++; 180 } 181 heap.clear(true); 182 return sketches; 183 } 184 processReadPair(Read r1, Read r2)185 void processReadPair(Read r1, Read r2){ 186 //Track the initial length for statistics 187 final int initialLength1=r1.length(); 188 final int initialLength2=r1.mateLength(); 189 190 //Increment counters 191 readsProcessed+=r1.pairCount(); 192 basesProcessed+=initialLength1+initialLength2; 193 194 if(mode!=ONE_SKETCH && mode!=PER_FILE){ 195 int expectedSize=toSketchSize(initialLength1+initialLength2, -1, -1, targetSketchSize); 196 if(heap==null || heap.capacity()<expectedSize){heap=new SketchHeap(expectedSize, tool.minKeyOccuranceCount, tool.trackCounts);} 197 } 198 199 if(tool.mergePairs && r2!=null){ 200 final int insert=BBMerge.findOverlapStrict(r1, r2, false); 201 if(insert>0){ 202 heap.genomeSequences++; 203 r2.reverseComplement(); 204 r1=r1.joinRead(insert); 205 r2=null; 206 } 207 } 208 209 processRead(r1); 210 if(r2!=null){processRead(r2);} 211 212 if(heap.name0()==null){ 213 heap.setName0(r1.id); 214 } 215 216 TaxNode tn=null; 217 if(heap.taxID<0 && r1.length()>800){ 218 if(taxtree!=null){ 219 try { 220 tn=taxtree.parseNodeFromHeader(r1.id, true); 221 } catch (Throwable e) {} 222 if(tn!=null){ 223 heap.taxID=tn.id; 224 if(heap.taxName()==null){ 225 heap.setTaxName(tn.name); 226 } 227 } 228 // System.err.println("A) "+heap.taxID+r1.id); 229 }else{ 230 heap.taxID=TaxTree.parseHeaderStatic(r1.id); 231 // System.err.println("B) "+heap.taxID+r1.id); 232 } 233 } 234 assert(heap.taxID<0 || heap.taxName()!=null || taxtree==null) : heap.taxID+", "+heap.taxName()+", "+heap.name()+", "+tn; 235 } 236 237 public void processRead(final Read r){ 238 if(amino){ 239 processReadAmino(r); 240 }else if(translate){ 241 processReadTranslated(r); 242 }else{ 243 processReadNucleotide(r); 244 } 245 } 246 247 public void processReadTranslated(final Read r){ 248 assert(!r.aminoacid()); 249 final ArrayList<Read> prots; 250 if(sixframes){ 251 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){ 252 Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: allow 18S also 253 if(orf!=null && orf.length()>=min_SSU_len){ 254 assert(orf.is16S()); 255 if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);} 256 } 257 //TODO: Add 18S. 258 } 259 prots=TranslateSixFrames.toFrames(r, true, false, 6); 260 }else{ 261 ArrayList<Orf> list; 262 list=gCaller.callGenes(r); 263 prots=CallGenes.translate(r, list); 264 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){ 265 for(Orf orf : list){ 266 if(orf.is16S() && orf.length()>=min_SSU_len && orf.length()>=heap.r16SLen()){ 267 heap.set16S(CallGenes.fetch(orf, r).bases); 268 break; 269 } 270 } 271 } 272 } 273 if(prots!=null){ 274 for(Read p : prots){ 275 processReadAmino(p); 276 } 277 } 278 } 279 processReadNucleotide(final Read r)280 void processReadNucleotide(final Read r){ 281 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){ 282 Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: 18S 283 if(orf!=null && orf.length()>=min_SSU_len){ 284 assert(orf.start>=0 && orf.stop<r.length()) : r.length()+"\n"+orf; 285 assert(orf.is16S()); 286 if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);} 287 } 288 //TODO: Add 18S. 289 } 290 291 final byte[] bases=r.bases; 292 final byte[] quals=r.quality; 293 final long[] baseCounts=heap.baseCounts(true); 294 long kmer=0; 295 long rkmer=0; 296 int len=0; 297 assert(!r.aminoacid()); 298 299 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists()); 300 final long min=minHashValue; 301 heap.genomeSizeBases+=r.length(); 302 heap.genomeSequences++; 303 if(eTracker!=null){eTracker.clear();} 304 305 // assert(false) : minProb+", "+minQual+", "+(quals==null); 306 307 if(quals==null || (minProb<=0 && minQual<2)){ 308 // System.err.println("A"); 309 for(int i=0; i<bases.length; i++){ 310 // System.err.println("B: len="+len); 311 byte b=bases[i]; 312 long x=AminoAcid.baseToNumber[b]; 313 long x2=AminoAcid.baseToComplementNumber[b]; 314 315 kmer=((kmer<<2)|x)&mask; 316 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 317 if(eTracker!=null){eTracker.add(b);} 318 if(x<0){ 319 len=0; 320 rkmer=0; 321 }else{ 322 len++; 323 baseCounts[(int)x]++; 324 } 325 326 // System.err.println("\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n" 327 // +AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n" 328 // +len+", "+(char)b+", "+x+", "+x2+"\n"); 329 330 if(len>=k){ 331 kmersProcessed++; 332 heap.genomeSizeKmers++; 333 // heap.probSum++; //Note really necessary for fasta data 334 if(eTracker==null || eTracker.passes()){ 335 // System.err.println("Pass.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()); 336 337 // assert(kmer==AminoAcid.reverseComplementBinaryFast(rkmer, k)) : 338 // "\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n" 339 // +AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n" 340 // +len+", "+(char)b+", "+x+", "+x2; 341 342 final long hashcode=hash(kmer, rkmer); 343 // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash); 344 if(hashcode>min){ 345 if(noBlacklist){ 346 heap.add(hashcode); 347 }else{ 348 heap.checkAndAdd(hashcode); 349 } 350 } 351 }else{ 352 // System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()+"\n"+r.toFastq()+"\n"+eTracker); 353 // assert(false); 354 } 355 } 356 } 357 }else{ 358 int zeroQualityKmers=0; 359 int positiveQualityKmers=0; 360 361 float prob=1; 362 for(int i=0; i<bases.length; i++){ 363 final byte b=bases[i]; 364 final long x=AminoAcid.baseToNumber[b]; 365 final long x2=AminoAcid.baseToComplementNumber[b]; 366 367 kmer=((kmer<<2)|x)&mask; 368 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 369 if(eTracker!=null){eTracker.add(b);} 370 371 final byte q=quals[i]; 372 {//Quality-related stuff 373 assert(q>=0) : Arrays.toString(quals)+"\n"+minProb+", "+minQual; 374 // if(x>=0){ 375 // if(q>0){ 376 // positiveQualityBases++; 377 // }else{ 378 // zeroQualityBases++; 379 // } 380 // } 381 prob=prob*align2.QualityTools.PROB_CORRECT[q]; 382 if(len>k){ 383 byte oldq=quals[i-k]; 384 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; 385 } 386 if(x<0 || q<minQual){ 387 len=0; 388 kmer=rkmer=0; 389 prob=1; 390 }else{ 391 len++; 392 baseCounts[(int)x]++; 393 } 394 } 395 396 if(len>=k){ 397 kmersProcessed++; 398 if(prob>=minProb){ 399 heap.genomeSizeKmers++; 400 heap.probSum+=prob; 401 if(eTracker==null || eTracker.passes()){ 402 final long hashcode=hash(kmer, rkmer); 403 // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash); 404 if(hashcode>min){ 405 if(noBlacklist){ 406 heap.add(hashcode); 407 }else{ 408 heap.checkAndAdd(hashcode); 409 } 410 } 411 }else{ 412 // System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()); 413 } 414 positiveQualityKmers++; 415 }else if(q<=2){ 416 zeroQualityKmers++; 417 } 418 } 419 420 //This version is slow but calculates depth better. 421 // if(len>=k){ 422 // kmersProcessed++; 423 // heap.genomeSizeKmers++; 424 // final long hash=hash(kmer, rkmer); 425 // // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash); 426 // if(hash>min){ 427 // if(prob>=minProb || (!heap.setMode && heap.contains(hash))){ 428 // if(noBlacklist){ 429 // heap.add(hash); 430 // }else{ 431 // heap.checkAndAdd(hash); 432 // } 433 // } 434 // } 435 // } 436 } 437 if(minProb>0 && zeroQualityKmers>100 && positiveQualityKmers==0){ 438 if(looksLikePacBio(r)){ 439 synchronized(this){ 440 minProb=0; 441 } 442 processReadNucleotide(r); 443 } 444 } 445 } 446 // assert(false); 447 } 448 looksLikePacBio(Read r)449 boolean looksLikePacBio(Read r){ 450 if(r.length()<302 || r.mate!=null){return false;} 451 if(r.quality==null){ 452 int x=Parse.parseZmw(r.id); 453 return x>=0; 454 } 455 int positive=0; 456 int zero=0; 457 int ns=0; 458 for(int i=0; i<r.bases.length; i++){ 459 byte b=r.bases[i]; 460 byte q=r.quality[i]; 461 if(b=='N'){ns++;} 462 else if(q==0 || q==2){ 463 zero++; 464 }else{ 465 positive++; 466 } 467 } 468 return zero>=r.length()/2 && positive==0; 469 } 470 processReadAmino(final Read r)471 void processReadAmino(final Read r){ 472 final byte[] bases=r.bases; 473 long kmer=0; 474 int len=0; 475 assert(r.aminoacid()); 476 477 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists()); 478 final long min=minHashValue; 479 heap.genomeSizeBases+=r.length(); 480 heap.genomeSequences++; 481 482 for(int i=0; i<bases.length; i++){ 483 byte b=bases[i]; 484 long x=AminoAcid.acidToNumberNoStops[b]; 485 kmer=((kmer<<aminoShift)|x)&mask; 486 // if(eTracker!=null){eTracker.add(b);} 487 488 if(x<0){len=0;}else{len++;} 489 if(len>=k){ 490 kmersProcessed++; 491 heap.genomeSizeKmers++; 492 // if(eTracker==null || eTracker.passes()){ 493 // assert(false) : (eTracker==null)+", "+eTracker.cutoff()+", "+eTracker.calcEntropy()+", "+r; 494 long hashcode=hash(kmer, kmer); 495 if(hashcode>min){ 496 if(noBlacklist){ 497 heap.add(hashcode); 498 }else{ 499 heap.checkAndAdd(hashcode); 500 } 501 } 502 // } 503 } 504 } 505 } 506 processReadAmino_old_no_entropy(final Read r)507 void processReadAmino_old_no_entropy(final Read r){ 508 final byte[] bases=r.bases; 509 long kmer=0; 510 int len=0; 511 assert(r.aminoacid()); 512 513 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists()); 514 final long min=minHashValue; 515 heap.genomeSizeBases+=r.length(); 516 heap.genomeSequences++; 517 518 for(int i=0; i<bases.length; i++){ 519 byte b=bases[i]; 520 long x=AminoAcid.acidToNumberNoStops[b]; 521 kmer=((kmer<<aminoShift)|x)&mask; 522 if(x<0){len=0;}else{len++;} 523 if(len>=k){ 524 kmersProcessed++; 525 heap.genomeSizeKmers++; 526 long hashcode=hash(kmer, kmer); 527 if(hashcode>min){ 528 if(noBlacklist){ 529 heap.add(hashcode); 530 }else{ 531 heap.checkAndAdd(hashcode); 532 } 533 } 534 } 535 } 536 } 537 toSketch(int minCount)538 public Sketch toSketch(int minCount){ 539 Sketch sketch=null; 540 if(heap!=null && heap.size()>0){ 541 try { 542 sketch=new Sketch(heap, false, tool.trackCounts, null, minCount); 543 } catch (Throwable e) { 544 // TODO Auto-generated catch block 545 e.printStackTrace(); 546 } 547 heap.clear(false); 548 } 549 return sketch; 550 } 551 add(SketchMakerMini smm)552 public void add(SketchMakerMini smm){ 553 heap.add(smm.heap); 554 readsProcessed+=smm.readsProcessed; 555 basesProcessed+=smm.basesProcessed; 556 kmersProcessed+=smm.kmersProcessed; 557 sketchesMade+=smm.sketchesMade; 558 pacBioDetected|=smm.pacBioDetected; 559 } 560 561 /** True only if this thread has completed successfully */ 562 boolean success=false; 563 564 SketchHeap heap; 565 566 final int aminoShift; 567 final int shift; 568 final int shift2; 569 final long mask; 570 final EntropyTracker eTracker; 571 final GeneCaller gCaller; 572 minEntropy()573 public float minEntropy() { 574 // TODO Auto-generated method stub 575 return eTracker==null ? -1 : eTracker.cutoff(); 576 } 577 578 /*--------------------------------------------------------------*/ 579 /*---------------- Fields ----------------*/ 580 /*--------------------------------------------------------------*/ 581 582 /** Number of reads processed */ 583 protected long readsProcessed=0; 584 /** Number of bases processed */ 585 protected long basesProcessed=0; 586 /** Number of bases processed */ 587 protected long kmersProcessed=0; 588 /** Number of sketches started */ 589 protected long sketchesMade=0; 590 minProb()591 float minProb() {return minProb;} minQual()592 byte minQual() {return minQual;} 593 public boolean pacBioDetected=false; 594 private float minProb; 595 private byte minQual; 596 597 /*--------------------------------------------------------------*/ 598 /*---------------- Final Fields ----------------*/ 599 /*--------------------------------------------------------------*/ 600 601 private final SketchTool tool; 602 final int mode; 603 604 /*--------------------------------------------------------------*/ 605 /*---------------- Common Fields ----------------*/ 606 /*--------------------------------------------------------------*/ 607 608 /** Print status messages to this output stream */ 609 private PrintStream outstream=System.err; 610 /** Print verbose messages */ 611 public static boolean verbose=false; 612 /** True if an error was encountered */ 613 public boolean errorState=false; 614 615 } 616