1 package jgi; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.util.ArrayList; 6 import java.util.Locale; 7 import java.util.Random; 8 9 import fileIO.ByteFile; 10 import fileIO.ByteFile1; 11 import fileIO.ByteFile2; 12 import fileIO.FileFormat; 13 import fileIO.ReadWrite; 14 import fileIO.TextStreamWriter; 15 import kmer.AbstractKmerTable; 16 import kmer.HashArray1D; 17 import kmer.HashForest; 18 import kmer.KmerTable; 19 import kmer.ScheduleMaker; 20 import shared.Parse; 21 import shared.Parser; 22 import shared.PreParser; 23 import shared.Shared; 24 import shared.Timer; 25 import shared.Tools; 26 import stream.ConcurrentGenericReadInputStream; 27 import stream.ConcurrentReadInputStream; 28 import stream.FASTQ; 29 import stream.FastaReadInputStream; 30 import stream.Read; 31 import structures.ListNum; 32 33 /** 34 * @author Brian Bushnell 35 * @date Mar 24, 2014 36 * 37 */ 38 public class CalcUniqueness { 39 40 41 /*--------------------------------------------------------------*/ 42 /*---------------- Initialization ----------------*/ 43 /*--------------------------------------------------------------*/ 44 45 main(String[] args)46 public static void main(String[] args){ 47 Timer t=new Timer(); 48 CalcUniqueness x=new CalcUniqueness(args); 49 x.process(t); 50 51 //Close the print stream if it was redirected 52 Shared.closeStream(x.outstream); 53 } 54 CalcUniqueness(String[] args)55 public CalcUniqueness(String[] args){ 56 57 {//Preparse block for help, config files, and outstream 58 PreParser pp=new PreParser(args, getClass(), false); 59 args=pp.args; 60 outstream=pp.outstream; 61 } 62 63 boolean setInterleaved=false; //Whether it was explicitly set. 64 65 Shared.capBuffers(4); 66 ReadWrite.USE_UNPIGZ=true; 67 68 int k_=25; 69 70 Parser parser=new Parser(); 71 for(int i=0; i<args.length; i++){ 72 String arg=args[i]; 73 String[] split=arg.split("="); 74 String a=split[0].toLowerCase(); 75 String b=split.length>1 ? split[1] : null; 76 77 if(parser.parse(arg, a, b)){ 78 //do nothing 79 }else if(a.equals("printlastbin") || a.equals("plb")){ 80 printLastBin=Parse.parseBoolean(b); 81 }else if(a.equals("verbose")){ 82 verbose=Parse.parseBoolean(b); 83 ByteFile1.verbose=verbose; 84 ByteFile2.verbose=verbose; 85 stream.FastaReadInputStream.verbose=verbose; 86 stream.FastqReadInputStream.verbose=verbose; 87 ConcurrentGenericReadInputStream.verbose=verbose; 88 ReadWrite.verbose=verbose; 89 }else if(a.equals("cumulative")){ 90 cumulative=Parse.parseBoolean(b); 91 }else if(a.equals("offset")){ 92 singleOffset=Integer.parseInt(b); 93 }else if(a.equals("percent") || a.equals("percents")){ 94 showPercents=Parse.parseBoolean(b); 95 }else if(a.equals("count") || a.equals("counts")){ 96 showCounts=Parse.parseBoolean(b); 97 }else if(a.equals("minprob") || a.equals("percents")){ 98 minprob=Float.parseFloat(b); 99 }else if(a.equals("k")){ 100 k_=Integer.parseInt(b); 101 }else if(a.equals("fixpeaks") || a.equals("fixspikes") || a.equals("fs")){ 102 fixSpikes=Parse.parseBoolean(b); 103 }else if(a.equals("bin") || a.equals("interval")){ 104 interval=Parse.parseKMG(b); 105 }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ 106 parser.in1=arg; 107 }else{ 108 // System.err.println("Unknown parameter "+args[i]); 109 // assert(false) : "Unknown parameter "+args[i]; 110 throw new RuntimeException("Unknown parameter "+args[i]); 111 } 112 } 113 114 {//Process parser fields 115 Parser.processQuality(); 116 minAverageQuality=parser.minAvgQuality; 117 minAverageQualityBases=parser.minAvgQualityBases; 118 119 maxReads=parser.maxReads; 120 samplerate=parser.samplerate; 121 sampleseed=parser.sampleseed; 122 123 overwrite=parser.overwrite; 124 append=parser.append; 125 testsize=parser.testsize; 126 127 setInterleaved=parser.setInterleaved; 128 129 in1=parser.in1; 130 in2=parser.in2; 131 132 out=parser.out1; 133 134 extin=parser.extin; 135 extout=parser.extout; 136 } 137 setSampleSeed(-1); 138 139 k=k_; 140 k2=k-1; 141 assert(k>0 && k<32) : "k="+k+"; valid range is 1-31"; 142 143 if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ 144 in2=in1.replace("#", "2"); 145 in1=in1.replace("#", "1"); 146 } 147 if(in2!=null){ 148 if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");} 149 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; 150 } 151 152 assert(FastaReadInputStream.settingsOK()); 153 154 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} 155 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ 156 ByteFile.FORCE_MODE_BF2=true; 157 } 158 159 if(out==null){ 160 out="stdout.txt"; 161 } 162 163 if(!setInterleaved){ 164 assert(in1!=null && out!=null) : "\nin1="+in1+"\nin2="+in2+"\nout="+out+"\n"; 165 if(in2!=null){ //If there are 2 input streams. 166 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; 167 outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); 168 } 169 } 170 171 if(!Tools.testOutputFiles(overwrite, append, false, out)){ 172 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out+"\n"); 173 } 174 175 ffout=FileFormat.testOutput(out, FileFormat.TEXT, extout, false, overwrite, append, false); 176 177 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); 178 ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true); 179 180 keySets=new AbstractKmerTable[WAYS]; 181 182 //Initialize tables 183 ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, 12, false, 1); 184 int[] schedule=scheduleMaker.makeSchedule(); 185 for(int j=0; j<WAYS; j++){ 186 if(useForest){ 187 keySets[j]=new HashForest(initialSize, true, false); 188 }else if(useTable){ 189 keySets[j]=new KmerTable(initialSize, true); 190 }else if(useArray){ 191 // keySets[j]=new HashArray1D(initialSize, -1, -1L, true); //TODO: Set maxSize 192 keySets[j]=new HashArray1D(schedule, -1L); 193 }else{ 194 throw new RuntimeException("Must use forest, table, or array data structure."); 195 } 196 } 197 198 } 199 200 /*--------------------------------------------------------------*/ 201 /*---------------- Inner Class ----------------*/ 202 /*--------------------------------------------------------------*/ 203 204 private class Counter{ 205 Counter(int mask_)206 Counter(int mask_){ 207 mask=mask_; 208 } 209 incrementQuality(Read r)210 void incrementQuality(Read r){ 211 qualCounts++; 212 double q=r.avgQualityByProbabilityDouble(true, r.length()); 213 quality+=q; 214 double p=r.probabilityErrorFree(true, r.length()); 215 perfectProb+=p; 216 } 217 increment(final long kmer)218 void increment(final long kmer){ 219 if(kmer<0){return;} 220 AbstractKmerTable table=keySets[(int)(kmer%WAYS)]; 221 int count=table.getValue(kmer); 222 if(count<1){ 223 table.set(kmer, mask); 224 misses++; 225 cmisses++; 226 }else if((count&mask)==0){ 227 table.set(kmer, count|mask); 228 misses++; 229 cmisses++; 230 }else{ 231 hits++; 232 chits++; 233 } 234 } 235 reset()236 void reset(){ 237 prevPercent=percent(); 238 prevHits=hits; 239 prevMisses=misses; 240 241 hits=misses=0; 242 quality=0; 243 perfectProb=0; 244 qualCounts=0; 245 } 246 averageQuality()247 public double averageQuality() { 248 return qualCounts<1 ? 0 : quality/qualCounts; 249 } 250 averagePerfectProb()251 public double averagePerfectProb() { 252 return qualCounts<1 ? 0 : 100*perfectProb/qualCounts; 253 } 254 percent()255 double percent(){ 256 final long sum=hits()+misses(), prevSum=prevHits+prevMisses; 257 if(sum==0){return 0;} 258 double percent=misses()*100.0/sum; 259 if(cumulative || !fixSpikes || prevSum==0){return percent;} 260 assert(!cumulative && fixSpikes); 261 return Tools.min(percent, prevPercent+0.2); 262 } 263 percentS()264 String percentS(){ 265 return String.format(Locale.ROOT, "%.3f",percent()); 266 } 267 hits()268 long hits(){return cumulative ? chits : hits;} misses()269 long misses(){return cumulative ? cmisses : misses;} 270 271 final int mask; 272 273 double perfectProb; 274 double quality; 275 long qualCounts; 276 277 /** Per-interval hash hits */ 278 long hits=0; 279 /** Per-interval hash misses */ 280 long misses=0; 281 282 /** Cumulative hash hits */ 283 long chits=0; 284 /** Cumulative hash misses */ 285 long cmisses=0; 286 287 long prevHits=0; 288 long prevMisses=0; 289 double prevPercent=0; 290 } 291 292 /*--------------------------------------------------------------*/ 293 /*---------------- Primary Method ----------------*/ 294 /*--------------------------------------------------------------*/ 295 process(Timer t)296 void process(Timer t){ 297 298 final ConcurrentReadInputStream cris; 299 { 300 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null); 301 cris.setSampleRate(samplerate, sampleseed); 302 if(verbose){System.err.println("Started cris");} 303 cris.start(); //4567 304 } 305 final boolean paired=cris.paired(); 306 if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));} 307 308 TextStreamWriter tsw=null; 309 if(out!=null){ 310 tsw=new TextStreamWriter(ffout); 311 tsw.start(); 312 tsw.print("#count"); 313 if(showPercents){ 314 tsw.print("\tfirst\trand"); 315 if(paired){tsw.print("\tr1_first\tr1_rand\tr2_first\tr2_rand\tpair");} 316 } 317 if(showCounts){ 318 tsw.print("\tfirst_cnt\trand_cnt"); 319 if(paired){tsw.print("\tr1_first_cnt\tr1_rand_cnt\tr2_first_cnt\tr2_rand_cnt\tpair_cnt");} 320 } 321 if(showQuality){ 322 tsw.print("\tavg_quality\tperfect_prob"); 323 } 324 tsw.print("\n"); 325 } 326 327 //Counters for overall data statistics 328 long pairsProcessed=0; 329 long readsProcessed=0; 330 long basesProcessed=0; 331 332 //Counter for display intervals 333 long remaining=interval; 334 335 final StringBuilder sb=new StringBuilder(1024); 336 337 { 338 //Fetch initial list 339 ListNum<Read> ln=cris.nextList(); 340 ArrayList<Read> reads=(ln!=null ? ln.list : null); 341 342 if(reads!=null && !reads.isEmpty()){ 343 Read r=reads.get(0); 344 assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); 345 } 346 347 /* Process 1 list of reads per loop iteration */ 348 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 349 /* Process 1 read per loop iteration */ 350 for(Read r1 : reads){ 351 if(minAverageQuality<1 || r1.avgQualityFirstNBases(minAverageQualityBases)>=minAverageQuality){ 352 final Read r2=r1.mate; 353 final byte[] bases1=(r1==null ? null : r1.bases); 354 final byte[] bases2=(r2==null ? null : r2.bases); 355 final byte[] quals1=(r1==null ? null : r1.quality); 356 final byte[] quals2=(r2==null ? null : r2.quality); 357 final int length1=(bases1==null ? 0 : bases1.length); 358 final int length2=(bases2==null ? 0 : bases2.length); 359 360 pairsProcessed++; 361 362 /* Process read 1 */ 363 if(r1!=null){ 364 365 bothCounterFirst.incrementQuality(r1); 366 367 readsProcessed++; 368 basesProcessed+=length1; 369 370 if(length1>=k){ 371 if(length1>=k+singleOffset){//Fixed kmer 372 final long kmer=toKmer(bases1, quals1, singleOffset, k); 373 r1CounterFirst.increment(kmer); 374 bothCounterFirst.increment(kmer); 375 } 376 {//Random kmer 377 final long kmer=toKmer(bases1, quals1, randy.nextInt(length1-k2), k); 378 r1CounterRand.increment(kmer); 379 bothCounterRand.increment(kmer); 380 } 381 } 382 } 383 384 /* Process read 2 */ 385 if(r2!=null){ 386 387 bothCounterFirst.incrementQuality(r2); 388 389 readsProcessed++; 390 basesProcessed+=length2; 391 392 if(length2>=k){ 393 if(length2>=k+singleOffset){//Fixed kmer 394 final long kmer=toKmer(bases2, quals2, singleOffset, k); 395 r2CounterFirst.increment(kmer); 396 bothCounterFirst.increment(kmer); 397 } 398 {//Random kmer 399 final long kmer=toKmer(bases2, quals2, randy.nextInt(length2-k2), k); 400 r2CounterRand.increment(kmer); 401 bothCounterRand.increment(kmer); 402 } 403 } 404 } 405 406 /* Process pair */ 407 if(r1!=null && r2!=null){ 408 409 if(length1>k+PAIR_OFFSET && length2>k+PAIR_OFFSET){ 410 final long kmer1=toKmer(bases1, quals1, PAIR_OFFSET, k); 411 final long kmer2=toKmer(bases2, quals2, PAIR_OFFSET, k); 412 if(kmer1!=-1 && kmer2!=-1){ 413 final long kmer=(~((-1L)>>2))|((kmer1<<(2*(31-k)))^(kmer2)); 414 assert(kmer>=0) : k+", "+kmer1+", "+kmer2+", "+kmer; 415 {//Pair kmer 416 pairCounter.increment(kmer); 417 } 418 } 419 } 420 } 421 422 remaining--; 423 if(remaining<=0){ 424 425 printCountsToBuffer(sb, pairsProcessed, paired); 426 427 if(tsw!=null){tsw.print(sb.toString());} 428 429 //Reset things 430 sb.setLength(0); 431 remaining=interval; 432 } 433 } 434 } 435 436 //Fetch a new list 437 cris.returnList(ln); 438 ln=cris.nextList(); 439 reads=(ln!=null ? ln.list : null); 440 } 441 if(ln!=null){//Return final list 442 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); 443 } 444 } 445 446 if(remaining<interval && printLastBin){ 447 448 printCountsToBuffer(sb, pairsProcessed, paired); 449 450 if(tsw!=null){tsw.print(sb.toString());} 451 452 //Reset things 453 sb.setLength(0); 454 remaining=interval; 455 } 456 457 458 //Close things 459 errorState|=ReadWrite.closeStream(cris); 460 if(tsw!=null){ 461 tsw.poisonAndWait(); 462 errorState|=tsw.errorState; 463 } 464 465 t.stop(); 466 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); 467 468 if(testsize){ 469 long bytesProcessed=(new File(in1).length()+(in2==null ? 0 : new File(in2).length())); 470 double xpnano=bytesProcessed/(double)(t.elapsed); 471 String xpstring=(bytesProcessed<100000 ? ""+bytesProcessed : bytesProcessed<100000000 ? (bytesProcessed/1000)+"k" : (bytesProcessed/1000000)+"m"); 472 while(xpstring.length()<8){xpstring=" "+xpstring;} 473 outstream.println("Bytes Processed: "+xpstring+" \t"+String.format(Locale.ROOT, "%.2fm bytes/sec", xpnano*1000)); 474 } 475 476 if(errorState){ 477 throw new RuntimeException("CalcUniqueness terminated in an error state; the output may be corrupt."); 478 } 479 } 480 481 printCountsToBuffer(StringBuilder sb, long pairsProcessed, boolean paired)482 private void printCountsToBuffer(StringBuilder sb, long pairsProcessed, boolean paired){ 483 484 //Display data for the last interval 485 sb.append(pairsProcessed); 486 487 if(showPercents){ 488 sb.append('\t'); 489 sb.append(bothCounterFirst.percentS()); 490 sb.append('\t'); 491 sb.append(bothCounterRand.percentS()); 492 if(paired){ 493 sb.append('\t'); 494 sb.append(r1CounterFirst.percentS()); 495 sb.append('\t'); 496 sb.append(r1CounterRand.percentS()); 497 sb.append('\t'); 498 sb.append(r2CounterFirst.percentS()); 499 sb.append('\t'); 500 sb.append(r2CounterRand.percentS()); 501 sb.append('\t'); 502 sb.append(pairCounter.percentS()); 503 } 504 } 505 506 if(showCounts){ 507 sb.append('\t'); 508 sb.append(bothCounterFirst.misses()); 509 sb.append('\t'); 510 sb.append(bothCounterRand.misses()); 511 if(paired){ 512 sb.append('\t'); 513 sb.append(r1CounterFirst.misses()); 514 sb.append('\t'); 515 sb.append(r1CounterRand.misses()); 516 sb.append('\t'); 517 sb.append(r2CounterFirst.misses()); 518 sb.append('\t'); 519 sb.append(r2CounterRand.misses()); 520 sb.append('\t'); 521 sb.append(pairCounter.misses()); 522 } 523 } 524 525 if(showQuality){ 526 sb.append('\t').append(String.format(Locale.ROOT, "%.2f", bothCounterFirst.averageQuality())); 527 sb.append('\t').append(String.format(Locale.ROOT, "%.2f", bothCounterFirst.averagePerfectProb())); 528 } 529 530 sb.append('\n'); 531 532 bothCounterFirst.reset(); 533 bothCounterRand.reset(); 534 r1CounterFirst.reset(); 535 r1CounterRand.reset(); 536 r2CounterFirst.reset(); 537 r2CounterRand.reset(); 538 pairCounter.reset(); 539 } 540 541 542 /*--------------------------------------------------------------*/ 543 /*---------------- Helper Methods ----------------*/ 544 /*--------------------------------------------------------------*/ 545 546 /** 547 * Generate a kmer from specified start location 548 * @param bases 549 * @param start 550 * @param klen kmer length 551 * @return kmer 552 */ toKmer(final byte[] bases, byte[] quals, final int start, final int klen)553 private final long toKmer(final byte[] bases, byte[] quals, final int start, final int klen){ 554 if(minprob>0 && quals!=null){ 555 float prob=toProb(quals, start, klen); 556 if(prob<minprob){return -1;} 557 } 558 final int stop=start+klen; 559 assert(stop<=bases.length); 560 long kmer=0; 561 562 for(int i=start; i<stop; i++){ 563 final byte b=bases[i]; 564 final long x=Dedupe.baseToNumber[b]; 565 kmer=((kmer<<2)|x); 566 } 567 return kmer; 568 } 569 570 /** 571 * Generate the probability a kmer is error-free, from specified start location 572 * @param quals 573 * @param start 574 * @param klen kmer length 575 * @return kmer 576 */ toProb(final byte[] quals, final int start, final int klen)577 private final static float toProb(final byte[] quals, final int start, final int klen){ 578 final int stop=start+klen; 579 assert(stop<=quals.length); 580 float prob=1f; 581 582 for(int i=start; i<stop; i++){ 583 final byte q=quals[i]; 584 float pq=probCorrect[q]; 585 prob*=pq; 586 } 587 return prob; 588 } 589 590 /*--------------------------------------------------------------*/ 591 setSampleSeed(long seed)592 public void setSampleSeed(long seed){ 593 randy=Shared.threadLocalRandom(seed); 594 } 595 596 /*--------------------------------------------------------------*/ 597 /*---------------- Fields ----------------*/ 598 /*--------------------------------------------------------------*/ 599 600 private String in1=null; 601 private String in2=null; 602 603 private String out=null; 604 605 private String extin=null; 606 private String extout=null; 607 608 /*--------------------------------------------------------------*/ 609 610 //Counters for hashtable hits and misses of different kmers 611 private final Counter r1CounterFirst=new Counter(1); 612 private final Counter r1CounterRand=new Counter(2); 613 private final Counter r2CounterFirst=new Counter(4); 614 private final Counter r2CounterRand=new Counter(8); 615 private final Counter pairCounter=new Counter(16); 616 617 private final Counter bothCounterFirst=new Counter(32); 618 private final Counter bothCounterRand=new Counter(64); 619 620 /*--------------------------------------------------------------*/ 621 622 623 private long maxReads=-1; 624 private float samplerate=1f; 625 private long sampleseed=-1; 626 627 private long interval=25000; 628 private float minprob=0; 629 private float minAverageQuality=0; 630 private int minAverageQualityBases=20; 631 private int singleOffset=0; 632 private boolean cumulative=false; 633 private boolean showPercents=true; 634 private boolean showCounts=false; 635 private boolean printLastBin=false; 636 private boolean showQuality=true; 637 private boolean fixSpikes=false; 638 639 private final int k, k2; 640 private static final int WAYS=31; 641 private static final int PAIR_OFFSET=10; 642 643 /** Initial size of data structures */ 644 private int initialSize=512000; 645 646 /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */ 647 private final AbstractKmerTable[] keySets; 648 649 /*--------------------------------------------------------------*/ 650 651 private final FileFormat ffin1; 652 private final FileFormat ffin2; 653 654 private final FileFormat ffout; 655 656 657 /*--------------------------------------------------------------*/ 658 659 private PrintStream outstream=System.err; 660 public static boolean verbose=false; 661 public boolean errorState=false; 662 private boolean overwrite=false; 663 private boolean append=false; 664 private boolean testsize=false; 665 666 private static final boolean useForest=false, useTable=false, useArray=true; 667 668 private Random randy; 669 670 private static final float[] probCorrect= 671 {0.0000f, 0.2501f, 0.3690f, 0.4988f, 0.6019f, 0.6838f, 0.7488f, 0.8005f, 0.8415f, 0.8741f, 0.9000f, 0.9206f, 0.9369f, 0.9499f, 672 0.9602f, 0.9684f, 0.9749f, 0.9800f, 0.9842f, 0.9874f, 0.9900f, 0.9921f, 0.9937f, 0.9950f, 0.9960f, 0.9968f, 0.9975f, 0.9980f, 673 0.9984f, 0.9987f, 0.9990f, 0.9992f, 0.9994f, 0.9995f, 0.9996f, 0.9997f, 0.9997f, 0.9998f, 0.9998f, 0.9999f, 0.9999f, 0.9999f, 674 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 675 0.9999f, 0.9999f, 0.9999f, 0.9999f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f}; 676 677 678 } 679