1 package hiseq; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.util.ArrayList; 6 import java.util.Locale; 7 import java.util.Random; 8 9 import fileIO.ByteFile; 10 import fileIO.FileFormat; 11 import fileIO.ReadWrite; 12 import fileIO.TextStreamWriter; 13 import jgi.Dedupe; 14 import kmer.AbstractKmerTable; 15 import kmer.HashArray1D; 16 import kmer.ScheduleMaker; 17 import shared.Parse; 18 import shared.Parser; 19 import shared.PreParser; 20 import shared.Shared; 21 import shared.Timer; 22 import shared.Tools; 23 import shared.TrimRead; 24 import stream.ConcurrentReadInputStream; 25 import stream.ConcurrentReadOutputStream; 26 import stream.FASTQ; 27 import stream.FastaReadInputStream; 28 import stream.Read; 29 import structures.ListNum; 30 31 /** 32 * Analyzes a flow cell for low-quality areas. 33 * Removes reads in the low-quality areas. 34 * 35 * @author Brian Bushnell 36 * @date August 31, 2016 37 * 38 */ 39 public class AnalyzeFlowCell { 40 41 /*--------------------------------------------------------------*/ 42 /*---------------- Initialization ----------------*/ 43 /*--------------------------------------------------------------*/ 44 45 /** 46 * Code entrance from the command line. 47 * @param args Command line arguments 48 */ main(String[] args)49 public static void main(String[] args){ 50 Timer t=new Timer(); 51 AnalyzeFlowCell x=new AnalyzeFlowCell(args); 52 x.process(t); 53 54 //Close the print stream if it was redirected 55 Shared.closeStream(x.outstream); 56 } 57 58 /** 59 * Constructor. 60 * @param args Command line arguments 61 */ AnalyzeFlowCell(String[] args)62 public AnalyzeFlowCell(String[] args){ 63 64 {//Preparse block for help, config files, and outstream 65 PreParser pp=new PreParser(args, getClass(), false); 66 args=pp.args; 67 outstream=pp.outstream; 68 } 69 70 //Set shared static variables 71 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; 72 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); 73 74 Parser parser=parse(args); 75 76 if(gToN || discardG){MicroTile.TRACK_CYCLES=true;} 77 78 {//Process parser fields 79 Parser.processQuality(); 80 81 maxReads=parser.maxReads; 82 83 overwrite=parser.overwrite; 84 append=parser.append; 85 setInterleaved=parser.setInterleaved; 86 87 in1=parser.in1; 88 in2=parser.in2; 89 qfin1=parser.qfin1; 90 qfin2=parser.qfin2; 91 92 out1=parser.out1; 93 out2=parser.out2; 94 qfout1=parser.qfout1; 95 qfout2=parser.qfout2; 96 97 extin=parser.extin; 98 extout=parser.extout; 99 100 101 trimq=parser.trimq; 102 trimE=parser.trimE(); 103 minlen=parser.minReadLength; 104 trimLeft=parser.qtrimLeft; 105 trimRight=parser.qtrimRight; 106 } 107 108 checkFiles(); 109 110 //Create output FileFormat objects 111 ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false); 112 ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false); 113 ffoutbad=FileFormat.testOutput(outbad, FileFormat.FASTQ, extout, true, overwrite, append, false); 114 115 //Create input FileFormat objects 116 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); 117 ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true); 118 } 119 120 /*--------------------------------------------------------------*/ 121 /*---------------- Initialization Helpers ----------------*/ 122 /*--------------------------------------------------------------*/ 123 parse(String[] args)124 private Parser parse(String[] args){ 125 //Create a parser object 126 Parser parser=new Parser(); 127 parser.qtrimRight=trimRight; 128 parser.trimq=trimq; 129 parser.minReadLength=minlen; 130 131 //Parse each argument 132 for(int i=0; i<args.length; i++){ 133 String arg=args[i]; 134 135 //Break arguments into their constituent parts, in the form of "a=b" 136 String[] split=arg.split("="); 137 String a=split[0].toLowerCase(); 138 String b=split.length>1 ? split[1] : null; 139 140 if(a.equals("verbose")){ 141 verbose=Parse.parseBoolean(b); 142 }else if(a.equals("seed")){ 143 seed=Long.parseLong(b); 144 }else if(a.equals("divisor") || a.equals("size")){ 145 Tile.xSize=Tile.ySize=Integer.parseInt(b); 146 }else if(a.equals("xdivisor") || a.equals("xsize")){ 147 Tile.xSize=Integer.parseInt(b); 148 }else if(a.equals("ydivisor") || a.equals("ysize")){ 149 Tile.ySize=Integer.parseInt(b); 150 }else if(a.equals("target")){ 151 targetAverageReads=Integer.parseInt(b); 152 }else if(a.equals("dump")){ 153 dump=b; 154 }else if(a.equals("indump") || a.equals("ind") || a.equals("dumpin")){ 155 dumpIn=b; 156 }else if(a.equals("pound")){ 157 pound=Parse.parseBoolean(b); 158 }else if(a.equals("loadkmers") || a.equals("usekmers")){ 159 loadKmers=Parse.parseBoolean(b); 160 }else if(a.equals("lqo") || a.equals("lowqualityonly")){ 161 discardOnlyLowQuality=Parse.parseBoolean(b); 162 }else if(a.equals("dl") || a.equals("discardlevel")){ 163 discardLevel=Integer.parseInt(b); 164 }else if(a.equals("outbad") || a.equals("outb") || a.equals("outtoss") || a.equals("outt") || a.equals("outunwanted") || a.equals("outu")){ 165 outbad=b; 166 } 167 168 else if(a.equals("deviations") || a.equals("d")){ 169 qDeviations=uDeviations=eDeviations=gDeviations=Float.parseFloat(b); 170 }else if(a.equals("qdeviations") || a.equals("qd") || a.equals("dq")){ 171 qDeviations=Float.parseFloat(b); 172 }else if(a.equals("udeviations") || a.equals("ud") || a.equals("du")){ 173 uDeviations=Float.parseFloat(b); 174 }else if(a.equals("edeviations") || a.equals("ed") || a.equals("de")){ 175 eDeviations=Float.parseFloat(b); 176 }else if(a.equals("gdeviations") || a.equals("gd") || a.equals("dg")){ 177 gDeviations=Float.parseFloat(b); 178 } 179 180 else if(a.equals("qfraction") || a.equals("qf")){ 181 qualFraction=Float.parseFloat(b); 182 }else if(a.equals("ufraction") || a.equals("uf")){ 183 uniqueFraction=Float.parseFloat(b); 184 }else if(a.equals("efraction") || a.equals("ef")){ 185 errorFreeFraction=Float.parseFloat(b); 186 }else if(a.equals("gfraction") || a.equals("gf")){ 187 gFraction=Float.parseFloat(b); 188 } 189 190 else if(a.equals("qabsolute") || a.equals("qa")){ 191 qualAbs=Float.parseFloat(b); 192 }else if(a.equals("uabsolute") || a.equals("ua")){ 193 uniqueAbs=Float.parseFloat(b); 194 }else if(a.equals("eabsolute") || a.equals("ea")){ 195 errorFreeAbs=Float.parseFloat(b); 196 }else if(a.equals("gabsolute") || a.equals("ga")){ 197 gAbs=Float.parseFloat(b); 198 } 199 200 else if(a.equals("gton")){ 201 gToN=Parse.parseBoolean(b); 202 }else if(a.equals("discardg")){ 203 discardG=Parse.parseBoolean(b); 204 } 205 206 else if(a.equals("minpolyg")){ 207 MicroTile.MIN_POLY_G=Integer.parseInt(b); 208 }else if(a.equals("trackcycles")){ 209 MicroTile.TRACK_CYCLES=Parse.parseBoolean(b); 210 } 211 212 else if(a.equals("parse_flag_goes_here")){ 213 //Set a variable here 214 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser 215 //do nothing 216 }else{ 217 outstream.println("Unknown parameter "+args[i]); 218 assert(false) : "Unknown parameter "+args[i]; 219 // throw new RuntimeException("Unknown parameter "+args[i]); 220 } 221 } 222 return parser; 223 } 224 checkFiles()225 private void checkFiles(){ 226 doPoundReplacement(); 227 adjustInterleaving(); 228 checkFileExistence(); 229 checkStatics(); 230 } 231 doPoundReplacement()232 private void doPoundReplacement(){ 233 //Do input file # replacement 234 if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ 235 in2=in1.replace("#", "2"); 236 in1=in1.replace("#", "1"); 237 } 238 239 //Do output file # replacement 240 if(out1!=null && out2==null && out1.indexOf('#')>-1){ 241 out2=out1.replace("#", "2"); 242 out1=out1.replace("#", "1"); 243 } 244 245 //Ensure there is an input file 246 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} 247 248 //Ensure out2 is not set without out1 249 if(out1==null && out2!=null){throw new RuntimeException("Error - cannot define out2 without defining out1.");} 250 } 251 checkFileExistence()252 private void checkFileExistence(){ 253 254 //Ensure output files can be written 255 if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outbad, dump)){ 256 outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2+", "+outbad); 257 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+", "+outbad+", "+dump+"\n"); 258 } 259 260 //Ensure input files can be read 261 if(!Tools.testInputFiles(false, true, in1, in2)){ 262 throw new RuntimeException("\nCan't read some input files.\n"); 263 } 264 265 //Ensure that no file was specified multiple times 266 if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2, outbad, dump)){ 267 throw new RuntimeException("\nSome file names were specified multiple times.\n"); 268 } 269 } 270 adjustInterleaving()271 private void adjustInterleaving(){ 272 //Adjust interleaved detection based on the number of input files 273 if(in2!=null){ 274 if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");} 275 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; 276 } 277 278 //Adjust interleaved settings based on number of output files 279 if(!setInterleaved){ 280 assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n"; 281 if(in2!=null){ //If there are 2 input streams. 282 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; 283 outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); 284 }else{ //There is one input stream. 285 if(out2!=null){ 286 FASTQ.FORCE_INTERLEAVED=true; 287 FASTQ.TEST_INTERLEAVED=false; 288 outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); 289 } 290 } 291 } 292 } 293 checkStatics()294 private static void checkStatics(){ 295 //Adjust the number of threads for input file reading 296 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ 297 ByteFile.FORCE_MODE_BF2=true; 298 } 299 300 assert(FastaReadInputStream.settingsOK()); 301 } 302 303 /*--------------------------------------------------------------*/ 304 /*---------------- Outer Methods ----------------*/ 305 /*--------------------------------------------------------------*/ 306 307 /** Create read streams and process all data */ process(Timer t)308 public void process(Timer t){ 309 310 //Reset counters 311 readsProcessed=0; 312 basesProcessed=0; 313 314 if(dumpIn==null){ 315 flowcell=new FlowCell(); 316 if(loadKmers){loadKmers();} 317 fillTiles(); 318 keySets=null; 319 }else{ 320 flowcell=new FlowCell(dumpIn); 321 322 if(flowcell.avgReads<targetAverageReads){ 323 flowcell=flowcell.widen(targetAverageReads); 324 } 325 326 avgQuality=flowcell.avgQuality; 327 avgUnique=flowcell.avgUnique; 328 avgErrorFree=flowcell.avgErrorFree; 329 avgG=flowcell.avgG; 330 stdQuality=flowcell.stdQuality; 331 stdUnique=flowcell.stdUnique; 332 stdErrorFree=flowcell.stdErrorFree; 333 stdG=flowcell.stdG; 334 335 long readsToDiscard=markTiles(flowcell.toList(), flowcell.avgReads); 336 } 337 processReads(t); 338 } 339 340 /** Create read streams and process all data */ loadKmers()341 void loadKmers(){ 342 Timer t2=new Timer(); 343 outstream.print("Loading kmers: \t"); 344 345 //Create a read input stream 346 final ConcurrentReadInputStream cris; 347 { 348 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2); 349 cris.start(); //Start the stream 350 if(verbose){outstream.println("Started cris");} 351 } 352 boolean paired=cris.paired(); 353 // if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} 354 355 //Process the read stream 356 loadKmersInner(cris); 357 358 if(verbose){outstream.println("Finished; closing streams.");} 359 360 //Close the read streams 361 errorState|=ReadWrite.closeStreams(cris); 362 363 t2.stop(); 364 outstream.println(t2); 365 } 366 367 /** Create read streams and process all data */ fillTiles()368 void fillTiles(){ 369 370 //Create a read input stream 371 final ConcurrentReadInputStream cris; 372 { 373 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2); 374 cris.start(); //Start the stream 375 if(verbose){outstream.println("Started cris");} 376 } 377 boolean paired=cris.paired(); 378 // if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} 379 380 //Process the read stream 381 fillTilesInner(cris); 382 383 if(verbose){outstream.println("Finished; closing streams.");} 384 385 //Close the read streams 386 errorState|=ReadWrite.closeStreams(cris); 387 } 388 389 /** Create read streams and process all data */ processReads(Timer t)390 void processReads(Timer t){ 391 392 if(ffout1!=null || ffoutbad!=null){ 393 Timer t2=new Timer(); 394 outstream.print("Filtering reads:\t"); 395 396 //Create a read input stream 397 final ConcurrentReadInputStream cris; 398 { 399 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2); 400 cris.start(); //Start the stream 401 if(verbose){outstream.println("Started cris");} 402 } 403 boolean paired=cris.paired(); 404 // if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} 405 406 //Optionally read output streams 407 final ConcurrentReadOutputStream ros, rosb; 408 final int buff=4; 409 if(ffout1!=null){ 410 ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false); 411 ros.start(); //Start the stream 412 }else{ros=null;} 413 414 if(ffoutbad!=null){ 415 rosb=ConcurrentReadOutputStream.getStream(ffoutbad, null, null, null, buff, null, false); 416 rosb.start(); //Start the stream 417 }else{rosb=null;} 418 419 //Process the read stream 420 processInner(cris, ros, rosb); 421 422 if(verbose){outstream.println("Finished; closing streams.");} 423 424 //Close the read streams 425 errorState|=ReadWrite.closeStreams(cris, ros, rosb); 426 427 t2.stop(); 428 outstream.println(t2); 429 } 430 431 //Report timing and results 432 { 433 t.stop(); 434 lastReadsOut=readsProcessed-readsDiscarded; 435 outstream.println(); 436 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); 437 438 if(ffout1!=null || ffoutbad!=null){ 439 440 String rpstring=Tools.padKM(readsDiscarded, 8); 441 String bpstring=Tools.padKM(basesDiscarded, 8); 442 String gpstring=Tools.padKM(gsTransformedToN, 8); 443 outstream.println(); 444 outstream.println("Reads Discarded: "+rpstring+" \t"+String.format(Locale.ROOT, "%.3f%%", readsDiscarded*100.0/readsProcessed)); 445 outstream.println("Bases Discarded: "+bpstring+" \t"+String.format(Locale.ROOT, "%.3f%%", basesDiscarded*100.0/basesProcessed)); 446 if(gToN){outstream.println("Gs Masked By N: "+gpstring+" \t"+String.format(Locale.ROOT, "%.3f%%", gsTransformedToN*100.0/basesProcessed));} 447 outstream.println(); 448 } 449 } 450 451 //Throw an exception of there was an error in a thread 452 if(errorState){ 453 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); 454 } 455 } 456 457 /** Iterate through the reads */ processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros, final ConcurrentReadOutputStream rosb)458 void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros, final ConcurrentReadOutputStream rosb){ 459 460 //Do anything necessary prior to processing 461 readsProcessed=0; 462 basesProcessed=0; 463 464 { 465 //Grab the first ListNum of reads 466 ListNum<Read> ln=cris.nextList(); 467 //Grab the actual read list from the ListNum 468 ArrayList<Read> reads=(ln!=null ? ln.list : null); 469 470 //Check to ensure pairing is as expected 471 if(reads!=null && !reads.isEmpty()){ 472 Read r=reads.get(0); 473 assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); 474 } 475 476 //As long as there is a nonempty read list... 477 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 478 if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} 479 480 ArrayList<Read> keepList=new ArrayList<Read>(reads.size()); 481 ArrayList<Read> tossList=new ArrayList<Read>(4); 482 483 //Loop through each read in the list 484 for(int idx=0; idx<reads.size(); idx++){ 485 final Read r1=reads.get(idx); 486 final Read r2=r1.mate; 487 488 //Track the initial length for statistics 489 final int initialLength1=r1.length(); 490 final int initialLength2=(r1.mateLength()); 491 492 //Increment counters 493 readsProcessed+=r1.pairCount(); 494 basesProcessed+=initialLength1+initialLength2; 495 496 boolean keep=processReadPair(r1, r2); 497 if(keep){ 498 keepList.add(r1); 499 }else{ 500 tossList.add(r1); 501 readsDiscarded+=r1.pairCount(); 502 basesDiscarded+=initialLength1+initialLength2; 503 } 504 } 505 506 //Output reads to the output stream 507 if(ros!=null){ros.add(keepList, ln.id);} 508 if(rosb!=null){rosb.add(tossList, ln.id);} 509 510 //Notify the input stream that the list was used 511 cris.returnList(ln); 512 if(verbose){outstream.println("Returned a list.");} 513 514 //Fetch a new list 515 ln=cris.nextList(); 516 reads=(ln!=null ? ln.list : null); 517 } 518 519 //Notify the input stream that the final list was used 520 if(ln!=null){ 521 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); 522 } 523 } 524 525 //Do anything necessary after processing 526 527 } 528 529 /** Iterate through the reads */ loadKmersInner(final ConcurrentReadInputStream cris)530 public void loadKmersInner(final ConcurrentReadInputStream cris){ 531 532 keySets=new AbstractKmerTable[WAYS]; 533 534 //Initialize tables 535 ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, 12, false, 0.8); 536 int[] schedule=scheduleMaker.makeSchedule(); 537 for(int j=0; j<WAYS; j++){ 538 keySets[j]=new HashArray1D(schedule, -1L); 539 } 540 // for(int j=0; j<WAYS; j++){ 541 // keySets[j]=new HashArray1D(512000, -1, -1L, true); //TODO: Set maxSize 542 // } 543 //Do anything necessary prior to processing 544 545 { 546 //Grab the first ListNum of reads 547 ListNum<Read> ln=cris.nextList(); 548 //Grab the actual read list from the ListNum 549 ArrayList<Read> reads=(ln!=null ? ln.list : null); 550 551 //Check to ensure pairing is as expected 552 if(reads!=null && !reads.isEmpty()){ 553 Read r=reads.get(0); 554 assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); 555 } 556 557 //As long as there is a nonempty read list... 558 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 559 if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} 560 561 //Loop through each read in the list 562 for(int idx=0; idx<reads.size(); idx++){ 563 final Read r1=reads.get(idx); 564 final Read r2=r1.mate; 565 566 //Track the initial length for statistics 567 final int initialLength1=r1.length(); 568 final int initialLength2=(r1.mateLength()); 569 570 if(initialLength1>=k && randy.nextBoolean()){ 571 final long kmer=toKmer(r1.bases, r1.quality, randy.nextInt(initialLength1-k2), k); 572 if(kmer>=0){ 573 AbstractKmerTable table=keySets[(int)(kmer%WAYS)]; 574 table.increment(kmer, 1); 575 } 576 } 577 578 if(initialLength2>=k && randy.nextBoolean()){ 579 final long kmer=toKmer(r2.bases, r2.quality, randy.nextInt(initialLength2-k2), k); 580 if(kmer>=0){ 581 AbstractKmerTable table=keySets[(int)(kmer%WAYS)]; 582 table.increment(kmer, 1); 583 } 584 } 585 } 586 587 //Notify the input stream that the list was used 588 cris.returnList(ln); 589 if(verbose){outstream.println("Returned a list.");} 590 591 //Fetch a new list 592 ln=cris.nextList(); 593 reads=(ln!=null ? ln.list : null); 594 } 595 596 //Notify the input stream that the final list was used 597 if(ln!=null){ 598 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); 599 } 600 } 601 602 //Do anything necessary after processing 603 } 604 605 /** Iterate through the reads */ fillTilesInner(final ConcurrentReadInputStream cris)606 public void fillTilesInner(final ConcurrentReadInputStream cris){ 607 608 609 Timer t2=new Timer(); 610 outstream.print("Filling tiles: \t"); 611 612 613 { 614 //Grab the first ListNum of reads 615 ListNum<Read> ln=cris.nextList(); 616 //Grab the actual read list from the ListNum 617 ArrayList<Read> reads=(ln!=null ? ln.list : null); 618 619 //Check to ensure pairing is as expected 620 if(reads!=null && !reads.isEmpty()){ 621 Read r=reads.get(0); 622 assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); 623 } 624 625 //As long as there is a nonempty read list... 626 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 627 if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} 628 629 //Loop through each read in the list 630 for(int idx=0; idx<reads.size(); idx++){ 631 final Read r1=reads.get(idx); 632 final Read r2=r1.mate; 633 634 //Track the initial length for statistics 635 final int initialLength1=r1.length(); 636 final int initialLength2=(r1.mateLength()); 637 638 //Increment counters 639 readsProcessed+=r1.pairCount(); 640 basesProcessed+=initialLength1+initialLength2; 641 642 final MicroTile mt=flowcell.getMicroTile(r1.id); 643 644 if(loadKmers){ 645 if(initialLength1>=k){ 646 final long kmer=toKmer(r1.bases, r1.quality, randy.nextInt(initialLength1-k2), k); 647 if(kmer>=0){ 648 AbstractKmerTable table=keySets[(int)(kmer%WAYS)]; 649 if(table.getValue(kmer)>0){mt.hits++;} 650 else{mt.misses++;} 651 }else{mt.misses++;} 652 } 653 654 if(initialLength1>=k){ 655 final long kmer=toKmer(r1.bases, r1.quality, randy.nextInt(initialLength1-k2), k); 656 if(kmer>=0){ 657 AbstractKmerTable table=keySets[(int)(kmer%WAYS)]; 658 if(table.getValue(kmer)>0){mt.hits++;} 659 else{mt.misses++;} 660 }else{mt.misses++;} 661 } 662 } 663 664 mt.add(r1); 665 mt.add(r2); 666 } 667 668 //Notify the input stream that the list was used 669 cris.returnList(ln); 670 if(verbose){outstream.println("Returned a list.");} 671 672 //Fetch a new list 673 ln=cris.nextList(); 674 reads=(ln!=null ? ln.list : null); 675 } 676 677 //Notify the input stream that the final list was used 678 if(ln!=null){ 679 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); 680 } 681 } 682 683 t2.stop(); 684 outstream.println(t2); 685 686 ArrayList<MicroTile> mtList=flowcell.calcStats(); 687 if(flowcell.avgReads<targetAverageReads){ 688 flowcell=flowcell.widen(targetAverageReads); 689 mtList=flowcell.toList(); 690 } 691 avgQuality=flowcell.avgQuality; 692 avgUnique=flowcell.avgUnique; 693 avgErrorFree=flowcell.avgErrorFree; 694 avgG=flowcell.avgG; 695 696 stdQuality=flowcell.stdQuality; 697 stdUnique=flowcell.stdUnique; 698 stdErrorFree=flowcell.stdErrorFree; 699 stdG=flowcell.stdG; 700 701 long readsToDiscard=markTiles(mtList, flowcell.avgReads); 702 703 if(dump!=null){ 704 TextStreamWriter tsw=new TextStreamWriter(dump, overwrite, append, false); 705 tsw.start(); 706 707 tsw.println("#xSize\t"+Tile.xSize); 708 tsw.println("#ySize\t"+Tile.ySize); 709 tsw.println("#reads\t"+String.format(Locale.ROOT, "%d", flowcell.readsProcessed)); 710 tsw.println("#avgReads\t"+String.format(Locale.ROOT, "%.1f", flowcell.avgReads)); 711 712 tsw.println("#avgQuality\t"+String.format(Locale.ROOT, "%.3f", avgQuality)); 713 tsw.println("#avgUnique\t"+String.format(Locale.ROOT, "%.3f", avgUnique)); 714 tsw.println("#avgErrorFree\t"+String.format(Locale.ROOT, "%.3f", avgErrorFree)); 715 tsw.println("#avgG\t"+String.format(Locale.ROOT, "%.3f", avgG)); 716 717 tsw.println("#stdQuality\t"+String.format(Locale.ROOT, "%.5f", stdQuality)); 718 tsw.println("#stdUnique\t"+String.format(Locale.ROOT, "%.5f", stdUnique)); 719 tsw.println("#stdErrorFree\t"+String.format(Locale.ROOT, "%.5f", stdErrorFree)); 720 tsw.println("#stdG\t"+String.format(Locale.ROOT, "%.5f", stdG)); 721 722 tsw.println((pound ? "#" : "") + "lane\ttile\tx1\tx2\ty1\ty2\treads\tunique\tquality\tprobErrorFree\tdiscard"); 723 724 for(Lane lane : flowcell.lanes){ 725 if(lane!=null){ 726 for(Tile tile : lane.tiles){ 727 if(tile!=null){ 728 tsw.print(tile.toString()); 729 } 730 } 731 } 732 } 733 tsw.poisonAndWait(); 734 } 735 } 736 737 /*--------------------------------------------------------------*/ 738 /*---------------- Inner Methods ----------------*/ 739 /*--------------------------------------------------------------*/ 740 processReadPair(final Read r1, final Read r2)741 boolean processReadPair(final Read r1, final Read r2){ 742 boolean passes=processReadPair_inner(r1, r2); 743 if(passes){return true;} 744 if(trimq>0){ 745 TrimRead.trimFast(r1, trimLeft, trimRight, trimq, trimE, 0); 746 if(r2!=null){TrimRead.trimFast(r2, trimLeft, trimRight, trimq, trimE, 0);} 747 return r1.length()>=minlen && (r2==null || r2.length()>=minlen); 748 }else{ 749 return false; 750 } 751 } 752 753 /** 754 * Process a single read pair. 755 * @param r1 Read 1 756 * @param r2 Read 2 (may be null) 757 * @return True if the reads should be kept, false if they should be discarded. 758 */ processReadPair_inner(final Read r1, final Read r2)759 boolean processReadPair_inner(final Read r1, final Read r2){ 760 761 MicroTile mt=flowcell.getMicroTile(r1.id); 762 if(mt==null){ 763 if(!warned){ 764 outstream.println("\nWarning - a read was found with no corresponding MicroTile:\n"+r1.id); 765 warned=true; 766 } 767 return true; 768 } 769 if(mt.discard<discardLevel){return true;} 770 if(!discardOnlyLowQuality){return false;} 771 int len1=r1.length(), len2=r1.mateLength(); 772 if(len1>0){ 773 double qual=r1.avgQualityByProbabilityDouble(true, len1); 774 double prob=100*r1.probabilityErrorFree(true, len1); 775 if(qual<avgQuality-qDeviations*stdQuality){return false;} 776 if(prob<avgErrorFree-eDeviations*stdErrorFree){return false;} 777 if(discardG && shouldDiscardG(r1, mt)){return false;} 778 if(gToN){gsTransformedToN+=doGToN(r1, mt);} 779 } 780 if(len2>0){ 781 double qual=r2.avgQualityByProbabilityDouble(true, len2); 782 double prob=100*r2.probabilityErrorFree(true, len2); 783 if(qual<avgQuality-qDeviations*stdQuality){return false;} 784 if(prob<avgErrorFree-eDeviations*stdErrorFree){return false;} 785 if(discardG && shouldDiscardG(r2, mt)){return false;} 786 if(gToN){gsTransformedToN+=doGToN(r2, mt);} 787 } 788 return true; 789 } 790 shouldDiscardG(Read r, MicroTile mt)791 private boolean shouldDiscardG(Read r, MicroTile mt){ 792 final byte[] bases=r.bases; 793 final float[] gArray=mt.tracker.cycleAverages[2]; 794 795 final float thresh=(float)(avgG+Tools.max(gDeviations*stdG, avgG*gFraction, gAbs)); 796 for(int i=0; i<bases.length; i++){ 797 byte b=bases[i]; 798 if(b=='G' && gArray[i]>thresh){ 799 return true; 800 } 801 } 802 return false; 803 } 804 doGToN(Read r, MicroTile mt)805 private int doGToN(Read r, MicroTile mt){ 806 final byte[] bases=r.bases; 807 final byte[] quals=r.quality; 808 final float[] gArray=mt.tracker.cycleAverages[2]; 809 810 final float thresh=(float)(avgG+Tools.max(gDeviations*stdG, avgG*gFraction, gAbs)); 811 int changes=0; 812 for(int i=0; i<bases.length; i++){ 813 byte b=bases[i]; 814 if(b=='G' && gArray[i]>thresh){ 815 bases[i]='N'; 816 changes++; 817 if(quals!=null){quals[i]=0;} 818 } 819 } 820 return changes; 821 } 822 823 /*--------------------------------------------------------------*/ 824 /*---------------- Helper Methods ----------------*/ 825 /*--------------------------------------------------------------*/ 826 827 /** 828 * Generate a kmer from specified start location 829 * @param bases 830 * @param start 831 * @param klen kmer length 832 * @return kmer 833 */ toKmer(final byte[] bases, byte[] quals, final int start, final int klen)834 private static final long toKmer(final byte[] bases, byte[] quals, final int start, final int klen){ 835 // if(minprob>0 && quals!=null){ 836 // float prob=toProb(quals, start, klen); 837 // if(prob<minprob){return -1;} 838 // } 839 final int stop=start+klen; 840 assert(stop<=bases.length) : klen+", "+bases.length; 841 long kmer=0; 842 843 for(int i=start; i<stop; i++){ 844 final byte b=bases[i]; 845 final long x=Dedupe.baseToNumber[b]; 846 kmer=((kmer<<2)|x); 847 } 848 return kmer; 849 } 850 markTiles(ArrayList<MicroTile> mtList, double avgReads)851 private final long markTiles(ArrayList<MicroTile> mtList, double avgReads){ 852 for(MicroTile mt : mtList){ 853 mt.discard=0; 854 } 855 long readsToDiscard=0; 856 857 cDiscards=qDiscards=eDiscards=uDiscards=mtDiscards=gDiscards=mtRetained=0; 858 859 for(MicroTile mt : mtList){ 860 double q=mt.averageQuality(); 861 double e=mt.percentErrorFree(); 862 double u=mt.uniquePercent(); 863 double g=mt.maxG(); 864 865 double dq=avgQuality-q; 866 double de=avgErrorFree-e; 867 double du=u-avgUnique; 868 double dg=g-avgG; 869 870 if(mt.readCount<10 && mt.readCount<0.02f*avgReads){ 871 mt.discard++; 872 cDiscards++; 873 } 874 875 if(dq>qDeviations*stdQuality && dq>avgQuality*qualFraction && dq>qualAbs){ 876 mt.discard++; 877 qDiscards++; 878 } 879 if(de>eDeviations*stdErrorFree && de>avgErrorFree*errorFreeFraction && de>errorFreeAbs){ 880 mt.discard++; 881 eDiscards++; 882 } 883 if(avgUnique>2 && avgUnique<98){ 884 if(du>uDeviations*stdUnique && du>avgUnique*uniqueFraction && du>uniqueAbs){ 885 mt.discard++; 886 uDiscards++; 887 } 888 } 889 890 //Or mode 891 // if(dq>qDeviations*stdQuality || dq>avgQuality*qualFraction || dq>qualAbs){ 892 // mt.discard++; 893 // qDiscards++; 894 // } 895 // if(de>eDeviations*stdErrorFree || de>avgErrorFree*errorFreeFraction || de>errorFreeAbs){ 896 // mt.discard++; 897 // eDiscards++; 898 // } 899 // if(avgUnique>2 && avgUnique<98){ 900 // if(du>uDeviations*stdUnique || du>avgUnique*uniqueFraction || du>uniqueAbs){ 901 // mt.discard++; 902 // uDiscards++; 903 // } 904 // } 905 906 if((discardG || gToN) && (dg>gDeviations*stdG && dg>avgG*gFraction && dg>gAbs)){ 907 mt.discard++; 908 gDiscards++; 909 } 910 if(mt.discard>0){ 911 mtDiscards++; 912 readsToDiscard+=mt.readCount; 913 } 914 else{mtRetained++;} 915 } 916 917 outstream.println(); 918 outstream.println("Flagged "+mtDiscards+" of "+(mtDiscards+mtRetained)+" micro-tiles, containing "+readsToDiscard+" reads:"); 919 outstream.println(uDiscards+" exceeded uniqueness thresholds."); 920 outstream.println(qDiscards+" exceeded quality thresholds."); 921 outstream.println(eDiscards+" exceeded error-free probability thresholds."); 922 outstream.println(gDiscards+" contained G spikes."); 923 outstream.println(cDiscards+" had too few reads to calculate statistics."); 924 outstream.println(); 925 926 return readsToDiscard; 927 } 928 929 /*--------------------------------------------------------------*/ 930 /*---------------- Fields ----------------*/ 931 /*--------------------------------------------------------------*/ 932 933 /** Primary input file path */ 934 private String in1=null; 935 /** Secondary input file path */ 936 private String in2=null; 937 938 private String qfin1=null; 939 private String qfin2=null; 940 941 /** Primary output file path */ 942 private String out1=null; 943 /** Secondary output file path */ 944 private String out2=null; 945 946 /** Discard output file path */ 947 private String outbad=null; 948 949 private String qfout1=null; 950 private String qfout2=null; 951 952 /** Override input file extension */ 953 private String extin=null; 954 /** Override output file extension */ 955 private String extout=null; 956 957 private boolean pound=true; 958 private String dump=null; 959 private String dumpIn=null; 960 961 /*--------------------------------------------------------------*/ 962 963 /** Number of reads processed */ 964 public long readsProcessed=0; 965 /** Number of bases processed */ 966 public long basesProcessed=0; 967 968 /** Number of reads discarded */ 969 public long readsDiscarded=0; 970 /** Number of bases discarded */ 971 public long basesDiscarded=0; 972 973 protected long cDiscards=0; 974 protected long uDiscards=0; 975 protected long qDiscards=0; 976 protected long eDiscards=0; 977 protected long gDiscards=0; 978 protected long mtDiscards=0; 979 protected long mtRetained=0; 980 981 protected long gsTransformedToN=0; 982 983 /** Quit after processing this many input reads; -1 means no limit */ 984 private long maxReads=-1; 985 986 /** Whether interleaved was explicitly set. */ 987 private boolean setInterleaved=false; 988 989 /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */ 990 private AbstractKmerTable[] keySets; 991 992 private int targetAverageReads=800; 993 994 private float minprob=0; 995 private static final int WAYS=31; 996 private static final int k=31, k2=30; 997 private long seed=-1; 998 999 private final Random randy=Shared.threadLocalRandom(seed); 1000 private FlowCell flowcell; 1001 1002 // private int[] avgQualityArray; 1003 // private int[] avgUniqueArray; 1004 1005 private long minCountToUse=0; 1006 1007 private float qDeviations=2f; 1008 private float uDeviations=1.5f; 1009 private float eDeviations=2f; 1010 private float gDeviations=2f; 1011 1012 private float qualFraction=0.01f; 1013 private float uniqueFraction=0.01f; 1014 private float errorFreeFraction=0.01f; 1015 private float gFraction=0.1f; 1016 1017 private float qualAbs=1f; 1018 private float uniqueAbs=1f; 1019 private float errorFreeAbs=1f; 1020 private float gAbs=0.05f; 1021 1022 private double avgQuality; 1023 private double avgUnique; 1024 private double avgErrorFree; 1025 private double avgG; 1026 1027 private double stdQuality; 1028 private double stdUnique; 1029 private double stdErrorFree; 1030 private double stdG; 1031 1032 private boolean loadKmers=true; 1033 private boolean discardOnlyLowQuality=true; 1034 private int discardLevel=1; 1035 private boolean gToN=false; 1036 private boolean discardG=false; 1037 1038 private int minlen=30; 1039 private float trimq=-1; 1040 private final float trimE; 1041 private boolean trimLeft=false; 1042 private boolean trimRight=true; 1043 1044 private boolean warned=false; 1045 1046 /*--------------------------------------------------------------*/ 1047 /*---------------- Final Fields ----------------*/ 1048 /*--------------------------------------------------------------*/ 1049 1050 /** Primary input file */ 1051 private final FileFormat ffin1; 1052 /** Secondary input file */ 1053 private final FileFormat ffin2; 1054 1055 /** Primary output file */ 1056 private final FileFormat ffout1; 1057 /** Secondary output file */ 1058 private final FileFormat ffout2; 1059 1060 /** Output for discarded reads */ 1061 private final FileFormat ffoutbad; 1062 1063 /*--------------------------------------------------------------*/ 1064 /*---------------- Common Fields ----------------*/ 1065 /*--------------------------------------------------------------*/ 1066 1067 /** Number of reads output in the last run */ 1068 public static long lastReadsOut; 1069 /** Print status messages to this output stream */ 1070 private PrintStream outstream=System.err; 1071 /** Print verbose messages */ 1072 public static boolean verbose=false; 1073 /** True if an error was encountered */ 1074 public boolean errorState=false; 1075 /** Overwrite existing output files */ 1076 private boolean overwrite=false; 1077 /** Append to existing output files */ 1078 private boolean append=false; 1079 /** This flag has no effect on singlethreaded programs */ 1080 private final boolean ordered=false; 1081 1082 } 1083