1 package icecream; 2 3 import java.io.PrintStream; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 import java.util.Random; 7 import java.util.concurrent.atomic.AtomicLong; 8 9 import dna.AminoAcid; 10 import fileIO.ByteFile; 11 import fileIO.ByteStreamWriter; 12 import fileIO.FileFormat; 13 import fileIO.ReadWrite; 14 import shared.Parse; 15 import shared.Parser; 16 import shared.PreParser; 17 import shared.ReadStats; 18 import shared.Shared; 19 import shared.Timer; 20 import shared.Tools; 21 import stream.ConcurrentReadInputStream; 22 import stream.ConcurrentReadOutputStream; 23 import stream.FASTQ; 24 import stream.FastaReadInputStream; 25 import stream.Read; 26 import stream.SamLine; 27 import structures.ByteBuilder; 28 import structures.IntList; 29 import structures.ListNum; 30 31 /** 32 * Generates chimeric PacBio reads containing inverted repeats 33 * due to missing adapters. 34 * 35 * @author Brian Bushnell 36 * @date June 8, 2019 37 * 38 */ 39 public class IceCreamMaker { 40 41 /*--------------------------------------------------------------*/ 42 /*---------------- Initialization ----------------*/ 43 /*--------------------------------------------------------------*/ 44 45 /** 46 * Code entrance from the command line. 47 * @param args Command line arguments 48 */ main(String[] args)49 public static void main(String[] args){ 50 //Start a timer immediately upon code entrance. 51 Timer t=new Timer(); 52 53 //Create an instance of this class 54 IceCreamMaker x=new IceCreamMaker(args); 55 56 //Run the object 57 x.process(t); 58 59 //Close the print stream if it was redirected 60 Shared.closeStream(x.outstream); 61 } 62 63 /** 64 * Constructor. 65 * @param args Command line arguments 66 */ IceCreamMaker(String[] args)67 public IceCreamMaker(String[] args){ 68 69 {//Preparse block for help, config files, and outstream 70 PreParser pp=new PreParser(args, getClass(), false); 71 args=pp.args; 72 outstream=pp.outstream; 73 } 74 75 //Set shared static variables prior to parsing 76 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; 77 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); 78 FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false; 79 Shared.FAKE_QUAL=8; 80 // Shared.FASTA_WRAP=511; 81 SamLine.SET_FROM_OK=true; 82 83 {//Parse the arguments 84 final Parser parser=parse(args); 85 Parser.processQuality(); 86 87 maxReads=parser.maxReads; 88 overwrite=ReadStats.overwrite=parser.overwrite; 89 append=ReadStats.append=parser.append; 90 91 in1=parser.in1; 92 extin=parser.extin; 93 94 out1=parser.out1; 95 qfout1=parser.qfout1; 96 extout=parser.extout; 97 } 98 99 validateParams(); 100 fixExtensions(); //Add or remove .gz or .bz2 as needed 101 checkFileExistence(); //Ensure files can be read and written 102 checkStatics(); //Adjust file-related static fields as needed for this program 103 104 //Create output FileFormat objects 105 ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false); 106 107 //Create input FileFormat objects 108 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); 109 110 ffIdHist=FileFormat.testOutput(outIdHist, FileFormat.TXT, null, false, overwrite, append, false); 111 112 insThresh=insFraction; 113 delThresh=delFraction+insThresh; 114 } 115 116 /*--------------------------------------------------------------*/ 117 /*---------------- Initialization Helpers ----------------*/ 118 /*--------------------------------------------------------------*/ 119 120 /** Parse arguments from the command line */ parse(String[] args)121 private Parser parse(String[] args){ 122 123 //Create a parser object 124 Parser parser=new Parser(); 125 126 //Set any necessary Parser defaults here 127 //parser.foo=bar; 128 129 //Parse each argument 130 for(int i=0; i<args.length; i++){ 131 String arg=args[i]; 132 133 //Break arguments into their constituent parts, in the form of "a=b" 134 String[] split=arg.split("="); 135 String a=split[0].toLowerCase(); 136 String b=split.length>1 ? split[1] : null; 137 if(b!=null && b.equalsIgnoreCase("null")){b=null;} 138 139 if(a.equals("verbose")){ 140 verbose=Parse.parseBoolean(b); 141 } 142 143 else if(a.equals("idhist")){ 144 outIdHist=b; 145 } 146 147 else if(a.equals("minlength") || a.equals("minlen")){ 148 minMoleculeLength=Parse.parseIntKMG(b); 149 }else if(a.equals("maxlength") || a.equals("maxlen")){ 150 maxMoleculeLength=Parse.parseIntKMG(b); 151 }else if(a.equals("length") || a.equals("len")){ 152 minMoleculeLength=maxMoleculeLength=Parse.parseIntKMG(b); 153 } 154 155 else if(a.equals("minmovie") || a.equals("minmov")){ 156 minMovie=Parse.parseIntKMG(b); 157 }else if(a.equals("maxmovie") || a.equals("maxmov")){ 158 maxMovie=Parse.parseIntKMG(b); 159 }else if(a.equals("movie") || a.equals("mov")){ 160 minMovie=maxMovie=Parse.parseIntKMG(b); 161 } 162 163 else if(a.equals("missingrate") || a.equals("missing")){ 164 missingRate=Double.parseDouble(b); 165 assert(missingRate<=1); 166 }else if(a.equals("hiddenrate") || a.equals("hidden")){ 167 hiddenRate=Double.parseDouble(b); 168 assert(hiddenRate<=1); 169 }else if(a.equals("bothends")){ 170 allowBothEndsBad=Parse.parseBoolean(b); 171 assert(false) : "TODO"; 172 } 173 174 else if(a.equals("gc")){ 175 genomeGC=(float)Double.parseDouble(b); 176 assert(genomeGC>=0 && genomeGC<=1); 177 }else if(a.equals("genomesize")){ 178 genomeSize=Parse.parseKMG(b); 179 }else if(a.equals("addns") || a.equals("ns")){ 180 addNs=Parse.parseBoolean(b); 181 }else if(a.equals("seed")){ 182 seed=Long.parseLong(b); 183 }else if(a.equals("zmws") || a.equals("maxzmws") || a.equals("reads")){ 184 maxZMWs=Parse.parseIntKMG(b); 185 }else if(a.equals("ccs")){ 186 makeCCS=Parse.parseBoolean(b); 187 } 188 189 else if(a.equals("invertedrepeatrate") || a.equals("invertrepeatrate") || a.equals("irrate")){ 190 invertedRepeatRate=Double.parseDouble(b); 191 assert(invertedRepeatRate>=0); 192 }else if(a.equals("invertedrepeatminlen") || a.equals("invertrepeatminlen") || a.equals("irminlen")){ 193 invertedRepeatMinLength=Parse.parseIntKMG(b); 194 }else if(a.equals("invertedrepeatmaxlen") || a.equals("invertrepeatmaxlen") || a.equals("irmaxlen")){ 195 invertedRepeatMaxLength=Parse.parseIntKMG(b); 196 }else if(a.equals("invertedrepeatlen") || a.equals("invertrepeatlen") || a.equals("irlen")){ 197 invertedRepeatMinLength=invertedRepeatMaxLength=Parse.parseIntKMG(b); 198 } 199 200 else if(a.equals("miner") || a.equals("minerrorrate")){ 201 minErrorRate=(float)Double.parseDouble(b); 202 assert(minErrorRate>=0 && minErrorRate<=1); 203 }else if(a.equals("maxer") || a.equals("maxerrorrate")){ 204 maxErrorRate=(float)Double.parseDouble(b); 205 assert(maxErrorRate>=0 && maxErrorRate<=1); 206 }else if(a.equals("er") || a.equals("errorrate")){ 207 minErrorRate=maxErrorRate=(float)Double.parseDouble(b); 208 assert(minErrorRate>=0 && minErrorRate<=1); 209 } 210 211 else if(a.equals("minid") || a.equals("minidentity")){ 212 maxErrorRate=1-(float)Double.parseDouble(b); 213 assert(maxErrorRate>=0 && maxErrorRate<=1); 214 }else if(a.equals("maxid") || a.equals("maxidentity")){ 215 minErrorRate=1-(float)Double.parseDouble(b); 216 assert(minErrorRate>=0 && minErrorRate<=1); 217 }else if(a.equals("id") || a.equals("identity")){ 218 minErrorRate=maxErrorRate=1-(float)Double.parseDouble(b); 219 assert(minErrorRate>=0 && minErrorRate<=1); 220 }else if(a.equals("adderrors")){ 221 addErrors=Parse.parseBoolean(b); 222 }else if(a.equals("ref")){ 223 parser.in1=b; 224 } 225 226 else if(a.equals("parse_flag_goes_here")){ 227 long fake_variable=Parse.parseKMG(b); 228 //Set a variable here 229 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser 230 //do nothing 231 }else{ 232 outstream.println("Unknown parameter "+args[i]); 233 assert(false) : "Unknown parameter "+args[i]; 234 } 235 } 236 237 return parser; 238 } 239 240 /** Add or remove .gz or .bz2 as needed */ fixExtensions()241 private void fixExtensions(){ 242 in1=Tools.fixExtension(in1); 243 } 244 245 /** Ensure files can be read and written */ checkFileExistence()246 private void checkFileExistence(){ 247 //Ensure output files can be written 248 if(!Tools.testOutputFiles(overwrite, append, false, out1, outIdHist)){ 249 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n"); 250 } 251 252 //Ensure input files can be read 253 if(!Tools.testInputFiles(false, true, in1)){ 254 throw new RuntimeException("\nCan't read some input files.\n"); 255 } 256 257 //Ensure that no file was specified multiple times 258 if(!Tools.testForDuplicateFiles(true, in1, out1, outIdHist)){ 259 throw new RuntimeException("\nSome file names were specified multiple times.\n"); 260 } 261 } 262 263 /** Adjust file-related static fields as needed for this program */ checkStatics()264 private static void checkStatics(){ 265 //Adjust the number of threads for input file reading 266 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ 267 ByteFile.FORCE_MODE_BF2=true; 268 } 269 270 assert(FastaReadInputStream.settingsOK()); 271 } 272 273 /** Ensure parameter ranges are within bounds and required parameters are set */ validateParams()274 private boolean validateParams(){ 275 assert(minMoleculeLength>0 && minMoleculeLength<=maxMoleculeLength) : minMoleculeLength+", "+maxMoleculeLength; 276 assert(minMovie>0 && minMovie<=maxMovie) : minMovie+", "+maxMovie; 277 278 assert(missingRate>=0 && missingRate<=1) : missingRate; 279 assert(hiddenRate>=0 && hiddenRate<=1) : hiddenRate; 280 assert(genomeGC>=0 && genomeGC<=1) : genomeGC; 281 assert(in1!=null || genomeSize>maxMoleculeLength) : genomeSize; 282 assert(in1!=null || invertedRepeatMaxLength*2<genomeSize) : genomeSize; 283 assert(maxZMWs>0) : "zmsw="+maxZMWs+"; please set to a positive number."; 284 285 assert(invertedRepeatRate>=0) : invertedRepeatRate; 286 assert(invertedRepeatMinLength>0 && invertedRepeatMinLength<=invertedRepeatMaxLength) : invertedRepeatMinLength+", "+invertedRepeatMaxLength; 287 288 assert(minErrorRate>=0 && minErrorRate<=maxErrorRate) : minErrorRate+", "+maxErrorRate; 289 return true; 290 } 291 292 /*--------------------------------------------------------------*/ 293 /*---------------- Outer Methods ----------------*/ 294 /*--------------------------------------------------------------*/ 295 296 /** Create read streams and process all data */ process(Timer t)297 void process(Timer t){ 298 299 //Turn off read validation in the input threads to increase speed 300 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; 301 Read.VALIDATE_IN_CONSTRUCTOR=true; 302 303 //Create a read input stream 304 final ConcurrentReadInputStream cris=makeCris(); 305 306 //Optionally create a read output stream 307 final ConcurrentReadOutputStream ros=makeCros(false); 308 309 //Reset counters 310 readsProcessed=readsOut=0; 311 basesProcessed=basesOut=0; 312 313 Random randy=Shared.threadLocalRandom(seed); 314 if(cris==null){ 315 ref=genSynthGenome(randy); 316 }else{ 317 ref=loadData(cris, randy); 318 } 319 320 if(invertedRepeatRate>0){ 321 addInvertedRepeats(ref, randy); 322 } 323 324 //Process the reads in separate threads 325 spawnThreads(cris, ros); 326 327 if(verbose){outstream.println("Finished; closing streams.");} 328 329 //Write anything that was accumulated by ReadStats 330 errorState|=ReadStats.writeAll(); 331 //Close the read streams 332 errorState|=ReadWrite.closeStreams(cris, ros); 333 334 writeIdHist(); 335 336 //Reset read validation 337 Read.VALIDATE_IN_CONSTRUCTOR=vic; 338 339 //Report timing and results 340 t.stop(); 341 outstream.println(Tools.timeReadsBasesProcessed(t, readsOut, basesOut, 8)); 342 // outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false)); 343 344 //Throw an exception of there was an error in a thread 345 if(errorState){ 346 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); 347 } 348 } 349 writeIdHist()350 private void writeIdHist(){ 351 if(ffIdHist==null){return;} 352 ByteStreamWriter bsw=new ByteStreamWriter(ffIdHist); 353 bsw.start(); 354 bsw.print("#Identity\tCount\n".getBytes()); 355 for(int i=0; i<idHist.length; i++){ 356 bsw.print(i*100.0/(ID_BINS-1), 3).print('\t').println(idHist[i]); 357 } 358 errorState|=bsw.poisonAndWait(); 359 } 360 361 /** Create a Read Input Stream */ makeCris()362 private ConcurrentReadInputStream makeCris(){ 363 if(ffin1==null){return null;} 364 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null); 365 cris.start(); //Start the stream 366 if(verbose){outstream.println("Started cris");} 367 return cris; 368 } 369 370 /** Create a Read Output Stream */ makeCros(boolean pairedInput)371 private ConcurrentReadOutputStream makeCros(boolean pairedInput){ 372 if(ffout1==null){return null;} 373 374 //Set output buffer size 375 final int buff=4; 376 377 final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream( 378 ffout1, null, qfout1, null, buff, null, false); 379 ros.start(); //Start the stream 380 return ros; 381 } 382 383 /** Spawn process threads */ spawnThreads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros)384 private void spawnThreads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){ 385 386 //Do anything necessary prior to processing 387 388 //Determine how many threads may be used 389 final int threads=Shared.threads(); 390 391 //Fill a list with ProcessThreads 392 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); 393 for(int i=0; i<threads; i++){ 394 alpt.add(new ProcessThread(ros, i, nextZmwID, seed)); 395 } 396 397 //Start the threads 398 for(ProcessThread pt : alpt){ 399 pt.start(); 400 } 401 402 //Wait for threads to finish 403 waitForThreads(alpt); 404 405 //Do anything necessary after processing 406 407 } 408 409 /** Wait until all worker threads are finished, then return */ waitForThreads(ArrayList<ProcessThread> alpt)410 private void waitForThreads(ArrayList<ProcessThread> alpt){ 411 412 //Wait for completion of all threads 413 boolean success=true; 414 for(ProcessThread pt : alpt){ 415 416 //Wait until this thread has terminated 417 while(pt.getState()!=Thread.State.TERMINATED){ 418 try { 419 //Attempt a join operation 420 pt.join(); 421 } catch (InterruptedException e) { 422 //Potentially handle this, if it is expected to occur 423 e.printStackTrace(); 424 } 425 } 426 427 //Accumulate per-thread statistics 428 readsOut+=pt.readsOutT; 429 basesOut+=pt.basesOutT; 430 Tools.add(idHist, pt.idHistT); 431 432 success&=pt.success; 433 } 434 435 //Track whether any threads failed 436 if(!success){errorState=true;} 437 } 438 439 /*--------------------------------------------------------------*/ 440 /*---------------- Inner Methods ----------------*/ 441 /*--------------------------------------------------------------*/ 442 randomBase(Random randy)443 private byte randomBase(Random randy) { 444 float rGC=randy.nextFloat(); 445 if(rGC>=genomeGC){//AT 446 return (byte)(randy.nextBoolean() ? 'A' : 'T'); 447 }else{//GC 448 return (byte)(randy.nextBoolean() ? 'G' : 'C'); 449 } 450 } 451 randomLength(int min, int max, Random randy)452 private static int randomLength(int min, int max, Random randy) { 453 if(min==max){return min;} 454 int range=max-min+1; 455 int x=min+Tools.min(randy.nextInt(range), randy.nextInt(range)); 456 // System.err.println(x+", "+min+", "+max); 457 // new Exception().printStackTrace(); 458 // System.err.println(randy.getClass()); 459 // System.err.println(randy.nextLong()); 460 return x; 461 } 462 randomRate(float min, float max, Random randy)463 private static float randomRate(float min, float max, Random randy) { 464 if(min==max){return min;} 465 float range=max-min; 466 // float a=(randy.nextFloat()+randy.nextFloat()); 467 float b=(randy.nextFloat()+randy.nextFloat()); 468 float c=(1.6f*randy.nextFloat()+0.4f*randy.nextFloat()); 469 float x=min+range*0.5f*Tools.min(b, c); 470 // assert(false) : "x="+x+", a="+a+", b="+b+", c="+c+", min="+min+", max="+max; 471 // System.err.println(x+", "+min+", "+max); 472 return x; 473 } 474 genSynthGenome(Random randy)475 private byte[] genSynthGenome(Random randy){ 476 assert(genomeSize<=MAX_GENOME_LENGTH) : genomeSize; 477 byte[] array=new byte[(int)genomeSize]; 478 for(int i=0; i<genomeSize; i++){ 479 array[i]=randomBase(randy); 480 } 481 return array; 482 } 483 loadData(ConcurrentReadInputStream cris, Random randy)484 private byte[] loadData(ConcurrentReadInputStream cris, Random randy){ 485 486 ByteBuilder bb=new ByteBuilder(); 487 488 //Grab the first ListNum of reads 489 ListNum<Read> ln=cris.nextList(); 490 491 //As long as there is a nonempty read list... 492 while(ln!=null && ln.size()>0){ 493 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access 494 495 for(Read r : ln){ 496 497 // System.err.println("Fetched read len="+r.length());//123 498 499 //Increment counters 500 readsProcessed+=r.pairCount(); 501 basesProcessed+=r.pairLength(); 502 503 504 505 // System.err.println("Filter: addNs="+addNs+", (r.length()>maxMoleculeLength && (invertedRepeatRate<=0 || r.length()>2*invertedRepeatMaxLength);//123 506 507 //Filter disabled; it causes short sequences to be ignored. 508 //Optional filter criteria 509 // if(!addNs || (r.length()>maxMoleculeLength && (invertedRepeatRate<=0 || r.length()>2*invertedRepeatMaxLength))){ 510 final byte[] bases=r.bases; 511 512 if(addNs){ 513 if(bb.length()>0){bb.append('N');} 514 }else{ 515 for(int i=0; i<bases.length; i++){ 516 if(!AminoAcid.isFullyDefined(bases[i])){ 517 bases[i]=randomBase(randy); 518 } 519 } 520 } 521 522 if(bb.length()+bases.length<=MAX_GENOME_LENGTH){ 523 bb.append(bases); 524 // System.err.println("Appended "+r.length());//123 525 }else{ 526 // System.err.println("Appending partial.");//123 527 for(byte b : bases){ 528 bb.append(b); 529 if(bb.length>=MAX_GENOME_LENGTH){ 530 cris.returnList(ln.id, true); 531 // System.err.println("Returning partial "+bb.length);//123 532 return bb.toBytes(); 533 } 534 } 535 } 536 // } 537 } 538 539 //Notify the input stream that the list was used 540 cris.returnList(ln); 541 // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access 542 543 //Fetch a new list 544 ln=cris.nextList(); 545 } 546 547 //Notify the input stream that the final list was used 548 if(ln!=null){ 549 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); 550 } 551 552 // System.err.println("Returning full "+bb.length);//123 553 return bb.toBytes(); 554 } 555 addInvertedRepeats(byte[] bases, Random randy)556 private void addInvertedRepeats(byte[] bases, Random randy){ 557 558 long added=0; 559 long toAdd=(long) (bases.length*invertedRepeatRate); 560 while(added<toAdd){ 561 int len=randomLength(invertedRepeatMinLength, invertedRepeatMaxLength, randy); 562 int start=randy.nextInt(bases.length-2*len); 563 int stop=start+len; 564 boolean OK=true; 565 for(int i=0; i<len && OK; i++){ 566 OK&=(bases[start+i]!='N' && bases[stop+i]!='N'); 567 } 568 if(OK){ 569 for(int i=0; i<len; i++){ 570 byte b=bases[stop-i-1]; 571 bases[stop+i]=AminoAcid.baseToComplementExtended[b]; 572 } 573 added+=2*len; 574 // System.err.println("added="+added+"/"+toAdd); 575 }else{ 576 // 577 } 578 } 579 580 } 581 582 /*--------------------------------------------------------------*/ 583 /*---------------- Inner Classes ----------------*/ 584 /*--------------------------------------------------------------*/ 585 586 /** */ 587 private class ProcessThread extends Thread { 588 589 //Constructor ProcessThread(final ConcurrentReadOutputStream ros_, final int tid_, final AtomicLong nextZmwID_, final long seed)590 ProcessThread(final ConcurrentReadOutputStream ros_, final int tid_, 591 final AtomicLong nextZmwID_, final long seed){ 592 ros=ros_; 593 tid=tid_; 594 atomicZmwID=nextZmwID_; 595 // assert(false) : randy.getClass()+", "+randy.nextLong(); 596 } 597 598 //Called by start() 599 @Override run()600 public void run(){ 601 //Do anything necessary prior to processing 602 randy=Shared.threadLocalRandom(seed<0 ? seed : seed+(tid+1)*tid*1000L); 603 604 //Process the reads 605 processInner(); 606 607 //Do anything necessary after processing 608 609 //Indicate successful exit status 610 success=true; 611 } 612 613 /** Iterate through the reads */ processInner()614 void processInner(){ 615 616 //As long as there is a nonempty read list... 617 for(long generated=atomicZmwID.getAndAdd(readsPerList); generated<maxZMWs; 618 generated=atomicZmwID.getAndAdd(readsPerList)){ 619 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access 620 621 long toGenerate=Tools.min(readsPerList, maxZMWs-generated); 622 623 ArrayList<Read> reads=generateList((int)toGenerate, generated); 624 625 //Output reads to the output stream 626 if(ros!=null){ros.add(reads, 0);} 627 } 628 } 629 630 /** Generate the next list of reads */ generateList(int toGenerate, long nextID)631 private ArrayList<Read> generateList(int toGenerate, long nextID){ 632 633 //Grab the actual read list from the ListNum 634 final ArrayList<Read> reads=new ArrayList<Read>(toGenerate); 635 636 //Loop through each read in the list 637 for(int i=0; i<toGenerate; i++, nextID++){ 638 ArrayList<Read> zmw=generateZMW(nextID); 639 if(zmw==null){ 640 i--; 641 nextID--; 642 }else{reads.addAll(zmw);} 643 } 644 645 return reads; 646 } 647 median(ArrayList<ReadBuilder> list)648 private ReadBuilder median(ArrayList<ReadBuilder> list){ 649 if(list.size()<3){return null;} 650 IntList lengths=new IntList(list.size()-2); 651 652 for(int i=1; i<list.size()-1; i++){ 653 lengths.add(list.get(i).length()); 654 } 655 lengths.sort(); 656 int median=lengths.get((lengths.size-1)/2); 657 658 for(int i=1; i<list.size()-1; i++){ 659 ReadBuilder rb=list.get(i); 660 if(rb.length()==median){ 661 return rb; 662 } 663 } 664 665 assert(false); 666 return null; 667 } 668 669 /** 670 * Generate a single read. 671 */ generateZMW(final long nextID)672 private ArrayList<Read> generateZMW(final long nextID){ 673 674 final int movieLength=randomLength(minMovie, maxMovie, randy); 675 final float errorRate=randomRate(minErrorRate, maxErrorRate, randy); 676 677 ArrayList<ReadBuilder> baseCalls=baseCallAllPasses(movieLength, errorRate, nextID); 678 if(baseCalls==null){ 679 // System.err.println(movieLength+", "+errorRate+", "+nextID);//123 680 return null; 681 } 682 683 final boolean missing=randy.nextFloat()<missingRate; 684 if(missing){ 685 final int missingMod=randy.nextInt(2); 686 ArrayList<ReadBuilder> temp=new ArrayList<ReadBuilder>(); 687 688 ReadBuilder current=null; 689 for(int i=0; i<baseCalls.size(); i++){ 690 ReadBuilder rb=baseCalls.get(i); 691 assert(rb.subreads==1); 692 assert(rb.missing==0); 693 assert(rb.adapters==0); 694 if(current!=null && (i&1)==missingMod){ 695 current.add(rb); 696 current.missing++; 697 assert(current.subreads>1); 698 assert(current.missing>0); 699 temp.add(current); 700 current=null; 701 }else{ 702 if(current!=null){temp.add(current);} 703 current=rb; 704 } 705 } 706 if(current!=null){temp.add(current);} 707 baseCalls=temp; 708 } 709 710 if(makeCCS){ 711 ReadBuilder median=median(baseCalls); 712 if(median==null){return null;} 713 baseCalls.clear(); 714 baseCalls.add(median); 715 }else if(hiddenRate>0){ 716 ArrayList<ReadBuilder> temp=new ArrayList<ReadBuilder>(); 717 718 ReadBuilder current=null; 719 for(int i=0; i<baseCalls.size(); i++){ 720 ReadBuilder rb=baseCalls.get(i); 721 assert(rb.adapters==0); 722 if(current!=null && randy.nextFloat()<hiddenRate){ 723 current.add(baseCallAdapter(errorRate)); 724 current.add(rb); 725 assert(current.adapters>0); 726 assert(current.subreads>1); 727 }else{ 728 if(current!=null){temp.add(current);} 729 current=rb; 730 assert(current.adapters==0); 731 } 732 } 733 if(current!=null){temp.add(current);} 734 baseCalls=temp; 735 } 736 737 738 ArrayList<Read> reads=new ArrayList<Read>(); 739 for(ReadBuilder rb : baseCalls){ 740 Read r=rb.toRead(); 741 readsOutT+=r.pairCount(); 742 basesOutT+=r.length(); 743 reads.add(r); 744 } 745 746 idHistT[(int)((1-errorRate)*(ID_BINS-1)+0.5f)]++; 747 return reads; 748 } 749 750 private ArrayList<ReadBuilder> baseCallAllPasses(final int movieLength, final float errorRate, long zmw){ 751 byte[] frag=null; 752 753 for(int i=0; i<10 && frag==null; i++){//retry several times 754 frag=fetchBases(ref); 755 } 756 757 if(frag==null){ 758 // System.err.println("Failed baseCallAllPasses("+movieLength+", "+errorRate+", "+zmw);//123 759 return null; 760 } 761 762 ArrayList<ReadBuilder> list=new ArrayList<ReadBuilder>(); 763 int movieRemaining=movieLength; 764 int moviePos=0; 765 assert(frag.length>0) : frag.length; 766 int start=randy.nextInt(frag.length); 767 while(movieRemaining>0){ 768 ReadBuilder rb=baseCallOnePass(frag, errorRate, start, moviePos, movieRemaining, zmw); 769 list.add(rb); 770 start=0; 771 int elapsed=rb.length()+adapterLen; 772 moviePos+=elapsed; 773 movieRemaining-=elapsed; 774 AminoAcid.reverseComplementBasesInPlace(frag); 775 } 776 return list; 777 } 778 779 /** Call bases for one pass */ 780 private ReadBuilder baseCallOnePass(final byte[] frag, final float errorRate, final int start, final int movieStart, int movieRemaining, long zmw){ 781 final float mult=1/(1-errorRate); 782 ByteBuilder bb=new ByteBuilder(); 783 int fpos=start; 784 for(; fpos<frag.length && movieRemaining>0; fpos++){ 785 float f=randy.nextFloat(); 786 byte b=frag[fpos]; 787 if(f>=errorRate){ 788 bb.append(b); 789 movieRemaining--; 790 }else{ 791 f=mult*(1-f); 792 if(f<insThresh){//ins 793 b=AminoAcid.numberToBase[randy.nextInt(4)]; 794 bb.append(b); 795 fpos--; 796 movieRemaining--; 797 }else if(f<delThresh){//del 798 799 }else{//sub 800 int x=AminoAcid.baseToNumber[b]+randy.nextInt(3)+1; 801 b=AminoAcid.numberToBase[x&3]; 802 bb.append(b); 803 movieRemaining--; 804 } 805 } 806 } 807 808 float passes=(fpos-start)*1.0f/frag.length; 809 ReadBuilder rb=new ReadBuilder(bb, passes, movieStart, zmw); 810 rb.errorRate=errorRate; 811 return rb; 812 } 813 814 /** Call bases for an adapter sequence pass */ 815 private ReadBuilder baseCallAdapter(final float errorRate){ 816 ReadBuilder rb=baseCallOnePass(pacbioAdapter, errorRate, 0, 0, 999999, 0); 817 rb.passes=0; 818 rb.fullPasses=0; 819 rb.subreads=0; 820 rb.adapters=1; 821 return rb; 822 } 823 824 private byte[] fetchBases(byte[] source){ 825 826 final int len=randomLength(minMoleculeLength, maxMoleculeLength, randy); 827 int start=len>=source.length ? 0 : randy.nextInt(source.length-len); 828 int stop=Tools.min(start+len, source.length); 829 830 // System.err.println("fetchBases(len="+len+", slen="+source.length+", start="+start+", stop="+stop);//123 831 832 for(int i=start; i<stop; i++){ 833 if(!AminoAcid.isFullyDefined(source[i])){ 834 return null; 835 } 836 } 837 if(stop-start<1){return null;} 838 byte[] frag=Arrays.copyOfRange(source, start, stop); 839 if(randy.nextBoolean()){AminoAcid.reverseComplementBasesInPlace(frag);} 840 return frag; 841 } 842 843 /** Number of reads retained by this thread */ 844 protected long readsOutT=0; 845 /** Number of bases retained by this thread */ 846 protected long basesOutT=0; 847 848 /** True only if this thread has completed successfully */ 849 boolean success=false; 850 851 private final AtomicLong atomicZmwID; 852 private final int readsPerList=Shared.bufferLen(); 853 854 /** Random number source */ 855 private Random randy; 856 857 /** Random number source */ 858 private final long[] idHistT=new long[ID_BINS]; 859 860 /** Shared output stream */ 861 private final ConcurrentReadOutputStream ros; 862 /** Thread ID */ 863 final int tid; 864 } 865 866 /*--------------------------------------------------------------*/ 867 868 869 870 /*--------------------------------------------------------------*/ 871 /*---------------- Fields ----------------*/ 872 /*--------------------------------------------------------------*/ 873 874 /** Primary input file path */ 875 private String in1=null; 876 877 /** Primary output file path */ 878 private String out1=null; 879 880 /** Primary output file path */ 881 private String outIdHist=null; 882 883 private String qfout1=null; 884 885 /** Override input file extension */ 886 private String extin=null; 887 /** Override output file extension */ 888 private String extout=null; 889 890 /*--------------------------------------------------------------*/ 891 892 /** */ 893 private int minMoleculeLength=500; 894 /** */ 895 private int maxMoleculeLength=10000; 896 /** */ 897 private int minMovie=500; 898 /** */ 899 private int maxMovie=40000; 900 901 /** */ 902 private double missingRate=0.0; 903 /** */ 904 private double hiddenRate=0.0; 905 /** */ 906 private boolean allowBothEndsBad=false; 907 908 /** */ 909 private float genomeGC=0.6f; 910 /** */ 911 private long genomeSize=10000000; 912 /** */ 913 private boolean addNs=true; 914 915 /** */ 916 private double invertedRepeatRate=0.0; 917 /** */ 918 private int invertedRepeatMinLength=100; 919 /** */ 920 private int invertedRepeatMaxLength=10000; 921 922 /** */ 923 private float minErrorRate=0.05f; 924 /** */ 925 private float maxErrorRate=0.25f; 926 /** */ 927 private boolean addErrors=true; 928 /** One read per ZMW */ 929 private boolean makeCCS=false; 930 931 /** */ 932 private long seed=-1; 933 934 private long[] idHist=new long[ID_BINS]; 935 936 //These should add to 1 937 private float insFraction=0.40f; 938 private float delFraction=0.35f; 939 private float subFraction=0.25f; 940 941 private final float insThresh; 942 private final float delThresh; 943 944 /*--------------------------------------------------------------*/ 945 946 /** Number of reads processed */ 947 protected long readsProcessed=0; 948 /** Number of bases processed */ 949 protected long basesProcessed=0; 950 951 /** Number of reads retained */ 952 protected long readsOut=0; 953 /** Number of bases retained */ 954 protected long basesOut=0; 955 956 /** Quit after processing this many INPUT reads */ 957 private long maxReads=-1; 958 959 /** Quit after generating this many OUTPUT zmws */ 960 private long maxZMWs=-1; 961 962 /** Reference genome, max 2Gbp */ 963 private byte[] ref; 964 965 private AtomicLong nextZmwID=new AtomicLong(0); 966 967 /*--------------------------------------------------------------*/ 968 /*---------------- Final Fields ----------------*/ 969 /*--------------------------------------------------------------*/ 970 971 /** Primary input file */ 972 private final FileFormat ffin1; 973 974 /** Primary output file */ 975 private final FileFormat ffout1; 976 977 /** Primary output file */ 978 private final FileFormat ffIdHist; 979 980 /*--------------------------------------------------------------*/ 981 /*---------------- Constants ----------------*/ 982 /*--------------------------------------------------------------*/ 983 984 private static final int ID_BINS=201; 985 986 private static final long MAX_GENOME_LENGTH=2000000000; 987 988 public static final byte[] pacbioAdapter="ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT".getBytes(); 989 public static final int adapterLen=pacbioAdapter.length; 990 991 /*--------------------------------------------------------------*/ 992 /*---------------- Common Fields ----------------*/ 993 /*--------------------------------------------------------------*/ 994 995 /** Print status messages to this output stream */ 996 private PrintStream outstream=System.err; 997 /** Print verbose messages */ 998 public static boolean verbose=false; 999 /** True if an error was encountered */ 1000 public boolean errorState=false; 1001 /** Overwrite existing output files */ 1002 private boolean overwrite=false; 1003 /** Append to existing output files */ 1004 private boolean append=false; 1005 1006 } 1007