1 package icecream;
2 
3 import java.io.PrintStream;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 import java.util.Random;
7 import java.util.concurrent.atomic.AtomicLong;
8 
9 import dna.AminoAcid;
10 import fileIO.ByteFile;
11 import fileIO.ByteStreamWriter;
12 import fileIO.FileFormat;
13 import fileIO.ReadWrite;
14 import shared.Parse;
15 import shared.Parser;
16 import shared.PreParser;
17 import shared.ReadStats;
18 import shared.Shared;
19 import shared.Timer;
20 import shared.Tools;
21 import stream.ConcurrentReadInputStream;
22 import stream.ConcurrentReadOutputStream;
23 import stream.FASTQ;
24 import stream.FastaReadInputStream;
25 import stream.Read;
26 import stream.SamLine;
27 import structures.ByteBuilder;
28 import structures.IntList;
29 import structures.ListNum;
30 
31 /**
32  * Generates chimeric PacBio reads containing inverted repeats
33  * due to missing adapters.
34  *
35  * @author Brian Bushnell
36  * @date June 8, 2019
37  *
38  */
39 public class IceCreamMaker {
40 
41 	/*--------------------------------------------------------------*/
42 	/*----------------        Initialization        ----------------*/
43 	/*--------------------------------------------------------------*/
44 
45 	/**
46 	 * Code entrance from the command line.
47 	 * @param args Command line arguments
48 	 */
main(String[] args)49 	public static void main(String[] args){
50 		//Start a timer immediately upon code entrance.
51 		Timer t=new Timer();
52 
53 		//Create an instance of this class
54 		IceCreamMaker x=new IceCreamMaker(args);
55 
56 		//Run the object
57 		x.process(t);
58 
59 		//Close the print stream if it was redirected
60 		Shared.closeStream(x.outstream);
61 	}
62 
63 	/**
64 	 * Constructor.
65 	 * @param args Command line arguments
66 	 */
IceCreamMaker(String[] args)67 	public IceCreamMaker(String[] args){
68 
69 		{//Preparse block for help, config files, and outstream
70 			PreParser pp=new PreParser(args, getClass(), false);
71 			args=pp.args;
72 			outstream=pp.outstream;
73 		}
74 
75 		//Set shared static variables prior to parsing
76 		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
77 		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
78 		FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
79 		Shared.FAKE_QUAL=8;
80 //		Shared.FASTA_WRAP=511;
81 		SamLine.SET_FROM_OK=true;
82 
83 		{//Parse the arguments
84 			final Parser parser=parse(args);
85 			Parser.processQuality();
86 
87 			maxReads=parser.maxReads;
88 			overwrite=ReadStats.overwrite=parser.overwrite;
89 			append=ReadStats.append=parser.append;
90 
91 			in1=parser.in1;
92 			extin=parser.extin;
93 
94 			out1=parser.out1;
95 			qfout1=parser.qfout1;
96 			extout=parser.extout;
97 		}
98 
99 		validateParams();
100 		fixExtensions(); //Add or remove .gz or .bz2 as needed
101 		checkFileExistence(); //Ensure files can be read and written
102 		checkStatics(); //Adjust file-related static fields as needed for this program
103 
104 		//Create output FileFormat objects
105 		ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
106 
107 		//Create input FileFormat objects
108 		ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
109 
110 		ffIdHist=FileFormat.testOutput(outIdHist, FileFormat.TXT, null, false, overwrite, append, false);
111 
112 		insThresh=insFraction;
113 		delThresh=delFraction+insThresh;
114 	}
115 
116 	/*--------------------------------------------------------------*/
117 	/*----------------    Initialization Helpers    ----------------*/
118 	/*--------------------------------------------------------------*/
119 
120 	/** Parse arguments from the command line */
parse(String[] args)121 	private Parser parse(String[] args){
122 
123 		//Create a parser object
124 		Parser parser=new Parser();
125 
126 		//Set any necessary Parser defaults here
127 		//parser.foo=bar;
128 
129 		//Parse each argument
130 		for(int i=0; i<args.length; i++){
131 			String arg=args[i];
132 
133 			//Break arguments into their constituent parts, in the form of "a=b"
134 			String[] split=arg.split("=");
135 			String a=split[0].toLowerCase();
136 			String b=split.length>1 ? split[1] : null;
137 			if(b!=null && b.equalsIgnoreCase("null")){b=null;}
138 
139 			if(a.equals("verbose")){
140 				verbose=Parse.parseBoolean(b);
141 			}
142 
143 			else if(a.equals("idhist")){
144 				outIdHist=b;
145 			}
146 
147 			else if(a.equals("minlength") || a.equals("minlen")){
148 				minMoleculeLength=Parse.parseIntKMG(b);
149 			}else if(a.equals("maxlength") || a.equals("maxlen")){
150 				maxMoleculeLength=Parse.parseIntKMG(b);
151 			}else if(a.equals("length") || a.equals("len")){
152 				minMoleculeLength=maxMoleculeLength=Parse.parseIntKMG(b);
153 			}
154 
155 			else if(a.equals("minmovie") || a.equals("minmov")){
156 				minMovie=Parse.parseIntKMG(b);
157 			}else if(a.equals("maxmovie") || a.equals("maxmov")){
158 				maxMovie=Parse.parseIntKMG(b);
159 			}else if(a.equals("movie") || a.equals("mov")){
160 				minMovie=maxMovie=Parse.parseIntKMG(b);
161 			}
162 
163 			else if(a.equals("missingrate") || a.equals("missing")){
164 				missingRate=Double.parseDouble(b);
165 				assert(missingRate<=1);
166 			}else if(a.equals("hiddenrate") || a.equals("hidden")){
167 				hiddenRate=Double.parseDouble(b);
168 				assert(hiddenRate<=1);
169 			}else if(a.equals("bothends")){
170 				allowBothEndsBad=Parse.parseBoolean(b);
171 				assert(false) : "TODO";
172 			}
173 
174 			else if(a.equals("gc")){
175 				genomeGC=(float)Double.parseDouble(b);
176 				assert(genomeGC>=0 && genomeGC<=1);
177 			}else if(a.equals("genomesize")){
178 				genomeSize=Parse.parseKMG(b);
179 			}else if(a.equals("addns") || a.equals("ns")){
180 				addNs=Parse.parseBoolean(b);
181 			}else if(a.equals("seed")){
182 				seed=Long.parseLong(b);
183 			}else if(a.equals("zmws") || a.equals("maxzmws") || a.equals("reads")){
184 				maxZMWs=Parse.parseIntKMG(b);
185 			}else if(a.equals("ccs")){
186 				makeCCS=Parse.parseBoolean(b);
187 			}
188 
189 			else if(a.equals("invertedrepeatrate") || a.equals("invertrepeatrate") || a.equals("irrate")){
190 				invertedRepeatRate=Double.parseDouble(b);
191 				assert(invertedRepeatRate>=0);
192 			}else if(a.equals("invertedrepeatminlen") || a.equals("invertrepeatminlen") || a.equals("irminlen")){
193 				invertedRepeatMinLength=Parse.parseIntKMG(b);
194 			}else if(a.equals("invertedrepeatmaxlen") || a.equals("invertrepeatmaxlen") || a.equals("irmaxlen")){
195 				invertedRepeatMaxLength=Parse.parseIntKMG(b);
196 			}else if(a.equals("invertedrepeatlen") || a.equals("invertrepeatlen") || a.equals("irlen")){
197 				invertedRepeatMinLength=invertedRepeatMaxLength=Parse.parseIntKMG(b);
198 			}
199 
200 			else if(a.equals("miner") || a.equals("minerrorrate")){
201 				minErrorRate=(float)Double.parseDouble(b);
202 				assert(minErrorRate>=0 && minErrorRate<=1);
203 			}else if(a.equals("maxer") || a.equals("maxerrorrate")){
204 				maxErrorRate=(float)Double.parseDouble(b);
205 				assert(maxErrorRate>=0 && maxErrorRate<=1);
206 			}else if(a.equals("er") || a.equals("errorrate")){
207 				minErrorRate=maxErrorRate=(float)Double.parseDouble(b);
208 				assert(minErrorRate>=0 && minErrorRate<=1);
209 			}
210 
211 			else if(a.equals("minid") || a.equals("minidentity")){
212 				maxErrorRate=1-(float)Double.parseDouble(b);
213 				assert(maxErrorRate>=0 && maxErrorRate<=1);
214 			}else if(a.equals("maxid") || a.equals("maxidentity")){
215 				minErrorRate=1-(float)Double.parseDouble(b);
216 				assert(minErrorRate>=0 && minErrorRate<=1);
217 			}else if(a.equals("id") || a.equals("identity")){
218 				minErrorRate=maxErrorRate=1-(float)Double.parseDouble(b);
219 				assert(minErrorRate>=0 && minErrorRate<=1);
220 			}else if(a.equals("adderrors")){
221 				addErrors=Parse.parseBoolean(b);
222 			}else if(a.equals("ref")){
223 				parser.in1=b;
224 			}
225 
226 			else if(a.equals("parse_flag_goes_here")){
227 				long fake_variable=Parse.parseKMG(b);
228 				//Set a variable here
229 			}else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
230 				//do nothing
231 			}else{
232 				outstream.println("Unknown parameter "+args[i]);
233 				assert(false) : "Unknown parameter "+args[i];
234 			}
235 		}
236 
237 		return parser;
238 	}
239 
240 	/** Add or remove .gz or .bz2 as needed */
fixExtensions()241 	private void fixExtensions(){
242 		in1=Tools.fixExtension(in1);
243 	}
244 
245 	/** Ensure files can be read and written */
checkFileExistence()246 	private void checkFileExistence(){
247 		//Ensure output files can be written
248 		if(!Tools.testOutputFiles(overwrite, append, false, out1, outIdHist)){
249 			throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n");
250 		}
251 
252 		//Ensure input files can be read
253 		if(!Tools.testInputFiles(false, true, in1)){
254 			throw new RuntimeException("\nCan't read some input files.\n");
255 		}
256 
257 		//Ensure that no file was specified multiple times
258 		if(!Tools.testForDuplicateFiles(true, in1, out1, outIdHist)){
259 			throw new RuntimeException("\nSome file names were specified multiple times.\n");
260 		}
261 	}
262 
263 	/** Adjust file-related static fields as needed for this program */
checkStatics()264 	private static void checkStatics(){
265 		//Adjust the number of threads for input file reading
266 		if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
267 			ByteFile.FORCE_MODE_BF2=true;
268 		}
269 
270 		assert(FastaReadInputStream.settingsOK());
271 	}
272 
273 	/** Ensure parameter ranges are within bounds and required parameters are set */
validateParams()274 	private boolean validateParams(){
275 		assert(minMoleculeLength>0 && minMoleculeLength<=maxMoleculeLength) : minMoleculeLength+", "+maxMoleculeLength;
276 		assert(minMovie>0 && minMovie<=maxMovie) : minMovie+", "+maxMovie;
277 
278 		assert(missingRate>=0 && missingRate<=1) : missingRate;
279 		assert(hiddenRate>=0 && hiddenRate<=1) : hiddenRate;
280 		assert(genomeGC>=0 && genomeGC<=1) : genomeGC;
281 		assert(in1!=null || genomeSize>maxMoleculeLength) : genomeSize;
282 		assert(in1!=null || invertedRepeatMaxLength*2<genomeSize) : genomeSize;
283 		assert(maxZMWs>0) : "zmsw="+maxZMWs+"; please set to a positive number.";
284 
285 		assert(invertedRepeatRate>=0) : invertedRepeatRate;
286 		assert(invertedRepeatMinLength>0 && invertedRepeatMinLength<=invertedRepeatMaxLength) : invertedRepeatMinLength+", "+invertedRepeatMaxLength;
287 
288 		assert(minErrorRate>=0 && minErrorRate<=maxErrorRate) : minErrorRate+", "+maxErrorRate;
289 		return true;
290 	}
291 
292 	/*--------------------------------------------------------------*/
293 	/*----------------         Outer Methods        ----------------*/
294 	/*--------------------------------------------------------------*/
295 
296 	/** Create read streams and process all data */
process(Timer t)297 	void process(Timer t){
298 
299 		//Turn off read validation in the input threads to increase speed
300 		final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
301 		Read.VALIDATE_IN_CONSTRUCTOR=true;
302 
303 		//Create a read input stream
304 		final ConcurrentReadInputStream cris=makeCris();
305 
306 		//Optionally create a read output stream
307 		final ConcurrentReadOutputStream ros=makeCros(false);
308 
309 		//Reset counters
310 		readsProcessed=readsOut=0;
311 		basesProcessed=basesOut=0;
312 
313 		Random randy=Shared.threadLocalRandom(seed);
314 		if(cris==null){
315 			ref=genSynthGenome(randy);
316 		}else{
317 			ref=loadData(cris, randy);
318 		}
319 
320 		if(invertedRepeatRate>0){
321 			addInvertedRepeats(ref, randy);
322 		}
323 
324 		//Process the reads in separate threads
325 		spawnThreads(cris, ros);
326 
327 		if(verbose){outstream.println("Finished; closing streams.");}
328 
329 		//Write anything that was accumulated by ReadStats
330 		errorState|=ReadStats.writeAll();
331 		//Close the read streams
332 		errorState|=ReadWrite.closeStreams(cris, ros);
333 
334 		writeIdHist();
335 
336 		//Reset read validation
337 		Read.VALIDATE_IN_CONSTRUCTOR=vic;
338 
339 		//Report timing and results
340 		t.stop();
341 		outstream.println(Tools.timeReadsBasesProcessed(t, readsOut, basesOut, 8));
342 //		outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false));
343 
344 		//Throw an exception of there was an error in a thread
345 		if(errorState){
346 			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
347 		}
348 	}
349 
writeIdHist()350 	private void writeIdHist(){
351 		if(ffIdHist==null){return;}
352 		ByteStreamWriter bsw=new ByteStreamWriter(ffIdHist);
353 		bsw.start();
354 		bsw.print("#Identity\tCount\n".getBytes());
355 		for(int i=0; i<idHist.length; i++){
356 			bsw.print(i*100.0/(ID_BINS-1), 3).print('\t').println(idHist[i]);
357 		}
358 		errorState|=bsw.poisonAndWait();
359 	}
360 
361 	/** Create a Read Input Stream */
makeCris()362 	private ConcurrentReadInputStream makeCris(){
363 		if(ffin1==null){return null;}
364 		ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
365 		cris.start(); //Start the stream
366 		if(verbose){outstream.println("Started cris");}
367 		return cris;
368 	}
369 
370 	/** Create a Read Output Stream */
makeCros(boolean pairedInput)371 	private ConcurrentReadOutputStream makeCros(boolean pairedInput){
372 		if(ffout1==null){return null;}
373 
374 		//Set output buffer size
375 		final int buff=4;
376 
377 		final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(
378 				ffout1, null, qfout1, null, buff, null, false);
379 		ros.start(); //Start the stream
380 		return ros;
381 	}
382 
383 	/** Spawn process threads */
spawnThreads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros)384 	private void spawnThreads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
385 
386 		//Do anything necessary prior to processing
387 
388 		//Determine how many threads may be used
389 		final int threads=Shared.threads();
390 
391 		//Fill a list with ProcessThreads
392 		ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
393 		for(int i=0; i<threads; i++){
394 			alpt.add(new ProcessThread(ros, i, nextZmwID, seed));
395 		}
396 
397 		//Start the threads
398 		for(ProcessThread pt : alpt){
399 			pt.start();
400 		}
401 
402 		//Wait for threads to finish
403 		waitForThreads(alpt);
404 
405 		//Do anything necessary after processing
406 
407 	}
408 
409 	/** Wait until all worker threads are finished, then return */
waitForThreads(ArrayList<ProcessThread> alpt)410 	private void waitForThreads(ArrayList<ProcessThread> alpt){
411 
412 		//Wait for completion of all threads
413 		boolean success=true;
414 		for(ProcessThread pt : alpt){
415 
416 			//Wait until this thread has terminated
417 			while(pt.getState()!=Thread.State.TERMINATED){
418 				try {
419 					//Attempt a join operation
420 					pt.join();
421 				} catch (InterruptedException e) {
422 					//Potentially handle this, if it is expected to occur
423 					e.printStackTrace();
424 				}
425 			}
426 
427 			//Accumulate per-thread statistics
428 			readsOut+=pt.readsOutT;
429 			basesOut+=pt.basesOutT;
430 			Tools.add(idHist, pt.idHistT);
431 
432 			success&=pt.success;
433 		}
434 
435 		//Track whether any threads failed
436 		if(!success){errorState=true;}
437 	}
438 
439 	/*--------------------------------------------------------------*/
440 	/*----------------         Inner Methods        ----------------*/
441 	/*--------------------------------------------------------------*/
442 
randomBase(Random randy)443 	private byte randomBase(Random randy) {
444 		float rGC=randy.nextFloat();
445 		if(rGC>=genomeGC){//AT
446 			return (byte)(randy.nextBoolean() ? 'A' : 'T');
447 		}else{//GC
448 			return (byte)(randy.nextBoolean() ? 'G' : 'C');
449 		}
450 	}
451 
randomLength(int min, int max, Random randy)452 	private static int randomLength(int min, int max, Random randy) {
453 		if(min==max){return min;}
454 		int range=max-min+1;
455 		int x=min+Tools.min(randy.nextInt(range), randy.nextInt(range));
456 //		System.err.println(x+", "+min+", "+max);
457 //		new Exception().printStackTrace();
458 //		System.err.println(randy.getClass());
459 //		System.err.println(randy.nextLong());
460 		return x;
461 	}
462 
randomRate(float min, float max, Random randy)463 	private static float randomRate(float min, float max, Random randy) {
464 		if(min==max){return min;}
465 		float range=max-min;
466 //		float a=(randy.nextFloat()+randy.nextFloat());
467 		float b=(randy.nextFloat()+randy.nextFloat());
468 		float c=(1.6f*randy.nextFloat()+0.4f*randy.nextFloat());
469 		float x=min+range*0.5f*Tools.min(b, c);
470 //		assert(false) : "x="+x+", a="+a+", b="+b+", c="+c+", min="+min+", max="+max;
471 //		System.err.println(x+", "+min+", "+max);
472 		return x;
473 	}
474 
genSynthGenome(Random randy)475 	private byte[] genSynthGenome(Random randy){
476 		assert(genomeSize<=MAX_GENOME_LENGTH) : genomeSize;
477 		byte[] array=new byte[(int)genomeSize];
478 		for(int i=0; i<genomeSize; i++){
479 			array[i]=randomBase(randy);
480 		}
481 		return array;
482 	}
483 
loadData(ConcurrentReadInputStream cris, Random randy)484 	private byte[] loadData(ConcurrentReadInputStream cris, Random randy){
485 
486 		ByteBuilder bb=new ByteBuilder();
487 
488 		//Grab the first ListNum of reads
489 		ListNum<Read> ln=cris.nextList();
490 
491 		//As long as there is a nonempty read list...
492 		while(ln!=null && ln.size()>0){
493 //			if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
494 
495 			for(Read r : ln){
496 
497 //				System.err.println("Fetched read len="+r.length());//123
498 
499 				//Increment counters
500 				readsProcessed+=r.pairCount();
501 				basesProcessed+=r.pairLength();
502 
503 
504 
505 //				System.err.println("Filter: addNs="+addNs+", (r.length()>maxMoleculeLength && (invertedRepeatRate<=0 || r.length()>2*invertedRepeatMaxLength);//123
506 
507 				//Filter disabled; it causes short sequences to be ignored.
508 				//Optional filter criteria
509 //				if(!addNs || (r.length()>maxMoleculeLength && (invertedRepeatRate<=0 || r.length()>2*invertedRepeatMaxLength))){
510 					final byte[] bases=r.bases;
511 
512 					if(addNs){
513 						if(bb.length()>0){bb.append('N');}
514 					}else{
515 						for(int i=0; i<bases.length; i++){
516 							if(!AminoAcid.isFullyDefined(bases[i])){
517 								bases[i]=randomBase(randy);
518 							}
519 						}
520 					}
521 
522 					if(bb.length()+bases.length<=MAX_GENOME_LENGTH){
523 						bb.append(bases);
524 //						System.err.println("Appended "+r.length());//123
525 					}else{
526 //						System.err.println("Appending partial.");//123
527 						for(byte b : bases){
528 							bb.append(b);
529 							if(bb.length>=MAX_GENOME_LENGTH){
530 								cris.returnList(ln.id, true);
531 //								System.err.println("Returning partial "+bb.length);//123
532 								return bb.toBytes();
533 							}
534 						}
535 					}
536 //				}
537 			}
538 
539 			//Notify the input stream that the list was used
540 			cris.returnList(ln);
541 //			if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
542 
543 			//Fetch a new list
544 			ln=cris.nextList();
545 		}
546 
547 		//Notify the input stream that the final list was used
548 		if(ln!=null){
549 			cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
550 		}
551 
552 //		System.err.println("Returning full "+bb.length);//123
553 		return bb.toBytes();
554 	}
555 
addInvertedRepeats(byte[] bases, Random randy)556 	private void addInvertedRepeats(byte[] bases, Random randy){
557 
558 		long added=0;
559 		long toAdd=(long) (bases.length*invertedRepeatRate);
560 		while(added<toAdd){
561 			int len=randomLength(invertedRepeatMinLength, invertedRepeatMaxLength, randy);
562 			int start=randy.nextInt(bases.length-2*len);
563 			int stop=start+len;
564 			boolean OK=true;
565 			for(int i=0; i<len && OK; i++){
566 				OK&=(bases[start+i]!='N' && bases[stop+i]!='N');
567 			}
568 			if(OK){
569 				for(int i=0; i<len; i++){
570 					byte b=bases[stop-i-1];
571 					bases[stop+i]=AminoAcid.baseToComplementExtended[b];
572 				}
573 				added+=2*len;
574 //				System.err.println("added="+added+"/"+toAdd);
575 			}else{
576 //
577 			}
578 		}
579 
580 	}
581 
582 	/*--------------------------------------------------------------*/
583 	/*----------------         Inner Classes        ----------------*/
584 	/*--------------------------------------------------------------*/
585 
586 	/** */
587 	private class ProcessThread extends Thread {
588 
589 		//Constructor
ProcessThread(final ConcurrentReadOutputStream ros_, final int tid_, final AtomicLong nextZmwID_, final long seed)590 		ProcessThread(final ConcurrentReadOutputStream ros_, final int tid_,
591 				final AtomicLong nextZmwID_, final long seed){
592 			ros=ros_;
593 			tid=tid_;
594 			atomicZmwID=nextZmwID_;
595 //			assert(false) : randy.getClass()+", "+randy.nextLong();
596 		}
597 
598 		//Called by start()
599 		@Override
run()600 		public void run(){
601 			//Do anything necessary prior to processing
602 			randy=Shared.threadLocalRandom(seed<0 ? seed : seed+(tid+1)*tid*1000L);
603 
604 			//Process the reads
605 			processInner();
606 
607 			//Do anything necessary after processing
608 
609 			//Indicate successful exit status
610 			success=true;
611 		}
612 
613 		/** Iterate through the reads */
processInner()614 		void processInner(){
615 
616 			//As long as there is a nonempty read list...
617 			for(long generated=atomicZmwID.getAndAdd(readsPerList); generated<maxZMWs;
618 					generated=atomicZmwID.getAndAdd(readsPerList)){
619 //				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
620 
621 				long toGenerate=Tools.min(readsPerList, maxZMWs-generated);
622 
623 				ArrayList<Read> reads=generateList((int)toGenerate, generated);
624 
625 				//Output reads to the output stream
626 				if(ros!=null){ros.add(reads, 0);}
627 			}
628 		}
629 
630 		/** Generate the next list of reads */
generateList(int toGenerate, long nextID)631 		private ArrayList<Read> generateList(int toGenerate, long nextID){
632 
633 			//Grab the actual read list from the ListNum
634 			final ArrayList<Read> reads=new ArrayList<Read>(toGenerate);
635 
636 			//Loop through each read in the list
637 			for(int i=0; i<toGenerate; i++, nextID++){
638 				ArrayList<Read> zmw=generateZMW(nextID);
639 				if(zmw==null){
640 					i--;
641 					nextID--;
642 				}else{reads.addAll(zmw);}
643 			}
644 
645 			return reads;
646 		}
647 
median(ArrayList<ReadBuilder> list)648 		private ReadBuilder median(ArrayList<ReadBuilder> list){
649 			if(list.size()<3){return null;}
650 			IntList lengths=new IntList(list.size()-2);
651 
652 			for(int i=1; i<list.size()-1; i++){
653 				lengths.add(list.get(i).length());
654 			}
655 			lengths.sort();
656 			int median=lengths.get((lengths.size-1)/2);
657 
658 			for(int i=1; i<list.size()-1; i++){
659 				ReadBuilder rb=list.get(i);
660 				if(rb.length()==median){
661 					return rb;
662 				}
663 			}
664 
665 			assert(false);
666 			return null;
667 		}
668 
669 		/**
670 		 * Generate a single read.
671 		 */
generateZMW(final long nextID)672 		private ArrayList<Read> generateZMW(final long nextID){
673 
674 			final int movieLength=randomLength(minMovie, maxMovie, randy);
675 			final float errorRate=randomRate(minErrorRate, maxErrorRate, randy);
676 
677 			ArrayList<ReadBuilder> baseCalls=baseCallAllPasses(movieLength, errorRate, nextID);
678 			if(baseCalls==null){
679 //				System.err.println(movieLength+", "+errorRate+", "+nextID);//123
680 				return null;
681 			}
682 
683 			final boolean missing=randy.nextFloat()<missingRate;
684 			if(missing){
685 				final int missingMod=randy.nextInt(2);
686 				ArrayList<ReadBuilder> temp=new ArrayList<ReadBuilder>();
687 
688 				ReadBuilder current=null;
689 				for(int i=0; i<baseCalls.size(); i++){
690 					ReadBuilder rb=baseCalls.get(i);
691 					assert(rb.subreads==1);
692 					assert(rb.missing==0);
693 					assert(rb.adapters==0);
694 					if(current!=null && (i&1)==missingMod){
695 						current.add(rb);
696 						current.missing++;
697 						assert(current.subreads>1);
698 						assert(current.missing>0);
699 						temp.add(current);
700 						current=null;
701 					}else{
702 						if(current!=null){temp.add(current);}
703 						current=rb;
704 					}
705 				}
706 				if(current!=null){temp.add(current);}
707 				baseCalls=temp;
708 			}
709 
710 			if(makeCCS){
711 				ReadBuilder median=median(baseCalls);
712 				if(median==null){return null;}
713 				baseCalls.clear();
714 				baseCalls.add(median);
715 			}else if(hiddenRate>0){
716 				ArrayList<ReadBuilder> temp=new ArrayList<ReadBuilder>();
717 
718 				ReadBuilder current=null;
719 				for(int i=0; i<baseCalls.size(); i++){
720 					ReadBuilder rb=baseCalls.get(i);
721 					assert(rb.adapters==0);
722 					if(current!=null && randy.nextFloat()<hiddenRate){
723 						current.add(baseCallAdapter(errorRate));
724 						current.add(rb);
725 						assert(current.adapters>0);
726 						assert(current.subreads>1);
727 					}else{
728 						if(current!=null){temp.add(current);}
729 						current=rb;
730 						assert(current.adapters==0);
731 					}
732 				}
733 				if(current!=null){temp.add(current);}
734 				baseCalls=temp;
735 			}
736 
737 
738 			ArrayList<Read> reads=new ArrayList<Read>();
739 			for(ReadBuilder rb : baseCalls){
740 				Read r=rb.toRead();
741 				readsOutT+=r.pairCount();
742 				basesOutT+=r.length();
743 				reads.add(r);
744 			}
745 
746 			idHistT[(int)((1-errorRate)*(ID_BINS-1)+0.5f)]++;
747 			return reads;
748 		}
749 
750 		private ArrayList<ReadBuilder> baseCallAllPasses(final int movieLength, final float errorRate, long zmw){
751 			byte[] frag=null;
752 
753 			for(int i=0; i<10 && frag==null; i++){//retry several times
754 				frag=fetchBases(ref);
755 			}
756 
757 			if(frag==null){
758 //				System.err.println("Failed baseCallAllPasses("+movieLength+", "+errorRate+", "+zmw);//123
759 				return null;
760 			}
761 
762 			ArrayList<ReadBuilder> list=new ArrayList<ReadBuilder>();
763 			int movieRemaining=movieLength;
764 			int moviePos=0;
765 			assert(frag.length>0) : frag.length;
766 			int start=randy.nextInt(frag.length);
767 			while(movieRemaining>0){
768 				ReadBuilder rb=baseCallOnePass(frag, errorRate, start, moviePos, movieRemaining, zmw);
769 				list.add(rb);
770 				start=0;
771 				int elapsed=rb.length()+adapterLen;
772 				moviePos+=elapsed;
773 				movieRemaining-=elapsed;
774 				AminoAcid.reverseComplementBasesInPlace(frag);
775 			}
776 			return list;
777 		}
778 
779 		/** Call bases for one pass */
780 		private ReadBuilder baseCallOnePass(final byte[] frag, final float errorRate, final int start, final int movieStart, int movieRemaining, long zmw){
781 			final float mult=1/(1-errorRate);
782 			ByteBuilder bb=new ByteBuilder();
783 			int fpos=start;
784 			for(; fpos<frag.length && movieRemaining>0; fpos++){
785 				float f=randy.nextFloat();
786 				byte b=frag[fpos];
787 				if(f>=errorRate){
788 					bb.append(b);
789 					movieRemaining--;
790 				}else{
791 					f=mult*(1-f);
792 					if(f<insThresh){//ins
793 						b=AminoAcid.numberToBase[randy.nextInt(4)];
794 						bb.append(b);
795 						fpos--;
796 						movieRemaining--;
797 					}else if(f<delThresh){//del
798 
799 					}else{//sub
800 						int x=AminoAcid.baseToNumber[b]+randy.nextInt(3)+1;
801 						b=AminoAcid.numberToBase[x&3];
802 						bb.append(b);
803 						movieRemaining--;
804 					}
805 				}
806 			}
807 
808 			float passes=(fpos-start)*1.0f/frag.length;
809 			ReadBuilder rb=new ReadBuilder(bb, passes, movieStart, zmw);
810 			rb.errorRate=errorRate;
811 			return rb;
812 		}
813 
814 		/** Call bases for an adapter sequence pass */
815 		private ReadBuilder baseCallAdapter(final float errorRate){
816 			ReadBuilder rb=baseCallOnePass(pacbioAdapter, errorRate, 0, 0, 999999, 0);
817 			rb.passes=0;
818 			rb.fullPasses=0;
819 			rb.subreads=0;
820 			rb.adapters=1;
821 			return rb;
822 		}
823 
824 		private byte[] fetchBases(byte[] source){
825 
826 			final int len=randomLength(minMoleculeLength, maxMoleculeLength, randy);
827 			int start=len>=source.length ? 0 : randy.nextInt(source.length-len);
828 			int stop=Tools.min(start+len, source.length);
829 
830 //			System.err.println("fetchBases(len="+len+", slen="+source.length+", start="+start+", stop="+stop);//123
831 
832 			for(int i=start; i<stop; i++){
833 				if(!AminoAcid.isFullyDefined(source[i])){
834 					return null;
835 				}
836 			}
837 			if(stop-start<1){return null;}
838 			byte[] frag=Arrays.copyOfRange(source, start, stop);
839 			if(randy.nextBoolean()){AminoAcid.reverseComplementBasesInPlace(frag);}
840 			return frag;
841 		}
842 
843 		/** Number of reads retained by this thread */
844 		protected long readsOutT=0;
845 		/** Number of bases retained by this thread */
846 		protected long basesOutT=0;
847 
848 		/** True only if this thread has completed successfully */
849 		boolean success=false;
850 
851 		private final AtomicLong atomicZmwID;
852 		private final int readsPerList=Shared.bufferLen();
853 
854 		/** Random number source */
855 		private Random randy;
856 
857 		/** Random number source */
858 		private final long[] idHistT=new long[ID_BINS];
859 
860 		/** Shared output stream */
861 		private final ConcurrentReadOutputStream ros;
862 		/** Thread ID */
863 		final int tid;
864 	}
865 
866 	/*--------------------------------------------------------------*/
867 
868 
869 
870 	/*--------------------------------------------------------------*/
871 	/*----------------            Fields            ----------------*/
872 	/*--------------------------------------------------------------*/
873 
874 	/** Primary input file path */
875 	private String in1=null;
876 
877 	/** Primary output file path */
878 	private String out1=null;
879 
880 	/** Primary output file path */
881 	private String outIdHist=null;
882 
883 	private String qfout1=null;
884 
885 	/** Override input file extension */
886 	private String extin=null;
887 	/** Override output file extension */
888 	private String extout=null;
889 
890 	/*--------------------------------------------------------------*/
891 
892 	/**  */
893 	private int minMoleculeLength=500;
894 	/**  */
895 	private int maxMoleculeLength=10000;
896 	/**  */
897 	private int minMovie=500;
898 	/**  */
899 	private int maxMovie=40000;
900 
901 	/**  */
902 	private double missingRate=0.0;
903 	/**  */
904 	private double hiddenRate=0.0;
905 	/**  */
906 	private boolean allowBothEndsBad=false;
907 
908 	/**  */
909 	private float genomeGC=0.6f;
910 	/**  */
911 	private long genomeSize=10000000;
912 	/**  */
913 	private boolean addNs=true;
914 
915 	/**  */
916 	private double invertedRepeatRate=0.0;
917 	/**  */
918 	private int invertedRepeatMinLength=100;
919 	/**  */
920 	private int invertedRepeatMaxLength=10000;
921 
922 	/**  */
923 	private float minErrorRate=0.05f;
924 	/**  */
925 	private float maxErrorRate=0.25f;
926 	/**  */
927 	private boolean addErrors=true;
928 	/** One read per ZMW */
929 	private boolean makeCCS=false;
930 
931 	/** */
932 	private long seed=-1;
933 
934 	private long[] idHist=new long[ID_BINS];
935 
936 	//These should add to 1
937 	private float insFraction=0.40f;
938 	private float delFraction=0.35f;
939 	private float subFraction=0.25f;
940 
941 	private final float insThresh;
942 	private final float delThresh;
943 
944 	/*--------------------------------------------------------------*/
945 
946 	/** Number of reads processed */
947 	protected long readsProcessed=0;
948 	/** Number of bases processed */
949 	protected long basesProcessed=0;
950 
951 	/** Number of reads retained */
952 	protected long readsOut=0;
953 	/** Number of bases retained */
954 	protected long basesOut=0;
955 
956 	/** Quit after processing this many INPUT reads */
957 	private long maxReads=-1;
958 
959 	/** Quit after generating this many OUTPUT zmws */
960 	private long maxZMWs=-1;
961 
962 	/** Reference genome, max 2Gbp */
963 	private byte[] ref;
964 
965 	private AtomicLong nextZmwID=new AtomicLong(0);
966 
967 	/*--------------------------------------------------------------*/
968 	/*----------------         Final Fields         ----------------*/
969 	/*--------------------------------------------------------------*/
970 
971 	/** Primary input file */
972 	private final FileFormat ffin1;
973 
974 	/** Primary output file */
975 	private final FileFormat ffout1;
976 
977 	/** Primary output file */
978 	private final FileFormat ffIdHist;
979 
980 	/*--------------------------------------------------------------*/
981 	/*----------------          Constants           ----------------*/
982 	/*--------------------------------------------------------------*/
983 
984 	private static final int ID_BINS=201;
985 
986 	private static final long MAX_GENOME_LENGTH=2000000000;
987 
988 	public static final byte[] pacbioAdapter="ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT".getBytes();
989 	public static final int adapterLen=pacbioAdapter.length;
990 
991 	/*--------------------------------------------------------------*/
992 	/*----------------        Common Fields         ----------------*/
993 	/*--------------------------------------------------------------*/
994 
995 	/** Print status messages to this output stream */
996 	private PrintStream outstream=System.err;
997 	/** Print verbose messages */
998 	public static boolean verbose=false;
999 	/** True if an error was encountered */
1000 	public boolean errorState=false;
1001 	/** Overwrite existing output files */
1002 	private boolean overwrite=false;
1003 	/** Append to existing output files */
1004 	private boolean append=false;
1005 
1006 }
1007