1 package hiseq;
2 
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Locale;
7 import java.util.Random;
8 
9 import fileIO.ByteFile;
10 import fileIO.FileFormat;
11 import fileIO.ReadWrite;
12 import fileIO.TextStreamWriter;
13 import jgi.Dedupe;
14 import kmer.AbstractKmerTable;
15 import kmer.HashArray1D;
16 import kmer.ScheduleMaker;
17 import shared.Parse;
18 import shared.Parser;
19 import shared.PreParser;
20 import shared.Shared;
21 import shared.Timer;
22 import shared.Tools;
23 import shared.TrimRead;
24 import stream.ConcurrentReadInputStream;
25 import stream.ConcurrentReadOutputStream;
26 import stream.FASTQ;
27 import stream.FastaReadInputStream;
28 import stream.Read;
29 import structures.ListNum;
30 
31 /**
32  * Analyzes a flow cell for low-quality areas.
33  * Removes reads in the low-quality areas.
34  *
35  * @author Brian Bushnell
36  * @date August 31, 2016
37  *
38  */
39 public class AnalyzeFlowCell {
40 
41 	/*--------------------------------------------------------------*/
42 	/*----------------        Initialization        ----------------*/
43 	/*--------------------------------------------------------------*/
44 
45 	/**
46 	 * Code entrance from the command line.
47 	 * @param args Command line arguments
48 	 */
main(String[] args)49 	public static void main(String[] args){
50 		Timer t=new Timer();
51 		AnalyzeFlowCell x=new AnalyzeFlowCell(args);
52 		x.process(t);
53 
54 		//Close the print stream if it was redirected
55 		Shared.closeStream(x.outstream);
56 	}
57 
58 	/**
59 	 * Constructor.
60 	 * @param args Command line arguments
61 	 */
AnalyzeFlowCell(String[] args)62 	public AnalyzeFlowCell(String[] args){
63 
64 		{//Preparse block for help, config files, and outstream
65 			PreParser pp=new PreParser(args, getClass(), false);
66 			args=pp.args;
67 			outstream=pp.outstream;
68 		}
69 
70 		//Set shared static variables
71 		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
72 		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
73 
74 		Parser parser=parse(args);
75 
76 		if(gToN || discardG){MicroTile.TRACK_CYCLES=true;}
77 
78 		{//Process parser fields
79 			Parser.processQuality();
80 
81 			maxReads=parser.maxReads;
82 
83 			overwrite=parser.overwrite;
84 			append=parser.append;
85 			setInterleaved=parser.setInterleaved;
86 
87 			in1=parser.in1;
88 			in2=parser.in2;
89 			qfin1=parser.qfin1;
90 			qfin2=parser.qfin2;
91 
92 			out1=parser.out1;
93 			out2=parser.out2;
94 			qfout1=parser.qfout1;
95 			qfout2=parser.qfout2;
96 
97 			extin=parser.extin;
98 			extout=parser.extout;
99 
100 
101 			trimq=parser.trimq;
102 			trimE=parser.trimE();
103 			minlen=parser.minReadLength;
104 			trimLeft=parser.qtrimLeft;
105 			trimRight=parser.qtrimRight;
106 		}
107 
108 		checkFiles();
109 
110 		//Create output FileFormat objects
111 		ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
112 		ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
113 		ffoutbad=FileFormat.testOutput(outbad, FileFormat.FASTQ, extout, true, overwrite, append, false);
114 
115 		//Create input FileFormat objects
116 		ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
117 		ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
118 	}
119 
120 	/*--------------------------------------------------------------*/
121 	/*----------------    Initialization Helpers    ----------------*/
122 	/*--------------------------------------------------------------*/
123 
parse(String[] args)124 	private Parser parse(String[] args){
125 		//Create a parser object
126 		Parser parser=new Parser();
127 		parser.qtrimRight=trimRight;
128 		parser.trimq=trimq;
129 		parser.minReadLength=minlen;
130 
131 		//Parse each argument
132 		for(int i=0; i<args.length; i++){
133 			String arg=args[i];
134 
135 			//Break arguments into their constituent parts, in the form of "a=b"
136 			String[] split=arg.split("=");
137 			String a=split[0].toLowerCase();
138 			String b=split.length>1 ? split[1] : null;
139 
140 			if(a.equals("verbose")){
141 				verbose=Parse.parseBoolean(b);
142 			}else if(a.equals("seed")){
143 				seed=Long.parseLong(b);
144 			}else if(a.equals("divisor") || a.equals("size")){
145 				Tile.xSize=Tile.ySize=Integer.parseInt(b);
146 			}else if(a.equals("xdivisor") || a.equals("xsize")){
147 				Tile.xSize=Integer.parseInt(b);
148 			}else if(a.equals("ydivisor") || a.equals("ysize")){
149 				Tile.ySize=Integer.parseInt(b);
150 			}else if(a.equals("target")){
151 				targetAverageReads=Integer.parseInt(b);
152 			}else if(a.equals("dump")){
153 				dump=b;
154 			}else if(a.equals("indump") || a.equals("ind") || a.equals("dumpin")){
155 				dumpIn=b;
156 			}else if(a.equals("pound")){
157 				pound=Parse.parseBoolean(b);
158 			}else if(a.equals("loadkmers") || a.equals("usekmers")){
159 				loadKmers=Parse.parseBoolean(b);
160 			}else if(a.equals("lqo") || a.equals("lowqualityonly")){
161 				discardOnlyLowQuality=Parse.parseBoolean(b);
162 			}else if(a.equals("dl") || a.equals("discardlevel")){
163 				discardLevel=Integer.parseInt(b);
164 			}else if(a.equals("outbad") || a.equals("outb") || a.equals("outtoss") || a.equals("outt") || a.equals("outunwanted") || a.equals("outu")){
165 				outbad=b;
166 			}
167 
168 			else if(a.equals("deviations") || a.equals("d")){
169 				qDeviations=uDeviations=eDeviations=gDeviations=Float.parseFloat(b);
170 			}else if(a.equals("qdeviations") || a.equals("qd") || a.equals("dq")){
171 				qDeviations=Float.parseFloat(b);
172 			}else if(a.equals("udeviations") || a.equals("ud") || a.equals("du")){
173 				uDeviations=Float.parseFloat(b);
174 			}else if(a.equals("edeviations") || a.equals("ed") || a.equals("de")){
175 				eDeviations=Float.parseFloat(b);
176 			}else if(a.equals("gdeviations") || a.equals("gd") || a.equals("dg")){
177 				gDeviations=Float.parseFloat(b);
178 			}
179 
180 			else if(a.equals("qfraction") || a.equals("qf")){
181 				qualFraction=Float.parseFloat(b);
182 			}else if(a.equals("ufraction") || a.equals("uf")){
183 				uniqueFraction=Float.parseFloat(b);
184 			}else if(a.equals("efraction") || a.equals("ef")){
185 				errorFreeFraction=Float.parseFloat(b);
186 			}else if(a.equals("gfraction") || a.equals("gf")){
187 				gFraction=Float.parseFloat(b);
188 			}
189 
190 			else if(a.equals("qabsolute") || a.equals("qa")){
191 				qualAbs=Float.parseFloat(b);
192 			}else if(a.equals("uabsolute") || a.equals("ua")){
193 				uniqueAbs=Float.parseFloat(b);
194 			}else if(a.equals("eabsolute") || a.equals("ea")){
195 				errorFreeAbs=Float.parseFloat(b);
196 			}else if(a.equals("gabsolute") || a.equals("ga")){
197 				gAbs=Float.parseFloat(b);
198 			}
199 
200 			else if(a.equals("gton")){
201 				gToN=Parse.parseBoolean(b);
202 			}else if(a.equals("discardg")){
203 				discardG=Parse.parseBoolean(b);
204 			}
205 
206 			else if(a.equals("minpolyg")){
207 				MicroTile.MIN_POLY_G=Integer.parseInt(b);
208 			}else if(a.equals("trackcycles")){
209 				MicroTile.TRACK_CYCLES=Parse.parseBoolean(b);
210 			}
211 
212 			else if(a.equals("parse_flag_goes_here")){
213 				//Set a variable here
214 			}else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
215 				//do nothing
216 			}else{
217 				outstream.println("Unknown parameter "+args[i]);
218 				assert(false) : "Unknown parameter "+args[i];
219 				//				throw new RuntimeException("Unknown parameter "+args[i]);
220 			}
221 		}
222 		return parser;
223 	}
224 
checkFiles()225 	private void checkFiles(){
226 		doPoundReplacement();
227 		adjustInterleaving();
228 		checkFileExistence();
229 		checkStatics();
230 	}
231 
doPoundReplacement()232 	private void doPoundReplacement(){
233 		//Do input file # replacement
234 		if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
235 			in2=in1.replace("#", "2");
236 			in1=in1.replace("#", "1");
237 		}
238 
239 		//Do output file # replacement
240 		if(out1!=null && out2==null && out1.indexOf('#')>-1){
241 			out2=out1.replace("#", "2");
242 			out1=out1.replace("#", "1");
243 		}
244 
245 		//Ensure there is an input file
246 		if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
247 
248 		//Ensure out2 is not set without out1
249 		if(out1==null && out2!=null){throw new RuntimeException("Error - cannot define out2 without defining out1.");}
250 	}
251 
checkFileExistence()252 	private void checkFileExistence(){
253 
254 		//Ensure output files can be written
255 		if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outbad, dump)){
256 			outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2+", "+outbad);
257 			throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+", "+outbad+", "+dump+"\n");
258 		}
259 
260 		//Ensure input files can be read
261 		if(!Tools.testInputFiles(false, true, in1, in2)){
262 			throw new RuntimeException("\nCan't read some input files.\n");
263 		}
264 
265 		//Ensure that no file was specified multiple times
266 		if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2, outbad, dump)){
267 			throw new RuntimeException("\nSome file names were specified multiple times.\n");
268 		}
269 	}
270 
adjustInterleaving()271 	private void adjustInterleaving(){
272 		//Adjust interleaved detection based on the number of input files
273 		if(in2!=null){
274 			if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
275 			FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
276 		}
277 
278 		//Adjust interleaved settings based on number of output files
279 		if(!setInterleaved){
280 			assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
281 			if(in2!=null){ //If there are 2 input streams.
282 				FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
283 				outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
284 			}else{ //There is one input stream.
285 				if(out2!=null){
286 					FASTQ.FORCE_INTERLEAVED=true;
287 					FASTQ.TEST_INTERLEAVED=false;
288 					outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
289 				}
290 			}
291 		}
292 	}
293 
checkStatics()294 	private static void checkStatics(){
295 		//Adjust the number of threads for input file reading
296 		if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
297 			ByteFile.FORCE_MODE_BF2=true;
298 		}
299 
300 		assert(FastaReadInputStream.settingsOK());
301 	}
302 
303 	/*--------------------------------------------------------------*/
304 	/*----------------         Outer Methods        ----------------*/
305 	/*--------------------------------------------------------------*/
306 
307 	/** Create read streams and process all data */
process(Timer t)308 	public void process(Timer t){
309 
310 		//Reset counters
311 		readsProcessed=0;
312 		basesProcessed=0;
313 
314 		if(dumpIn==null){
315 			flowcell=new FlowCell();
316 			if(loadKmers){loadKmers();}
317 			fillTiles();
318 			keySets=null;
319 		}else{
320 			flowcell=new FlowCell(dumpIn);
321 
322 			if(flowcell.avgReads<targetAverageReads){
323 				flowcell=flowcell.widen(targetAverageReads);
324 			}
325 
326 			avgQuality=flowcell.avgQuality;
327 			avgUnique=flowcell.avgUnique;
328 			avgErrorFree=flowcell.avgErrorFree;
329 			avgG=flowcell.avgG;
330 			stdQuality=flowcell.stdQuality;
331 			stdUnique=flowcell.stdUnique;
332 			stdErrorFree=flowcell.stdErrorFree;
333 			stdG=flowcell.stdG;
334 
335 			long readsToDiscard=markTiles(flowcell.toList(), flowcell.avgReads);
336 		}
337 		processReads(t);
338 	}
339 
340 	/** Create read streams and process all data */
loadKmers()341 	void loadKmers(){
342 		Timer t2=new Timer();
343 		outstream.print("Loading kmers:  \t");
344 
345 		//Create a read input stream
346 		final ConcurrentReadInputStream cris;
347 		{
348 			cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
349 			cris.start(); //Start the stream
350 			if(verbose){outstream.println("Started cris");}
351 		}
352 		boolean paired=cris.paired();
353 //		if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
354 
355 		//Process the read stream
356 		loadKmersInner(cris);
357 
358 		if(verbose){outstream.println("Finished; closing streams.");}
359 
360 		//Close the read streams
361 		errorState|=ReadWrite.closeStreams(cris);
362 
363 		t2.stop();
364 		outstream.println(t2);
365 	}
366 
367 	/** Create read streams and process all data */
fillTiles()368 	void fillTiles(){
369 
370 		//Create a read input stream
371 		final ConcurrentReadInputStream cris;
372 		{
373 			cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
374 			cris.start(); //Start the stream
375 			if(verbose){outstream.println("Started cris");}
376 		}
377 		boolean paired=cris.paired();
378 //		if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
379 
380 		//Process the read stream
381 		fillTilesInner(cris);
382 
383 		if(verbose){outstream.println("Finished; closing streams.");}
384 
385 		//Close the read streams
386 		errorState|=ReadWrite.closeStreams(cris);
387 	}
388 
389 	/** Create read streams and process all data */
processReads(Timer t)390 	void processReads(Timer t){
391 
392 		if(ffout1!=null || ffoutbad!=null){
393 			Timer t2=new Timer();
394 			outstream.print("Filtering reads:\t");
395 
396 			//Create a read input stream
397 			final ConcurrentReadInputStream cris;
398 			{
399 				cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
400 				cris.start(); //Start the stream
401 				if(verbose){outstream.println("Started cris");}
402 			}
403 			boolean paired=cris.paired();
404 			//		if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
405 
406 			//Optionally read output streams
407 			final ConcurrentReadOutputStream ros, rosb;
408 			final int buff=4;
409 			if(ffout1!=null){
410 				ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false);
411 				ros.start(); //Start the stream
412 			}else{ros=null;}
413 
414 			if(ffoutbad!=null){
415 				rosb=ConcurrentReadOutputStream.getStream(ffoutbad, null, null, null, buff, null, false);
416 				rosb.start(); //Start the stream
417 			}else{rosb=null;}
418 
419 			//Process the read stream
420 			processInner(cris, ros, rosb);
421 
422 			if(verbose){outstream.println("Finished; closing streams.");}
423 
424 			//Close the read streams
425 			errorState|=ReadWrite.closeStreams(cris, ros, rosb);
426 
427 			t2.stop();
428 			outstream.println(t2);
429 		}
430 
431 		//Report timing and results
432 		{
433 			t.stop();
434 			lastReadsOut=readsProcessed-readsDiscarded;
435 			outstream.println();
436 			outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
437 
438 			if(ffout1!=null || ffoutbad!=null){
439 
440 				String rpstring=Tools.padKM(readsDiscarded, 8);
441 				String bpstring=Tools.padKM(basesDiscarded, 8);
442 				String gpstring=Tools.padKM(gsTransformedToN, 8);
443 				outstream.println();
444 				outstream.println("Reads Discarded:    "+rpstring+" \t"+String.format(Locale.ROOT, "%.3f%%", readsDiscarded*100.0/readsProcessed));
445 				outstream.println("Bases Discarded:    "+bpstring+" \t"+String.format(Locale.ROOT, "%.3f%%", basesDiscarded*100.0/basesProcessed));
446 				if(gToN){outstream.println("Gs Masked By N:     "+gpstring+" \t"+String.format(Locale.ROOT, "%.3f%%", gsTransformedToN*100.0/basesProcessed));}
447 				outstream.println();
448 			}
449 		}
450 
451 		//Throw an exception of there was an error in a thread
452 		if(errorState){
453 			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
454 		}
455 	}
456 
457 	/** Iterate through the reads */
processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros, final ConcurrentReadOutputStream rosb)458 	void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros, final ConcurrentReadOutputStream rosb){
459 
460 		//Do anything necessary prior to processing
461 		readsProcessed=0;
462 		basesProcessed=0;
463 
464 		{
465 			//Grab the first ListNum of reads
466 			ListNum<Read> ln=cris.nextList();
467 			//Grab the actual read list from the ListNum
468 			ArrayList<Read> reads=(ln!=null ? ln.list : null);
469 
470 			//Check to ensure pairing is as expected
471 			if(reads!=null && !reads.isEmpty()){
472 				Read r=reads.get(0);
473 				assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
474 			}
475 
476 			//As long as there is a nonempty read list...
477 			while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
478 				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
479 
480 				ArrayList<Read> keepList=new ArrayList<Read>(reads.size());
481 				ArrayList<Read> tossList=new ArrayList<Read>(4);
482 
483 				//Loop through each read in the list
484 				for(int idx=0; idx<reads.size(); idx++){
485 					final Read r1=reads.get(idx);
486 					final Read r2=r1.mate;
487 
488 					//Track the initial length for statistics
489 					final int initialLength1=r1.length();
490 					final int initialLength2=(r1.mateLength());
491 
492 					//Increment counters
493 					readsProcessed+=r1.pairCount();
494 					basesProcessed+=initialLength1+initialLength2;
495 
496 					boolean keep=processReadPair(r1, r2);
497 					if(keep){
498 						keepList.add(r1);
499 					}else{
500 						tossList.add(r1);
501 						readsDiscarded+=r1.pairCount();
502 						basesDiscarded+=initialLength1+initialLength2;
503 					}
504 				}
505 
506 				//Output reads to the output stream
507 				if(ros!=null){ros.add(keepList, ln.id);}
508 				if(rosb!=null){rosb.add(tossList, ln.id);}
509 
510 				//Notify the input stream that the list was used
511 				cris.returnList(ln);
512 				if(verbose){outstream.println("Returned a list.");}
513 
514 				//Fetch a new list
515 				ln=cris.nextList();
516 				reads=(ln!=null ? ln.list : null);
517 			}
518 
519 			//Notify the input stream that the final list was used
520 			if(ln!=null){
521 				cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
522 			}
523 		}
524 
525 		//Do anything necessary after processing
526 
527 	}
528 
529 	/** Iterate through the reads */
loadKmersInner(final ConcurrentReadInputStream cris)530 	public void loadKmersInner(final ConcurrentReadInputStream cris){
531 
532 		keySets=new AbstractKmerTable[WAYS];
533 
534 		//Initialize tables
535 		ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, 12, false, 0.8);
536 		int[] schedule=scheduleMaker.makeSchedule();
537 		for(int j=0; j<WAYS; j++){
538 			keySets[j]=new HashArray1D(schedule, -1L);
539 		}
540 //		for(int j=0; j<WAYS; j++){
541 //			keySets[j]=new HashArray1D(512000, -1, -1L, true); //TODO: Set maxSize
542 //		}
543 		//Do anything necessary prior to processing
544 
545 		{
546 			//Grab the first ListNum of reads
547 			ListNum<Read> ln=cris.nextList();
548 			//Grab the actual read list from the ListNum
549 			ArrayList<Read> reads=(ln!=null ? ln.list : null);
550 
551 			//Check to ensure pairing is as expected
552 			if(reads!=null && !reads.isEmpty()){
553 				Read r=reads.get(0);
554 				assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
555 			}
556 
557 			//As long as there is a nonempty read list...
558 			while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
559 				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
560 
561 				//Loop through each read in the list
562 				for(int idx=0; idx<reads.size(); idx++){
563 					final Read r1=reads.get(idx);
564 					final Read r2=r1.mate;
565 
566 					//Track the initial length for statistics
567 					final int initialLength1=r1.length();
568 					final int initialLength2=(r1.mateLength());
569 
570 					if(initialLength1>=k && randy.nextBoolean()){
571 						final long kmer=toKmer(r1.bases, r1.quality, randy.nextInt(initialLength1-k2), k);
572 						if(kmer>=0){
573 							AbstractKmerTable table=keySets[(int)(kmer%WAYS)];
574 							table.increment(kmer, 1);
575 						}
576 					}
577 
578 					if(initialLength2>=k && randy.nextBoolean()){
579 						final long kmer=toKmer(r2.bases, r2.quality, randy.nextInt(initialLength2-k2), k);
580 						if(kmer>=0){
581 							AbstractKmerTable table=keySets[(int)(kmer%WAYS)];
582 							table.increment(kmer, 1);
583 						}
584 					}
585 				}
586 
587 				//Notify the input stream that the list was used
588 				cris.returnList(ln);
589 				if(verbose){outstream.println("Returned a list.");}
590 
591 				//Fetch a new list
592 				ln=cris.nextList();
593 				reads=(ln!=null ? ln.list : null);
594 			}
595 
596 			//Notify the input stream that the final list was used
597 			if(ln!=null){
598 				cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
599 			}
600 		}
601 
602 		//Do anything necessary after processing
603 	}
604 
605 	/** Iterate through the reads */
fillTilesInner(final ConcurrentReadInputStream cris)606 	public void fillTilesInner(final ConcurrentReadInputStream cris){
607 
608 
609 		Timer t2=new Timer();
610 		outstream.print("Filling tiles:  \t");
611 
612 
613 		{
614 			//Grab the first ListNum of reads
615 			ListNum<Read> ln=cris.nextList();
616 			//Grab the actual read list from the ListNum
617 			ArrayList<Read> reads=(ln!=null ? ln.list : null);
618 
619 			//Check to ensure pairing is as expected
620 			if(reads!=null && !reads.isEmpty()){
621 				Read r=reads.get(0);
622 				assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
623 			}
624 
625 			//As long as there is a nonempty read list...
626 			while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
627 				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
628 
629 				//Loop through each read in the list
630 				for(int idx=0; idx<reads.size(); idx++){
631 					final Read r1=reads.get(idx);
632 					final Read r2=r1.mate;
633 
634 					//Track the initial length for statistics
635 					final int initialLength1=r1.length();
636 					final int initialLength2=(r1.mateLength());
637 
638 					//Increment counters
639 					readsProcessed+=r1.pairCount();
640 					basesProcessed+=initialLength1+initialLength2;
641 
642 					final MicroTile mt=flowcell.getMicroTile(r1.id);
643 
644 					if(loadKmers){
645 						if(initialLength1>=k){
646 							final long kmer=toKmer(r1.bases, r1.quality, randy.nextInt(initialLength1-k2), k);
647 							if(kmer>=0){
648 								AbstractKmerTable table=keySets[(int)(kmer%WAYS)];
649 								if(table.getValue(kmer)>0){mt.hits++;}
650 								else{mt.misses++;}
651 							}else{mt.misses++;}
652 						}
653 
654 						if(initialLength1>=k){
655 							final long kmer=toKmer(r1.bases, r1.quality, randy.nextInt(initialLength1-k2), k);
656 							if(kmer>=0){
657 								AbstractKmerTable table=keySets[(int)(kmer%WAYS)];
658 								if(table.getValue(kmer)>0){mt.hits++;}
659 								else{mt.misses++;}
660 							}else{mt.misses++;}
661 						}
662 					}
663 
664 					mt.add(r1);
665 					mt.add(r2);
666 				}
667 
668 				//Notify the input stream that the list was used
669 				cris.returnList(ln);
670 				if(verbose){outstream.println("Returned a list.");}
671 
672 				//Fetch a new list
673 				ln=cris.nextList();
674 				reads=(ln!=null ? ln.list : null);
675 			}
676 
677 			//Notify the input stream that the final list was used
678 			if(ln!=null){
679 				cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
680 			}
681 		}
682 
683 		t2.stop();
684 		outstream.println(t2);
685 
686 		ArrayList<MicroTile> mtList=flowcell.calcStats();
687 		if(flowcell.avgReads<targetAverageReads){
688 			flowcell=flowcell.widen(targetAverageReads);
689 			mtList=flowcell.toList();
690 		}
691 		avgQuality=flowcell.avgQuality;
692 		avgUnique=flowcell.avgUnique;
693 		avgErrorFree=flowcell.avgErrorFree;
694 		avgG=flowcell.avgG;
695 
696 		stdQuality=flowcell.stdQuality;
697 		stdUnique=flowcell.stdUnique;
698 		stdErrorFree=flowcell.stdErrorFree;
699 		stdG=flowcell.stdG;
700 
701 		long readsToDiscard=markTiles(mtList, flowcell.avgReads);
702 
703 		if(dump!=null){
704 			TextStreamWriter tsw=new TextStreamWriter(dump, overwrite, append, false);
705 			tsw.start();
706 
707 			tsw.println("#xSize\t"+Tile.xSize);
708 			tsw.println("#ySize\t"+Tile.ySize);
709 			tsw.println("#reads\t"+String.format(Locale.ROOT, "%d", flowcell.readsProcessed));
710 			tsw.println("#avgReads\t"+String.format(Locale.ROOT, "%.1f", flowcell.avgReads));
711 
712 			tsw.println("#avgQuality\t"+String.format(Locale.ROOT, "%.3f", avgQuality));
713 			tsw.println("#avgUnique\t"+String.format(Locale.ROOT, "%.3f", avgUnique));
714 			tsw.println("#avgErrorFree\t"+String.format(Locale.ROOT, "%.3f", avgErrorFree));
715 			tsw.println("#avgG\t"+String.format(Locale.ROOT, "%.3f", avgG));
716 
717 			tsw.println("#stdQuality\t"+String.format(Locale.ROOT, "%.5f", stdQuality));
718 			tsw.println("#stdUnique\t"+String.format(Locale.ROOT, "%.5f", stdUnique));
719 			tsw.println("#stdErrorFree\t"+String.format(Locale.ROOT, "%.5f", stdErrorFree));
720 			tsw.println("#stdG\t"+String.format(Locale.ROOT, "%.5f", stdG));
721 
722 			tsw.println((pound ? "#" : "") + "lane\ttile\tx1\tx2\ty1\ty2\treads\tunique\tquality\tprobErrorFree\tdiscard");
723 
724 			for(Lane lane : flowcell.lanes){
725 				if(lane!=null){
726 					for(Tile tile : lane.tiles){
727 						if(tile!=null){
728 							tsw.print(tile.toString());
729 						}
730 					}
731 				}
732 			}
733 			tsw.poisonAndWait();
734 		}
735 	}
736 
737 	/*--------------------------------------------------------------*/
738 	/*----------------         Inner Methods        ----------------*/
739 	/*--------------------------------------------------------------*/
740 
processReadPair(final Read r1, final Read r2)741 	boolean processReadPair(final Read r1, final Read r2){
742 		boolean passes=processReadPair_inner(r1, r2);
743 		if(passes){return true;}
744 		if(trimq>0){
745 			TrimRead.trimFast(r1, trimLeft, trimRight, trimq, trimE, 0);
746 			if(r2!=null){TrimRead.trimFast(r2, trimLeft, trimRight, trimq, trimE, 0);}
747 			return r1.length()>=minlen && (r2==null || r2.length()>=minlen);
748 		}else{
749 			return false;
750 		}
751 	}
752 
753 	/**
754 	 * Process a single read pair.
755 	 * @param r1 Read 1
756 	 * @param r2 Read 2 (may be null)
757 	 * @return True if the reads should be kept, false if they should be discarded.
758 	 */
processReadPair_inner(final Read r1, final Read r2)759 	boolean processReadPair_inner(final Read r1, final Read r2){
760 
761 		MicroTile mt=flowcell.getMicroTile(r1.id);
762 		if(mt==null){
763 			if(!warned){
764 				outstream.println("\nWarning - a read was found with no corresponding MicroTile:\n"+r1.id);
765 				warned=true;
766 			}
767 			return true;
768 		}
769 		if(mt.discard<discardLevel){return true;}
770 		if(!discardOnlyLowQuality){return false;}
771 		int len1=r1.length(), len2=r1.mateLength();
772 		if(len1>0){
773 			double qual=r1.avgQualityByProbabilityDouble(true, len1);
774 			double prob=100*r1.probabilityErrorFree(true, len1);
775 			if(qual<avgQuality-qDeviations*stdQuality){return false;}
776 			if(prob<avgErrorFree-eDeviations*stdErrorFree){return false;}
777 			if(discardG && shouldDiscardG(r1, mt)){return false;}
778 			if(gToN){gsTransformedToN+=doGToN(r1, mt);}
779 		}
780 		if(len2>0){
781 			double qual=r2.avgQualityByProbabilityDouble(true, len2);
782 			double prob=100*r2.probabilityErrorFree(true, len2);
783 			if(qual<avgQuality-qDeviations*stdQuality){return false;}
784 			if(prob<avgErrorFree-eDeviations*stdErrorFree){return false;}
785 			if(discardG && shouldDiscardG(r2, mt)){return false;}
786 			if(gToN){gsTransformedToN+=doGToN(r2, mt);}
787 		}
788 		return true;
789 	}
790 
shouldDiscardG(Read r, MicroTile mt)791 	private boolean shouldDiscardG(Read r, MicroTile mt){
792 		final byte[] bases=r.bases;
793 		final float[] gArray=mt.tracker.cycleAverages[2];
794 
795 		final float thresh=(float)(avgG+Tools.max(gDeviations*stdG, avgG*gFraction, gAbs));
796 		for(int i=0; i<bases.length; i++){
797 			byte b=bases[i];
798 			if(b=='G' && gArray[i]>thresh){
799 				return true;
800 			}
801 		}
802 		return false;
803 	}
804 
doGToN(Read r, MicroTile mt)805 	private int doGToN(Read r, MicroTile mt){
806 		final byte[] bases=r.bases;
807 		final byte[] quals=r.quality;
808 		final float[] gArray=mt.tracker.cycleAverages[2];
809 
810 		final float thresh=(float)(avgG+Tools.max(gDeviations*stdG, avgG*gFraction, gAbs));
811 		int changes=0;
812 		for(int i=0; i<bases.length; i++){
813 			byte b=bases[i];
814 			if(b=='G' && gArray[i]>thresh){
815 				bases[i]='N';
816 				changes++;
817 				if(quals!=null){quals[i]=0;}
818 			}
819 		}
820 		return changes;
821 	}
822 
823 	/*--------------------------------------------------------------*/
824 	/*----------------        Helper Methods        ----------------*/
825 	/*--------------------------------------------------------------*/
826 
827 	/**
828 	 * Generate a kmer from specified start location
829 	 * @param bases
830 	 * @param start
831 	 * @param klen kmer length
832 	 * @return kmer
833 	 */
toKmer(final byte[] bases, byte[] quals, final int start, final int klen)834 	private static final long toKmer(final byte[] bases, byte[] quals, final int start, final int klen){
835 //		if(minprob>0 && quals!=null){
836 //			float prob=toProb(quals, start, klen);
837 //			if(prob<minprob){return -1;}
838 //		}
839 		final int stop=start+klen;
840 		assert(stop<=bases.length) : klen+", "+bases.length;
841 		long kmer=0;
842 
843 		for(int i=start; i<stop; i++){
844 			final byte b=bases[i];
845 			final long x=Dedupe.baseToNumber[b];
846 			kmer=((kmer<<2)|x);
847 		}
848 		return kmer;
849 	}
850 
markTiles(ArrayList<MicroTile> mtList, double avgReads)851 	private final long markTiles(ArrayList<MicroTile> mtList, double avgReads){
852 		for(MicroTile mt : mtList){
853 			mt.discard=0;
854 		}
855 		long readsToDiscard=0;
856 
857 		cDiscards=qDiscards=eDiscards=uDiscards=mtDiscards=gDiscards=mtRetained=0;
858 
859 		for(MicroTile mt : mtList){
860 			double q=mt.averageQuality();
861 			double e=mt.percentErrorFree();
862 			double u=mt.uniquePercent();
863 			double g=mt.maxG();
864 
865 			double dq=avgQuality-q;
866 			double de=avgErrorFree-e;
867 			double du=u-avgUnique;
868 			double dg=g-avgG;
869 
870 			if(mt.readCount<10 && mt.readCount<0.02f*avgReads){
871 				mt.discard++;
872 				cDiscards++;
873 			}
874 
875 			if(dq>qDeviations*stdQuality && dq>avgQuality*qualFraction && dq>qualAbs){
876 				mt.discard++;
877 				qDiscards++;
878 			}
879 			if(de>eDeviations*stdErrorFree && de>avgErrorFree*errorFreeFraction && de>errorFreeAbs){
880 				mt.discard++;
881 				eDiscards++;
882 			}
883 			if(avgUnique>2 && avgUnique<98){
884 				if(du>uDeviations*stdUnique && du>avgUnique*uniqueFraction && du>uniqueAbs){
885 					mt.discard++;
886 					uDiscards++;
887 				}
888 			}
889 
890 			//Or mode
891 //			if(dq>qDeviations*stdQuality || dq>avgQuality*qualFraction || dq>qualAbs){
892 //				mt.discard++;
893 //				qDiscards++;
894 //			}
895 //			if(de>eDeviations*stdErrorFree || de>avgErrorFree*errorFreeFraction || de>errorFreeAbs){
896 //				mt.discard++;
897 //				eDiscards++;
898 //			}
899 //			if(avgUnique>2 && avgUnique<98){
900 //				if(du>uDeviations*stdUnique || du>avgUnique*uniqueFraction || du>uniqueAbs){
901 //					mt.discard++;
902 //					uDiscards++;
903 //				}
904 //			}
905 
906 			if((discardG || gToN) && (dg>gDeviations*stdG && dg>avgG*gFraction && dg>gAbs)){
907 				mt.discard++;
908 				gDiscards++;
909 			}
910 			if(mt.discard>0){
911 				mtDiscards++;
912 				readsToDiscard+=mt.readCount;
913 			}
914 			else{mtRetained++;}
915 		}
916 
917 		outstream.println();
918 		outstream.println("Flagged "+mtDiscards+" of "+(mtDiscards+mtRetained)+" micro-tiles, containing "+readsToDiscard+" reads:");
919 		outstream.println(uDiscards+" exceeded uniqueness thresholds.");
920 		outstream.println(qDiscards+" exceeded quality thresholds.");
921 		outstream.println(eDiscards+" exceeded error-free probability thresholds.");
922 		outstream.println(gDiscards+" contained G spikes.");
923 		outstream.println(cDiscards+" had too few reads to calculate statistics.");
924 		outstream.println();
925 
926 		return readsToDiscard;
927 	}
928 
929 	/*--------------------------------------------------------------*/
930 	/*----------------            Fields            ----------------*/
931 	/*--------------------------------------------------------------*/
932 
933 	/** Primary input file path */
934 	private String in1=null;
935 	/** Secondary input file path */
936 	private String in2=null;
937 
938 	private String qfin1=null;
939 	private String qfin2=null;
940 
941 	/** Primary output file path */
942 	private String out1=null;
943 	/** Secondary output file path */
944 	private String out2=null;
945 
946 	/** Discard output file path */
947 	private String outbad=null;
948 
949 	private String qfout1=null;
950 	private String qfout2=null;
951 
952 	/** Override input file extension */
953 	private String extin=null;
954 	/** Override output file extension */
955 	private String extout=null;
956 
957 	private boolean pound=true;
958 	private String dump=null;
959 	private String dumpIn=null;
960 
961 	/*--------------------------------------------------------------*/
962 
963 	/** Number of reads processed */
964 	public long readsProcessed=0;
965 	/** Number of bases processed */
966 	public long basesProcessed=0;
967 
968 	/** Number of reads discarded */
969 	public long readsDiscarded=0;
970 	/** Number of bases discarded */
971 	public long basesDiscarded=0;
972 
973 	protected long cDiscards=0;
974 	protected long uDiscards=0;
975 	protected long qDiscards=0;
976 	protected long eDiscards=0;
977 	protected long gDiscards=0;
978 	protected long mtDiscards=0;
979 	protected long mtRetained=0;
980 
981 	protected long gsTransformedToN=0;
982 
983 	/** Quit after processing this many input reads; -1 means no limit */
984 	private long maxReads=-1;
985 
986 	/** Whether interleaved was explicitly set. */
987 	private boolean setInterleaved=false;
988 
989 	/** Hold kmers.  A kmer X such that X%WAYS=Y will be stored in keySets[Y] */
990 	private AbstractKmerTable[] keySets;
991 
992 	private int targetAverageReads=800;
993 
994 	private float minprob=0;
995 	private static final int WAYS=31;
996 	private static final int k=31, k2=30;
997 	private long seed=-1;
998 
999 	private final Random randy=Shared.threadLocalRandom(seed);
1000 	private FlowCell flowcell;
1001 
1002 //	private int[] avgQualityArray;
1003 //	private int[] avgUniqueArray;
1004 
1005 	private long minCountToUse=0;
1006 
1007 	private float qDeviations=2f;
1008 	private float uDeviations=1.5f;
1009 	private float eDeviations=2f;
1010 	private float gDeviations=2f;
1011 
1012 	private float qualFraction=0.01f;
1013 	private float uniqueFraction=0.01f;
1014 	private float errorFreeFraction=0.01f;
1015 	private float gFraction=0.1f;
1016 
1017 	private float qualAbs=1f;
1018 	private float uniqueAbs=1f;
1019 	private float errorFreeAbs=1f;
1020 	private float gAbs=0.05f;
1021 
1022 	private double avgQuality;
1023 	private double avgUnique;
1024 	private double avgErrorFree;
1025 	private double avgG;
1026 
1027 	private double stdQuality;
1028 	private double stdUnique;
1029 	private double stdErrorFree;
1030 	private double stdG;
1031 
1032 	private boolean loadKmers=true;
1033 	private boolean discardOnlyLowQuality=true;
1034 	private int discardLevel=1;
1035 	private boolean gToN=false;
1036 	private boolean discardG=false;
1037 
1038 	private int minlen=30;
1039 	private float trimq=-1;
1040 	private final float trimE;
1041 	private boolean trimLeft=false;
1042 	private boolean trimRight=true;
1043 
1044 	private boolean warned=false;
1045 
1046 	/*--------------------------------------------------------------*/
1047 	/*----------------         Final Fields         ----------------*/
1048 	/*--------------------------------------------------------------*/
1049 
1050 	/** Primary input file */
1051 	private final FileFormat ffin1;
1052 	/** Secondary input file */
1053 	private final FileFormat ffin2;
1054 
1055 	/** Primary output file */
1056 	private final FileFormat ffout1;
1057 	/** Secondary output file */
1058 	private final FileFormat ffout2;
1059 
1060 	/** Output for discarded reads */
1061 	private final FileFormat ffoutbad;
1062 
1063 	/*--------------------------------------------------------------*/
1064 	/*----------------        Common Fields         ----------------*/
1065 	/*--------------------------------------------------------------*/
1066 
1067 	/** Number of reads output in the last run */
1068 	public static long lastReadsOut;
1069 	/** Print status messages to this output stream */
1070 	private PrintStream outstream=System.err;
1071 	/** Print verbose messages */
1072 	public static boolean verbose=false;
1073 	/** True if an error was encountered */
1074 	public boolean errorState=false;
1075 	/** Overwrite existing output files */
1076 	private boolean overwrite=false;
1077 	/** Append to existing output files */
1078 	private boolean append=false;
1079 	/** This flag has no effect on singlethreaded programs */
1080 	private final boolean ordered=false;
1081 
1082 }
1083