1 package jgi;
2 
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.BitSet;
8 import java.util.HashMap;
9 import java.util.HashSet;
10 import java.util.Locale;
11 import java.util.concurrent.ArrayBlockingQueue;
12 import java.util.concurrent.atomic.AtomicLongArray;
13 
14 import cardinality.CardinalityTracker;
15 import dna.AminoAcid;
16 import dna.Data;
17 import fileIO.ByteFile;
18 import fileIO.ByteStreamWriter;
19 import fileIO.FileFormat;
20 import fileIO.ReadWrite;
21 import fileIO.TextStreamWriter;
22 import kmer.AbstractKmerTable;
23 import kmer.ScheduleMaker;
24 import shared.KillSwitch;
25 import shared.Parse;
26 import shared.Parser;
27 import shared.PreParser;
28 import shared.ReadStats;
29 import shared.Shared;
30 import shared.Timer;
31 import shared.Tools;
32 import shared.TrimRead;
33 import stream.ConcurrentReadInputStream;
34 import stream.ConcurrentReadOutputStream;
35 import stream.FASTQ;
36 import stream.FastaReadInputStream;
37 import stream.Read;
38 import stream.SamLine;
39 import structures.IntList;
40 import structures.ListNum;
41 
42 /**
43  * Separates or trims reads based on matching kmers in a reference.
44  * Supports arbitrary K and inexact matches.
45  * Supercedes BBDukF by allowing simultaneous filtering, left- and right-trimming with different references.
46  * @author Brian Bushnell
47  * @date Aug 30, 2013
48  *
49  */
50 public class BBDuk2 {
51 
52 	/*--------------------------------------------------------------*/
53 	/*----------------        Initialization        ----------------*/
54 	/*--------------------------------------------------------------*/
55 
56 	/**
57 	 * Code entrance from the command line.
58 	 * @param args Command line arguments
59 	 */
main(String[] args)60 	public static void main(String[] args){
61 		//Create a new BBDuk2 instance
62 		BBDuk2 x=new BBDuk2(args);
63 
64 		///And run it
65 		x.process();
66 
67 		//Close the print stream if it was redirected
68 		Shared.closeStream(outstream);
69 	}
70 
71 	/**
72 	 * Constructor.
73 	 * @param args Command line arguments
74 	 */
BBDuk2(String[] args)75 	public BBDuk2(String[] args){
76 
77 		{//Preparse block for help, config files, and outstream
78 			PreParser pp=new PreParser(args, getClass(), true);
79 			args=pp.args;
80 			outstream=pp.outstream;
81 		}
82 
83 		/* Set global defaults */
84 		ReadWrite.ZIPLEVEL=2;
85 		ReadWrite.USE_UNPIGZ=true;
86 		ReadWrite.USE_PIGZ=true;
87 
88 		if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
89 			ByteFile.FORCE_MODE_BF2=true;
90 		}
91 
92 		/* Initialize local variables with defaults */
93 		boolean setOut=false, setOutb=false;
94 		boolean ktrimRight_=false, ktrimLeft_=false, ktrimN_=false, ktrimExclusive_=false, kfilter_=false;
95 		boolean findBestMatch_=false;
96 		boolean addTrimmedToBad_=true;
97 		boolean rcomp_=true;
98 		boolean forbidNs_=false;
99 		boolean useForest_=false, useTable_=false, useArray_=true, prealloc_=false;
100 		int k_=27, kbig_=-1;
101 		int mink_=-1;
102 		int ways_=-1; //Currently disabled
103 		int maxBadKmers_=0;
104 		long skipreads_=0;
105 		byte TRIM_SYMBOL_='N';
106 		boolean kmaskLowercase_=false;
107 
108 		Parser parser=new Parser();
109 		parser.trimq=6;
110 		parser.minAvgQuality=0;
111 		parser.minReadLength=10;
112 		parser.maxReadLength=Integer.MAX_VALUE;
113 		parser.minLenFraction=0f;
114 		parser.requireBothBad=false;
115 		parser.maxNs=-1;
116 		boolean trimByOverlap_=false, useQualityForOverlap_=false, strictOverlap_=true;
117 		boolean trimPairsEvenly_=false;
118 		boolean ordered_=false;
119 		int minoverlap_=-1, mininsert_=-1;
120 		int restrictLeft_=0, restrictRight_=0, speed_=0, qSkip_=1;
121 		boolean printNonZeroOnly_=true;
122 		boolean rename_=false, useRefNames_=false;
123 		boolean skipr1_=false, skipr2_=false;
124 		boolean ecc_=false;
125 		float minBaseFrequency_=0;
126 
127 
128 		String[] literal=null;
129 		String[] ref=null;
130 
131 		scaffoldNames.add(""); //Necessary so that the first real scaffold gets an id of 1, not zero
132 		scaffoldLengths.add(0);
133 
134 		/* Parse arguments */
135 		for(int i=0; i<args.length; i++){
136 			final String arg=args[i];
137 			String[] split=arg.split("=");
138 			String a=split[0].toLowerCase();
139 			String b=split.length>1 ? split[1] : null;
140 
141 			if(Parser.parseZip(arg, a, b)){
142 				//do nothing
143 			}else if(Parser.parseHist(arg, a, b)){
144 				//do nothing
145 			}else if(Parser.parseCommonStatic(arg, a, b)){
146 				//do nothing
147 			}else if(Parser.parseQualityAdjust(arg, a, b)){
148 				//do nothing
149 			}else if(Parser.parseQuality(arg, a, b)){
150 				//do nothing
151 			}else if(Parser.parseFasta(arg, a, b)){
152 				//do nothing
153 			}else if(parser.parseInterleaved(arg, a, b)){
154 				//do nothing
155 			}else if(parser.parseTrim(arg, a, b)){
156 				//do nothing
157 			}else if(parser.parseCommon(arg, a, b)){
158 				//do nothing
159 			}else if(parser.parseCardinality(arg, a, b)){
160 				//do nothing
161 			}else if(a.equals("in") || a.equals("in1")){
162 				in1=b;
163 			}else if(a.equals("in2")){
164 				in2=b;
165 			}else if(a.equals("qfin") || a.equals("qfin1")){
166 				qfin1=b;
167 			}else if(a.equals("qfin2")){
168 				qfin2=b;
169 			}else if(a.equals("out") || a.equals("out1") || a.equals("outu") || a.equals("outu1") || a.equals("outnonmatch") ||
170 					a.equals("outnonmatch1") || a.equals("outunnmatch") || a.equals("outunmatch1") || a.equals("outunnmatched") || a.equals("outunmatched1")){
171 				out1=b;
172 				setOut=true;
173 			}else if(a.equals("out2") || a.equals("outu2") || a.equals("outnonmatch2") || a.equals("outunmatch2") ||
174 					a.equals("outnonmatched2") || a.equals("outunmatched2")){
175 				out2=b;
176 			}else if(a.equals("outb") || a.equals("outm") || a.equals("outb1") || a.equals("outm1") || a.equals("outbad") ||
177 					a.equals("outbad1") || a.equals("outmatch") || a.equals("outmatch1")){
178 				outb1=b;
179 				setOut=true;
180 			}else if(a.equals("outb2") || a.equals("outm2") || a.equals("outbad2") || a.equals("outmatch2")){
181 				outb2=b;
182 			}else if(a.equals("outs") || a.equals("outsingle")){
183 				outsingle=b;
184 			}else if(a.equals("stats") || a.equals("scafstats")){
185 				outstats=b;
186 			}else if(a.equals("refstats")){
187 				outrefstats=b;
188 			}else if(a.equals("rpkm") || a.equals("fpkm") || a.equals("cov") || a.equals("coverage")){
189 				outrpkm=b;
190 			}else if(a.equals("sam") || a.equals("bam")){
191 				samFile=b;
192 			}else if(a.equals("duk") || a.equals("outduk")){
193 				outduk=b;
194 			}else if(a.equals("rqc")){
195 				outrqc=b;
196 			}else if(a.equals("ref")){
197 				ref=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
198 			}else if(a.equals("filterref") || a.equals("fref")){
199 				refFilter=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
200 			}else if(a.equals("maskref") || a.equals("mref") || a.equals("nref")){
201 				refMask=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
202 			}else if(a.equals("rightref") || a.equals("trref") || a.equals("rref")){
203 				refRight=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
204 			}else if(a.equals("leftref") || a.equals("tlref") || a.equals("lref")){
205 				refLeft=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
206 			}else if(a.equals("literal")){
207 				literal=(b==null) ? null : b.split(",");
208 //				assert(false) : b+", "+Arrays.toString(literal);
209 			}else if(a.equals("filterliteral") || a.equals("fliteral")){
210 				literalFilter=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
211 			}else if(a.equals("maskliteral") || a.equals("mliteral") || a.equals("nliteral")){
212 				literalMask=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
213 			}else if(a.equals("rightliteral") || a.equals("trliteral") || a.equals("rliteral")){
214 				literalRight=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
215 			}else if(a.equals("leftliteral") || a.equals("tlliteral") || a.equals("lliteral")){
216 				literalLeft=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
217 			}else if(a.equals("forest")){
218 				useForest_=Parse.parseBoolean(b);
219 				if(useForest_){useTable_=useArray_=false;}
220 			}else if(a.equals("table")){
221 				useTable_=Parse.parseBoolean(b);
222 				if(useTable_){useForest_=useArray_=false;}
223 			}else if(a.equals("array")){
224 				useArray_=Parse.parseBoolean(b);
225 				if(useArray_){useTable_=useForest_=false;}
226 			}else if(a.equals("ways")){
227 				ways_=Integer.parseInt(b);
228 			}else if(a.equals("ordered") || a.equals("ord")){
229 				ordered_=Parse.parseBoolean(b);
230 				System.err.println("Set ORDERED to "+ordered_);
231 			}else if(a.equals("skipr1")){
232 				skipr1_=Parse.parseBoolean(b);
233 			}else if(a.equals("skipr2")){
234 				skipr2_=Parse.parseBoolean(b);
235 			}else if(a.equals("k")){
236 				assert(b!=null) : "\nThe k key needs an integer value greater than 0, such as k=27\n";
237 				k_=Integer.parseInt(b);
238 				if(k_>31){
239 					kbig_=k_;
240 					k_=31;
241 				}else{
242 					kbig_=-1;
243 				}
244 				assert(k_>0 && k_<32) : "k must be at least 1; default is 27.";
245 			}else if(a.equals("mink") || a.equals("kmin")){
246 				mink_=Integer.parseInt(b);
247 				assert(mink_<0 || (mink_>0 && mink_<32)) : "kmin must be between 1 and 31; default is 4, negative numbers disable it.";
248 			}else if(a.equals("useshortkmers") || a.equals("shortkmers") || a.equals("usk")){
249 				useShortKmers=Parse.parseBoolean(b);
250 			}else if(a.equals("trimextra") || a.equals("trimpad") || a.equals("tp")){
251 				trimPad=Integer.parseInt(b);
252 			}else if(a.equals("hdist") || a.equals("hammingdistance")){
253 				hammingDistance=Integer.parseInt(b);
254 				assert(hammingDistance>=0 && hammingDistance<=4) : "hamming distance must be between 0 and 4; default is 0.";
255 			}else if(a.equals("qhdist") || a.equals("queryhammingdistance")){
256 				qHammingDistance=Integer.parseInt(b);
257 				assert(qHammingDistance>=0 && qHammingDistance<=4) : "hamming distance must be between 0 and 4; default is 0.";
258 			}else if(a.equals("edits") || a.equals("edist") || a.equals("editdistance")){
259 				editDistance=Integer.parseInt(b);
260 				assert(editDistance>=0 && editDistance<=3) : "edit distance must be between 0 and 3; default is 0.";
261 			}else if(a.equals("hdist2") || a.equals("hammingdistance2")){
262 				hammingDistance2=Integer.parseInt(b);
263 				assert(hammingDistance2>=0 && hammingDistance2<=4) : "hamming distance must be between 0 and 3; default is 0.";
264 			}else if(a.equals("qhdist2") || a.equals("queryhammingdistance2")){
265 				qHammingDistance2=Integer.parseInt(b);
266 				assert(qHammingDistance2>=0 && qHammingDistance2<=4) : "hamming distance must be between 0 and 3; default is 0.";
267 			}else if(a.equals("edits2") || a.equals("edist2") || a.equals("editdistance2")){
268 				editDistance2=Integer.parseInt(b);
269 				assert(editDistance2>=0 && editDistance2<=3) : "edit distance must be between 0 and 2; default is 0.";
270 			}else if(a.equals("maxskip") || a.equals("maxrskip") || a.equals("mxs")){
271 				maxSkip=Integer.parseInt(b);
272 			}else if(a.equals("minskip") || a.equals("minrskip") || a.equals("mns")){
273 				minSkip=Integer.parseInt(b);
274 			}else if(a.equals("skip") || a.equals("refskip") || a.equals("rskip")){
275 				minSkip=maxSkip=Integer.parseInt(b);
276 			}else if(a.equals("qskip")){
277 				qSkip_=Integer.parseInt(b);
278 			}else if(a.equals("speed")){
279 				speed_=Integer.parseInt(b);
280 				assert(speed_>=0 && speed_<=15) : "Speed range is 0 to 15.  Value: "+speed_;
281 			}else if(a.equals("skipreads")){
282 				skipreads_=Parse.parseKMG(b);
283 			}else if(a.equals("maxbadkmers") || a.equals("mbk")){
284 				maxBadKmers_=Integer.parseInt(b);
285 			}else if(a.equals("minhits") || a.equals("minkmerhits") || a.equals("mkh")){
286 				maxBadKmers_=Integer.parseInt(b)-1;
287 			}else if(a.equals("showspeed") || a.equals("ss")){
288 				showSpeed=Parse.parseBoolean(b);
289 			}else if(a.equals("verbose")){
290 				assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
291 				assert(WAYS>1) : "WAYS=1 is for debug mode.";
292 //				verbose=Parse.parseBoolean(b); //123
293 				if(verbose){outstream=System.err;} //For some reason System.out does not print in verbose mode.
294 			}else if(a.equals("mm") || a.equals("maskmiddle")){
295 				maskMiddle=Parse.parseBoolean(b);
296 			}else if(a.equals("rcomp")){
297 				rcomp_=Parse.parseBoolean(b);
298 			}else if(a.equals("forbidns") || a.equals("forbidn") || a.equals("fn")){
299 				forbidNs_=Parse.parseBoolean(b);
300 			}else if(a.equals("findbestmatch") || a.equals("fbm")){
301 				findBestMatch_=Parse.parseBoolean(b);
302 			}else if(a.equals("kfilter")){
303 				kfilter_=Parse.parseBoolean(b);
304 			}else if(a.equals("kmask") || a.equals("mask")){
305 				if("lc".equalsIgnoreCase(b) || "lowercase".equalsIgnoreCase(b)){
306 					kmaskLowercase_=true;
307 					ktrimN_=true;
308 				}else{
309 					if(b==null){b="";}
310 					if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){
311 						ktrimN_=true;
312 						TRIM_SYMBOL_=(byte)b.charAt(0);
313 					}else{
314 						ktrimN_=Parse.parseBoolean(b);
315 					}
316 				}
317 			}else if(a.equals("ktrim")){
318 				throw new RuntimeException("BBDuk2 does not need the ktrim flag.  " +
319 						"It will trim according to which references are specified - for example, if lref is set, " +
320 						"it will do left-trimming.  To specify masking modes, use the kmask flag; " +
321 						"e.g. kmask=lowercase or kmask=X.");
322 //				if(b==null){b="";}
323 //				if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){ktrimLeft_=true;}
324 //				else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){ktrimRight_=true;}
325 //				else if(b.equalsIgnoreCase("n") || b.equalsIgnoreCase("mask")){ktrimN_=true;}
326 //				else if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){
327 //					ktrimN_=true;
328 //					TRIM_SYMBOL_=(byte)b.charAt(0);
329 //				}else{
330 //					boolean x=Parse.parseBoolean(b);
331 //					if(!x){
332 //						ktrimRight_=ktrimLeft_=false;
333 //					}else{
334 //						throw new RuntimeException("\nInvalid setting for ktrim: "+b+"\nvalues must be f (false), l (left), r (right), or n");
335 //					}
336 //				}
337 			}else if(a.equals("ktrimright")){
338 				ktrimRight_=Parse.parseBoolean(b);
339 				ktrimLeft_=ktrimN_=!(ktrimRight_);
340 			}else if(a.equals("ktrimleft")){
341 				ktrimLeft_=Parse.parseBoolean(b);
342 				ktrimRight_=ktrimN_=!(ktrimLeft_);
343 			}else if(a.equals("ktrimn")){
344 				ktrimN_=Parse.parseBoolean(b);
345 				ktrimLeft_=ktrimRight_=!(ktrimN_);
346 			}else if(a.equals("ktrimexclusive")){
347 				ktrimExclusive_=Parse.parseBoolean(b);
348 			}else if(a.equals("tbo") || a.equals("trimbyoverlap")){
349 				trimByOverlap_=Parse.parseBoolean(b);
350 			}else if(a.equals("strictoverlap")){
351 				strictOverlap_=Parse.parseBoolean(b);
352 			}else if(a.equals("usequality")){
353 				useQualityForOverlap_=Parse.parseBoolean(b);
354 			}else if(a.equals("tpe") || a.equals("tbe") || a.equals("trimpairsevenly")){
355 				trimPairsEvenly_=Parse.parseBoolean(b);
356 			}else if(a.equals("ottm") || a.equals("outputtrimmedtomatch")){
357 				addTrimmedToBad_=Parse.parseBoolean(b);
358 			}else if(a.equals("minoverlap")){
359 				minoverlap_=Integer.parseInt(b);
360 			}else if(a.equals("mininsert")){
361 				mininsert_=Integer.parseInt(b);
362 			}else if(a.equals("prealloc") || a.equals("preallocate")){
363 				if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){
364 					prealloc_=Parse.parseBoolean(b);
365 				}else{
366 					preallocFraction=Tools.max(0, Double.parseDouble(b));
367 					prealloc_=(preallocFraction>0);
368 				}
369 			}else if(a.equals("restrictleft")){
370 				restrictLeft_=Integer.parseInt(b);
371 			}else if(a.equals("restrictright")){
372 				restrictRight_=Integer.parseInt(b);
373 			}else if(a.equals("statscolumns") || a.equals("columns") || a.equals("cols")){
374 				STATS_COLUMNS=Integer.parseInt(b);
375 				assert(STATS_COLUMNS==3 || STATS_COLUMNS==5) : "statscolumns bust be either 3 or 5. Invalid value: "+STATS_COLUMNS;
376 			}else if(a.equals("nzo") || a.equals("nonzeroonly")){
377 				printNonZeroOnly_=Parse.parseBoolean(b);
378 			}else if(a.equals("rename")){
379 				rename_=Parse.parseBoolean(b);
380 			}else if(a.equals("refnames") || a.equals("userefnames")){
381 				useRefNames_=Parse.parseBoolean(b);
382 			}else if(a.equals("initialsize")){
383 				initialSize=Parse.parseIntKMG(b);
384 			}else if(a.equals("dump")){
385 				dump=b;
386 			}else if(a.equals("entropyk") || a.equals("ek")){
387 				entropyK=Integer.parseInt(b);
388 			}else if(a.equals("entropywindow") || a.equals("ew")){
389 				entropyWindow=Integer.parseInt(b);
390 			}else if(a.equals("minentropy") || a.equals("entropy") || a.equals("entropyfilter")){
391 				entropyCutoff=Float.parseFloat(b);
392 			}else if(a.equals("verifyentropy")){
393 				verifyEntropy=Parse.parseBoolean(b);
394 			}else if(a.equals("minbasefrequency")){
395 				minBaseFrequency_=Float.parseFloat(b);
396 			}else if(a.equals("ecco") || a.equals("ecc")){
397 				ecc_=Parse.parseBoolean(b);
398 			}else if(a.equals("copyundefined") || a.equals("cu")){
399 				REPLICATE_AMBIGUOUS=Parse.parseBoolean(b);
400 			}else if(a.equals("path")){
401 				Data.setPath(b);
402 			}else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
403 				in1=args[i];
404 			}else if(i==1 && out1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
405 				out1=args[i];
406 				setOut=true;
407 			}else if(i==2 && ref==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
408 				ref=(new File(args[i]).exists() ? new String[] {args[i]} : args[i].split(","));
409 			}else{
410 				throw new RuntimeException("Unknown parameter "+args[i]);
411 			}
412 		}
413 
414 		if(hammingDistance2==-1){hammingDistance2=hammingDistance;}
415 		if(qHammingDistance2==-1){qHammingDistance2=qHammingDistance;}
416 		if(editDistance2==-1){editDistance2=editDistance;}
417 		minBaseFrequency=minBaseFrequency_;
418 
419 		{//Process parser fields
420 			Parser.processQuality();
421 
422 			maxReads=parser.maxReads;
423 			samplerate=parser.samplerate;
424 			sampleseed=parser.sampleseed;
425 			recalibrateQuality=parser.recalibrateQuality;
426 
427 			overwrite=ReadStats.overwrite=parser.overwrite;
428 			append=ReadStats.append=parser.append;
429 //			testsize=parser.testsize;
430 //			trimBadSequence=parser.trimBadSequence;
431 //			breakLength=parser.breakLength;
432 
433 			forceTrimModulo=parser.forceTrimModulo;
434 			forceTrimLeft=parser.forceTrimLeft;
435 			forceTrimRight=parser.forceTrimRight;
436 			forceTrimRight2=parser.forceTrimRight2;
437 			qtrimLeft=parser.qtrimLeft;
438 			qtrimRight=parser.qtrimRight;
439 			trimq=parser.trimq;
440 			trimE=parser.trimE();
441 			minLenFraction=parser.minLenFraction;
442 			minAvgQuality=parser.minAvgQuality;
443 			minAvgQualityBases=parser.minAvgQualityBases;
444 			chastityFilter=parser.chastityFilter;
445 			failBadBarcodes=parser.failBadBarcodes;
446 			removeBadBarcodes=parser.removeBadBarcodes;
447 			failIfNoBarcode=parser.failIfNoBarcode;
448 			barcodes=parser.barcodes;
449 			minReadLength=parser.minReadLength;
450 			maxReadLength=parser.maxReadLength;
451 			maxNs=parser.maxNs;
452 			minConsecutiveBases=parser.minConsecutiveBases;
453 //			untrim=parser.untrim;
454 //			minTrimLength=(parser.minTrimLength>=0 ? parser.minTrimLength : minTrimLength);
455 //			requireBothBad=parser.requireBothBad;
456 			removePairsIfEitherBad=!parser.requireBothBad;
457 
458 			minGC=parser.minGC;
459 			maxGC=parser.maxGC;
460 			filterGC=(minGC>0 || maxGC<1);
461 			usePairGC=parser.usePairGC;
462 
463 			loglog=(parser.loglog ? CardinalityTracker.makeTracker(parser) : null);
464 
465 			THREADS=Shared.threads();
466 		}
467 
468 		if(prealloc_){
469 			System.err.println("Note - if this program runs out of memory, please disable the prealloc flag.");
470 		}
471 
472 		if(minoverlap_>=0){
473 			minOverlap=Tools.max(minoverlap_, 1);
474 			minOverlap0=Tools.min(minOverlap0, minOverlap);
475 		}
476 
477 		if(mininsert_>=0){
478 			minInsert=Tools.max(mininsert_, 1);
479 			minInsert0=Tools.min(minInsert0, minInsert);
480 		}
481 
482 		if(refFilter!=null || literalFilter!=null){kfilter_=true;}
483 		if(refMask!=null || literalMask!=null){ktrimN_=true;}
484 		if(refRight!=null || literalRight!=null){ktrimRight_=true;}
485 		if(refLeft!=null || literalLeft!=null){ktrimLeft_=true;}
486 
487 		if(ref!=null){
488 			if(!ktrimN_ && !ktrimRight_ && !ktrimLeft_){kfilter_=true;}
489 			if(kfilter_ && refFilter==null){
490 				refFilter=ref;
491 			}else if(ktrimN_ && refMask==null){
492 				refMask=ref;
493 			}else if(ktrimRight_ && refRight==null){
494 				refRight=ref;
495 			}else if(ktrimLeft_ && refLeft==null){
496 				refLeft=ref;
497 			}
498 		}
499 		if(literal!=null){
500 			if(!ktrimN_ && !ktrimRight_ && !ktrimLeft_){kfilter_=true;}
501 			if(kfilter_ && literalFilter==null){
502 				literalFilter=literal;
503 			}else if(ktrimN_ && literalMask==null){
504 				literalMask=literal;
505 			}else if(ktrimRight_ && literalRight==null){
506 				literalRight=literal;
507 			}else if(ktrimLeft_ && literalLeft==null){
508 				literalLeft=literal;
509 			}
510 		}
511 
512 		kfilter_=(refFilter!=null || literalFilter!=null);
513 		ktrimN_=(refMask!=null || literalMask!=null);
514 		ktrimRight_=(refRight!=null || literalRight!=null);
515 		ktrimLeft_=(refLeft!=null || literalLeft!=null);
516 
517 		kfilter=kfilter_;
518 		ktrimN=ktrimN_;
519 		ktrimRight=ktrimRight_;
520 		ktrimLeft=ktrimLeft_;
521 		ktrimExclusive=ktrimExclusive_;
522 		findBestMatch=findBestMatch_;
523 
524 		if(refFilter!=null){
525 			for(String s : refFilter){refNames.add(s);}
526 		}
527 		if(literalFilter!=null){refNames.add("literal");}
528 		refScafCounts=new int[refNames.size()];
529 
530 		/* Set final variables; post-process and validate argument combinations */
531 
532 		useForest=useForest_;
533 		useTable=useTable_;
534 		useArray=useArray_;
535 		hammingDistance=Tools.max(editDistance, hammingDistance);
536 		hammingDistance2=Tools.max(editDistance2, hammingDistance2);
537 		minSkip=Tools.max(1, Tools.min(minSkip, maxSkip));
538 		maxSkip=Tools.max(minSkip, maxSkip);
539 		addTrimmedToBad=addTrimmedToBad_;
540 		rcomp=rcomp_;
541 		forbidNs=(forbidNs_ || hammingDistance<1);
542 		trimSymbol=TRIM_SYMBOL_;
543 		kmaskLowercase=kmaskLowercase_;
544 		skipreads=skipreads_;
545 		trimByOverlap=trimByOverlap_;
546 		useQualityForOverlap=useQualityForOverlap_;
547 		strictOverlap=strictOverlap_;
548 		trimPairsEvenly=trimPairsEvenly_;
549 		ordered=ordered_;
550 		restrictLeft=Tools.max(restrictLeft_, 0);
551 		restrictRight=Tools.max(restrictRight_, 0);
552 		printNonZeroOnly=printNonZeroOnly_;
553 		rename=rename_;
554 		useRefNames=useRefNames_;
555 		speed=speed_;
556 		qSkip=qSkip_;
557 		noAccel=(speed<1 && qSkip<2);
558 		skipR1=skipr1_;
559 		skipR2=skipr2_;
560 		ecc=ecc_;
561 
562 		if(strictOverlap){
563 			maxRatio=0.05f;
564 			ratioMargin=9f;
565 			ratioOffset=0.5f;
566 			efilterRatio=3.5f;
567 			efilterOffset=0.05f;
568 			pfilterRatio=0.001f;
569 			meeFilter=15f;
570 		}else{
571 			maxRatio=0.10f;
572 			ratioMargin=5f;
573 			ratioOffset=0.4f;
574 			efilterRatio=6f;
575 			efilterOffset=0.05f;
576 			pfilterRatio=0.00005f;
577 			meeFilter=999999999;
578 		}
579 
580 		MAKE_QUALITY_HISTOGRAM=ReadStats.COLLECT_QUALITY_STATS;
581 		MAKE_QUALITY_ACCURACY=ReadStats.COLLECT_QUALITY_ACCURACY;
582 		MAKE_MATCH_HISTOGRAM=ReadStats.COLLECT_MATCH_STATS;
583 		MAKE_BASE_HISTOGRAM=ReadStats.COLLECT_BASE_STATS;
584 		MAKE_EHIST=ReadStats.COLLECT_ERROR_STATS;
585 		MAKE_INDELHIST=ReadStats.COLLECT_INDEL_STATS;
586 		MAKE_LHIST=ReadStats.COLLECT_LENGTH_STATS;
587 		MAKE_GCHIST=ReadStats.COLLECT_GC_STATS;
588 		MAKE_IDHIST=ReadStats.COLLECT_IDENTITY_STATS;
589 
590 		{
591 			long usableMemory;
592 			long tableMemory;
593 			long numTables=Tools.max(1, (kfilter ? 1 : 0)+(ktrimLeft ? 1 : 0)+(ktrimRight ? 1 : 0)+(ktrimN ? 1 : 0));
594 
595 			{
596 				long memory=Runtime.getRuntime().maxMemory();
597 				double xmsRatio=Shared.xmsRatio();
598 				usableMemory=(long)Tools.max(((memory-96000000-(20*400000 /* for atomic arrays */))*(xmsRatio>0.97 ? 0.82 : 0.72)), memory*0.45);
599 				tableMemory=(long)(usableMemory*.95);
600 			}
601 
602 			if(initialSize<1){
603 				final long memOverWays=tableMemory/(12*WAYS*numTables);
604 				final double mem2=(prealloc_ ? preallocFraction : 1)*tableMemory;
605 				initialSize=(prealloc_ || memOverWays<initialSizeDefault ? (int)Tools.min(2142000000, (long)(mem2/(12*WAYS*numTables))) : initialSizeDefault);
606 				if(initialSize!=initialSizeDefault){
607 					System.err.println("Initial size set to "+initialSize);
608 				}
609 			}
610 		}
611 
612 		if(ktrimLeft_ || ktrimRight_ || ktrimN_){
613 			if(kbig_>k_){
614 				System.err.println("***********************   WARNING   ***********************");
615 				System.err.println("WARNING: When kmer-trimming, the maximum value of K is "+k_+".");
616 				System.err.println("K has been reduced from "+kbig_+" to "+k_+".");
617 				System.err.println("***********************************************************");
618 				kbig_=k_;
619 			}
620 		}
621 
622 		if((speed>0 || qSkip>1) && kbig_>k_){
623 			System.err.println("***********************   WARNING   ***********************");
624 			System.err.println("WARNING: When speed>0 or qskip>1, the maximum value of K is "+k_+".");
625 			System.err.println("K has been reduced from "+kbig_+" to "+k_+".");
626 			System.err.println("***********************************************************");
627 			kbig_=k_;
628 		}
629 
630 		if((speed>0 && qSkip>1) || (qSkip>1 && maxSkip>1) || (speed>0 && maxSkip>1)){
631 			System.err.println("WARNING: It is not recommended to use more than one of 'qskip', 'speed', and 'rskip/maxskip' together.");
632 			System.err.println("qskip="+qSkip+", speed="+speed+", maxskip="+maxSkip);
633 		}
634 
635 		k=k_;
636 		k2=k-1;
637 		kbig=kbig_;
638 		if(kbig>k){
639 			minSkip=maxSkip=0;
640 			if(maskMiddle){
641 				System.err.println("maskMiddle was disabled because kbig>k");
642 				maskMiddle=false;
643 			}
644 		}
645 		mink=Tools.min((mink_<1 ? 6 : mink_), k);
646 		maxBadKmers=maxBadKmers_;
647 		if(mink_>0 && mink_<k){useShortKmers=true;}
648 		if(useShortKmers){
649 			if(maskMiddle){
650 				System.err.println("maskMiddle was disabled because useShortKmers=true");
651 				maskMiddle=false;
652 			}
653 		}
654 		assert(findBestMatch==false || kfilter==false || kbig<=k) : "K must be less than 32 in 'findBestMatch' mode";
655 
656 		assert(!useShortKmers || ktrimRight || ktrimLeft || ktrimN) : "\nSetting mink or useShortKmers also requires setting a ktrim mode, such as 'r', 'l', or 'n'\n";
657 
658 		middleMask=maskMiddle ? ~(3L<<(2*(k/2))) : -1L;
659 
660 		if(kfilter || ktrimN || ktrimRight || ktrimLeft){
661 			System.err.println("k="+k);
662 			if(kbig>k){System.err.println("kbig="+kbig);}
663 			if(mink<k && (ktrimN || ktrimRight || ktrimLeft)){System.err.println("mink="+mink);}
664 			if(maskMiddle){System.err.println("maskMiddle="+maskMiddle);}
665 			if(hammingDistance>0){System.err.println("hamming distance="+hammingDistance);}
666 			if(editDistance>0){System.err.println("edit distance="+editDistance);}
667 		}
668 		if(kfilter){printFilterPlan("kfiltering", refFilter, literalFilter);}
669 		if(ktrimN){printFilterPlan(((char)trimSymbol)+"-masking", refMask, literalMask);}
670 		if(ktrimRight){printFilterPlan("right-ktrimming", refRight, literalRight);}
671 		if(ktrimLeft){printFilterPlan("left-ktrimming", refLeft, literalLeft);}
672 		if(qtrimRight || qtrimLeft){
673 			System.err.println("quality-trimming "+((qtrimRight && qtrimLeft) ? "both ends" : qtrimRight ? "right end" : " left end")+" to Q"+trimq);
674 		}
675 		System.err.println();
676 
677 		hitCounts=(outduk==null ? null : new long[HITCOUNT_LEN+1]);
678 
679 
680 		/* Adjust I/O settings and filenames */
681 
682 		assert(FastaReadInputStream.settingsOK());
683 
684 		if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
685 
686 		if(in1!=null && in1.contains("#") && !new File(in1).exists()){
687 			int pound=in1.lastIndexOf('#');
688 			String a=in1.substring(0, pound);
689 			String b=in1.substring(pound+1);
690 			in1=a+1+b;
691 			in2=a+2+b;
692 		}
693 		if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;}
694 		if(in2!=null){
695 			if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
696 			FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
697 		}
698 
699 		if(qfin1!=null && qfin1.contains("#") && in2!=null && !new File(qfin1).exists()){
700 			int pound=qfin1.lastIndexOf('#');
701 			String a=qfin1.substring(0, pound);
702 			String b=qfin1.substring(pound+1);
703 			qfin1=a+1+b;
704 			qfin2=a+2+b;
705 		}
706 
707 		if(out1!=null && out1.contains("#")){
708 			int pound=out1.lastIndexOf('#');
709 			String a=out1.substring(0, pound);
710 			String b=out1.substring(pound+1);
711 			out1=a+1+b;
712 			out2=a+2+b;
713 		}
714 
715 		if(outb1!=null && outb1.contains("#")){
716 			int pound=outb1.lastIndexOf('#');
717 			String a=outb1.substring(0, pound);
718 			String b=outb1.substring(pound+1);
719 			outb1=a+1+b;
720 			outb2=a+2+b;
721 		}
722 
723 		if((out2!=null || outb2!=null) && (in1!=null && in2==null)){
724 			if(!FASTQ.FORCE_INTERLEAVED){System.err.println("Forcing interleaved input because paired output was specified for a single input file.");}
725 			FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true;
726 		}
727 
728 		if(!setOut){
729 			System.err.println("No output stream specified.  To write to stdout, please specify 'out=stdout.fq' or similar.");
730 //			out1="stdout";
731 //			outstream=System.err;
732 //			out2=null;
733 			out1=out2=null;
734 		}else if("stdout".equalsIgnoreCase(out1) || "standarddout".equalsIgnoreCase(out1)){
735 			out1="stdout.fq";
736 			outstream=System.err;
737 			out2=null;
738 		}
739 
740 		if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats)){
741 			throw new RuntimeException("\nCan't write to some output files; overwrite="+overwrite+"\n");
742 		}
743 		if(!Tools.testInputFiles(false, true, in1, in2, qfin1, qfin2)){
744 			throw new RuntimeException("\nCan't read some input files.\n");
745 		}
746 		if(!Tools.testInputFiles(true, true, refFilter, refMask, refRight, refLeft)){
747 			throw new RuntimeException("\nCan't read from some reference files.\n");
748 		}
749 		if(!Tools.testForDuplicateFiles(true, in1, in2, qfin1, qfin2, out1, out2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats)){
750 			throw new RuntimeException("\nSome file names were specified multiple times.\n");
751 		}
752 
753 		assert(THREADS>0) : "THREADS must be greater than 0.";
754 
755 		assert(in1==null || in1.toLowerCase().startsWith("stdin") || in1.toLowerCase().startsWith("standardin") || new File(in1).exists()) : "Can't find "+in1;
756 		assert(in2==null || in2.toLowerCase().startsWith("stdin") || in2.toLowerCase().startsWith("standardin") || new File(in2).exists()) : "Can't find "+in2;
757 
758 		if(!(kfilter || ktrimN || ktrimRight || ktrimLeft || qtrimLeft || qtrimRight || minAvgQuality>0 || maxNs>=0 || trimByOverlap ||
759 				MAKE_QUALITY_HISTOGRAM || MAKE_MATCH_HISTOGRAM || MAKE_BASE_HISTOGRAM || MAKE_QUALITY_ACCURACY ||
760 				MAKE_EHIST || MAKE_INDELHIST || MAKE_LHIST || MAKE_GCHIST || MAKE_IDHIST ||
761 				forceTrimLeft>0 || forceTrimRight>0 || forceTrimModulo>0 || minBaseFrequency>0 || recalibrateQuality)){
762 			System.err.println("NOTE: No reference files specified, no trimming mode, no min avg quality, no histograms - read sequences will not be changed.");
763 		}
764 
765 		if(recalibrateQuality){
766 			SamLine.SET_FROM_OK=true;//TODO:  Should ban other operations
767 		}
768 
769 		if(ref!=null){
770 			for(String s0 : ref){
771 				assert(s0!=null) : "Specified a null reference.";
772 				String s=s0.toLowerCase();
773 				assert(s==null || s.startsWith("stdin") || s.startsWith("standardin") || new File(s0).exists()) : "Can't find "+s0;
774 			}
775 		}
776 
777 		//Initialize tables
778 		final int tableType=(useForest ? AbstractKmerTable.FOREST1D : useTable ? AbstractKmerTable.TABLE : useArray ? AbstractKmerTable.ARRAY1D : 0);
779 		ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, 12, prealloc_, (prealloc_ ? preallocFraction : 0.7));
780 		int[] schedule=scheduleMaker.makeSchedule();
781 		if(kfilter){
782 			filterMaps=AbstractKmerTable.preallocate(WAYS, tableType, schedule, -1L);
783 		}else{filterMaps=null;}
784 		if(ktrimN){
785 			maskMaps=AbstractKmerTable.preallocate(WAYS, tableType, schedule, -1L);
786 		}else{maskMaps=null;}
787 		if(ktrimRight){
788 			trimRightMaps=AbstractKmerTable.preallocate(WAYS, tableType, schedule, -1L);
789 		}else{trimRightMaps=null;}
790 		if(ktrimLeft){
791 			trimLeftMaps=AbstractKmerTable.preallocate(WAYS, tableType, schedule, -1L);
792 		}else{trimLeftMaps=null;}
793 
794 		//Initialize entropy
795 		calcEntropy=(entropyCutoff>0);
796 		if(calcEntropy){
797 			assert(entropyWindow>0 && entropyCutoff>=0 && entropyCutoff<=1);
798 			entropy=new double[entropyWindow+2];
799 			final double mult=1d/entropyWindow;
800 			for(int i=0; i<entropy.length; i++){
801 				double pk=i*mult;
802 				entropy[i]=pk*Math.log(pk);
803 			}
804 			entropyMult=-1/Math.log(entropyWindow);
805 			entropyKmerspace=(1<<(2*entropyK));
806 		}else{
807 			entropy=null;
808 			entropyMult=0;
809 			entropyKmerspace=1;
810 		}
811 	}
812 
813 
printFilterPlan(String action, String[] refs, String[] lits)814 	private static void printFilterPlan(String action, String[] refs, String[] lits){
815 		String and=(refs!=null && lits!=null ? " and " : "");
816 		String s1=(lits==null || lits.length!=1 ? "s" : "");
817 		String s2=(refs==null || refs.length!=1 ? "s" : "");
818 		System.err.println(action+" using "+(lits!=null ? lits.length+" literal"+s1 : "")+and+(refs!=null ? refs.length+" reference"+s2 : "")+".");
819 	}
820 
821 
822 	/*--------------------------------------------------------------*/
823 	/*----------------         Outer Methods        ----------------*/
824 	/*--------------------------------------------------------------*/
825 
826 
process()827 	public void process(){
828 
829 		if(recalibrateQuality){
830 			if(samFile!=null){
831 				CalcTrueQuality.main2(new String[] {"in="+samFile, "showstats=f"});
832 			}
833 			CalcTrueQuality.initializeMatrices();
834 		}
835 
836 		/* Check for output file collisions */
837 		if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outb1, outb2, outstats, outrpkm, outduk, outrqc, outrefstats)){
838 			throw new RuntimeException("One or more output files were duplicate or could not be written to.  Check the names or set the 'overwrite=true' flag.");
839 		}
840 
841 		/* Start overall timer */
842 		Timer t=new Timer();
843 
844 //		boolean dq0=FASTQ.DETECT_QUALITY;
845 //		boolean ti0=FASTQ.TEST_INTERLEAVED;
846 //		int rbl0=Shared.bufferLen();;
847 //		FASTQ.DETECT_QUALITY=false;
848 //		FASTQ.TEST_INTERLEAVED=false;
849 //		Shared.setBufferLen(16;
850 
851 		process2(t.time1);
852 
853 //		FASTQ.DETECT_QUALITY=dq0;
854 //		FASTQ.TEST_INTERLEAVED=ti0;
855 //		Shared.setBufferLen(rbl0;
856 
857 		/* Stop timer and calculate speed statistics */
858 		t.stop();
859 
860 
861 		if(showSpeed){
862 			outstream.println();
863 			outstream.println(Tools.timeReadsBasesProcessed(t, readsIn, basesIn, 8));
864 		}
865 
866 		/* Throw an exception if errors were detected */
867 		if(errorState){
868 			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
869 		}
870 	}
871 
872 
process2(long startTime)873 	public void process2(long startTime){
874 
875 		/* Start phase timer */
876 		Timer t=new Timer();
877 
878 		if(DISPLAY_PROGRESS){
879 			outstream.println("Initial:");
880 			Shared.printMemory();
881 			outstream.println();
882 		}
883 
884 		/* Fill tables with reference kmers */
885 		{
886 			final boolean oldTI=FASTQ.TEST_INTERLEAVED; //TODO: This needs to be changed to a non-static field, or somehow 'read mode' and 'ref mode' need to be distinguished.
887 			final boolean oldFI=FASTQ.FORCE_INTERLEAVED;
888 			final boolean oldSplit=FastaReadInputStream.SPLIT_READS;
889 			final int oldML=FastaReadInputStream.MIN_READ_LEN;
890 
891 			FASTQ.TEST_INTERLEAVED=false;
892 			FASTQ.FORCE_INTERLEAVED=false;
893 			FastaReadInputStream.SPLIT_READS=false;
894 			FastaReadInputStream.MIN_READ_LEN=1;
895 
896 			if(kfilter){
897 				storedKmersFilter=spawnLoadThreads(refFilter, literalFilter, filterMaps, true);
898 			}
899 			if(ktrimN){
900 				storedKmersMask=spawnLoadThreads(refMask, literalMask, maskMaps, false);
901 			}
902 			if(ktrimRight){
903 				storedKmersRight=spawnLoadThreads(refRight, literalRight, trimRightMaps, false);
904 			}
905 			if(ktrimLeft){
906 				storedKmersLeft=spawnLoadThreads(refLeft, literalLeft, trimLeftMaps, false);
907 			}
908 
909 			FASTQ.TEST_INTERLEAVED=oldTI;
910 			FASTQ.FORCE_INTERLEAVED=oldFI;
911 			FastaReadInputStream.SPLIT_READS=oldSplit;
912 			FastaReadInputStream.MIN_READ_LEN=oldML;
913 
914 			if(useRefNames){toRefNames();}
915 			t.stop();
916 		}
917 
918 		{
919 			long ram=freeMemory();
920 			ALLOW_LOCAL_ARRAYS=(scaffoldNames!=null && Tools.max(THREADS, 1)*3*8*scaffoldNames.size()<ram*5);
921 		}
922 
923 		/* Dump kmers to text */
924 		if(dump!=null && filterMaps!=null){
925 			ByteStreamWriter bsw=new ByteStreamWriter(dump, overwrite, false, true);
926 			bsw.start();
927 			for(AbstractKmerTable set : filterMaps){
928 				set.dumpKmersAsBytes(bsw, k, 0, Integer.MAX_VALUE, null);
929 			}
930 			bsw.poisonAndWait();
931 		}
932 
933 		final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
934 		Read.VALIDATE_IN_CONSTRUCTOR=THREADS<4;
935 
936 		/* Do kmer matching of input reads */
937 		spawnProcessThreads(t);
938 
939 		Read.VALIDATE_IN_CONSTRUCTOR=vic;
940 
941 		/* Write legacy duk statistics (which requires tables) */
942 		writeDuk(System.nanoTime()-startTime);
943 
944 		/* Unload kmers to save memory */
945 		if(RELEASE_TABLES){unloadKmers();}
946 
947 		/* Write statistics to files */
948 		writeStats();
949 		writeRPKM();
950 		writeRefStats();
951 		writeRqc();
952 
953 		/* Unload sequence data to save memory */
954 		if(RELEASE_TABLES){unloadScaffolds();}
955 
956 		outstream.println("\nInput:                  \t"+readsIn+" reads \t\t"+basesIn+" bases.");
957 
958 		if(kfilter){
959 			outstream.println("Contaminants:           \t"+readsKFiltered+" reads ("+toPercent(readsKFiltered, readsIn)+") \t"+
960 					basesKFiltered+" bases ("+toPercent(basesKFiltered, basesIn)+")");
961 			outstream.flush();
962 		}
963 		if(qtrimLeft || qtrimRight){
964 			outstream.println("QTrimmed:               \t"+readsQTrimmed+" reads ("+toPercent(readsQTrimmed, readsIn)+") \t"+
965 					basesQTrimmed+" bases ("+toPercent(basesQTrimmed, basesIn)+")");
966 		}
967 		if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
968 			outstream.println("FTrimmed:               \t"+readsFTrimmed+" reads ("+toPercent(readsFTrimmed, readsIn)+") \t"+
969 					basesFTrimmed+" bases ("+toPercent(basesFTrimmed, basesIn)+")");
970 		}
971 		if(ktrimLeft || ktrimRight || ktrimN){
972 			outstream.println("KTrimmed:               \t"+readsKTrimmed+" reads ("+toPercent(readsKTrimmed, readsIn)+") \t"+
973 					basesKTrimmed+" bases ("+toPercent(basesKTrimmed, basesIn)+")");
974 		}
975 		if(trimByOverlap){
976 			outstream.println("Trimmed by overlap:     \t"+readsTrimmedByOverlap+" reads ("+toPercent(readsTrimmedByOverlap, readsIn)+") \t"+
977 					basesTrimmedByOverlap+" bases ("+toPercent(basesTrimmedByOverlap, basesIn)+")");
978 		}
979 		if(filterGC){
980 			outstream.println("Filtered by GC:         \t"+badGcReads+" reads ("+toPercent(badGcReads, readsIn)+") \t"+
981 					badGcBases+" bases ("+toPercent(badGcBases, basesIn)+")");
982 		}
983 		if(minAvgQuality>0 || maxNs>=0 || minBaseFrequency>0 || chastityFilter || removeBadBarcodes){
984 			outstream.println("Low quality discards:   \t"+readsQFiltered+" reads ("+toPercent(readsQFiltered, readsIn)+") \t"+
985 					basesQFiltered+" bases ("+toPercent(basesQFiltered, basesIn)+")");
986 		}
987 		if(calcEntropy){
988 			outstream.println("Low entropy discards:   \t"+readsEFiltered+" reads ("+toPercent(readsEFiltered, readsIn)+") \t"+
989 					basesEFiltered+" bases ("+toPercent(basesEFiltered, basesIn)+")");
990 		}
991 
992 		final long readsRemoved=readsIn-readsOut;
993 		final long basesRemoved=basesIn-basesOut;
994 
995 		outstream.println("Total Removed:          \t"+readsRemoved+" reads ("+toPercent(readsRemoved, readsIn)+") \t"+
996 				basesRemoved+" bases ("+toPercent(basesRemoved, basesIn)+")");
997 
998 		outstream.println("Result:                 \t"+readsOut+" reads ("+toPercent(readsOut, readsIn)+") \t"+
999 				basesOut+" bases ("+toPercent(basesOut, basesIn)+")");
1000 
1001 		if(loglog!=null){
1002 			outstream.println("Unique "+loglog.k+"-mers:         \t"+loglog.cardinality());
1003 		}
1004 	}
1005 
toPercent(long numerator, long denominator)1006 	private static String toPercent(long numerator, long denominator){
1007 		if(denominator<1){return "0.00%";}
1008 		return String.format(Locale.ROOT, "%.2f%%",numerator*100.0/denominator);
1009 	}
1010 
1011 	/**
1012 	 * Clear stored kmers.
1013 	 */
unloadKmers()1014 	public void unloadKmers(){
1015 		for(AbstractKmerTable[] akta : new AbstractKmerTable[][] {filterMaps, maskMaps, trimRightMaps, trimLeftMaps}){
1016 			if(akta!=null){
1017 				for(int i=0; i<akta.length; i++){akta[i]=null;}
1018 			}
1019 		}
1020 	}
1021 
1022 	/**
1023 	 * Clear stored sequence data.
1024 	 */
unloadScaffolds()1025 	public void unloadScaffolds(){
1026 		if(scaffoldNames!=null && !scaffoldNames.isEmpty()){
1027 			scaffoldNames.clear();
1028 			scaffoldNames.trimToSize();
1029 		}
1030 		scaffoldReadCounts=null;
1031 		scaffoldBaseCounts=null;
1032 		hitCounts=null;
1033 		scaffoldLengths=null;
1034 	}
1035 
1036 	/**
1037 	 * Write statistics about how many reads matched each reference scaffold.
1038 	 */
writeStats()1039 	private void writeStats(){
1040 		if(outstats==null){return;}
1041 		final TextStreamWriter tsw=new TextStreamWriter(outstats, overwrite, false, false);
1042 		tsw.start();
1043 
1044 		long rsum=0, bsum=0;
1045 
1046 		/* Create StringCount list of scaffold names and hitcounts */
1047 		ArrayList<StringCount> list=new ArrayList<StringCount>();
1048 		for(int i=1; i<scaffoldNames.size(); i++){
1049 			final long num1=scaffoldReadCounts.get(i), num2=scaffoldBaseCounts.get(i);
1050 			if(num1>0 || !printNonZeroOnly){
1051 				rsum+=num1;
1052 				bsum+=num2;
1053 				final String s=scaffoldNames.get(i);
1054 				final int len=scaffoldLengths.get(i);
1055 				final StringCount sn=new StringCount(s, len, num1, num2);
1056 				list.add(sn);
1057 			}
1058 		}
1059 		Shared.sort(list);
1060 		final double rmult=100.0/(readsIn>0 ? readsIn : 1);
1061 		final double bmult=100.0/(basesIn>0 ? basesIn : 1);
1062 
1063 		tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
1064 
1065 		if(STATS_COLUMNS==3){
1066 			tsw.print(String.format(Locale.ROOT, "#Total\t%d\n",readsIn));
1067 			tsw.print(String.format(Locale.ROOT, "#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum));
1068 			tsw.print("#Name\tReads\tReadsPct\n");
1069 			for(int i=0; i<list.size(); i++){
1070 				StringCount sn=list.get(i);
1071 				tsw.print(String.format(Locale.ROOT, "%s\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult)));
1072 			}
1073 		}else{
1074 			tsw.print(String.format(Locale.ROOT, "#Total\t%d\t%d\n",readsIn,basesIn));
1075 			tsw.print(String.format(Locale.ROOT, "#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum,bsum,bsum*bmult));
1076 			tsw.print("#Name\tReads\tReadsPct\tBases\tBasesPct\n");
1077 			for(int i=0; i<list.size(); i++){
1078 				StringCount sn=list.get(i);
1079 				tsw.print(String.format(Locale.ROOT, "%s\t%d\t%.5f%%\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult),sn.bases,(sn.bases*bmult)));
1080 			}
1081 		}
1082 		tsw.poisonAndWait();
1083 	}
1084 
1085 	/**
1086 	 * Write RPKM statistics.
1087 	 */
writeRPKM()1088 	private void writeRPKM(){
1089 		if(outrpkm==null){return;}
1090 		final TextStreamWriter tsw=new TextStreamWriter(outrpkm, overwrite, false, false);
1091 		tsw.start();
1092 
1093 		/* Count mapped reads */
1094 		long mapped=0;
1095 		for(int i=0; i<scaffoldReadCounts.length(); i++){
1096 			mapped+=scaffoldReadCounts.get(i);
1097 		}
1098 
1099 		/* Print header */
1100 		tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
1101 		tsw.print(String.format(Locale.ROOT, "#Reads\t%d\n",readsIn));
1102 		tsw.print(String.format(Locale.ROOT, "#Mapped\t%d\n",mapped));
1103 		tsw.print(String.format(Locale.ROOT, "#RefSequences\t%d\n",Tools.max(0, scaffoldNames.size()-1)));
1104 		tsw.print("#Name\tLength\tBases\tCoverage\tReads\tRPKM\n");
1105 
1106 		final float mult=1000000000f/Tools.max(1, mapped);
1107 
1108 		/* Print data */
1109 		for(int i=1; i<scaffoldNames.size(); i++){
1110 			final long reads=scaffoldReadCounts.get(i);
1111 			final long bases=scaffoldBaseCounts.get(i);
1112 			final String s=scaffoldNames.get(i);
1113 			final int len=scaffoldLengths.get(i);
1114 			final double invlen=1.0/Tools.max(1, len);
1115 			final double mult2=mult*invlen;
1116 			if(reads>0 || !printNonZeroOnly){
1117 				tsw.print(String.format(Locale.ROOT, "%s\t%d\t%d\t%.4f\t%d\t%.4f\n",s,len,bases,bases*invlen,reads,reads*mult2));
1118 			}
1119 		}
1120 		tsw.poisonAndWait();
1121 	}
1122 
1123 	/**
1124 	 * Write statistics on a per-reference basis.
1125 	 */
writeRefStats()1126 	private void writeRefStats(){
1127 		if(outrefstats==null){return;}
1128 		final TextStreamWriter tsw=new TextStreamWriter(outrefstats, overwrite, false, false);
1129 		tsw.start();
1130 
1131 		/* Count mapped reads */
1132 		long mapped=0;
1133 		for(int i=0; i<scaffoldReadCounts.length(); i++){
1134 			mapped+=scaffoldReadCounts.get(i);
1135 		}
1136 
1137 		final int numRefs=refNames.size();
1138 		long[] refReadCounts=new long[numRefs];
1139 		long[] refBaseCounts=new long[numRefs];
1140 		long[] refLengths=new long[numRefs];
1141 
1142 		for(int r=0, s=1; r<numRefs; r++){
1143 			final int lim=s+refScafCounts[r];
1144 			while(s<lim){
1145 				refReadCounts[r]+=scaffoldReadCounts.get(s);
1146 				refBaseCounts[r]+=scaffoldBaseCounts.get(s);
1147 				refLengths[r]+=scaffoldLengths.get(s);
1148 				s++;
1149 			}
1150 		}
1151 
1152 		/* Print header */
1153 		tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
1154 		tsw.print(String.format(Locale.ROOT, "#Reads\t%d\n",readsIn));
1155 		tsw.print(String.format(Locale.ROOT, "#Mapped\t%d\n",mapped));
1156 		tsw.print(String.format(Locale.ROOT, "#References\t%d\n",Tools.max(0, refNames.size())));
1157 		tsw.print("#Name\tLength\tScaffolds\tBases\tCoverage\tReads\tRPKM\n");
1158 
1159 		final float mult=1000000000f/Tools.max(1, mapped);
1160 
1161 		/* Print data */
1162 		for(int i=0; i<refNames.size(); i++){
1163 			final long reads=refReadCounts[i];
1164 			final long bases=refBaseCounts[i];
1165 			final long len=refLengths[i];
1166 			final int scafs=refScafCounts[i];
1167 			final String name=ReadWrite.stripToCore(refNames.get(i));
1168 			final double invlen=1.0/Tools.max(1, len);
1169 			final double mult2=mult*invlen;
1170 			if(reads>0 || !printNonZeroOnly){
1171 				tsw.print(String.format(Locale.ROOT, "%s\t%d\t%d\t%d\t%.4f\t%d\t%.4f\n",name,len,scafs,bases,bases*invlen,reads,reads*mult2));
1172 			}
1173 		}
1174 		tsw.poisonAndWait();
1175 	}
1176 
1177 	/**
1178 	 * Write processing statistics in DUK's format.
1179 	 * @param time Elapsed time, nanoseconds
1180 	 */
writeDuk(long time)1181 	private void writeDuk(long time){
1182 		if(outduk==null){return;}
1183 		final TextStreamWriter tsw=new TextStreamWriter(outduk, overwrite, false, false);
1184 		tsw.start();
1185 		tsw.println(dukString(time, refFilter));
1186 		tsw.poisonAndWait();
1187 	}
1188 
1189 	/**
1190 	 * Write RQCFilter stats.
1191 	 * @param time Elapsed time, nanoseconds
1192 	 */
writeRqc()1193 	private void writeRqc(){
1194 		if(outrqc==null){return;}
1195 		addToRqcMap();
1196 		if(outrqc.endsWith("hashmap")){return;}
1197 		final TextStreamWriter tsw=new TextStreamWriter(outrqc, overwrite, false, false);
1198 		tsw.start();
1199 		tsw.println(rqcString());
1200 		tsw.poisonAndWait();
1201 	}
1202 
rqcString()1203 	public static String rqcString(){
1204 		if(RQC_MAP==null){return null;}
1205 		StringBuilder sb=new StringBuilder();
1206 
1207 		String[] keys=new String[] {"inputReads", "inputBases", "qtrimmedReads", "qtrimmedBases", "qfilteredReads", "qfilteredBases",
1208 				"ktrimmedReads", "ktrimmedBases", "kfilteredReads", "kfilteredBases", "outputReads", "outputBases"};
1209 
1210 		for(String key : keys){
1211 			String value=RQC_MAP.get(key);
1212 			if(value!=null){
1213 				sb.append(key+"="+value+"\n");
1214 			}
1215 		}
1216 
1217 		return sb.toString();
1218 	}
1219 
addToRqcMap()1220 	private void addToRqcMap(){
1221 		putRqc("inputReads", readsIn, false);
1222 		putRqc("inputBases", basesIn, false);
1223 		if(qtrimLeft || qtrimRight){
1224 			putRqc("qtrimmedReads", readsQTrimmed, false);
1225 			putRqc("qtrimmedBases", basesQTrimmed, false);
1226 		}
1227 		putRqc("qfilteredReads", readsQFiltered, false);
1228 		putRqc("qfilteredBases", basesQFiltered, false);
1229 
1230 		if(ktrimLeft || ktrimRight || ktrimN){
1231 			putRqc("ktrimmedReads", readsKTrimmed, true);
1232 			putRqc("ktrimmedBases", basesKTrimmed, true);
1233 		}else{
1234 			putRqc("kfilteredReads", readsKFiltered, false);
1235 			putRqc("kfilteredBases", basesKFiltered, false);
1236 		}
1237 		putRqc("outputReads", readsOut, true);
1238 		putRqc("outputBases", basesOut, true);
1239 	}
1240 
putRqc(String key, long value, boolean evict)1241 	private static void putRqc(String key, long value, boolean evict){putRqc(key, value+"", evict);}
1242 
putRqc(String key, String value, boolean evict)1243 	private static void putRqc(String key, String value, boolean evict){
1244 		if(RQC_MAP==null){RQC_MAP=new HashMap<String,String>();}
1245 		if(evict || !RQC_MAP.containsKey(key)){RQC_MAP.put(key, value);}
1246 	}
1247 
1248 	/**
1249 	 * Helper method; formats statistics to be duk-compatible
1250 	 * @param time Elapsed time, nanoseconds
1251 	 * @return duk output string
1252 	 */
dukString(long time, String[] ref)1253 	private String dukString(long time, String[] ref){
1254 		StringBuilder sb=new StringBuilder();
1255 		sb.append("##INPUT PARAMETERS##\n");
1256 		sb.append("#Reference file:	"+(ref==null || ref.length<1 ? null : ref.length==1 ? ref[0] : Arrays.toString(ref))+"\n");
1257 		sb.append("#Query file:	"+in1+(in2==null ? "" : ","+in2)+"\n");
1258 		sb.append("#Not matched reads file:	"+out1+(out2==null ? "" : ","+out2)+"\n");
1259 		sb.append("#Matched reads file:	"+outb1+(outb2==null ? "" : ","+outb2)+"\n");
1260 		sb.append("#Output file (duk):	"+outduk+"\n");
1261 		sb.append("#Output file (stats):	"+outstats+"\n");
1262 		sb.append("#Mer size:	"+k+"\n");
1263 		long size=0;
1264 		if(kfilter){
1265 			for(AbstractKmerTable x : filterMaps){size+=x.size();}
1266 		}
1267 		if(ktrimN){
1268 			for(AbstractKmerTable x : maskMaps){size+=x.size();}
1269 		}
1270 		if(ktrimRight){
1271 			for(AbstractKmerTable x : trimRightMaps){size+=x.size();}
1272 		}
1273 		if(ktrimLeft){
1274 			for(AbstractKmerTable x : trimLeftMaps){size+=x.size();}
1275 		}
1276 		sb.append("#Avg step size:	"+String.format(Locale.ROOT, "%.1f", refKmers/(double)(Tools.max(1, size)))+"\n");
1277 		sb.append("#Cut off:	"+maxBadKmers+"\n");
1278 		sb.append("#Mask middle:	"+maskMiddle+"\n");
1279 		sb.append("#Quality trim:	"+((qtrimLeft || qtrimRight) ? trimq : "false")+"\n");
1280 		sb.append("\n");
1281 
1282 		sb.append("##REFERENCE STAT##\n");
1283 		sb.append("#Total Reads:	"+refReads+"\n");
1284 		sb.append("#Total Bases:	"+refBases+"\n");
1285 		sb.append("#Total kmers:	"+refKmers+"\n");
1286 		sb.append("#Total stored kmers:	"+size+"\n");
1287 		sb.append("\n");
1288 
1289 		sb.append("## ELAPSED TIME##\n");
1290 		sb.append("# Time:	"+String.format(Locale.ROOT, "%.2f", time/1000000000.0)+" seconds\n");
1291 		sb.append("\n");
1292 
1293 		sb.append("##QUERY FILE STAT##\n");
1294 		sb.append("# Total number of reads:	"+readsIn+"\n");
1295 		sb.append("# Total number of matched reads:	"+readsKFiltered+"\n");
1296 		sb.append("# Match ratio:	"+String.format(Locale.ROOT, "%.6f", readsKFiltered*1.0/readsIn)+"\n");
1297 		sb.append("\n");
1298 
1299 		sb.append("##P-VALUE##\n");
1300 		sb.append("#Avg number of Kmer for each read:	"+((basesIn/(Tools.max(readsIn, 1)))-k)+"\n");
1301 //		sb.append("# P value for the given threshold 1 is 4.05231e-14\n"); //duk prints a P value; not sure what it means
1302 		sb.append("\n");
1303 
1304 		sb.append("## Histogram of kmer occurance for reads with at least one occurance ##\n");
1305 		sb.append("#NumOcc\tNumReads\tPercentage\n");
1306 
1307 		long sum=Tools.sum(hitCounts);
1308 		double mult=100.0/(sum<1 ? 1 : sum);
1309 		for(int i=0; i<hitCounts.length; i++){
1310 			long x=hitCounts[i];
1311 			if(x>0){
1312 				sb.append(i).append('\t').append(x).append('\t').append(String.format(Locale.ROOT, "%.4f",(x*mult))).append('\n');
1313 			}
1314 		}
1315 
1316 		return sb.toString();
1317 	}
1318 
1319 	/**
1320 	 * Fills the scaffold names array with reference names.
1321 	 */
toRefNames()1322 	private void toRefNames(){
1323 		final int numRefs=refNames.size();
1324 		for(int r=0, s=1; r<numRefs; r++){
1325 			final int scafs=refScafCounts[r];
1326 			final int lim=s+scafs;
1327 			final String name=ReadWrite.stripToCore(refNames.get(r));
1328 //			System.err.println("r="+r+", s="+s+", scafs="+scafs+", lim="+lim+", name="+name);
1329 			while(s<lim){
1330 //				System.err.println(r+", "+s+". Setting "+scaffoldNames.get(s)+" -> "+name);
1331 				scaffoldNames.set(s, name);
1332 				s++;
1333 			}
1334 		}
1335 	}
1336 
1337 
1338 	/*--------------------------------------------------------------*/
1339 	/*----------------         Inner Methods        ----------------*/
1340 	/*--------------------------------------------------------------*/
1341 
1342 
1343 	/**
1344 	 * Fills tables with kmers from references, using multiple LoadThread.
1345 	 * @return Number of kmers stored.
1346 	 */
spawnLoadThreads(String[] ref, String[] literal, AbstractKmerTable[] maps, boolean countRef)1347 	private long spawnLoadThreads(String[] ref, String[] literal, AbstractKmerTable[] maps, boolean countRef){
1348 		Timer t=new Timer();
1349 		if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;}
1350 		long added=0;
1351 
1352 		/* Create load threads */
1353 		LoadThread[] loaders=new LoadThread[WAYS];
1354 		for(int i=0; i<loaders.length; i++){
1355 			loaders[i]=new LoadThread(i, maps[i]);
1356 			loaders[i].start();
1357 		}
1358 
1359 		/* For each reference file... */
1360 		int refNum=0;
1361 		if(ref!=null){
1362 			for(String refname : ref){
1363 
1364 				/* Start an input stream */
1365 				FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, false, true);
1366 				ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true);
1367 				cris.start(); //4567
1368 				ListNum<Read> ln=cris.nextList();
1369 				ArrayList<Read> reads=(ln!=null ? ln.list : null);
1370 
1371 				/* Iterate through read lists from the input stream */
1372 				while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
1373 					{
1374 						/* Assign a unique ID number to each scaffold */
1375 						ArrayList<Read> reads2=new ArrayList<Read>(reads);
1376 						for(Read r1 : reads2){
1377 							final Read r2=r1.mate;
1378 							final Integer id=scaffoldNames.size();
1379 							if(countRef){refScafCounts[refNum]++;}
1380 							scaffoldNames.add(r1.id==null ? id.toString() : r1.id);
1381 							int len=r1.length();
1382 							r1.obj=id;
1383 							if(r2!=null){
1384 								r2.obj=id;
1385 								len+=r2.length();
1386 							}
1387 							scaffoldLengths.add(len);
1388 						}
1389 
1390 						if(REPLICATE_AMBIGUOUS){
1391 							reads2=Tools.replicateAmbiguous(reads2, Tools.min(k, mink));
1392 						}
1393 
1394 						/* Send a pointer to the read list to each LoadThread */
1395 						for(LoadThread lt : loaders){
1396 							boolean b=true;
1397 							while(b){
1398 								try {
1399 									lt.queue.put(reads2);
1400 									b=false;
1401 								} catch (InterruptedException e) {
1402 									//TODO:  This will hang due to still-running threads.
1403 									throw new RuntimeException(e);
1404 								}
1405 							}
1406 						}
1407 					}
1408 
1409 					/* Dispose of the old list and fetch a new one */
1410 					cris.returnList(ln);
1411 					ln=cris.nextList();
1412 					reads=(ln!=null ? ln.list : null);
1413 				}
1414 				/* Cleanup */
1415 				cris.returnList(ln);
1416 				errorState|=ReadWrite.closeStream(cris);
1417 				refNum++;
1418 			}
1419 		}
1420 
1421 		/* If there are literal sequences to use as references */
1422 		if(literal!=null){
1423 			ArrayList<Read> list=new ArrayList<Read>(literal.length);
1424 			if(verbose){System.err.println("Adding literals "+Arrays.toString(literal));}
1425 
1426 			/* Assign a unique ID number to each literal sequence */
1427 			for(int i=0; i<literal.length; i++){
1428 				final Integer id=scaffoldNames.size();
1429 				final Read r=new Read(literal[i].getBytes(), null, id);
1430 				if(countRef){refScafCounts[refNum]++;}
1431 				scaffoldNames.add(id.toString());
1432 				scaffoldLengths.add(r.length());
1433 				r.obj=id;
1434 				list.add(r);
1435 			}
1436 
1437 			if(REPLICATE_AMBIGUOUS){
1438 				list=Tools.replicateAmbiguous(list, Tools.min(k, mink));
1439 			}
1440 
1441 			/* Send a pointer to the read list to each LoadThread */
1442 			for(LoadThread lt : loaders){
1443 				boolean b=true;
1444 				while(b){
1445 					try {
1446 						lt.queue.put(list);
1447 						b=false;
1448 					} catch (InterruptedException e) {
1449 						//TODO:  This will hang due to still-running threads.
1450 						throw new RuntimeException(e);
1451 					}
1452 				}
1453 			}
1454 		}
1455 
1456 		/* Signal loaders to terminate */
1457 		for(LoadThread lt : loaders){
1458 			boolean b=true;
1459 			while(b){
1460 				try {
1461 					lt.queue.put(POISON);
1462 					b=false;
1463 				} catch (InterruptedException e) {
1464 					//TODO:  This will hang due to still-running threads.
1465 					throw new RuntimeException(e);
1466 				}
1467 			}
1468 		}
1469 
1470 		/* Wait for loaders to die, and gather statistics */
1471 		boolean success=true;
1472 		for(LoadThread lt : loaders){
1473 			while(lt.getState()!=Thread.State.TERMINATED){
1474 				try {
1475 					lt.join();
1476 				} catch (InterruptedException e) {
1477 					// TODO Auto-generated catch block
1478 					e.printStackTrace();
1479 				}
1480 			}
1481 			added+=lt.added;
1482 			refKmers+=lt.refKmersT;
1483 			refBases+=lt.refBasesT;
1484 			refReads+=lt.refReadsT;
1485 			success&=lt.success;
1486 		}
1487 		if(!success){KillSwitch.kill("Failed loading ref kmers; aborting.");}
1488 
1489 		//Correct statistics for number of threads, since each thread processes all reference data
1490 		refKmers/=WAYS;
1491 		refBases/=WAYS;
1492 		refReads/=WAYS;
1493 
1494 		scaffoldReadCounts=new AtomicLongArray(scaffoldNames.size());
1495 		scaffoldBaseCounts=new AtomicLongArray(scaffoldNames.size());
1496 
1497 		t.stop();
1498 		if(DISPLAY_PROGRESS){
1499 			outstream.println("Added "+added+" kmers; time: \t"+t);
1500 			Shared.printMemory();
1501 			outstream.println();
1502 		}
1503 
1504 		if(verbose){
1505 			TextStreamWriter tsw=new TextStreamWriter("stdout", false, false, false, FileFormat.TEXT);
1506 			tsw.start();
1507 
1508 			if(kfilter){
1509 				tsw.println("kfilter tables:");
1510 				for(AbstractKmerTable x : filterMaps){x.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE);}
1511 			}
1512 			if(ktrimN){
1513 				tsw.println("ktrimN tables:");
1514 				for(AbstractKmerTable x : maskMaps){x.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE);}
1515 			}
1516 			if(ktrimRight){
1517 				tsw.println("ktrimRight tables:");
1518 				for(AbstractKmerTable x : trimRightMaps){x.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE);}
1519 			}
1520 			if(ktrimLeft){
1521 				tsw.println("ktrimLeft tables:");
1522 				for(AbstractKmerTable x : trimLeftMaps){x.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE);}
1523 			}
1524 
1525 			tsw.poisonAndWait();
1526 		}
1527 
1528 		return added;
1529 	}
1530 
1531 	/**
1532 	 * Match reads against reference kmers, using multiple ProcessThread.
1533 	 * @param t
1534 	 */
spawnProcessThreads(Timer t)1535 	private void spawnProcessThreads(Timer t){
1536 		t.start();
1537 
1538 		/* Create read input stream */
1539 		final ConcurrentReadInputStream cris;
1540 		final boolean paired;
1541 		{
1542 			FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
1543 			FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
1544 			cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2, qfin1, qfin2);
1545 			cris.setSampleRate(samplerate, sampleseed);
1546 			cris.start(); //4567
1547 			paired=cris.paired();
1548 			if(!ff1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
1549 		}
1550 
1551 		/* Create read output streams */
1552 		final ConcurrentReadOutputStream ros, rosb, ross;
1553 		if(out1!=null){
1554 			final int buff=(!ordered ? 12 : Tools.max(32, 2*Shared.threads()));
1555 			FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, ordered);
1556 			FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, ordered);
1557 			ros=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, true);
1558 			ros.start();
1559 		}else{ros=null;}
1560 		if(outb1!=null){
1561 			final int buff=(!ordered ? 12 : Tools.max(32, 2*Shared.threads()));
1562 			FileFormat ff1=FileFormat.testOutput(outb1, FileFormat.FASTQ, null, true, overwrite, append, ordered);
1563 			FileFormat ff2=FileFormat.testOutput(outb2, FileFormat.FASTQ, null, true, overwrite, append, ordered);
1564 			rosb=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, true);
1565 			rosb.start();
1566 		}else{rosb=null;}
1567 		if(outsingle!=null){
1568 			final int buff=(!ordered ? 12 : Tools.max(32, 2*Shared.threads()));
1569 			FileFormat ff=FileFormat.testOutput(outsingle, FileFormat.FASTQ, null, true, overwrite, append, ordered);
1570 			ross=ConcurrentReadOutputStream.getStream(ff, null, null, null, buff, null, true);
1571 			ross.start();
1572 		}else{ross=null;}
1573 		if(ros!=null || rosb!=null || ross!=null){
1574 			t.stop();
1575 			outstream.println("Started output streams:\t"+t);
1576 			t.start();
1577 		}
1578 
1579 		/* Optionally skip the first reads, since initial reads may have lower quality */
1580 		if(skipreads>0){
1581 			long skipped=0;
1582 
1583 			ListNum<Read> ln=cris.nextList();
1584 			ArrayList<Read> reads=(ln!=null ? ln.list : null);
1585 
1586 			while(skipped<skipreads && reads!=null && reads.size()>0){
1587 				skipped+=reads.size();
1588 
1589 				if(rosb!=null){rosb.add(new ArrayList<Read>(1), ln.id);}
1590 				if(ros!=null){ros.add(new ArrayList<Read>(1), ln.id);}
1591 				if(ross!=null){ross.add(new ArrayList<Read>(1), ln.id);}
1592 
1593 				cris.returnList(ln);
1594 				ln=cris.nextList();
1595 				reads=(ln!=null ? ln.list : null);
1596 			}
1597 			cris.returnList(ln);
1598 			if(reads==null || reads.isEmpty()){
1599 				ReadWrite.closeStreams(cris, ros, rosb, ross);
1600 				System.err.println("Skipped all of the reads.");
1601 				System.exit(0);
1602 			}
1603 		}
1604 
1605 		/* Create ProcessThreads */
1606 		ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(THREADS);
1607 		for(int i=0; i<THREADS; i++){alpt.add(new ProcessThread(cris, ros, rosb, ross, ALLOW_LOCAL_ARRAYS));}
1608 		for(ProcessThread pt : alpt){pt.start();}
1609 
1610 		/* Wait for threads to die, and gather statistics */
1611 		for(ProcessThread pt : alpt){
1612 			while(pt.getState()!=Thread.State.TERMINATED){
1613 				try {
1614 					pt.join();
1615 				} catch (InterruptedException e) {
1616 					// TODO Auto-generated catch block
1617 					e.printStackTrace();
1618 				}
1619 			}
1620 			readsIn+=pt.readsInT;
1621 			basesIn+=pt.basesInT;
1622 			readsOut+=pt.readsOutT;
1623 			basesOut+=pt.basesOutT;
1624 			readsKFiltered+=pt.readsKFilteredT;
1625 			basesKFiltered+=pt.basesKFilteredT;
1626 			readsQTrimmed+=pt.readsQTrimmedT;
1627 			basesQTrimmed+=pt.basesQTrimmedT;
1628 			readsFTrimmed+=pt.readsFTrimmedT;
1629 			basesFTrimmed+=pt.basesFTrimmedT;
1630 			readsKTrimmed+=pt.readsKTrimmedT;
1631 			basesKTrimmed+=pt.basesKTrimmedT;
1632 			readsTrimmedByOverlap+=pt.readsTrimmedByOverlapT;
1633 			basesTrimmedByOverlap+=pt.basesTrimmedByOverlapT;
1634 			badGcReads+=pt.badGcReadsT;
1635 			badGcBases+=pt.badGcBasesT;
1636 			readsQFiltered+=pt.readsQFilteredT;
1637 			basesQFiltered+=pt.basesQFilteredT;
1638 			readsEFiltered+=pt.readsEFilteredT;
1639 			basesEFiltered+=pt.basesEFilteredT;
1640 
1641 			if(hitCounts!=null){
1642 				for(int i=0; i<hitCounts.length; i++){hitCounts[i]+=pt.hitCountsT[i];}
1643 				pt.hitCountsT=null;
1644 			}
1645 			if(pt.scaffoldReadCountsT!=null && scaffoldReadCounts!=null){
1646 				for(int i=0; i<pt.scaffoldReadCountsT.length; i++){scaffoldReadCounts.addAndGet(i, pt.scaffoldReadCountsT[i]);}
1647 				pt.scaffoldReadCountsT=null;
1648 			}
1649 			if(pt.scaffoldBaseCountsT!=null && scaffoldBaseCounts!=null){
1650 				for(int i=0; i<pt.scaffoldBaseCountsT.length; i++){scaffoldBaseCounts.addAndGet(i, pt.scaffoldBaseCountsT[i]);}
1651 				pt.scaffoldBaseCountsT=null;
1652 			}
1653 		}
1654 
1655 		/* Shut down I/O streams; capture error status */
1656 		errorState|=ReadWrite.closeStreams(cris, ros, rosb, ross);
1657 		errorState|=ReadStats.writeAll();
1658 
1659 		t.stop();
1660 		if(showSpeed){
1661 			outstream.println("Processing time:   \t\t"+t);
1662 		}
1663 	}
1664 
1665 	/*--------------------------------------------------------------*/
1666 	/*----------------         Inner Classes        ----------------*/
1667 	/*--------------------------------------------------------------*/
1668 
1669 
1670 	/**
1671 	 * Loads kmers into a table.  Each thread handles all kmers X such that X%WAYS==tnum.
1672 	 */
1673 	private class LoadThread extends Thread{
1674 
LoadThread(final int tnum_, final AbstractKmerTable map_)1675 		public LoadThread(final int tnum_, final AbstractKmerTable map_){
1676 			tnum=tnum_;
1677 			map=map_;
1678 		}
1679 
1680 		/**
1681 		 * Get the next list of reads (or scaffolds) from the queue.
1682 		 * @return List of reads
1683 		 */
fetch()1684 		private ArrayList<Read> fetch(){
1685 			ArrayList<Read> list=null;
1686 			while(list==null){
1687 				try {
1688 					list=queue.take();
1689 				} catch (InterruptedException e) {
1690 					// TODO Auto-generated catch block
1691 					e.printStackTrace();
1692 				}
1693 			}
1694 			return list;
1695 		}
1696 
1697 		@Override
run()1698 		public void run(){
1699 			ArrayList<Read> reads=fetch();
1700 			while(reads!=POISON){
1701 				for(Read r1 : reads){
1702 					assert(r1.pairnum()==0);
1703 					final Read r2=r1.mate;
1704 
1705 					final int rblen=(r1==null ? 0 : r1.length());
1706 					final int rblen2=r1.mateLength();
1707 
1708 					added+=addToMap(r1, rblen>20000000 ? k : rblen>5000000 ? 11 : rblen>500000 ? 2 : 0);
1709 					if(r2!=null){
1710 						added+=addToMap(r2, rblen2>20000000 ? k : rblen2>5000000 ? 11 : rblen2>500000 ? 2 : 0);
1711 					}
1712 				}
1713 				reads=fetch();
1714 			}
1715 
1716 			if(map.canRebalance() && map.size()>2L*map.arrayLength()){
1717 				map.rebalance();
1718 			}
1719 			success=true;
1720 		}
1721 
1722 		/**
1723 		 * @param r The current read to process
1724 		 * @param skip Number of bases to skip between kmers
1725 		 * @return Number of kmers stored
1726 		 */
addToMap(Read r, int skip)1727 		private long addToMap(Read r, int skip){
1728 			skip=Tools.max(minSkip, Tools.min(maxSkip, skip));
1729 			final byte[] bases=r.bases;
1730 			final int shift=2*k;
1731 			final int shift2=shift-2;
1732 			final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
1733 			final long kmask=lengthMasks[k];
1734 			long kmer=0;
1735 			long rkmer=0;
1736 			long added=0;
1737 			int len=0;
1738 
1739 			if(bases!=null){
1740 				refReadsT++;
1741 				refBasesT+=bases.length;
1742 			}
1743 			if(bases==null || bases.length<k){return 0;}
1744 
1745 			final int id=(Integer)r.obj;
1746 
1747 			if(skip>1){ //Process while skipping some kmers
1748 				for(int i=0; i<bases.length; i++){
1749 					byte b=bases[i];
1750 					long x=AminoAcid.baseToNumber[b];
1751 					long x2=AminoAcid.baseToComplementNumber[b];
1752 					kmer=((kmer<<2)|x)&mask;
1753 					rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
1754 					if(x<0){len=0; rkmer=0;}else{len++;}
1755 					if(verbose){System.err.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
1756 					if(len>=k){
1757 						refKmersT++;
1758 						if(len%skip==0){
1759 							final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
1760 							added+=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
1761 							if(useShortKmers){
1762 								if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
1763 								if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
1764 							}
1765 						}
1766 					}
1767 				}
1768 			}else{ //Process all kmers
1769 				for(int i=0; i<bases.length; i++){
1770 					byte b=bases[i];
1771 					long x=AminoAcid.baseToNumber[b];
1772 					long x2=AminoAcid.baseToComplementNumber[b];
1773 					kmer=((kmer<<2)|x)&mask;
1774 					rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
1775 					if(x<0){len=0; rkmer=0;}else{len++;}
1776 					if(verbose){System.err.println("Scanning2 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
1777 					if(len>=k){
1778 						refKmersT++;
1779 						final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
1780 						final long atm=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
1781 						added+=atm;
1782 //						assert(false) : atm+", "+map.contains(toValue(kmer, rkmer, kmask));
1783 						if(useShortKmers){
1784 							if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
1785 							if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
1786 						}
1787 					}
1788 				}
1789 			}
1790 			return added;
1791 		}
1792 
1793 
1794 		/**
1795 		 * Adds short kmers on the left end of the read.
1796 		 * @param kmer Forward kmer
1797 		 * @param rkmer Reverse kmer
1798 		 * @param extraBase Base added to end in case of deletions
1799 		 * @param id Scaffold number
1800 		 * @return Number of kmers stored
1801 		 */
addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id)1802 		private long addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id){
1803 			if(verbose){System.err.println("addToMapLeftShift");}
1804 			long added=0;
1805 			for(int i=k-1; i>=mink; i--){
1806 				kmer=kmer&rightMasks[i];
1807 				rkmer=rkmer>>>2;
1808 				long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2);
1809 				added+=x;
1810 				if(verbose){
1811 					if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){
1812 						System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added left-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
1813 						System.err.println("i="+i+"; tnum="+tnum+"; Looking for left-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i));
1814 						final long value=toValue(kmer, rkmer, lengthMasks[i]);
1815 						if(map.contains(value)){System.err.println("Found "+value);}
1816 					}
1817 				}
1818 			}
1819 			return added;
1820 		}
1821 
1822 
1823 		/**
1824 		 * Adds short kmers on the right end of the read.
1825 		 * @param kmer Forward kmer
1826 		 * @param rkmer Reverse kmer
1827 		 * @param id Scaffold number
1828 		 * @return Number of kmers stored
1829 		 */
addToMapRightShift(long kmer, long rkmer, final int id)1830 		private long addToMapRightShift(long kmer, long rkmer, final int id){
1831 			if(verbose){System.err.println("addToMapRightShift");}
1832 			long added=0;
1833 			for(int i=k-1; i>=mink; i--){
1834 				long extraBase=kmer&3L;
1835 				kmer=kmer>>>2;
1836 				rkmer=rkmer&rightMasks[i];
1837 //				assert(Long.numberOfLeadingZeros(kmer)>=2*(32-i)) : Long.numberOfLeadingZeros(kmer)+", "+i+", "+kmer+", "+kMasks[i];
1838 //				assert(Long.numberOfLeadingZeros(rkmer)>=2*(32-i)) : Long.numberOfLeadingZeros(rkmer)+", "+i+", "+rkmer+", "+kMasks[i];
1839 				long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2);
1840 				added+=x;
1841 				if(verbose){
1842 					if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){
1843 						System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added right-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
1844 						System.err.println("i="+i+"; tnum="+tnum+"; Looking for right-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i));
1845 						final long value=toValue(kmer, rkmer, lengthMasks[i]);
1846 						if(map.contains(value)){System.err.println("Found "+value);}
1847 					}
1848 				}
1849 			}
1850 			return added;
1851 		}
1852 
1853 
1854 		/**
1855 		 * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance.
1856 		 * @param kmer Forward kmer
1857 		 * @param rkmer Reverse kmer
1858 		 * @param len Kmer length
1859 		 * @param extraBase Base added to end in case of deletions
1860 		 * @param id Scaffold number
1861 		 * @param kmask0
1862 		 * @return Number of kmers stored
1863 		 */
addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist)1864 		private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist){
1865 
1866 			assert(kmask0==lengthMasks[len]) : kmask0+", "+len+", "+lengthMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(lengthMasks[len]);
1867 
1868 			if(verbose){System.err.println("addToMap_A; len="+len+"; kMasks[len]="+lengthMasks[len]);}
1869 			assert((kmer&kmask0)==0);
1870 			final long added;
1871 			if(hdist==0){
1872 				final long key=toValue(kmer, rkmer, kmask0);
1873 				if(speed>0 && ((key/WAYS)&15)<speed){return 0;}
1874 				if(key%WAYS!=tnum){return 0;}
1875 				if(verbose){System.err.println("addToMap_B: "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+" = "+key);}
1876 				added=map.setIfNotPresent(key, id);
1877 			}else if(edist>0){
1878 //				long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
1879 				added=mutate(kmer, rkmer, len, id, edist, extraBase);
1880 			}else{
1881 				added=mutate(kmer, rkmer, len, id, hdist, -1);
1882 			}
1883 			if(verbose){System.err.println("addToMap added "+added+" keys.");}
1884 			return added;
1885 		}
1886 
1887 		/**
1888 		 * Mutate and store this kmer through 'dist' recursions.
1889 		 * @param kmer Forward kmer
1890 		 * @param rkmer Reverse kmer
1891 		 * @param id Scaffold number
1892 		 * @param dist Number of mutations
1893 		 * @param extraBase Base added to end in case of deletions
1894 		 * @return Number of kmers stored
1895 		 */
mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase)1896 		private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){
1897 			long added=0;
1898 
1899 			final long key=toValue(kmer, rkmer, lengthMasks[len]);
1900 
1901 			if(verbose){System.err.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+lengthMasks[len]);}
1902 			if(key%WAYS==tnum){
1903 				if(verbose){System.err.println("mutate_B: "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+" = "+key);}
1904 				int x=map.setIfNotPresent(key, id);
1905 				if(verbose){System.err.println("mutate_B added "+x+" keys.");}
1906 				added+=x;
1907 				assert(map.contains(key));
1908 			}
1909 
1910 			if(dist>0){
1911 				final int dist2=dist-1;
1912 
1913 				//Sub
1914 				for(int j=0; j<4; j++){
1915 					for(int i=0; i<len; i++){
1916 						final long temp=(kmer&clearMasks[i])|setMasks[j][i];
1917 						if(temp!=kmer){
1918 							long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
1919 							added+=mutate(temp, rtemp, len, id, dist2, extraBase);
1920 						}
1921 					}
1922 				}
1923 
1924 				if(editDistance>0){
1925 					//Del
1926 					if(extraBase>=0 && extraBase<=3){
1927 						for(int i=1; i<len; i++){
1928 							final long temp=(kmer&leftMasks[i])|((kmer<<2)&rightMasks[i])|extraBase;
1929 							if(temp!=kmer){
1930 								long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
1931 								added+=mutate(temp, rtemp, len, id, dist2, -1);
1932 							}
1933 						}
1934 					}
1935 
1936 					//Ins
1937 					final long eb2=kmer&3;
1938 					for(int i=1; i<len; i++){
1939 						final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>2);
1940 						for(int j=0; j<4; j++){
1941 							final long temp=temp0|setMasks[j][i-1];
1942 							if(temp!=kmer){
1943 								long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
1944 								added+=mutate(temp, rtemp, len, id, dist2, eb2);
1945 							}
1946 						}
1947 					}
1948 				}
1949 
1950 			}
1951 
1952 			return added;
1953 		}
1954 
1955 		/*--------------------------------------------------------------*/
1956 
1957 		/** Number of kmers stored by this thread */
1958 		public long added=0;
1959 		/** Number of items encountered by this thread */
1960 		public long refKmersT=0, refReadsT=0, refBasesT=0;
1961 		/** Thread number; used to determine which kmers to store */
1962 		public final int tnum;
1963 		/** Buffer of input read lists */
1964 		public final ArrayBlockingQueue<ArrayList<Read>> queue=new ArrayBlockingQueue<ArrayList<Read>>(32);
1965 
1966 		/** Destination for storing kmers */
1967 		private final AbstractKmerTable map;
1968 
1969 		/** Completed successfully */
1970 		boolean success=false;
1971 
1972 	}
1973 
1974 	/*--------------------------------------------------------------*/
1975 
1976 	/**
1977 	 * Matches read kmers against reference kmers, performs binning and/or trimming, and writes output.
1978 	 */
1979 	private class ProcessThread extends Thread{
1980 
1981 		/**
1982 		 * Constructor
1983 		 * @param cris_ Read input stream
1984 		 * @param ros_ Unmatched read output stream (optional)
1985 		 * @param rosb_ Matched read output stream (optional)
1986 		 */
ProcessThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, ConcurrentReadOutputStream rosb_, ConcurrentReadOutputStream ross_, boolean localArrays)1987 		public ProcessThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, ConcurrentReadOutputStream rosb_, ConcurrentReadOutputStream ross_, boolean localArrays){
1988 			cris=cris_;
1989 			ros=ros_;
1990 			rosb=rosb_;
1991 			ross=ross_;
1992 
1993 			readstats=(MAKE_QUALITY_HISTOGRAM || MAKE_MATCH_HISTOGRAM || MAKE_BASE_HISTOGRAM || MAKE_QUALITY_ACCURACY ||
1994 					MAKE_EHIST || MAKE_INDELHIST || MAKE_LHIST || MAKE_GCHIST || MAKE_IDHIST) ?
1995 					new ReadStats() : null;
1996 
1997 			final int alen=(scaffoldNames==null ? 0 : scaffoldNames.size());
1998 
1999 			if(findBestMatch){
2000 				countArray=new int[alen];
2001 				idList=new IntList();
2002 				countList=new IntList();
2003 			}else{
2004 				countArray=null;
2005 				idList=countList=null;
2006 			}
2007 
2008 			overlapVector=(trimByOverlap ? new int[5] : null);
2009 
2010 			hitCountsT=(hitCounts==null ? null : new long[hitCounts.length]);
2011 
2012 			if(localArrays && alen>0 && alen<10000){
2013 				scaffoldReadCountsT=new long[alen];
2014 				scaffoldBaseCountsT=new long[alen];
2015 			}else{
2016 				scaffoldReadCountsT=scaffoldBaseCountsT=null;
2017 			}
2018 
2019 			if(calcEntropy){
2020 				entropyCounts=new short[entropyKmerspace];
2021 				entropyCountCounts=new short[entropyWindow+2];
2022 				entropyCountCounts[0]=(short)entropyWindow;
2023 			}else{
2024 				entropyCounts=entropyCountCounts=null;
2025 			}
2026 		}
2027 
2028 		@Override
run()2029 		public void run(){
2030 			ListNum<Read> ln=cris.nextList();
2031 			ArrayList<Read> reads=(ln!=null ? ln.list : null);
2032 			ArrayList<Read> bad=(rosb==null ? null : new ArrayList<Read>(Shared.bufferLen()));
2033 			ArrayList<Read> single=new ArrayList<Read>(Shared.bufferLen());
2034 
2035 			//While there are more reads lists...
2036 			while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
2037 
2038 				int removed=0;
2039 
2040 				//For each read (or pair) in the list...
2041 				for(int i=0; i<reads.size(); i++){
2042 					final Read r1=reads.get(i);
2043 					final Read r2=r1.mate;
2044 
2045 					if(!r1.validated()){r1.validate(true);}
2046 					if(r2!=null && !r2.validated()){r2.validate(true);}
2047 
2048 					if(readstats!=null){
2049 						if(MAKE_QUALITY_HISTOGRAM){readstats.addToQualityHistogram(r1);}
2050 						if(MAKE_BASE_HISTOGRAM){readstats.addToBaseHistogram(r1);}
2051 						if(MAKE_MATCH_HISTOGRAM){readstats.addToMatchHistogram(r1);}
2052 						if(MAKE_QUALITY_ACCURACY){readstats.addToQualityAccuracy(r1);}
2053 
2054 						if(MAKE_EHIST){readstats.addToErrorHistogram(r1);}
2055 						if(MAKE_INDELHIST){readstats.addToIndelHistogram(r1);}
2056 						if(MAKE_LHIST){readstats.addToLengthHistogram(r1);}
2057 						if(MAKE_GCHIST){readstats.addToGCHistogram(r1);}
2058 						if(MAKE_IDHIST){readstats.addToIdentityHistogram(r1);}
2059 					}
2060 
2061 					if(loglog!=null){loglog.hash(r1);}
2062 
2063 					final int initialLength1=r1.length();
2064 					final int initialLength2=r1.mateLength();
2065 
2066 					final int minlen1=(int)Tools.max(initialLength1*minLenFraction, minReadLength);
2067 					final int minlen2=(int)Tools.max(initialLength2*minLenFraction, minReadLength);
2068 
2069 					if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));}
2070 
2071 					readsInT++;
2072 					basesInT+=initialLength1;
2073 					if(r2!=null){
2074 						readsInT++;
2075 						basesInT+=initialLength2;
2076 					}
2077 
2078 					if(chastityFilter){
2079 						if(r1!=null && r1.failsChastity()){
2080 							r1.setDiscarded(true);
2081 							if(r2!=null){r2.setDiscarded(true);}
2082 						}
2083 					}
2084 
2085 					if(removeBadBarcodes){
2086 						if(r1!=null && !r1.discarded() && r1.failsBarcode(barcodes, failIfNoBarcode)){
2087 							if(failBadBarcodes){KillSwitch.kill("Invalid barcode detected: "+r1.id+"\nThis can be disabled with the flag barcodefilter=f");}
2088 							r1.setDiscarded(true);
2089 							if(r2!=null){r2.setDiscarded(true);}
2090 						}
2091 					}
2092 
2093 					if(recalibrateQuality){
2094 						if(r1!=null && !r1.discarded()){
2095 							CalcTrueQuality.recalibrate(r1);
2096 						}
2097 						if(r2!=null && !r2.discarded()){
2098 							CalcTrueQuality.recalibrate(r2);
2099 						}
2100 					}
2101 
2102 					if(filterGC && (initialLength1>0 || initialLength2>0)){
2103 						float gc1=(initialLength1>0 ? r1.gc() : -1);
2104 						float gc2=(initialLength2>0 ? r2.gc() : gc1);
2105 						if(gc1==-1){gc1=gc2;}
2106 						if(usePairGC){
2107 							final float gc;
2108 							if(r2==null){
2109 								gc=gc1;
2110 							}else{
2111 								gc=(gc1*initialLength1+gc2*initialLength2)/(initialLength1+initialLength2);
2112 							}
2113 							gc1=gc2=gc;
2114 						}
2115 						if(r1!=null && !r1.discarded() && (gc1<minGC || gc1>maxGC)){
2116 							r1.setDiscarded(true);
2117 							badGcBasesT+=initialLength1;
2118 							badGcReadsT++;
2119 						}
2120 						if(r2!=null && !r2.discarded() && (gc2<minGC || gc2>maxGC)){
2121 							r2.setDiscarded(true);
2122 							badGcBasesT+=initialLength2;
2123 							badGcReadsT++;
2124 						}
2125 					}
2126 
2127 					if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
2128 						if(r1!=null && !r1.discarded()){
2129 							final int len=r1.length();
2130 							final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
2131 							final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
2132 							final int b1=forceTrimRight>0 ? forceTrimRight : len;
2133 							final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
2134 							final int b=Tools.min(b0, b1, b2);
2135 							final int x=TrimRead.trimToPosition(r1, a, b, 1);
2136 							basesFTrimmedT+=x;
2137 							readsFTrimmedT+=(x>0 ? 1 : 0);
2138 							if(r1.length()<minlen1){r1.setDiscarded(true);}
2139 						}
2140 						if(r2!=null && !r2.discarded()){
2141 							final int len=r2.length();
2142 							final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
2143 							final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
2144 							final int b1=forceTrimRight>0 ? forceTrimRight : len;
2145 							final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
2146 							final int b=Tools.min(b0, b1, b2);
2147 							final int x=TrimRead.trimToPosition(r2, a, b, 1);
2148 							basesFTrimmedT+=x;
2149 							readsFTrimmedT+=(x>0 ? 1 : 0);
2150 							if(r2.length()<minlen2){r2.setDiscarded(true);}
2151 						}
2152 					}
2153 
2154 					boolean remove;
2155 					if(removePairsIfEitherBad){remove=r1.discarded() || (r2!=null && r2.discarded());}
2156 					else{remove=r1.discarded() && (r2==null || r2.discarded());}
2157 
2158 					if(remove){
2159 						if(r1!=null){
2160 							basesQFilteredT+=r1.length();
2161 							readsQFilteredT++;
2162 						}
2163 						if(r2!=null){
2164 							basesQFilteredT+=r2.length();
2165 							readsQFilteredT++;
2166 						}
2167 						if(bad!=null){bad.add(r1);}
2168 					}else{
2169 
2170 						if(ecc && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
2171 
2172 						//Process kmers
2173 						if(kfilter && storedKmersFilter>0){
2174 							//Do kmer matching
2175 
2176 							if(!findBestMatch){
2177 								final int a=(kbig<=k ? countSetKmers(r1, filterMaps) : countSetKmersBig(r1, filterMaps));
2178 								final int b=(kbig<=k ? countSetKmers(r2, filterMaps) : countSetKmersBig(r2, filterMaps));
2179 
2180 								if(r1!=null && a>maxBadKmers){r1.setDiscarded(true);}
2181 								if(r2!=null && b>maxBadKmers){r2.setDiscarded(true);}
2182 							}else{
2183 								final int a=findBestMatch(r1, filterMaps);
2184 								final int b=findBestMatch(r2, filterMaps);
2185 
2186 								if(r1!=null && a>0){r1.setDiscarded(true);}
2187 								if(r2!=null && b>0){r2.setDiscarded(true);}
2188 							}
2189 
2190 							if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
2191 									(r1.discarded() && (r2==null || r2.discarded()))){
2192 								remove=true;
2193 								if(r1!=null){
2194 									readsKFilteredT++;
2195 									basesKFilteredT+=r1.length();
2196 								}
2197 								if(r2!=null){
2198 									readsKFilteredT++;
2199 									basesKFilteredT+=r2.length();
2200 								}
2201 								if(bad!=null){bad.add(r1);}
2202 							}
2203 
2204 						}
2205 
2206 						if(ktrimN && storedKmersMask>0){
2207 							remove=remove || ktrim0(r1, r2, bad, maskMaps, NMODE, minlen1, minlen2);
2208 						}
2209 
2210 						if(ktrimRight && storedKmersRight>0){
2211 							remove=remove || ktrim0(r1, r2, bad, trimRightMaps, RIGHTMODE, minlen1, minlen2);
2212 
2213 							if(trimPairsEvenly && xsum>0 && r2!=null && r1.length()!=r2.length()){
2214 								int x;
2215 								if(r1.length()>r2.length()){
2216 									x=TrimRead.trimToPosition(r1, 0, r2.length()-1, 1);
2217 								}else{
2218 									x=TrimRead.trimToPosition(r2, 0, r1.length()-1, 1);
2219 								}
2220 								if(rktsum<2){readsKTrimmedT++;}
2221 								basesKTrimmedT+=x;
2222 
2223 								assert(r1.length()==r2.length()) : r1.length()+", "+r2.length();
2224 							}
2225 						}
2226 
2227 						if(!remove && trimByOverlap && r2!=null && expectedErrors(r1, r2)<meeFilter){
2228 
2229 							if(aprob==null || aprob.length<r1.length()){aprob=new float[r1.length()];}
2230 							if(bprob==null || bprob.length<r2.length()){bprob=new float[r2.length()];}
2231 
2232 							//Do overlap trimming
2233 							r2.reverseComplement();
2234 //							int bestInsert=BBMergeOverlapper.mateByOverlap(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap,
2235 //									overlapMargin, overlapMaxMismatches0, overlapMaxMismatches, overlapMinq);
2236 							int bestInsert=BBMergeOverlapper.mateByOverlapRatio(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap,
2237 									minInsert0, minInsert, maxRatio, 0.12f, ratioMargin, ratioOffset, 0.95f, 0.95f, useQualityForOverlap);
2238 
2239 							if(bestInsert<minInsert){bestInsert=-1;}
2240 							boolean ambig=(overlapVector[4]==1);
2241 							final int bestBad=overlapVector[2];
2242 
2243 							if(bestInsert>0 && !ambig && r1.quality!=null && r2.quality!=null && useQualityForOverlap){
2244 								if(efilterRatio>0 && bestInsert>0 && !ambig){
2245 									float bestExpected=BBMergeOverlapper.expectedMismatches(r1, r2, bestInsert);
2246 									if((bestExpected+efilterOffset)*efilterRatio<bestBad){ambig=true;}
2247 								}
2248 								if(pfilterRatio>0 && bestInsert>0 && !ambig){
2249 									float probability=BBMergeOverlapper.probability(r1, r2, bestInsert);
2250 									if(probability<pfilterRatio){bestInsert=-1;}
2251 								}
2252 							}
2253 
2254 							r2.reverseComplement();
2255 
2256 							if(bestInsert>0 && !ambig){
2257 								if(bestInsert<r1.length()){
2258 									if(verbose){System.err.println("Overlap right trimming r1 to "+0+", "+(bestInsert-1));}
2259 									int x=TrimRead.trimToPosition(r1, 0, bestInsert-1, 1);
2260 									if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r1.bases));}
2261 									readsTrimmedByOverlapT++;
2262 									basesTrimmedByOverlapT+=x;
2263 								}
2264 								if(bestInsert<r2.length()){
2265 									if(verbose){System.err.println("Overlap right trimming r2 to "+0+", "+(bestInsert-1));}
2266 									int x=TrimRead.trimToPosition(r2, 0, bestInsert-1, 1);
2267 									if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r2.bases));}
2268 									readsTrimmedByOverlapT++;
2269 									basesTrimmedByOverlapT+=x;
2270 								}
2271 							}
2272 						}
2273 
2274 						if(ktrimLeft && storedKmersLeft>0){
2275 							remove=remove || ktrim0(r1, r2, bad, trimLeftMaps, LEFTMODE, minlen1, minlen2);
2276 						}
2277 
2278 					}
2279 
2280 					if(!remove){
2281 						//Do quality trimming
2282 
2283 						int rlen1=0, rlen2=0;
2284 						if(r1!=null){
2285 							if(qtrimLeft || qtrimRight){
2286 								int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, trimE, 1);
2287 								basesQTrimmedT+=x;
2288 								readsQTrimmedT+=(x>0 ? 1 : 0);
2289 							}
2290 							rlen1=r1.length();
2291 							if(rlen1<minlen1 || rlen1>maxReadLength){r1.setDiscarded(true);}
2292 						}
2293 						if(r2!=null){
2294 							if(qtrimLeft || qtrimRight){
2295 								int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, trimE, 1);
2296 								basesQTrimmedT+=x;
2297 								readsQTrimmedT+=(x>0 ? 1 : 0);
2298 							}
2299 							rlen2=r2.length();
2300 							if(rlen2<minlen2 || rlen2>maxReadLength){r2.setDiscarded(true);}
2301 						}
2302 
2303 						//Discard reads if too short
2304 						if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
2305 								(r1.discarded() && (r2==null || r2.discarded()))){
2306 							basesQTrimmedT+=r1.pairLength();
2307 							remove=true;
2308 							if(addTrimmedToBad && bad!=null){bad.add(r1);}
2309 						}
2310 
2311 					}
2312 
2313 					if(!remove){
2314 						//Do quality filtering
2315 
2316 						//Determine whether to discard the reads based on average quality
2317 						if(minAvgQuality>0){
2318 							if(r1!=null && r1.quality!=null && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){r1.setDiscarded(true);}
2319 							if(r2!=null && r2.quality!=null && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){r2.setDiscarded(true);}
2320 						}
2321 						//Determine whether to discard the reads based on the presence of Ns
2322 						if(maxNs>=0){
2323 							if(r1!=null && r1.countUndefined()>maxNs){r1.setDiscarded(true);}
2324 							if(r2!=null && r2.countUndefined()>maxNs){r2.setDiscarded(true);}
2325 						}
2326 						//Determine whether to discard the reads based on a lack of useful kmers
2327 						if(minConsecutiveBases>0){
2328 							if(r1!=null && !r1.discarded() && !r1.hasMinConsecutiveBases(minConsecutiveBases)){r1.setDiscarded(true);}
2329 							if(r2!=null && !r2.discarded() && !r2.hasMinConsecutiveBases(minConsecutiveBases)){r2.setDiscarded(true);}
2330 						}
2331 						//Determine whether to discard the reads based on minimum base frequency
2332 						if(minBaseFrequency>0){
2333 							if(r1!=null && r1.minBaseCount()<minBaseFrequency*r1.length()){r1.setDiscarded(true);}
2334 							if(r2!=null && r2.minBaseCount()<minBaseFrequency*r2.length()){r2.setDiscarded(true);}
2335 						}
2336 
2337 						//Discard reads if too short
2338 						if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
2339 								(r1.discarded() && (r2==null || r2.discarded()))){
2340 							basesQFilteredT+=r1.pairLength();
2341 							readsQFilteredT+=r1.pairCount();
2342 							remove=true;
2343 							if(addTrimmedToBad && bad!=null){bad.add(r1);}
2344 						}
2345 					}
2346 
2347 					if(!remove && calcEntropy){
2348 						//Test entropy
2349 
2350 						if(r1!=null && !r1.discarded() && entropyCutoff>averageEntropy(r1.bases, entropyK, entropyWindow,
2351 								entropyCounts, entropyCountCounts, entropyKmerspace, verifyEntropy)){r1.setDiscarded(true);}
2352 						if(r2!=null && !r2.discarded() && entropyCutoff>averageEntropy(r2.bases, entropyK, entropyWindow,
2353 								entropyCounts, entropyCountCounts, entropyKmerspace, verifyEntropy)){r2.setDiscarded(true);}
2354 
2355 						if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
2356 								(r1.discarded() && (r2==null || r2.discarded()))){
2357 							basesEFilteredT+=r1.pairLength();
2358 							readsEFilteredT+=r1.pairCount();
2359 							remove=true;
2360 							if(bad!=null){bad.add(r1);}
2361 						}
2362 					}
2363 
2364 					if(ross!=null){
2365 						if(!r1.discarded() && (r2==null || r2.discarded())){
2366 							Read clone=r1.clone();
2367 							clone.mate=null;
2368 							single.add(clone);
2369 						}else if(r2!=null && r1.discarded() && !r2.discarded()){
2370 							Read clone=r2.clone();
2371 							clone.mate=null;
2372 							single.add(clone);
2373 						}
2374 					}
2375 
2376 					if(remove){
2377 						//Evict read
2378 						removed++;
2379 						if(r2!=null){removed++;}
2380 						reads.set(i, null);
2381 //						System.err.println("X1\t"+removed);
2382 					}else{
2383 						//Track statistics
2384 
2385 						if(r1!=null){
2386 							readsOutT++;
2387 							basesOutT+=r1.length();
2388 						}
2389 						if(r2!=null){
2390 							readsOutT++;
2391 							basesOutT+=r2.length();
2392 						}
2393 //						System.err.println("X2\t"+readsOutT);
2394 					}
2395 				}
2396 
2397 				//Send matched list to matched output stream
2398 				if(rosb!=null){
2399 					rosb.add(bad, ln.id);
2400 					bad.clear();
2401 				}
2402 
2403 				//Send unmatched list to unmatched output stream
2404 				if(ros!=null){
2405 					ros.add((removed>0 ? Tools.condenseNew(reads) : reads), ln.id); //Creates a new list if old one became empty, to prevent shutting down the cris.
2406 				}
2407 
2408 				if(ross!=null){
2409 					ross.add(single, ln.id);
2410 					single.clear();
2411 				}
2412 
2413 				//Fetch a new read list
2414 				cris.returnList(ln);
2415 				ln=cris.nextList();
2416 				reads=(ln!=null ? ln.list : null);
2417 			}
2418 			cris.returnList(ln);
2419 		}
2420 
2421 		/*--------------------------------------------------------------*/
2422 		/*----------------        Helper Methods        ----------------*/
2423 		/*--------------------------------------------------------------*/
2424 
ktrim0(final Read r1, final Read r2, ArrayList<Read> bad, AbstractKmerTable[] maps, int mode, int minlen1, int minlen2)2425 		private boolean ktrim0(final Read r1, final Read r2, ArrayList<Read> bad, AbstractKmerTable[] maps, int mode, int minlen1, int minlen2){
2426 			boolean remove=false;
2427 			int rlen1=0, rlen2=0;
2428 			xsum=0;
2429 			rktsum=0;
2430 			if(r1!=null){
2431 				final int x=(mode==NMODE ? kmask(r1, maps) : ktrim(r1, maps, mode));
2432 				xsum+=x;
2433 				rktsum+=(x>0 ? 1 : 0);
2434 				rlen1=r1.length();
2435 				if(rlen1<minlen1){r1.setDiscarded(true);}
2436 			}
2437 			if(r2!=null){
2438 				final int x=(mode==NMODE ? kmask(r2, maps) : ktrim(r2, maps, mode));
2439 				xsum+=x;
2440 				rktsum+=(x>0 ? 1 : 0);
2441 				rlen2=r2.length();
2442 				if(rlen2<minlen2){r2.setDiscarded(true);}
2443 			}
2444 
2445 			if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
2446 					(r1.discarded() && (r2==null || r2.discarded()))){
2447 				if(!ktrimN){
2448 					xsum+=(rlen1+rlen2);
2449 					rktsum=r1.pairCount();
2450 				}
2451 				remove=true;
2452 				if(addTrimmedToBad && bad!=null){bad.add(r1);}
2453 			}
2454 			basesKTrimmedT+=xsum;
2455 			readsKTrimmedT+=rktsum;
2456 
2457 			return remove;
2458 		}
2459 
2460 		/**
2461 		 * Transforms a kmer into all canonical values for a given Hamming distance.
2462 		 * Returns the related id stored in the tables.
2463 		 * @param kmer Forward kmer
2464 		 * @param rkmer Reverse kmer
2465 		 * @param lengthMask Bitmask with single '1' set to left of kmer
2466 		 * @param qPos Position of kmer in query
2467 		 * @param len kmer length
2468 		 * @param qHDist Hamming distance
2469 		 * @param sets Kmer hash tables
2470 		 * @return Value stored in table, or -1
2471 		 */
getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets)2472 		private final int getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets){
2473 			int id=getValue(kmer, rkmer, lengthMask, qPos, sets);
2474 			if(id<1 && qHDist>0){
2475 				final int qHDist2=qHDist-1;
2476 
2477 				//Sub
2478 				for(int j=0; j<4 && id<1; j++){
2479 					for(int i=0; i<len && id<1; i++){
2480 						final long temp=(kmer&clearMasks[i])|setMasks[j][i];
2481 						if(temp!=kmer){
2482 							long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
2483 							id=getValue(temp, rtemp, lengthMask, qPos, len, qHDist2, sets);
2484 						}
2485 					}
2486 				}
2487 			}
2488 			return id;
2489 		}
2490 
2491 		/**
2492 		 * Transforms a kmer into a canonical value stored in the table and search.
2493 		 * @param kmer Forward kmer
2494 		 * @param rkmer Reverse kmer
2495 		 * @param lengthMask Bitmask with single '1' set to left of kmer
2496 		 * @param qPos Position of kmer in query
2497 		 * @param sets Kmer hash tables
2498 		 * @return Value stored in table
2499 		 */
getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final AbstractKmerTable[] sets)2500 		private final int getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final AbstractKmerTable[] sets){
2501 			assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
2502 			if(qSkip>1 && (qPos%qSkip!=0)){return -1;}
2503 
2504 			final long max=(rcomp ? Tools.max(kmer, rkmer) : kmer);
2505 			final long key=(max&middleMask)|lengthMask;
2506 			if(noAccel || ((key/WAYS)&15)>=speed){
2507 				if(verbose){System.err.println("Testing key "+key);}
2508 				AbstractKmerTable set=sets[(int)(key%WAYS)];
2509 				final int id=set.getValue(key);
2510 				return id;
2511 			}
2512 			return -1;
2513 		}
2514 
2515 		/*--------------------------------------------------------------*/
2516 
2517 		/**
2518 		 * Counts the number of kmer hits for a read.
2519 		 * @param r Read to process
2520 		 * @param sets Kmer tables
2521 		 * @return Number of hits
2522 		 */
countSetKmers(final Read r, final AbstractKmerTable sets[])2523 		private final int countSetKmers(final Read r, final AbstractKmerTable sets[]){
2524 			if(r==null || r.length()<k){return 0;}
2525 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
2526 			final byte[] bases=r.bases;
2527 			final int minlen=k-1;
2528 			final int minlen2=(maskMiddle ? k/2 : k);
2529 			final int shift=2*k;
2530 			final int shift2=shift-2;
2531 			final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
2532 			final long kmask=lengthMasks[k];
2533 			long kmer=0;
2534 			long rkmer=0;
2535 			int found=0;
2536 			int len=0;
2537 
2538 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
2539 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
2540 
2541 			/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
2542 			for(int i=start; i<stop; i++){
2543 				byte b=bases[i];
2544 				long x=Dedupe.baseToNumber[b];
2545 				long x2=Dedupe.baseToComplementNumber[b];
2546 				kmer=((kmer<<2)|x)&mask;
2547 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
2548 				if(b=='N' && forbidNs){len=0; rkmer=0;}else{len++;}
2549 				if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
2550 				if(len>=minlen2 && i>=minlen){
2551 					final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
2552 					if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
2553 					if(id>0){
2554 						if(verbose){System.err.println("Found = "+(found+1)+"/"+maxBadKmers);}
2555 						if(found==maxBadKmers){
2556 							if(scaffoldReadCountsT!=null){
2557 								scaffoldReadCountsT[id]++;
2558 								scaffoldBaseCountsT[id]+=bases.length;
2559 							}else{
2560 								scaffoldReadCounts.addAndGet(id, 1);
2561 								scaffoldBaseCounts.addAndGet(id, bases.length);
2562 							}
2563 							if(hitCounts==null){
2564 								return (found=found+1);
2565 							}//Early exit, but prevents generation of histogram that goes over maxBadKmers+1.
2566 						}
2567 						found++;
2568 					}
2569 				}
2570 			}
2571 
2572 			if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
2573 			return found;
2574 		}
2575 
2576 		/**
2577 		 * Returns the id of the sequence with the most kmer matches to this read, or -1 if none are over maxBadKmers.
2578 		 * @param r Read to process
2579 		 * @param sets Kmer tables
2580 		 * @return id of best match
2581 		 */
findBestMatch(final Read r, final AbstractKmerTable sets[])2582 		private final int findBestMatch(final Read r, final AbstractKmerTable sets[]){
2583 			idList.size=0;
2584 			if(r==null || r.length()<k){return -1;}
2585 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return -1;}
2586 			final byte[] bases=r.bases;
2587 			final int minlen=k-1;
2588 			final int minlen2=(maskMiddle ? k/2 : k);
2589 			final int shift=2*k;
2590 			final int shift2=shift-2;
2591 			final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
2592 			final long kmask=lengthMasks[k];
2593 			long kmer=0;
2594 			long rkmer=0;
2595 			int len=0;
2596 			int found=0;
2597 
2598 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
2599 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
2600 
2601 			/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
2602 			for(int i=start; i<stop; i++){
2603 				byte b=bases[i];
2604 				long x=Dedupe.baseToNumber[b];
2605 				long x2=Dedupe.baseToComplementNumber[b];
2606 				kmer=((kmer<<2)|x)&mask;
2607 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
2608 				if(b=='N' && forbidNs){len=0; rkmer=0;}else{len++;}
2609 				if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
2610 				if(len>=minlen2 && i>=minlen){
2611 					final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
2612 					if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
2613 					if(id>0){
2614 						countArray[id]++;
2615 						if(countArray[id]==1){idList.add(id);}
2616 						found++;
2617 						if(verbose){System.err.println("Found = "+found+"/"+maxBadKmers);}
2618 					}
2619 				}
2620 			}
2621 
2622 			final int id, max;
2623 			if(found>maxBadKmers){
2624 				max=condenseLoose(countArray, idList, countList);
2625 				int id0=-1;
2626 				for(int i=0; i<countList.size; i++){
2627 					if(countList.get(i)==max){
2628 						id0=idList.get(i); break;
2629 					}
2630 				}
2631 				if(rename){rename(r, idList, countList);}
2632 				id=id0;
2633 			}else{
2634 				max=0;
2635 				id=-1;
2636 			}
2637 
2638 			if(found>maxBadKmers){
2639 				if(scaffoldReadCountsT!=null){
2640 					scaffoldReadCountsT[id]++;
2641 					scaffoldBaseCountsT[id]+=bases.length;
2642 				}else{
2643 					scaffoldReadCounts.addAndGet(id, 1);
2644 					scaffoldBaseCounts.addAndGet(id, bases.length);
2645 				}
2646 			}
2647 
2648 			if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
2649 			return id;
2650 		}
2651 
2652 		/** Estimates kmer hit counts for kmers longer than k using consecutive matches
2653 		 * @param r
2654 		 * @param sets
2655 		 * @return Number of sets of consecutive hits of exactly length kbig
2656 		 */
countSetKmersBig(final Read r, final AbstractKmerTable sets[])2657 		private final int countSetKmersBig(final Read r, final AbstractKmerTable sets[]){
2658 			if(r==null || r.length()<kbig){return 0;}
2659 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
2660 			assert(kbig>k);
2661 			final int sub=kbig-k-1;
2662 			assert(sub>=0) : kbig+", "+sub;
2663 			final byte[] bases=r.bases;
2664 			final int minlen=k-1;
2665 			final int minlen2=(maskMiddle ? k/2 : k);
2666 			final int shift=2*k;
2667 			final int shift2=shift-2;
2668 			final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
2669 			final long kmask=lengthMasks[k];
2670 			long kmer=0;
2671 			long rkmer=0;
2672 			int found=0;
2673 			int len=0;
2674 
2675 			int bkStart=-1;
2676 			int bkStop=-1;
2677 			int id=-1, lastId=-1;
2678 
2679 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
2680 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
2681 
2682 			/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
2683 			for(int i=start; i<stop; i++){
2684 				byte b=bases[i];
2685 				long x=Dedupe.baseToNumber[b];
2686 				long x2=Dedupe.baseToComplementNumber[b];
2687 				kmer=((kmer<<2)|x)&mask;
2688 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
2689 				if(b=='N' && forbidNs){len=0; rkmer=0;}else{len++;}
2690 				if(verbose){System.err.println("Scanning7 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
2691 				if(len>=minlen2 && i>=minlen){
2692 					id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
2693 					if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
2694 					if(id>0){
2695 						lastId=id;
2696 						if(bkStart==-1){bkStart=i;}
2697 						bkStop=i;
2698 					}else{
2699 						if(bkStart>-1){
2700 							int dif=bkStop-bkStart-sub;
2701 							bkStop=bkStart=-1;
2702 							if(dif>0){
2703 								int old=found;
2704 								found+=dif;
2705 								if(found>maxBadKmers && old<=maxBadKmers){
2706 									if(scaffoldReadCountsT!=null){
2707 										scaffoldReadCountsT[lastId]++;
2708 										scaffoldBaseCountsT[lastId]+=bases.length;
2709 									}else{
2710 										scaffoldReadCounts.addAndGet(lastId, 1);
2711 										scaffoldBaseCounts.addAndGet(lastId, bases.length);
2712 									}
2713 									if(hitCounts==null){
2714 										return found;
2715 									}//Early exit, but prevents generation of histogram that goes over maxBadKmers+1.
2716 								}
2717 							}
2718 						}
2719 					}
2720 				}
2721 			}
2722 
2723 			// This catches the case where valid kmers extend to the end of the read
2724 			if(bkStart>-1){
2725 				int dif=bkStop-bkStart-sub;
2726 				bkStop=bkStart=-1;
2727 				if(dif>0){
2728 					int old=found;
2729 					found+=dif;
2730 					if(found>maxBadKmers && old<=maxBadKmers){
2731 						if(scaffoldReadCountsT!=null){
2732 							scaffoldReadCountsT[lastId]++;
2733 							scaffoldBaseCountsT[lastId]+=bases.length;
2734 						}else{
2735 							scaffoldReadCounts.addAndGet(lastId, 1);
2736 							scaffoldBaseCounts.addAndGet(lastId, bases.length);
2737 						}
2738 					}
2739 				}
2740 			}
2741 
2742 			if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
2743 			return found;
2744 		}
2745 
2746 		/**
2747 		 * Trim a read to remove matching kmers and everything to their left or right.
2748 		 * @param r Read to process
2749 		 * @param sets Kmer tables
2750 		 * @return Number of bases trimmed
2751 		 */
ktrim(final Read r, final AbstractKmerTable[] sets, int mode)2752 		private final int ktrim(final Read r, final AbstractKmerTable[] sets, int mode){
2753 			assert(mode==RIGHTMODE || mode==LEFTMODE);
2754 			if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k))){return 0;}
2755 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
2756 			if(verbose){System.err.println("KTrimming read "+r.id);}
2757 			final byte[] bases=r.bases, quals=r.quality;
2758 			final int minlen=k-1;
2759 			final int minlen2=(maskMiddle ? k/2 : k);
2760 			final int shift=2*k;
2761 			final int shift2=shift-2;
2762 			final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
2763 			final long kmask=lengthMasks[k];
2764 			long kmer=0;
2765 			long rkmer=0;
2766 			int found=0;
2767 			int len=0;
2768 			int id0=-1; //ID of first kmer found.
2769 
2770 			int minLoc=999999999, minLocExclusive=999999999;
2771 			int maxLoc=-1, maxLocExclusive=-1;
2772 			final int initialLength=r.length();
2773 
2774 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
2775 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
2776 
2777 			//Scan for normal kmers
2778 			for(int i=start; i<stop; i++){
2779 				byte b=bases[i];
2780 				long x=Dedupe.baseToNumber[b];
2781 				long x2=Dedupe.baseToComplementNumber[b];
2782 				kmer=((kmer<<2)|x)&mask;
2783 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
2784 				if(b=='N' && forbidNs){len=0; rkmer=0;}else{len++;}
2785 				if(verbose){System.err.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
2786 				if(len>=minlen2 && i>=minlen){
2787 					final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
2788 					if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
2789 					if(id>0){
2790 						if(id0<0){id0=id;}
2791 						minLoc=Tools.min(minLoc, i-k+1);
2792 						assert(minLoc>=0);
2793 						maxLoc=i;
2794 						found++;
2795 					}
2796 				}
2797 			}
2798 
2799 			if(minLoc!=minLocExclusive){minLocExclusive=minLoc+k;}
2800 			if(maxLoc!=maxLocExclusive){maxLocExclusive=maxLoc-k;}
2801 
2802 			//If nothing was found, scan for short kmers.  Only used for trimming.
2803 			if(useShortKmers && found==0){
2804 				assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
2805 
2806 				//Look for short kmers on left side
2807 				if(mode==LEFTMODE || mode==NMODE){
2808 					kmer=0;
2809 					rkmer=0;
2810 					len=0;
2811 					final int lim=Tools.min(k, stop);
2812 					for(int i=start; i<lim; i++){
2813 						byte b=bases[i];
2814 						long x=Dedupe.baseToNumber[b];
2815 						long x2=Dedupe.baseToComplementNumber[b];
2816 						kmer=((kmer<<2)|x)&mask;
2817 						rkmer=rkmer|(x2<<(2*len));
2818 						len++;
2819 						if(verbose){System.err.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
2820 						if(len>=mink){
2821 
2822 							if(verbose){
2823 								System.err.println("Looking for left kmer  "+AminoAcid.kmerToString(kmer, len));
2824 								System.err.println("Looking for left rkmer "+AminoAcid.kmerToString(rkmer, len));
2825 							}
2826 							final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
2827 							if(id>0){
2828 								if(id0<0){id0=id;}
2829 								if(verbose){System.err.println("Found "+kmer);}
2830 								minLoc=0;
2831 								minLocExclusive=Tools.min(minLocExclusive, i+1);
2832 								maxLoc=Tools.max(maxLoc, i);
2833 								maxLocExclusive=Tools.max(maxLocExclusive, 0);
2834 								found++;
2835 							}
2836 						}
2837 					}
2838 				}
2839 
2840 				//Look for short kmers on right side
2841 				if(mode==RIGHTMODE || mode==NMODE){
2842 					kmer=0;
2843 					rkmer=0;
2844 					len=0;
2845 					final int lim=Tools.max(-1, stop-k);
2846 					for(int i=stop-1; i>lim; i--){
2847 						byte b=bases[i];
2848 						long x=Dedupe.baseToNumber[b];
2849 						long x2=Dedupe.baseToComplementNumber[b];
2850 						kmer=kmer|(x<<(2*len));
2851 						rkmer=((rkmer<<2)|x2)&mask;
2852 						len++;
2853 						if(verbose){System.err.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
2854 						if(len>=mink){
2855 
2856 							final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
2857 							if(verbose){System.err.println("Looking for right kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; id="+id+"; kmask="+lengthMasks[len]);}
2858 							if(id>0){
2859 								if(id0<0){id0=id;}
2860 								if(verbose){System.err.println("Found "+kmer);}
2861 								minLoc=i;
2862 								minLocExclusive=Tools.min(minLocExclusive, bases.length);
2863 								maxLoc=bases.length-1;
2864 								maxLocExclusive=Tools.max(maxLocExclusive, i-1);
2865 								found++;
2866 							}
2867 						}
2868 					}
2869 				}
2870 			}
2871 
2872 
2873 			if(verbose){System.err.println("found="+found+", minLoc="+minLoc+", maxLoc="+maxLoc+", minLocExclusive="+minLocExclusive+", maxLocExclusive="+maxLocExclusive);}
2874 
2875 			if(found==0){return 0;}
2876 			assert(found>0) : "Overflow in 'found' variable.";
2877 
2878 			{//Increment counter for the scaffold whose kmer was first detected
2879 				if(scaffoldReadCountsT!=null){
2880 					scaffoldReadCountsT[id0]++;
2881 					scaffoldBaseCountsT[id0]+=bases.length;
2882 				}else{
2883 					scaffoldReadCounts.addAndGet(id0, 1);
2884 					scaffoldBaseCounts.addAndGet(id0, bases.length);
2885 				}
2886 			}
2887 
2888 			if(trimPad!=0){
2889 				maxLoc=Tools.mid(0, maxLoc+trimPad, bases.length);
2890 				minLoc=Tools.mid(0, minLoc-trimPad, bases.length);
2891 				maxLocExclusive=Tools.mid(0, maxLocExclusive+trimPad, bases.length);
2892 				minLocExclusive=Tools.mid(0, minLocExclusive-trimPad, bases.length);
2893 			}
2894 
2895 			//Old version.  No longer needed.
2896 //			if(mode==NMODE){ //Replace kmer hit zone with the trim symbol
2897 //				Arrays.fill(bases, minLoc, maxLoc+1, trimSymbol);
2898 //				if(quals!=null){Arrays.fill(quals, minLoc, maxLoc+1, (byte)0);}
2899 //				return maxLoc-minLoc+1;
2900 //			}
2901 
2902 			if(mode==LEFTMODE){ //Trim from the read start to the rightmost kmer base
2903 				if(verbose){System.err.println("Left trimming to "+(ktrimExclusive ? maxLocExclusive+1 : maxLoc+1)+", "+0);}
2904 				int x=TrimRead.trimToPosition(r, ktrimExclusive ? maxLocExclusive+1 : maxLoc+1, bases.length-1, 1);
2905 				if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r.bases));}
2906 				return x;
2907 			}else{ //Trim from the leftmost kmer base to the read stop
2908 				assert(mode==RIGHTMODE);
2909 				if(verbose){System.err.println("Right trimming to "+0+", "+(ktrimExclusive ? minLocExclusive-1 : minLoc-1));}
2910 				int x=TrimRead.trimToPosition(r, 0, ktrimExclusive ? minLocExclusive-1 : minLoc-1, 1);
2911 				if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r.bases));}
2912 				return x;
2913 			}
2914 		}
2915 
2916 
2917 		/**
2918 		 * Mask a read to cover matching kmers.
2919 		 * @param r Read to process
2920 		 * @param sets Kmer tables
2921 		 * @return Number of bases masked
2922 		 */
kmask(final Read r, final AbstractKmerTable[] sets)2923 		private final int kmask(final Read r, final AbstractKmerTable[] sets){
2924 			assert(ktrimN);
2925 			if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k))){return 0;}
2926 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
2927 			if(verbose){System.err.println("KMasking read "+r.id);}
2928 			final byte[] bases=r.bases, quals=r.quality;
2929 			if(bases==null || bases.length<k){return 0;}
2930 			final int minlen=k-1;
2931 			final int minlen2=(maskMiddle ? k/2 : k);
2932 			final int shift=2*k;
2933 			final int shift2=shift-2;
2934 			final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
2935 			final long kmask=lengthMasks[k];
2936 			long kmer=0;
2937 			long rkmer=0;
2938 			int found=0;
2939 			int len=0;
2940 			int id0=-1; //ID of first kmer found.
2941 
2942 			BitSet bs=new BitSet(bases.length+trimPad+1);
2943 
2944 			final int minus=k-1-trimPad;
2945 			final int plus=trimPad+1;
2946 
2947 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
2948 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
2949 
2950 			//Scan for normal kmers
2951 			for(int i=start; i<stop; i++){
2952 				byte b=bases[i];
2953 				long x=Dedupe.baseToNumber[b];
2954 				long x2=Dedupe.baseToComplementNumber[b];
2955 				kmer=((kmer<<2)|x)&mask;
2956 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
2957 				if(b=='N' && forbidNs){len=0; rkmer=0;}else{len++;}
2958 				if(verbose){System.err.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
2959 				if(len>=minlen2 && i>=minlen){
2960 					final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
2961 					if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
2962 					if(id>0){
2963 						if(id0<0){id0=id;}
2964 						if(verbose){
2965 							System.err.println("a: Found "+kmer);
2966 							System.err.println("Setting "+Tools.max(0, i-minus)+", "+(i+plus));
2967 							System.err.println("i="+i+", minus="+minus+", plus="+plus+", trimpad="+trimPad+", k="+k);
2968 						}
2969 						bs.set(Tools.max(0, i-minus), i+plus);
2970 						found++;
2971 					}
2972 				}
2973 			}
2974 
2975 			//If nothing was found, scan for short kmers.
2976 			if(useShortKmers){
2977 				assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
2978 
2979 				//Look for short kmers on left side
2980 				{
2981 					kmer=0;
2982 					rkmer=0;
2983 					len=0;
2984 					final int lim=Tools.min(k, stop);
2985 					for(int i=start; i<lim; i++){
2986 						byte b=bases[i];
2987 						long x=Dedupe.baseToNumber[b];
2988 						long x2=Dedupe.baseToComplementNumber[b];
2989 						kmer=((kmer<<2)|x)&mask;
2990 						rkmer=rkmer|(x2<<(2*len));
2991 						len++;
2992 						if(verbose){System.err.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
2993 						if(len>=mink){
2994 
2995 							if(verbose){
2996 								System.err.println("Looking for left kmer  "+AminoAcid.kmerToString(kmer, len));
2997 								System.err.println("Looking for left rkmer "+AminoAcid.kmerToString(rkmer, len));
2998 							}
2999 							final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
3000 							if(id>0){
3001 								if(id0<0){id0=id;}
3002 								if(verbose){
3003 									System.err.println("b: Found "+kmer);
3004 									System.err.println("Setting "+0+", "+(i+plus));
3005 								}
3006 								bs.set(0, i+plus);
3007 								found++;
3008 							}
3009 						}
3010 					}
3011 				}
3012 
3013 				//Look for short kmers on right side
3014 				{
3015 					kmer=0;
3016 					rkmer=0;
3017 					len=0;
3018 					final int lim=Tools.max(-1, stop-k);
3019 					for(int i=stop-1; i>lim; i--){
3020 						byte b=bases[i];
3021 						long x=Dedupe.baseToNumber[b];
3022 						long x2=Dedupe.baseToComplementNumber[b];
3023 						kmer=kmer|(x<<(2*len));
3024 						rkmer=((rkmer<<2)|x2)&mask;
3025 						len++;
3026 						if(verbose){System.err.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3027 						if(len>=mink){
3028 
3029 							final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
3030 							if(verbose){System.err.println("Looking for right kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; id="+id+"; kmask="+lengthMasks[len]);}
3031 							if(id>0){
3032 								if(id0<0){id0=id;}
3033 								if(verbose){
3034 									System.err.println("c: Found "+kmer);
3035 									System.err.println("Setting "+Tools.max(0, i-trimPad)+", "+bases.length);
3036 								}
3037 								bs.set(Tools.max(0, i-trimPad), bases.length);
3038 								found++;
3039 							}
3040 						}
3041 					}
3042 				}
3043 			}
3044 
3045 
3046 			if(verbose){System.err.println("found="+found+", bitset="+bs);}
3047 
3048 			if(found==0){return 0;}
3049 			assert(found>0) : "Overflow in 'found' variable.";
3050 
3051 			{//Increment counter for the scaffold whose kmer was first detected
3052 				if(scaffoldReadCountsT!=null){
3053 					scaffoldReadCountsT[id0]++;
3054 					scaffoldBaseCountsT[id0]+=bases.length;
3055 				}else{
3056 					scaffoldReadCounts.addAndGet(id0, 1);
3057 					scaffoldBaseCounts.addAndGet(id0, bases.length);
3058 				}
3059 			}
3060 
3061 			int cardinality=bs.cardinality();
3062 			assert(cardinality>0);
3063 
3064 			//Replace kmer hit zone with the trim symbol
3065 			for(int i=0; i<bases.length; i++){
3066 				if(bs.get(i)){
3067 					if(kmaskLowercase){
3068 						bases[i]=(byte)Tools.toLowerCase(bases[i]);
3069 					}else{
3070 						bases[i]=trimSymbol;
3071 						if(quals!=null && trimSymbol=='N'){quals[i]=0;}
3072 					}
3073 				}
3074 			}
3075 			return cardinality;
3076 		}
3077 
3078 		/**
3079 		 * @param r
3080 		 * @param idList
3081 		 * @param countList
3082 		 */
rename(Read r, IntList idList, IntList countList)3083 		private void rename(Read r, IntList idList, IntList countList) {
3084 			if(r==null || idList.size<1){return;}
3085 			StringBuilder sb=new StringBuilder();
3086 			if(r.id==null){sb.append(r.numericID);}
3087 			else{sb.append(r.id);}
3088 			for(int i=0; i<idList.size; i++){
3089 				int id=idList.get(i);
3090 				int count=countList.get(i);
3091 				sb.append('\t');
3092 				sb.append(scaffoldNames.get(id));
3093 				sb.append('=');
3094 				sb.append(count);
3095 			}
3096 			r.id=sb.toString();
3097 		}
3098 
3099 		/**
3100 		 * Pack a list of counts from an array to an IntList.
3101 		 * @param loose Counter array
3102 		 * @param packed Unique values
3103 		 * @param counts Counts of values
3104 		 * @return
3105 		 */
condenseLoose(int[] loose, IntList packed, IntList counts)3106 		private int condenseLoose(int[] loose, IntList packed, IntList counts){
3107 			counts.size=0;
3108 			if(packed.size<1){return 0;}
3109 
3110 			int max=0;
3111 			for(int i=0; i<packed.size; i++){
3112 				final int p=packed.get(i);
3113 				final int c=loose[p];
3114 				counts.add(c);
3115 				loose[p]=0;
3116 				max=Tools.max(max, c);
3117 			}
3118 			return max;
3119 		}
3120 
expectedErrors(Read r1, Read r2)3121 		private float expectedErrors(Read r1, Read r2){
3122 			float a=(r1==null ? 0 : r1.expectedErrors(false, -1));
3123 			float b=(r2==null ? 0 : r2.expectedErrors(false, -1));
3124 			return Tools.max(a, b);
3125 		}
3126 
3127 		/*--------------------------------------------------------------*/
3128 		/*----------------        Entropy Methods       ----------------*/
3129 		/*--------------------------------------------------------------*/
3130 
averageEntropy(final byte[] bases, final int k, final int window, final short[] counts, final short[] countCounts, final int kmerspace, boolean verify)3131 		private float averageEntropy(final byte[] bases, final int k,
3132 				final int window, final short[] counts, final short[] countCounts, final int kmerspace, boolean verify){
3133 			assert(k>0) : "k must be greater than 0";
3134 //			Arrays.fill(counts, 0);
3135 
3136 			assert(countCounts[0]==window);
3137 			if(verify){
3138 				for(int c : counts){assert(c==0);}
3139 				for(int i=1; i<countCounts.length; i++){assert(countCounts[i]==0);}
3140 			}
3141 
3142 			final int mask=(k>15 ? -1 : ~((-1)<<(2*k)));
3143 			int current=0;
3144 			//int ns=0;
3145 			int kmer=0, kmer2=0;
3146 
3147 			double entropySum=0;
3148 			int entropyMeasurements=0;
3149 
3150 			for(int i=0, i2=-window; i2<bases.length; i++, i2++){
3151 
3152 //				System.err.println("\nStart: i="+i+", current="+current+", ns="+ns+"\n"+Arrays.toString(counts)+"\n"+Arrays.toString(countCounts));
3153 
3154 				if(i<bases.length){
3155 					byte b=bases[i];
3156 					if(!AminoAcid.isFullyDefined(b)){
3157 //						ns++;
3158 						b='A';
3159 					}
3160 					final int n=Dedupe.baseToNumber[b];
3161 					kmer=((kmer<<2)|n)&mask;
3162 
3163 					if(counts[kmer]<1){
3164 						assert(counts[kmer]==0);
3165 						current++;
3166 					}
3167 					countCounts[counts[kmer]]--;
3168 					assert(countCounts[counts[kmer]]>=-1): i+", "+current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts)+"\n"+Arrays.toString(countCounts);
3169 					counts[kmer]++;
3170 					assert(counts[kmer]<=window+1) : Arrays.toString(counts)+"\n"+Arrays.toString(countCounts);
3171 					countCounts[counts[kmer]]++;
3172 					if(verify){
3173 						assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts);
3174 						assert(Tools.sum(countCounts)>0 && (Tools.sum(countCounts)<=window+1)): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
3175 					}
3176 
3177 //					System.err.println("Added "+kmer+"; counts["+kmer+"]="+counts[kmer]);
3178 				}
3179 
3180 				if(i2>=0){
3181 					byte b2=bases[i2];
3182 					if(!AminoAcid.isFullyDefined(b2)){
3183 //						ns--;
3184 						b2='A';
3185 					}
3186 					final int n2=Dedupe.baseToNumber[b2];
3187 					kmer2=((kmer2<<2)|n2)&mask;
3188 
3189 					countCounts[counts[kmer2]]--;
3190 					assert(countCounts[counts[kmer2]]>=0);
3191 					counts[kmer2]--;
3192 					countCounts[counts[kmer2]]++;
3193 					if(counts[kmer2]<1){
3194 						assert(counts[kmer2]==0) : Arrays.toString(counts);
3195 						current--;
3196 					}
3197 					if(verify){
3198 						assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts);
3199 						assert(Tools.sum(countCounts)>=0 && (Tools.sum(countCounts)<=window)): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
3200 					}
3201 
3202 //					System.err.println("Removed "+kmer2+"; count="+counts[kmer2]);
3203 				}
3204 
3205 				if(verify && i2>-1 && i<bases.length){
3206 					assert(Tools.sum(counts)==window);
3207 					assert(Tools.sum(countCounts)==window): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
3208 				}
3209 
3210 				if(i2>=-1 && i<bases.length){
3211 					 float e=calcEntropy(countCounts, window, kmerspace);
3212 					 entropySum+=e;
3213 					 entropyMeasurements++;
3214 				}
3215 			}
3216 
3217 //			System.err.println(" *** ");
3218 //			System.err.println(entropySum+", "+entropyMeasurements+", "+(entropySum/(Tools.max(1, entropyMeasurements))));
3219 //			System.err.println(window+", "+k+", "+kmerspace+", "+counts.length+", "+countCounts.length);
3220 //			System.err.println(" *** ");
3221 
3222 			return (float)(entropySum/(Tools.max(1, entropyMeasurements)));
3223 		}
3224 
calcEntropy(short[] countCounts, int window, int kmerspace)3225 		private float calcEntropy(short[] countCounts, int window, int kmerspace){
3226 			double sum=0;
3227 			for(int i=1; i<countCounts.length; i++){
3228 				int cc=countCounts[i];
3229 				double pklogpk=entropy[i];
3230 				sum+=(cc*pklogpk);
3231 			}
3232 //			System.err.println("sum = "+sum);
3233 //			System.err.println("entropy = "+(sum*entropyMult));
3234 			return (float)(sum*entropyMult);
3235 		}
3236 
3237 		/*--------------------------------------------------------------*/
3238 
3239 		/** Input read stream */
3240 		private final ConcurrentReadInputStream cris;
3241 		/** Output read streams */
3242 		private final ConcurrentReadOutputStream ros, rosb, ross;
3243 
3244 		private final ReadStats readstats;
3245 		private final int[] overlapVector;
3246 		private final int[] countArray;
3247 
3248 		private final IntList idList;
3249 		private final IntList countList;
3250 
3251 		long[] hitCountsT;
3252 		long[] scaffoldReadCountsT;
3253 		long[] scaffoldBaseCountsT;
3254 
3255 		final short[] entropyCounts;
3256 		final short[] entropyCountCounts;
3257 
3258 		private float[] aprob, bprob;
3259 
3260 		private long readsInT=0;
3261 		private long basesInT=0;
3262 		private long readsOutT=0;
3263 		private long basesOutT=0;
3264 
3265 		private long readsQTrimmedT=0;
3266 		private long basesQTrimmedT=0;
3267 		private long readsFTrimmedT=0;
3268 		private long basesFTrimmedT=0;
3269 		private long readsQFilteredT=0;
3270 		private long basesQFilteredT=0;
3271 		private long readsEFilteredT=0;
3272 		private long basesEFilteredT=0;
3273 
3274 		private long readsKTrimmedT=0;
3275 		private long basesKTrimmedT=0;
3276 		private long readsKFilteredT=0;
3277 		private long basesKFilteredT=0;
3278 
3279 		private long readsTrimmedByOverlapT=0;
3280 		private long basesTrimmedByOverlapT=0;
3281 
3282 		private long badGcBasesT=0;
3283 		private long badGcReadsT=0;
3284 
3285 		private int xsum=0, rktsum=0;
3286 	}
3287 
3288 	/*--------------------------------------------------------------*/
3289 	/*----------------        Static Methods        ----------------*/
3290 	/*--------------------------------------------------------------*/
3291 
3292 	/** Current available memory */
freeMemory()3293 	private static final long freeMemory(){
3294 		Runtime rt=Runtime.getRuntime();
3295 		return rt.freeMemory();
3296 	}
3297 
3298 	/**
3299 	 * Transforms a kmer into a canonical value stored in the table.  Expected to be inlined.
3300 	 * @param kmer Forward kmer
3301 	 * @param rkmer Reverse kmer
3302 	 * @param lengthMask Bitmask with single '1' set to left of kmer
3303 	 * @return Canonical value
3304 	 */
toValue(long kmer, long rkmer, long lengthMask)3305 	private final long toValue(long kmer, long rkmer, long lengthMask){
3306 		assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
3307 		long value=(rcomp ? Tools.max(kmer, rkmer) : kmer);
3308 		return (value&middleMask)|lengthMask;
3309 	}
3310 
3311 	/*--------------------------------------------------------------*/
3312 	/*----------------            Fields            ----------------*/
3313 	/*--------------------------------------------------------------*/
3314 
3315 	/** For calculating kmer cardinality */
3316 	private final CardinalityTracker loglog;
3317 
3318 	/** Has this class encountered errors while processing? */
3319 	public boolean errorState=false;
3320 
3321 	/** Fraction of available memory preallocated to arrays */
3322 	private double preallocFraction=1.0;
3323 	/** Initial size of data structures */
3324 	private int initialSize=-1;
3325 
3326 	/** Hold kmers for filtering.  A kmer X such that X%WAYS=Y will be stored in keySets[Y] */
3327 	private final AbstractKmerTable[] filterMaps;
3328 	/** Hold kmers for masking */
3329 	private final AbstractKmerTable[] maskMaps;
3330 	/** Hold kmers for trimming right (3') */
3331 	private final AbstractKmerTable[] trimRightMaps;
3332 	/** Hold kmers for trimming left (5') */
3333 	private final AbstractKmerTable[] trimLeftMaps;
3334 
3335 	/** A scaffold's name is stored at scaffoldNames.get(id).
3336 	 * scaffoldNames[0] is reserved, so the first id is 1. */
3337 	private final ArrayList<String> scaffoldNames=new ArrayList<String>();
3338 	/** Names of reference files (refNames[0] is valid). */
3339 	private final ArrayList<String> refNames=new ArrayList<String>();
3340 	/** Number of scaffolds per reference. */
3341 	private final int[] refScafCounts;
3342 	/** scaffoldCounts[id] stores the number of reads with kmer matches to that scaffold */
3343 	private AtomicLongArray scaffoldReadCounts;
3344 	/** scaffoldBaseCounts[id] stores the number of bases with kmer matches to that scaffold */
3345 	private AtomicLongArray scaffoldBaseCounts;
3346 	/** Set to false to force threads to share atomic counter arrays. */
3347 	private boolean ALLOW_LOCAL_ARRAYS=true;
3348 	/** scaffoldLengths[id] stores the length of that scaffold */
3349 	private IntList scaffoldLengths=new IntList();
3350 	/** hitCounts[x] stores the number of reads with exactly x kmer matches */
3351 	private long[] hitCounts;
3352 	/** Array of reference files from which to load kmers */
3353 	private String[] refFilter=null;
3354 	/** Array of reference files from which to load kmers */
3355 	private String[] refMask=null;
3356 	/** Array of reference files from which to load kmers */
3357 	private String[] refRight=null;
3358 	/** Array of reference files from which to load kmers */
3359 	private String[] refLeft=null;
3360 	/** Array of literal strings from which to load kmers */
3361 	private String[] literalFilter=null;
3362 	/** Array of literal strings from which to load kmers */
3363 	private String[] literalMask=null;
3364 	/** Array of literal strings from which to load kmers */
3365 	private String[] literalRight=null;
3366 	/** Array of literal strings from which to load kmers */
3367 	private String[] literalLeft=null;
3368 
3369 	/** Input reads */
3370 	private String in1=null, in2=null;
3371 	/** Input qual files */
3372 	private String qfin1=null, qfin2=null;
3373 	/** Output reads (unmatched and at least minlen) */
3374 	private String out1=null, out2=null;
3375 	/** Output reads (matched or shorter than minlen) */
3376 	private String outb1=null, outb2=null;
3377 	/** Output reads whose mate was discarded */
3378 	private String outsingle=null;
3379 	/** Statistics output files */
3380 	private String outstats=null, outduk=null, outrqc=null, outrpkm=null, outrefstats=null;
3381 
3382 	/** Optional file for quality score recalibration */
3383 	private String samFile=null;
3384 
3385 	/** Dump kmers here. */
3386 	private String dump=null;
3387 
3388 	/** Maximum input reads (or pairs) to process.  Does not apply to references.  -1 means unlimited. */
3389 	private long maxReads=-1;
3390 	/** Process this fraction of input reads. */
3391 	private float samplerate=1f;
3392 	/** Set samplerate seed to this value. */
3393 	private long sampleseed=-1;
3394 
3395 	/** Output reads in input order.  May reduce speed. */
3396 	private final boolean ordered;
3397 	/** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */
3398 	private boolean useShortKmers=false;
3399 	/** Make the middle base in a kmer a wildcard to improve sensitivity */
3400 	private boolean maskMiddle=true;
3401 
3402 	/** Store reference kmers with up to this many substitutions */
3403 	private int hammingDistance=0;
3404 	/** Search for query kmers with up to this many substitutions */
3405 	private int qHammingDistance=0;
3406 	/** Store reference kmers with up to this many edits (including indels) */
3407 	private int editDistance=0;
3408 	/** Store short reference kmers with up to this many substitutions */
3409 	private int hammingDistance2=-1;
3410 	/** Search for short query kmers with up to this many substitutions */
3411 	private int qHammingDistance2=-1;
3412 	/** Store short reference kmers with up to this many edits (including indels) */
3413 	private int editDistance2=-1;
3414 	/** Never skip more than this many consecutive kmers when hashing reference. */
3415 	private int maxSkip=1;
3416 	/** Always skip at least this many consecutive kmers when hashing reference.
3417 	 * 1 means every kmer is used, 2 means every other, etc. */
3418 	private int minSkip=1;
3419 
3420 	/** Trim this much extra around matched kmers */
3421 	private int trimPad;
3422 
3423 	/*--------------------------------------------------------------*/
3424 	/*----------------        Entropy Fields        ----------------*/
3425 	/*--------------------------------------------------------------*/
3426 
3427 	private int entropyK=5;
3428 	private int entropyWindow=50;
3429 	private float entropyCutoff=-1;
3430 	private boolean verifyEntropy=false;
3431 
3432 	private final boolean calcEntropy;
3433 	private final int entropyKmerspace;
3434 	private final double entropyMult;
3435 	private final double[] entropy;
3436 
3437 	/*--------------------------------------------------------------*/
3438 	/*----------------          Statistics          ----------------*/
3439 	/*--------------------------------------------------------------*/
3440 
3441 	long readsIn=0;
3442 	long basesIn=0;
3443 	long readsOut=0;
3444 	long basesOut=0;
3445 
3446 	long readsQTrimmed=0;
3447 	long basesQTrimmed=0;
3448 	long readsFTrimmed=0;
3449 	long basesFTrimmed=0;
3450 	long readsQFiltered=0;
3451 	long basesQFiltered=0;
3452 	long readsEFiltered=0;
3453 	long basesEFiltered=0;
3454 
3455 	long readsKTrimmed=0;
3456 	long basesKTrimmed=0;
3457 	long readsKFiltered=0;
3458 	long basesKFiltered=0;
3459 
3460 	long badGcReads;
3461 	long badGcBases;
3462 
3463 	long readsTrimmedByOverlap;
3464 	long basesTrimmedByOverlap;
3465 
3466 	long refReads=0;
3467 	long refBases=0;
3468 	long refKmers=0;
3469 
3470 	long storedKmersFilter=0;
3471 	long storedKmersMask=0;
3472 	long storedKmersRight=0;
3473 	long storedKmersLeft=0;
3474 
3475 	/*--------------------------------------------------------------*/
3476 	/*----------------       Final Primitives       ----------------*/
3477 	/*--------------------------------------------------------------*/
3478 
3479 	/** Don't look for kmers in read 1 */
3480 	private final boolean skipR1;
3481 	/** Don't look for kmers in read 2 */
3482 	private final boolean skipR2;
3483 	/** Correct errors via read overlap */
3484 	private final boolean ecc;
3485 
3486 	/** Look for reverse-complements as well as forward kmers.  Default: true */
3487 	private final boolean rcomp;
3488 	/** Don't allow a read 'N' to match a reference 'A'.
3489 	 * Reduces sensitivity when hdist>0 or edist>0.  Default: false. */
3490 	private final boolean forbidNs;
3491 	/** AND bitmask with 0's at the middle base */
3492 	private final long middleMask;
3493 	/** Use HashForest data structure */
3494 	private final boolean useForest;
3495 	/** Use KmerTable data structure */
3496 	private final boolean useTable;
3497 	/** Use HashArray data structure (default) */
3498 	private final boolean useArray;
3499 
3500 	/** Normal kmer length */
3501 	final int k;
3502 	/** k-1; used in some expressions */
3503 	final int k2;
3504 	/** Emulated kmer greater than k */
3505 	final int kbig;
3506 	/** Shortest kmer to use for trimming */
3507 	final int mink;
3508 	/** A read may contain up to this many kmers before being considered a match.  Default: 0 */
3509 	final int maxBadKmers;
3510 
3511 	/** Recalibrate quality scores using matrices */
3512 	final boolean recalibrateQuality;
3513 	/** Quality-trim the left side */
3514 	final boolean qtrimLeft;
3515 	/** Quality-trim the right side */
3516 	final boolean qtrimRight;
3517 	/** Trim bases at this quality or below.  Default: 4 */
3518 	final float trimq;
3519 	/** Error rate for trimming (derived from trimq) */
3520 	private final float trimE;
3521 	/** Throw away reads below this average quality after trimming.  Default: 0 */
3522 	final float minAvgQuality;
3523 	/** If positive, calculate average quality from the first X bases only.  Default: 0 */
3524 	final int minAvgQualityBases;
3525 	/** Throw away reads failing chastity filter (:Y: in read header) */
3526 	final boolean chastityFilter;
3527 	/** Crash if a barcode is encountered that contains Ns or is not in the table */
3528 	final boolean failBadBarcodes;
3529 	/** Remove reads with Ns in barcodes or that are not in the table */
3530 	final boolean removeBadBarcodes;
3531 	/** Fail reads missing a barcode */
3532 	final boolean failIfNoBarcode;
3533 	/** A set of valid barcodes; null if unused */
3534 	final HashSet<String> barcodes;
3535 	/** Throw away reads containing more than this many Ns.  Default: -1 (disabled) */
3536 	final int maxNs;
3537 	/** Throw away reads containing without at least this many consecutive called bases. */
3538 	int minConsecutiveBases=0;
3539 	/** Throw away reads containing fewer than this fraction of any particular base. */
3540 	final float minBaseFrequency;
3541 	/** Throw away reads shorter than this after trimming.  Default: 10 */
3542 	final int minReadLength;
3543 	/** Throw away reads longer than this after trimming.  Default: Integer.MAX_VALUE */
3544 	final int maxReadLength;
3545 	/** Toss reads shorter than this fraction of initial length, after trimming */
3546 	final float minLenFraction;
3547 	/** Filter reads by whether or not they have matching kmers */
3548 	final boolean kfilter;
3549 	/** Trim matching kmers and all bases to the left */
3550 	final boolean ktrimLeft;
3551 	/** Trim matching kmers and all bases to the right */
3552 	final boolean ktrimRight;
3553 	/** Don't trim, but replace matching kmers with a symbol (default N) */
3554 	final boolean ktrimN;
3555 	/** Exclude kmer itself when ktrimming */
3556 	final boolean ktrimExclusive;
3557 	/** Replace bases covered by matched kmers with this symbol */
3558 	final byte trimSymbol;
3559 	/** Convert masked bases to lowercase */
3560 	final boolean kmaskLowercase;
3561 	/** Output over-trimmed reads to outbad (outmatch).  If false, they are discarded. */
3562 	final boolean addTrimmedToBad;
3563 	/** Find the sequence that shares the most kmer matches when filtering. */
3564 	final boolean findBestMatch;
3565 	/** Trim pairs to the same length, when adapter-trimming */
3566 	final boolean trimPairsEvenly;
3567 	/** Trim left bases of the read to this position (exclusive, 0-based) */
3568 	final int forceTrimLeft;
3569 	/** Trim right bases of the read after this position (exclusive, 0-based) */
3570 	final int forceTrimRight;
3571 	/** Trim this many rightmost bases of the read */
3572 	final int forceTrimRight2;
3573 	/** Trim right bases of the read modulo this value.
3574 	 * e.g. forceTrimModulo=50 would trim the last 3bp from a 153bp read. */
3575 	final int forceTrimModulo;
3576 
3577 	/** Discard reads with GC below this. */
3578 	final float minGC;
3579 	/** Discard reads with GC above this. */
3580 	final float maxGC;
3581 	/** Discard reads outside of GC bounds. */
3582 	final boolean filterGC;
3583 	/** Average GC for paired reads. */
3584 	final boolean usePairGC;
3585 
3586 	/** If positive, only look for kmer matches in the leftmost X bases */
3587 	int restrictLeft;
3588 	/** If positive, only look for kmer matches the rightmost X bases */
3589 	int restrictRight;
3590 
3591 	/** Trim implied adapters based on overlap, for reads with insert size shorter than read length */
3592 	final boolean trimByOverlap;
3593 	final boolean useQualityForOverlap;
3594 	final boolean strictOverlap;
3595 
3596 //	int minOverlap0=11;
3597 //	int minOverlap=24;
3598 //	final int overlapMargin=2;
3599 //	final int overlapMaxMismatches0=4;
3600 //	final int overlapMaxMismatches=4;
3601 //	final int overlapMinq=13;
3602 
3603 	int minOverlap0=7;
3604 	int minOverlap=14;
3605 	int minInsert0=16;
3606 	int minInsert=50;
3607 
3608 	final float maxRatio;
3609 	final float ratioMargin;
3610 	final float ratioOffset;
3611 	final float efilterRatio;
3612 	final float efilterOffset;
3613 	final float pfilterRatio;
3614 	final float meeFilter;
3615 
3616 	/** Skip this many initial input reads */
3617 	private final long skipreads;
3618 
3619 	/** Pairs go to outbad if either of them is bad, as opposed to requiring both to be bad.
3620 	 * Default: true. */
3621 	private final boolean removePairsIfEitherBad;
3622 
3623 	/** Print only statistics for scaffolds that matched at least one read
3624 	 * Default: true. */
3625 	private final boolean printNonZeroOnly;
3626 
3627 	/** Rename reads to indicate what they matched.
3628 	 * Default: false. */
3629 	private final boolean rename;
3630 	/** Use names of reference files instead of scaffolds.
3631 	 * Default: false. */
3632 	private final boolean useRefNames;
3633 
3634 	/** Fraction of kmers to skip, 0 to 15 out of 16 */
3635 	private final int speed;
3636 
3637 	/** Skip this many kmers when examining the read.  Default 1.
3638 	 * 1 means every kmer is used, 2 means every other, etc. */
3639 	private final int qSkip;
3640 
3641 	/** True if speed and qSkip are disabled. */
3642 	private final boolean noAccel;
3643 
3644 	private final boolean MAKE_QUALITY_ACCURACY;
3645 	private final boolean MAKE_QUALITY_HISTOGRAM;
3646 	private final boolean MAKE_MATCH_HISTOGRAM;
3647 	private final boolean MAKE_BASE_HISTOGRAM;
3648 
3649 	private final boolean MAKE_EHIST;
3650 	private final boolean MAKE_INDELHIST;
3651 	private final boolean MAKE_LHIST;
3652 	private final boolean MAKE_GCHIST;
3653 	private final boolean MAKE_IDHIST;
3654 
3655 	/*--------------------------------------------------------------*/
3656 	/*----------------         Static Fields        ----------------*/
3657 	/*--------------------------------------------------------------*/
3658 
3659 	/** Number of tables (and threads, during loading) */
3660 	private static final int WAYS=7; //123
3661 	/** Default initial size of data structures */
3662 	private static final int initialSizeDefault=128000;
3663 	/** Verbose messages */
3664 	public static final boolean verbose=false; //123
3665 
3666 	/** Print messages to this stream */
3667 	private static PrintStream outstream=System.err;
3668 	/** Permission to overwrite existing files */
3669 	public static boolean overwrite=true;
3670 	/** Permission to append to existing files */
3671 	public static boolean append=false;
3672 	/** Print speed statistics upon completion */
3673 	public static boolean showSpeed=true;
3674 	/** Display progress messages such as memory usage */
3675 	public static boolean DISPLAY_PROGRESS=true;
3676 	/** Number of ProcessThreads */
3677 	public static int THREADS=Shared.threads();
3678 	/** Indicates end of input stream */
3679 	private static final ArrayList<Read> POISON=new ArrayList<Read>(0);
3680 	/** Number of columns for statistics output, 3 or 5 */
3681 	public static int STATS_COLUMNS=3;
3682 	/** Release memory used by kmer storage after processing reads */
3683 	public static boolean RELEASE_TABLES=true;
3684 	/** Max value of hitCount array */
3685 	public static final int HITCOUNT_LEN=1000;
3686 	/** Make unambiguous copies of ref sequences with ambiguous bases */
3687 	public static boolean REPLICATE_AMBIGUOUS=false;
3688 
3689 	/** x&clearMasks[i] will clear base i */
3690 	private static final long[] clearMasks;
3691 	/** x|setMasks[i][j] will set base i to j */
3692 	private static final long[][] setMasks;
3693 	/** x&leftMasks[i] will clear all bases to the right of i (exclusive) */
3694 	private static final long[] leftMasks;
3695 	/** x&rightMasks[i] will clear all bases to the left of i (inclusive) */
3696 	private static final long[] rightMasks;
3697 	/** x|kMasks[i] will set the bit to the left of the leftmost base */
3698 	private static final long[] lengthMasks;
3699 
3700 	public static HashMap<String,String> RQC_MAP=null;
3701 
3702 	public static final int FILTERMODE=0, RIGHTMODE=1, LEFTMODE=2, NMODE=3;
3703 
3704 	/*--------------------------------------------------------------*/
3705 	/*----------------      Static Initializers     ----------------*/
3706 	/*--------------------------------------------------------------*/
3707 
3708 	static{
3709 		clearMasks=new long[32];
3710 		leftMasks=new long[32];
3711 		rightMasks=new long[32];
3712 		lengthMasks=new long[32];
3713 		setMasks=new long[4][32];
3714 		for(int i=0; i<32; i++){
3715 			clearMasks[i]=~(3L<<(2*i));
3716 		}
3717 		for(int i=0; i<32; i++){
3718 			leftMasks[i]=((-1L)<<(2*i));
3719 		}
3720 		for(int i=0; i<32; i++){
3721 			rightMasks[i]=~((-1L)<<(2*i));
3722 		}
3723 		for(int i=0; i<32; i++){
3724 			lengthMasks[i]=((1L)<<(2*i));
3725 		}
3726 		for(int i=0; i<32; i++){
3727 			for(long j=0; j<4; j++){
3728 				setMasks[(int)j][i]=(j<<(2*i));
3729 			}
3730 		}
3731 	}
3732 
3733 }
3734