1 package jgi;
2 
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.BitSet;
8 import java.util.HashMap;
9 import java.util.HashSet;
10 import java.util.Locale;
11 import java.util.concurrent.ArrayBlockingQueue;
12 import java.util.concurrent.atomic.AtomicLongArray;
13 
14 import cardinality.CardinalityTracker;
15 import dna.AminoAcid;
16 import dna.Data;
17 import fileIO.ByteFile;
18 import fileIO.ByteStreamWriter;
19 import fileIO.FileFormat;
20 import fileIO.ReadWrite;
21 import fileIO.TextStreamWriter;
22 import hiseq.FlowcellCoordinate;
23 import json.JsonObject;
24 import kmer.AbstractKmerTable;
25 import kmer.AbstractKmerTableSet;
26 import kmer.ScheduleMaker;
27 import shared.KillSwitch;
28 import shared.Parse;
29 import shared.Parser;
30 import shared.PreParser;
31 import shared.ReadStats;
32 import shared.Shared;
33 import shared.Timer;
34 import shared.Tools;
35 import shared.TrimRead;
36 import stream.ConcurrentReadInputStream;
37 import stream.ConcurrentReadOutputStream;
38 import stream.FASTQ;
39 import stream.FastaReadInputStream;
40 import stream.Read;
41 import stream.SamLine;
42 import structures.EntropyTracker;
43 import structures.IntList;
44 import structures.ListNum;
45 import structures.PolymerTracker;
46 import var2.CallVariants;
47 import var2.ScafMap;
48 import var2.Var;
49 import var2.VarMap;
50 import var2.VcfLoader;
51 
52 /**
53  * Separates, trims, or masks sequences based on matching kmers in a reference.
54  * Supports Hamming and and edit distance.
55  * Supports K 1-31 and emulated K>31.
56  * @author Brian Bushnell
57  * @date Aug 30, 2013
58  *
59  */
60 public class BBDuk {
61 
62 	/*--------------------------------------------------------------*/
63 	/*----------------        Initialization        ----------------*/
64 	/*--------------------------------------------------------------*/
65 
66 	/**
67 	 * Code entrance from the command line.
68 	 * @param args Command line arguments
69 	 */
main(String[] args)70 	public static void main(String[] args){
71 
72 //		if(PreParser.isAmino(args)){
73 //			BBDukAA.main(args);//No longer needed since this supports AAs
74 //			return;
75 //		}
76 
77 		//Create a new BBDuk instance
78 		BBDuk x=new BBDuk(args);
79 
80 		//And run it
81 		x.process();
82 
83 		//Close the print stream if it was redirected
84 		Shared.closeStream(outstream);
85 		assert(WAYS==7 && !verbose) : "Undo WAYS and verbose";
86 	}
87 
88 	/**
89 	 * Constructor.
90 	 * @param args Command line arguments
91 	 */
BBDuk(String[] args)92 	public BBDuk(String[] args){
93 		{//Preparse block for help, config files, and outstream
94 			PreParser pp=new PreParser(args, getClass(), true);
95 			args=pp.args;
96 			outstream=pp.outstream;
97 			jsonStats=pp.jsonObject;
98 			json=pp.json;
99 		}
100 
101 		/* Set global defaults */
102 		ReadWrite.ZIPLEVEL=2;
103 		ReadWrite.USE_UNPIGZ=true;
104 		ReadWrite.USE_PIGZ=true;
105 
106 		if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
107 			ByteFile.FORCE_MODE_BF2=true;
108 		}
109 
110 		/* Initialize local variables with defaults */
111 		boolean setOut=false, setOutb=false;
112 		boolean ktrimRight_=false, ktrimLeft_=false, ktrimN_=false, ktrimExclusive_=false, ksplit_=false;
113 		boolean findBestMatch_=false;
114 		boolean addTrimmedToBad_=true;
115 		boolean rcomp_=true;
116 		boolean forbidNs_=false;
117 		boolean useForest_=false, useTable_=false, useArray_=true, prealloc_=false;
118 		int k_=27, kbig_=-1;
119 		int mink_=-1;
120 		int ways_=-1; //Currently disabled for speed
121 		int maxBadKmers_=0;
122 		long skipreads_=0;
123 		byte TRIM_SYMBOL_='N';
124 		boolean kmaskLowercase_=false;
125 		boolean kmaskFullyCovered_=false;
126 		boolean histogramsBeforeProcessing_=true;
127 		boolean trimFailuresTo1bp_=false;
128 
129 		Parser parser=new Parser();
130 		parser.trimq=6;
131 		parser.minAvgQuality=0;
132 		parser.minReadLength=10;
133 		parser.maxReadLength=Integer.MAX_VALUE;
134 		parser.minLenFraction=0f;
135 		parser.requireBothBad=false;
136 		parser.maxNs=-1;
137 		parser.overwrite=overwrite;
138 		boolean trimByOverlap_=false, useQualityForOverlap_=false, strictOverlap_=true;
139 		boolean trimPairsEvenly_=false;
140 		boolean ordered_=false;
141 		int minoverlap_=-1, mininsert_=-1;
142 		int restrictLeft_=0, restrictRight_=0, speed_=0, qSkip_=1;
143 		boolean printNonZeroOnly_=true;
144 		boolean rename_=false, useRefNames_=false;
145 		boolean skipr1_=false, skipr2_=false;
146 		boolean ecc_=false;
147 		float minBaseFrequency_=0;
148 		float minKmerFraction_=0;
149 		float minCoveredFraction_=0;
150 		boolean setk=false;
151 
152 		scaffoldNames.add(""); //Necessary so that the first real scaffold gets an id of 1, not zero
153 		scaffoldLengths.add(0);
154 
155 		/* Parse arguments */
156 		for(int i=0; i<args.length; i++){
157 
158 			final String arg=args[i];
159 			String[] split=arg.split("=");
160 			String a=split[0].toLowerCase();
161 			String b=split.length>1 ? split[1] : null;
162 
163 			if(Parser.parseZip(arg, a, b)){
164 				//do nothing
165 			}else if(Parser.parseHist(arg, a, b)){
166 				//do nothing
167 			}else if(Parser.parseCommonStatic(arg, a, b)){
168 				//do nothing
169 			}else if(Parser.parseQualityAdjust(arg, a, b)){
170 				//do nothing
171 			}else if(Parser.parseQuality(arg, a, b)){
172 				//do nothing
173 			}else if(Parser.parseFasta(arg, a, b)){
174 				//do nothing
175 			}else if(parser.parseInterleaved(arg, a, b)){
176 				//do nothing
177 			}else if(parser.parseTrim(arg, a, b)){
178 				//do nothing
179 			}else if(parser.parseCommon(arg, a, b)){
180 				//do nothing
181 			}else if(parser.parseCardinality(arg, a, b)){
182 				//do nothing
183 			}else if(a.equals("in") || a.equals("in1")){
184 				in1=b;
185 			}else if(a.equals("in2")){
186 				in2=b;
187 			}else if(a.equals("qfin") || a.equals("qfin1")){
188 				qfin1=b;
189 			}else if(a.equals("qfin2")){
190 				qfin2=b;
191 			}else if(a.equals("qfout") || a.equals("qfout1")){
192 				qfout1=b;
193 			}else if(a.equals("qfin2")){
194 				qfout2=b;
195 			}else if(a.equals("out") || a.equals("out1") || a.equals("outu") || a.equals("outu1") || a.equals("outnonmatch") ||
196 					a.equals("outnonmatch1") || a.equals("outunnmatch") || a.equals("outunmatch1") || a.equals("outunnmatched") || a.equals("outunmatched1")){
197 				out1=b;
198 				setOut=true;
199 			}else if(a.equals("out2") || a.equals("outu2") || a.equals("outnonmatch2") || a.equals("outunmatch2") ||
200 					a.equals("outnonmatched2") || a.equals("outunmatched2")){
201 				out2=b;
202 			}else if(a.equals("outb") || a.equals("outm") || a.equals("outb1") || a.equals("outm1") || a.equals("outbad") ||
203 					a.equals("outbad1") || a.equals("outmatch") || a.equals("outmatch1")){
204 				outb1=b;
205 				setOut=true;
206 			}else if(a.equals("outb2") || a.equals("outm2") || a.equals("outbad2") || a.equals("outmatch2")){
207 				outb2=b;
208 			}else if(a.equals("outs") || a.equals("outsingle")){
209 				outsingle=b;
210 			}else if(a.equals("stats") || a.equals("scafstats")){
211 				outstats=b;
212 			}else if(a.equals("polymerstats") || a.equals("polymerstatsfile") || a.equals("pstats") || a.equals("phist")){
213 				polymerStatsFile=b;
214 			}else if(a.equals("refstats")){
215 				outrefstats=b;
216 			}else if(a.equals("rpkm") || a.equals("fpkm") || a.equals("cov") || a.equals("coverage")){
217 				outrpkm=b;
218 			}else if(a.equals("sam") || a.equals("bam")){
219 				samFile=b;
220 			}else if(a.equals("duk") || a.equals("outduk")){
221 				outduk=b;
222 			}else if(a.equals("rqc")){
223 				outrqc=b;
224 			}else if(a.equals("ref") || a.equals("adapters")){
225 				ref=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
226 			}else if(a.equals("altref")){
227 				altref=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
228 			}else if(a.equals("samref") || a.equals("bamref")){
229 				samref=b;
230 			}else if(a.equals("literal")){
231 				literal=(b==null) ? null : b.split(",");
232 //				assert(false) : b+", "+Arrays.toString(literal);
233 			}else if(a.equals("forest")){
234 				useForest_=Parse.parseBoolean(b);
235 				if(useForest_){useTable_=useArray_=false;}
236 			}else if(a.equals("table")){
237 				useTable_=Parse.parseBoolean(b);
238 				if(useTable_){useForest_=useArray_=false;}
239 			}else if(a.equals("array")){
240 				useArray_=Parse.parseBoolean(b);
241 				if(useArray_){useTable_=useForest_=false;}
242 			}else if(a.equals("ways")){
243 				ways_=Integer.parseInt(b);
244 			}else if(a.equals("ordered") || a.equals("ord")){
245 				ordered_=Parse.parseBoolean(b);
246 			}else if(a.equals("skipr1")){
247 				skipr1_=Parse.parseBoolean(b);
248 			}else if(a.equals("skipr2")){
249 				skipr2_=Parse.parseBoolean(b);
250 			}else if(a.equals("k")){
251 				assert(b!=null) : "\nThe k key needs an integer value greater than 0, such as k=27\n";
252 				k_=Integer.parseInt(b);
253 				setk=true;
254 			}else if(a.equals("mink") || a.equals("kmin")){
255 				mink_=Integer.parseInt(b);
256 				assert(mink_<0 || (mink_>0 && mink_<32)) : "kmin must be between 1 and 31; default is 4, negative numbers disable it.";
257 			}else if(a.equals("useshortkmers") || a.equals("shortkmers") || a.equals("usk")){
258 				useShortKmers=Parse.parseBoolean(b);
259 			}else if(a.equals("trimextra") || a.equals("trimpad") || a.equals("tp")){
260 				trimPad=Integer.parseInt(b);
261 			}else if(a.equals("hdist") || a.equals("hammingdistance")){
262 				hammingDistance=Integer.parseInt(b);
263 				assert(hammingDistance>=0 && hammingDistance<4) : "hamming distance must be between 0 and 3; default is 0.";
264 			}else if(a.equals("qhdist") || a.equals("queryhammingdistance")){
265 				qHammingDistance=Integer.parseInt(b);
266 				assert(qHammingDistance>=0 && qHammingDistance<4) : "hamming distance must be between 0 and 3; default is 0.";
267 			}else if(a.equals("edits") || a.equals("edist") || a.equals("editdistance")){
268 				editDistance=Integer.parseInt(b);
269 				assert(editDistance>=0 && editDistance<3) : "edit distance must be between 0 and 2; default is 0.\n" +
270 						"You can bypass this error message with the -da flag, but edist=3 at K=31 " +
271 						"requires 15,000,000x the time and memory for indexing compared to edist=0.";
272 			}else if(a.equals("hdist2") || a.equals("hammingdistance2")){
273 				hammingDistance2=Integer.parseInt(b);
274 				assert(hammingDistance2>=0 && hammingDistance2<4) : "hamming distance must be between 0 and 3; default is 0.";
275 			}else if(a.equals("qhdist2") || a.equals("queryhammingdistance2")){
276 				qHammingDistance2=Integer.parseInt(b);
277 				assert(qHammingDistance2>=0 && qHammingDistance2<4) : "hamming distance must be between 0 and 3; default is 0.";
278 			}else if(a.equals("edits2") || a.equals("edist2") || a.equals("editdistance2")){
279 				editDistance2=Integer.parseInt(b);
280 				assert(editDistance2>=0 && editDistance2<3) : "edit distance must be between 0 and 2; default is 0.";
281 			}else if(a.equals("maxskip") || a.equals("maxrskip") || a.equals("mxs")){
282 				maxSkip=Integer.parseInt(b);
283 			}else if(a.equals("minskip") || a.equals("minrskip") || a.equals("mns")){
284 				minSkip=Integer.parseInt(b);
285 			}else if(a.equals("skip") || a.equals("refskip") || a.equals("rskip")){
286 				minSkip=maxSkip=Integer.parseInt(b);
287 			}else if(a.equals("qskip")){
288 				qSkip_=Integer.parseInt(b);
289 			}else if(a.equals("speed")){
290 				speed_=Integer.parseInt(b);
291 				assert(speed_>=0 && speed_<=16) : "Speed range is 0 to 16.  Value: "+speed_;
292 			}else if(a.equals("skipreads")){
293 				skipreads_=Parse.parseKMG(b);
294 			}else if(a.equals("maxbadkmers") || a.equals("mbk")){
295 				maxBadKmers_=Integer.parseInt(b);
296 			}else if(a.equals("minhits") || a.equals("minkmerhits") || a.equals("mkh")){
297 				maxBadKmers_=Integer.parseInt(b)-1;
298 			}else if(a.equals("minkmerfraction") || a.equals("minfraction") || a.equals("mkf")){
299 				minKmerFraction_=Float.parseFloat(b);
300 			}else if(a.equals("mincoveredfraction") || a.equals("mincovfraction") || a.equals("mcf")){
301 				minCoveredFraction_=Float.parseFloat(b);
302 			}else if(a.equals("showspeed") || a.equals("ss")){
303 				showSpeed=Parse.parseBoolean(b);
304 			}else if(a.equals("verbose")){
305 				assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
306 				assert(WAYS>1) : "WAYS=1 is for debug mode.";
307 			}else if(a.equals("mm") || a.equals("maskmiddle")){
308 				maskMiddle=Parse.parseBoolean(b);
309 			}else if(a.equals("rcomp")){
310 				rcomp_=Parse.parseBoolean(b);
311 			}else if(a.equals("forbidns") || a.equals("forbidn") || a.equals("fn")){
312 				forbidNs_=Parse.parseBoolean(b);
313 			}else if(a.equals("findbestmatch") || a.equals("fbm")){
314 				findBestMatch_=Parse.parseBoolean(b);
315 			}else if(a.equals("kfilter")){
316 				boolean x=Parse.parseBoolean(b);
317 				if(x){ktrimLeft_=ktrimRight_=ktrimN_=ksplit_=false;}
318 			}else if(a.equals("ksplit")){
319 				boolean x=Parse.parseBoolean(b);
320 				if(x){ksplit_=true; ktrimLeft_=ktrimRight_=ktrimN_=false;}
321 				else{ksplit_=false;}
322 			}else if(a.equals("ktrim")){
323 				if(b==null){b="";}
324 				if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){ktrimLeft_=true;ktrimRight_=false;ktrimN_=false;}
325 				else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){ktrimLeft_=false;ktrimRight_=true;ktrimN_=false;}
326 				else if(b.equalsIgnoreCase("n")){
327 					ktrimLeft_=ktrimRight_=ksplit_=false;
328 					ktrimN_=true;
329 				}else if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){
330 					ktrimLeft_=ktrimRight_=ksplit_=false;
331 					ktrimN_=true;
332 					TRIM_SYMBOL_=(byte)b.charAt(0);
333 				}else{
334 					assert(b!=null && (b.equalsIgnoreCase("f") || b.equalsIgnoreCase("false"))) :
335 						"\nInvalid setting for ktrim - values must be f (false), l (left), r (right), or n.";
336 					ktrimRight_=ktrimLeft_=false;
337 				}
338 			}else if(a.equals("kmask") || a.equals("mask")){
339 				if("lc".equalsIgnoreCase(b) || "lowercase".equalsIgnoreCase(b)){
340 					kmaskLowercase_=true;
341 					ktrimN_=true;
342 					ktrimLeft_=ktrimRight_=ksplit_=false;
343 				}else{
344 					if(Parse.parseBoolean(b)){b="N";}
345 					if(b!=null && b.length()==1){
346 						ktrimLeft_=false;ktrimRight_=false;ktrimN_=true;
347 						TRIM_SYMBOL_=(byte)b.charAt(0);
348 					}else{
349 						boolean x=Parse.parseBoolean(b);
350 //						assert(!x) : "\nInvalid setting for kmask - values must be f (false), t (true), or a single character for replacement.";
351 						ktrimN_=x;
352 					}
353 				}
354 			}else if(a.equals("kmaskfullycovered") || a.equals("maskfullycovered") || a.equals("mfc")){
355 				kmaskFullyCovered_=Parse.parseBoolean(b);
356 			}else if(a.equals("ktrimright")){
357 				ktrimRight_=Parse.parseBoolean(b);
358 				ktrimLeft_=ktrimN_=!(ktrimRight_);
359 			}else if(a.equals("ktrimleft")){
360 				ktrimLeft_=Parse.parseBoolean(b);
361 				ktrimRight_=ktrimN_=!(ktrimLeft_);
362 			}else if(a.equals("ktrimn")){
363 				ktrimN_=Parse.parseBoolean(b);
364 				ktrimLeft_=ktrimRight_=!(ktrimN_);
365 			}else if(a.equals("ktrimexclusive")){
366 				ktrimExclusive_=Parse.parseBoolean(b);
367 			}else if(a.equals("tbo") || a.equals("trimbyoverlap")){
368 				trimByOverlap_=Parse.parseBoolean(b);
369 			}else if(a.equals("strictoverlap")){
370 				strictOverlap_=Parse.parseBoolean(b);
371 			}else if(a.equals("usequality")){
372 				useQualityForOverlap_=Parse.parseBoolean(b);
373 			}else if(a.equals("tpe") || a.equals("tbe") || a.equals("trimpairsevenly")){
374 				trimPairsEvenly_=Parse.parseBoolean(b);
375 			}else if(a.equals("ottm") || a.equals("outputtrimmedtomatch")){
376 				addTrimmedToBad_=Parse.parseBoolean(b);
377 			}else if(a.equals("minoverlap")){
378 				minoverlap_=Integer.parseInt(b);
379 			}else if(a.equals("mininsert")){
380 				mininsert_=Integer.parseInt(b);
381 			}else if(a.equals("prealloc") || a.equals("preallocate")){
382 				if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){
383 					prealloc_=Parse.parseBoolean(b);
384 				}else{
385 					preallocFraction=Tools.max(0, Double.parseDouble(b));
386 					prealloc_=(preallocFraction>0);
387 				}
388 			}else if(a.equals("restrictleft")){
389 				restrictLeft_=Integer.parseInt(b);
390 			}else if(a.equals("restrictright")){
391 				restrictRight_=Integer.parseInt(b);
392 			}else if(a.equals("statscolumns") || a.equals("columns") || a.equals("cols")){
393 				STATS_COLUMNS=Integer.parseInt(b);
394 				assert(STATS_COLUMNS==3 || STATS_COLUMNS==5) : "statscolumns bust be either 3 or 5. Invalid value: "+STATS_COLUMNS;
395 			}else if(a.equals("nzo") || a.equals("nonzeroonly")){
396 				printNonZeroOnly_=Parse.parseBoolean(b);
397 			}else if(a.equals("rename")){
398 				rename_=Parse.parseBoolean(b);
399 			}else if(a.equals("refnames") || a.equals("userefnames")){
400 				useRefNames_=Parse.parseBoolean(b);
401 			}else if(a.equals("initialsize")){
402 				initialSize=Parse.parseIntKMG(b);
403 			}else if(a.equals("dump")){
404 				dump=b;
405 			}else if(a.equals("minentropy") || a.equals("entropy") || a.equals("entropyfilter")){
406 				entropyCutoff=Float.parseFloat(b);
407 			}else if(a.equals("verifyentropy")){
408 				EntropyTracker.verify=Parse.parseBoolean(b);
409 //			}else if(a.equals("entropytracker") || a.equals("usetracker")){
410 //				useEntropyTracker=Parse.parseBoolean(b);
411 			}else if(a.equals("entropymask") || a.equals("maskentropy")){
412 				if(b==null){
413 					entropyMask=true;
414 				}else if(b.equalsIgnoreCase("lc") || b.equalsIgnoreCase("lowercase")){
415 					entropyMask=true;
416 					entropyMaskLowercase=true;
417 				}else if(b.equalsIgnoreCase("filter")){
418 					entropyMask=false;
419 				}else{
420 					entropyMask=Parse.parseBoolean(b);
421 				}
422 			}else if(a.equals("entropytrim") || a.equals("trimentropy")){
423 				entropyTrim=parseEnd(b);
424 			}else if(a.equals("entropymark") || a.equals("markentropy")){
425 				entropyMark=Parse.parseBoolean(b);
426 			}
427 
428 			else if(a.equals("countpolymers")){
429 				countPolymers=Parse.parseBoolean(b);
430 			}else if(a.equals("polybase1")){
431 				polymerChar1=(byte)b.charAt(0);
432 			}else if(a.equals("polybase2")){
433 				polymerChar2=(byte)b.charAt(0);
434 			}else if(a.equals("polymerratio") || a.equals("pratio")){
435 				assert(b!=null);
436 				b=b.toUpperCase();
437 				if(b.length()==2){
438 					polymerChar1=(byte)b.charAt(0);
439 					polymerChar2=(byte)b.charAt(1);
440 				}else if(b.length()==3){
441 					assert(b.charAt(1)==',');
442 					polymerChar1=(byte)b.charAt(0);
443 					polymerChar2=(byte)b.charAt(2);
444 				}else{
445 					assert(false) : "Format should be pratio=G,C";
446 				}
447 				assert(polymerChar1>=0 && polymerChar2>=0);
448 				assert(AminoAcid.baseToNumberACGTN[polymerChar1]>=0 && AminoAcid.baseToNumberACGTN[polymerChar2]>=0) : "Only ACGTN polymer tracking is possible: "+arg;
449 			}else if(a.equals("polymerlength") || a.equals("plen")){
450 				polymerLength=Integer.parseInt(b);
451 				assert(polymerLength>=1);
452 			}
453 
454 			else if(a.equals("minbasefrequency")){
455 				minBaseFrequency_=Float.parseFloat(b);
456 			}else if(a.equals("ecco") || a.equals("ecc")){
457 				ecc_=Parse.parseBoolean(b);
458 			}else if(a.equals("copyundefined") || a.equals("cu")){
459 				REPLICATE_AMBIGUOUS=Parse.parseBoolean(b);
460 			}else if(a.equals("path")){
461 				Data.setPath(b);
462 			}else if(a.equals("maxbasesoutm")){
463 				maxBasesOutm=Parse.parseKMG(b);
464 			}else if(a.equals("maxbasesoutu") || a.equals("maxbasesout")){
465 				maxBasesOutu=Parse.parseKMG(b);
466 			}else if(a.equals("vars") || a.equals("variants") || a.equals("varfile") || a.equals("inv")){
467 				varFile=b;
468 			}else if(a.equals("vcf") || a.equals("vcffile")){
469 				vcfFile=b;
470 			}else if(a.equals("unfixvars") || a.equals("unfixvariants")){
471 				unfixVariants=Parse.parseBoolean(b);
472 			}else if(a.equals("histogramsbefore") || a.equals("histbefore")){
473 				histogramsBeforeProcessing_=Parse.parseBoolean(b);
474 			}else if(a.equals("histogramsafter") || a.equals("histafter")){
475 				histogramsBeforeProcessing_=!Parse.parseBoolean(b);
476 			}
477 
478 			else if(a.equals("trimfailures") || a.equals("trimfailuresto1bp")){
479 				trimFailuresTo1bp_=Parse.parseBoolean(b);
480 			}
481 
482 			else if(a.equals("minx") || a.equals("xmin")){
483 				xMinLoc=Parse.parseIntKMG(b);
484 			}else if(a.equals("miny") || a.equals("ymin")){
485 				yMinLoc=Parse.parseIntKMG(b);
486 			}else if(a.equals("maxx") || a.equals("xmax")){
487 				xMaxLoc=Parse.parseIntKMG(b);
488 			}else if(a.equals("maxy") || a.equals("ymax")){
489 				yMaxLoc=Parse.parseIntKMG(b);
490 			}
491 
492 			else if(a.equalsIgnoreCase("pairedToSingle")){
493 				pairedToSingle=Parse.parseBoolean(b);
494 			}
495 
496 			else if(a.equals("filtersubs") || a.equals("filtervars")){
497 				filterVars=Parse.parseBoolean(b);
498 			}else if(a.equals("maxbadsubs") || a.equals("maxbadbars")){
499 				maxBadSubs=Integer.parseInt(b);
500 			}else if(a.equals("maxbadsubdepth") || a.equals("maxbadvardepth") || a.equals("maxbadsuballeledepth") || a.equals("maxbadvaralleledepth") || a.equals("mbsad")){
501 				maxBadSubAlleleDepth=Integer.parseInt(b);
502 			}else if(a.equals("minbadsubreaddepth") || a.equals("minbadvarreaddepth") || a.equals("mbsrd")){
503 				minBadSubReadDepth=Integer.parseInt(b);
504 			}
505 
506 			else if(a.equals("khist") || a.equals("khistin")){
507 				khistIn=b;
508 			}else if(a.equals("khistout")){
509 				khistOut=b;
510 			}
511 
512 			else if(a.equals("json")){
513 				json=Parse.parseBoolean(b);
514 			}
515 
516 			else if(a.equals("swift")){
517 				swift=Parse.parseBoolean(b);
518 			}
519 
520 			else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
521 				in1=args[i];
522 			}else if(i==1 && out1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
523 				out1=args[i];
524 				setOut=true;
525 			}else if(i==2 && ref==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
526 				ref=(new File(args[i]).exists() ? new String[] {args[i]} : args[i].split(","));
527 			}else{
528 				throw new RuntimeException("Unknown parameter "+args[i]);
529 			}
530 		}
531 		if(WAYS!=7){
532 			System.err.println("WARNING! WAYS="+WAYS+".  Probably debug mode.");
533 		}
534 
535 		if(khistIn!=null || khistOut!=null){
536 			if(khistIn!=null){parser.loglog=true;}
537 			if(khistOut!=null){parser.loglogOut=true;}
538 			parser.loglogbuckets=Tools.max(parser.loglogbuckets, 128000);
539 			CardinalityTracker.trackCounts=true;
540 		}
541 
542 		if(ordered_ && !silent){
543 			outstream.println("Set ORDERED to "+ordered_);
544 			if(jsonStats!=null){jsonStats.add("ordered", ordered_);}
545 		}
546 		if(silent || json){
547 			AbstractKmerTableSet.DISPLAY_PROGRESS=false; //TODO: Test to make sure this occurs for silent mode
548 		}
549 
550 		if(hammingDistance2==-1){hammingDistance2=hammingDistance;}
551 		if(qHammingDistance2==-1){qHammingDistance2=qHammingDistance;}
552 		if(editDistance2==-1){editDistance2=editDistance;}
553 		minBaseFrequency=minBaseFrequency_;
554 
555 		{//Process parser fields
556 			Parser.processQuality();
557 
558 			maxReads=parser.maxReads;
559 			samplerate=parser.samplerate;
560 			sampleseed=parser.sampleseed;
561 			recalibrateQuality=parser.recalibrateQuality;
562 
563 			overwrite=ReadStats.overwrite=parser.overwrite;
564 			append=ReadStats.append=parser.append;
565 //			testsize=parser.testsize;
566 //			trimBadSequence=parser.trimBadSequence;
567 //			breakLength=parser.breakLength;
568 
569 			forceTrimModulo=parser.forceTrimModulo;
570 			forceTrimLeft=parser.forceTrimLeft;
571 			forceTrimRight=parser.forceTrimRight;
572 			forceTrimRight2=parser.forceTrimRight2;
573 			trimq=parser.trimq;
574 			trimE=parser.trimE();
575 			qtrimLeft=parser.qtrimLeft && trimE<1;
576 			qtrimRight=parser.qtrimRight && trimE<1;
577 			trimClip=parser.trimClip;
578 			trimPolyA=parser.trimPolyA;
579 			trimPolyGLeft=parser.trimPolyGLeft;
580 			trimPolyGRight=parser.trimPolyGRight;
581 			filterPolyG=parser.filterPolyG;
582 			trimPolyCLeft=parser.trimPolyCLeft;
583 			trimPolyCRight=parser.trimPolyCRight;
584 			filterPolyC=parser.filterPolyC;
585 			minLenFraction=parser.minLenFraction;
586 			minAvgQuality=parser.minAvgQuality;
587 			minAvgQualityBases=parser.minAvgQualityBases;
588 			minBaseQuality=parser.minBaseQuality;
589 			chastityFilter=parser.chastityFilter;
590 			failBadBarcodes=parser.failBadBarcodes;
591 			removeBadBarcodes=parser.removeBadBarcodes;
592 			failIfNoBarcode=parser.failIfNoBarcode;
593 			barcodes=parser.barcodes;
594 			minReadLength=parser.minReadLength;
595 			maxReadLength=parser.maxReadLength;
596 			maxNs=parser.maxNs;
597 			minConsecutiveBases=parser.minConsecutiveBases;
598 //			untrim=parser.untrim;
599 //			minTrimLength=(parser.minTrimLength>=0 ? parser.minTrimLength : minTrimLength);
600 //			requireBothBad=parser.requireBothBad;
601 			removePairsIfEitherBad=(!parser.requireBothBad) && (!trimFailuresTo1bp_);
602 			tossJunk=parser.tossJunk;
603 
604 			minGC=parser.minGC;
605 			maxGC=parser.maxGC;
606 			filterGC=(minGC>0 || maxGC<1);
607 			usePairGC=parser.usePairGC;
608 
609 			loglogIn=(parser.loglog ? CardinalityTracker.makeTracker(parser) : null);
610 			loglogOut=(parser.loglogOut ? CardinalityTracker.makeTracker(parser) : null);
611 
612 			THREADS=Shared.threads();
613 			silent=Parser.silent;
614 			if(silent){
615 				DISPLAY_PROGRESS=false;
616 				showSpeed=false;
617 			}
618 		}
619 
620 		ref=modifyRefPath(ref, refNames);
621 		altref=modifyRefPath(altref, altRefNames);
622 		if(literal!=null){
623 			refNames.add("literal");
624 			if(!altRefNames.isEmpty()){altRefNames.add("literal");}
625 		}
626 		refScafCounts=new int[refNames.size()];
627 
628 		if(minoverlap_>=0){
629 			minOverlap=Tools.max(minoverlap_, 1);
630 			minOverlap0=Tools.min(minOverlap0, minOverlap);
631 		}
632 
633 		if(mininsert_>=0){
634 			minInsert=Tools.max(mininsert_, 1);
635 			minInsert0=Tools.min(minInsert0, minInsert);
636 		}
637 
638 		/* Set final variables; post-process and validate argument combinations */
639 
640 		useForest=useForest_;
641 		useTable=useTable_;
642 		useArray=useArray_;
643 		hammingDistance=Tools.max(editDistance, hammingDistance);
644 		hammingDistance2=Tools.max(editDistance2, hammingDistance2);
645 		minSkip=Tools.max(1, Tools.min(minSkip, maxSkip));
646 		maxSkip=Tools.max(minSkip, maxSkip);
647 		addTrimmedToBad=addTrimmedToBad_;
648 		forbidNs=(forbidNs_ || hammingDistance<1);
649 		trimSymbol=TRIM_SYMBOL_;
650 		kmaskLowercase=kmaskLowercase_;
651 		kmaskFullyCovered=kmaskFullyCovered_;
652 		skipreads=skipreads_;
653 		trimByOverlap=trimByOverlap_;
654 		useQualityForOverlap=useQualityForOverlap_;
655 		strictOverlap=strictOverlap_;
656 		trimPairsEvenly=trimPairsEvenly_;
657 		ordered=ordered_;
658 		restrictLeft=Tools.max(restrictLeft_, 0);
659 		restrictRight=Tools.max(restrictRight_, 0);
660 		printNonZeroOnly=printNonZeroOnly_;
661 		rename=rename_;
662 		findBestMatch=(rename || findBestMatch_);
663 		useRefNames=useRefNames_;
664 		speed=speed_;
665 		qSkip=qSkip_;
666 		noAccel=(speed<1 && qSkip<2);
667 		accel=!noAccel;
668 		skipR1=skipr1_;
669 		skipR2=skipr2_;
670 		ecc=ecc_;
671 		locationFilter=(xMinLoc>0 || yMinLoc>0 || xMaxLoc>-1 || yMaxLoc>-1);
672 		trimFailuresTo1bp=trimFailuresTo1bp_;
673 
674 		amino=Shared.AMINO_IN;
675 		rcomp=rcomp_ && !amino;
676 
677 		//Set K
678 		maxSupportedK=(amino ? 12 : 31);
679 		if(!setk){k_=(amino ? 11 : 27);}
680 		kbig_=(k_>maxSupportedK ? k_ : -1);
681 		k_=Tools.min(k_, maxSupportedK);
682 
683 		if(strictOverlap){
684 			maxRatio=0.05f;
685 			ratioMargin=9f;
686 			ratioOffset=0.5f;
687 			efilterRatio=3.5f;
688 			efilterOffset=0.05f;
689 			pfilterRatio=0.001f;
690 			meeFilter=15f;
691 		}else{
692 			maxRatio=0.10f;
693 			ratioMargin=5f;
694 			ratioOffset=0.4f;
695 			efilterRatio=6f;
696 			efilterOffset=0.05f;
697 			pfilterRatio=0.00005f;
698 			meeFilter=999999999;
699 		}
700 
701 		histogramsBeforeProcessing=histogramsBeforeProcessing_;
702 		MAKE_QUALITY_HISTOGRAM=ReadStats.COLLECT_QUALITY_STATS;
703 		MAKE_QUALITY_ACCURACY=ReadStats.COLLECT_QUALITY_ACCURACY;
704 		MAKE_MATCH_HISTOGRAM=ReadStats.COLLECT_MATCH_STATS;
705 		MAKE_BASE_HISTOGRAM=ReadStats.COLLECT_BASE_STATS;
706 		MAKE_EHIST=ReadStats.COLLECT_ERROR_STATS;
707 		MAKE_INDELHIST=ReadStats.COLLECT_INDEL_STATS;
708 		MAKE_LHIST=ReadStats.COLLECT_LENGTH_STATS;
709 		MAKE_GCHIST=ReadStats.COLLECT_GC_STATS;
710 		MAKE_ENTROPYHIST=ReadStats.COLLECT_ENTROPY_STATS;
711 		MAKE_IDHIST=ReadStats.COLLECT_IDENTITY_STATS;
712 		MAKE_IHIST=ReadStats.COLLECT_INSERT_STATS;
713 
714 		{
715 			long usableMemory;
716 			long tableMemory;
717 
718 			{
719 				long memory=Runtime.getRuntime().maxMemory();
720 				double xmsRatio=Shared.xmsRatio();
721 				usableMemory=(long)Tools.max(((memory-96000000-(20*400000 /* for atomic arrays */))*(xmsRatio>0.97 ? 0.82 : 0.72)), memory*0.45);
722 				tableMemory=(long)(usableMemory*.95);
723 			}
724 
725 			if(initialSize<1){
726 				final long memOverWays=tableMemory/(12*WAYS);
727 				final double mem2=(prealloc_ ? preallocFraction : 1)*tableMemory;
728 				initialSize=(prealloc_ || memOverWays<initialSizeDefault ? (int)Tools.min(2142000000, (long)(mem2/(12*WAYS))) : initialSizeDefault);
729 				if(initialSize!=initialSizeDefault && !silent){
730 					outstream.println("Initial size set to "+initialSize);
731 				}
732 			}
733 		}
734 
735 		if(ktrimLeft_ || ktrimRight_ || ktrimN_ || ksplit_){
736 			if(kbig_>k_){
737 				outstream.println("***********************   WARNING   ***********************");
738 				outstream.println("WARNING: When kmer-trimming or masking, the maximum value of K is "+k_+".");
739 				outstream.println("K has been reduced from "+kbig_+" to "+k_+".");
740 				outstream.println("***********************************************************");
741 				kbig_=k_;
742 			}
743 		}
744 
745 		if((speed>0 || qSkip>1) && kbig_>k_){
746 			outstream.println("***********************   WARNING   ***********************");
747 			outstream.println("WARNING: When speed>0 or qskip>1, the maximum value of K is "+k_+".");
748 			outstream.println("K has been reduced from "+kbig_+" to "+k_+".");
749 			outstream.println("***********************************************************");
750 			kbig_=k_;
751 		}
752 
753 		if((speed>0 && qSkip>1) || (qSkip>1 && maxSkip>1) || (speed>0 && maxSkip>1)){
754 			outstream.println("WARNING: It is not recommended to use more than one of 'qskip', 'speed', and 'rskip/maxskip' together.");
755 			outstream.println("qskip="+qSkip+", speed="+speed+", maxskip="+maxSkip);
756 		}
757 
758 		k=k_;
759 		k2=k-1;
760 		kbig=kbig_;
761 		keff=Tools.max(k, kbig);
762 		if(kbig>k){
763 			minSkip=maxSkip=0;
764 			if(maskMiddle){
765 				outstream.println("maskMiddle was disabled because kbig>k");
766 				maskMiddle=false;
767 			}
768 		}
769 		mink=Tools.min((mink_<1 ? 6 : mink_), k);
770 		maxBadKmers0=maxBadKmers_;
771 
772 		{//set some constants
773 			bitsPerBase=(amino ? 5 : 2);
774 			maxSymbol=(amino ? 20 : 3);
775 			symbols=maxSymbol+1;
776 			symbolArrayLen=(64+bitsPerBase-1)/bitsPerBase;
777 			symbolSpace=(1<<bitsPerBase);
778 			symbolMask=symbolSpace-1;
779 
780 			symbolToNumber=AminoAcid.symbolToNumber(amino);
781 			symbolToNumber0=AminoAcid.symbolToNumber0(amino);
782 			symbolToComplementNumber0=AminoAcid.symbolToComplementNumber0(amino);
783 
784 			clearMasks=new long[symbolArrayLen];
785 			leftMasks=new long[symbolArrayLen];
786 			rightMasks=new long[symbolArrayLen];
787 			lengthMasks=new long[symbolArrayLen];
788 			setMasks=new long[symbols][symbolArrayLen];
789 			for(int i=0; i<symbolArrayLen; i++){
790 				clearMasks[i]=~(symbolMask<<(bitsPerBase*i));
791 				leftMasks[i]=((-1L)<<(bitsPerBase*i));
792 				rightMasks[i]=~((-1L)<<(bitsPerBase*i));
793 				lengthMasks[i]=((1L)<<(bitsPerBase*i));
794 				for(long j=0; j<symbols; j++){
795 					setMasks[(int)j][i]=(j<<(bitsPerBase*i));
796 				}
797 			}
798 
799 			minlen=k-1;
800 			minminlen=mink-1;
801 			minlen2=(maskMiddle ? k/2 : k);
802 			shift=bitsPerBase*k;
803 			shift2=shift-bitsPerBase;
804 			mask=(shift>63 ? -1L : ~((-1L)<<shift));
805 			kmask=lengthMasks[k];
806 		}
807 
808 		minKmerFraction=Tools.max(minKmerFraction_, 0);
809 		assert(minKmerFraction<=1) : "minKmerFraction must range from 0 to 1; value="+minKmerFraction;
810 
811 		minCoveredFraction=Tools.max(minCoveredFraction_, 0);
812 		assert(minCoveredFraction<=1) : "minCoveredFraction must range from 0 to 1; value="+minCoveredFraction;
813 
814 		if(mink_>0 && mink_<k){useShortKmers=true;}
815 		if(useShortKmers){
816 			if(maskMiddle){
817 				outstream.println("maskMiddle was disabled because useShortKmers=true");
818 				maskMiddle=false;
819 			}
820 		}
821 
822 		ktrimRight=ktrimRight_;
823 		ktrimLeft=ktrimLeft_;
824 		ktrimN=ktrimN_;
825 		ksplit=ksplit_;
826 		ktrimExclusive=ktrimExclusive_;
827 		kfilter=(ref!=null || literal!=null) && !(ktrimRight || ktrimLeft || ktrimN || ksplit);
828 		assert(findBestMatch==false || kfilter==false || kbig<=k) : "K must be less than 32 in 'findBestMatch' mode";
829 
830 		assert(!useShortKmers || ktrimRight || ktrimLeft || ktrimN || ksplit) : "\nSetting mink or useShortKmers also requires setting a ktrim mode, such as 'r', 'l', or 'n'\n";
831 
832 		middleMask=maskMiddle ? ~(symbolMask<<(bitsPerBase*(k/2))) : -1L;
833 
834 		hitCounts=(outduk==null ? null : new long[HITCOUNT_LEN+1]);
835 
836 
837 		/* Adjust I/O settings and filenames */
838 
839 		assert(FastaReadInputStream.settingsOK());
840 
841 		if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
842 
843 		if(in1!=null && in1.contains("#") && !new File(in1).exists()){
844 			int pound=in1.lastIndexOf('#');
845 			String a=in1.substring(0, pound);
846 			String b=in1.substring(pound+1);
847 			in1=a+1+b;
848 			in2=a+2+b;
849 		}
850 		if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;}
851 		if(in2!=null){
852 			if(FASTQ.FORCE_INTERLEAVED && !silent){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
853 			FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
854 		}
855 
856 		if(qfin1!=null && qfin1.contains("#") && in2!=null && !new File(qfin1).exists()){
857 			int pound=qfin1.lastIndexOf('#');
858 			String a=qfin1.substring(0, pound);
859 			String b=qfin1.substring(pound+1);
860 			qfin1=a+1+b;
861 			qfin2=a+2+b;
862 		}
863 
864 		if(out1!=null && out1.contains("#")){
865 			int pound=out1.lastIndexOf('#');
866 			String a=out1.substring(0, pound);
867 			String b=out1.substring(pound+1);
868 			out1=a+1+b;
869 			out2=a+2+b;
870 		}
871 
872 		if(qfout1!=null && qfout1.contains("#") && in2!=null && !new File(qfout1).exists()){
873 			int pound=qfout1.lastIndexOf('#');
874 			String a=qfout1.substring(0, pound);
875 			String b=qfout1.substring(pound+1);
876 			qfout1=a+1+b;
877 			qfout2=a+2+b;
878 		}
879 
880 		if(outb1!=null && outb1.contains("#")){
881 			int pound=outb1.lastIndexOf('#');
882 			String a=outb1.substring(0, pound);
883 			String b=outb1.substring(pound+1);
884 			outb1=a+1+b;
885 			outb2=a+2+b;
886 		}
887 
888 		if((out2!=null || outb2!=null) && (in1!=null && in2==null)){
889 			if(!FASTQ.FORCE_INTERLEAVED){outstream.println("Forcing interleaved input because paired output was specified for a single input file.");}
890 			FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true;
891 		}
892 
893 		if(!setOut){
894 			if(!silent && !json){
895 				outstream.println("No output stream specified.  To write to stdout, please specify 'out=stdout.fq' or similar.");
896 			}
897 			out1=out2=null;
898 		}else if("stdout".equalsIgnoreCase(out1) || "standarddout".equalsIgnoreCase(out1)){
899 			out1="stdout.fq";
900 			out2=null;
901 		}
902 
903 		in1=Tools.fixExtension(in1);
904 		in2=Tools.fixExtension(in2);
905 		qfin1=Tools.fixExtension(qfin1);
906 		qfin2=Tools.fixExtension(qfin2);
907 		ref=Tools.fixExtension(ref);
908 
909 		if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, qfout1, qfout2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats, polymerStatsFile, khistIn, khistOut)){
910 			throw new RuntimeException("\nCan't write to some output files; overwrite="+overwrite+"\n");
911 		}
912 		if(!Tools.testInputFiles(false, true, in1, in2, qfin1, qfin2)){
913 			throw new RuntimeException("\nCan't read some input files.\n");
914 		}
915 		if(!Tools.testInputFiles(true, true, ref)){
916 			throw new RuntimeException("\nCan't read to some reference files.\n");
917 		}
918 		if(!Tools.testForDuplicateFiles(true, in1, in2, qfin1, qfin2, qfout1, qfout2,
919 				out1, out2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats, polymerStatsFile, khistIn, khistOut)){
920 			throw new RuntimeException("\nSome file names were specified multiple times.\n");
921 		}
922 
923 		assert(THREADS>0) : "THREADS must be greater than 0.";
924 
925 		assert(in1==null || in1.toLowerCase().startsWith("stdin") || in1.toLowerCase().startsWith("standardin") || new File(in1).exists()) : "Can't find "+in1;
926 		assert(in2==null || in2.toLowerCase().startsWith("stdin") || in2.toLowerCase().startsWith("standardin") || new File(in2).exists()) : "Can't find "+in2;
927 
928 		ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
929 		ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
930 
931 		final int defaultFormat=(ffin1==null ? FileFormat.FASTQ : ffin1.format());
932 		ffout1=FileFormat.testOutput(out1, defaultFormat, null, true, overwrite, append, ordered);
933 		ffout2=FileFormat.testOutput(out2, defaultFormat, null, true, overwrite, append, ordered);
934 		ffoutb1=FileFormat.testOutput(outb1, defaultFormat, null, true, overwrite, append, ordered);
935 		ffoutb2=FileFormat.testOutput(outb2, defaultFormat, null, true, overwrite, append, ordered);
936 		ffouts=FileFormat.testOutput(outsingle, defaultFormat, null, true, overwrite, append, ordered);
937 
938 		parser.validateStdio(ffin1, ffout1, ffoutb1, ffouts);
939 
940 		makeReadStats=ReadStats.collectingStats();
941 
942 		//This block just causes problems when new features are added, so it's disabled.
943 		if(!((ref!=null || literal!=null) || qtrimLeft || qtrimRight || minAvgQuality>0 || minBaseQuality>0 || maxNs>=0 || trimByOverlap ||
944 				makeReadStats || entropyMask || entropyTrim>0 || entropyMark || entropyCutoff>=0 || filterVars ||
945 				forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0 || minBaseFrequency>0 || recalibrateQuality ||
946 				trimPolyA>0 || trimPolyGLeft>0 || trimPolyGRight>0 || filterPolyG>0 || trimPolyCLeft>0 || trimPolyCRight>0 || filterPolyC>0)){
947 //			outstream.println("NOTE: No reference files specified, no trimming mode, no min avg quality, no histograms - read sequences will not be changed.");
948 		}
949 
950 		if(recalibrateQuality || true){
951 			SamLine.SET_FROM_OK=true;//TODO:  Should ban other operations
952 		}
953 
954 		if(ref!=null){
955 			for(String s0 : ref){
956 				assert(s0!=null) : "Specified a null reference.";
957 				String s=s0.toLowerCase();
958 				assert(s==null || s.startsWith("stdin") || s.startsWith("standardin") || new File(s0).exists()) : "Can't find "+s0;
959 			}
960 		}
961 
962 		//Initialize tables
963 		final int tableType=(useForest ? AbstractKmerTable.FOREST1D : useTable ? AbstractKmerTable.TABLE : useArray ? AbstractKmerTable.ARRAY1D : 0);
964 		ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, 12, prealloc_, (prealloc_ ? preallocFraction : 0.9));
965 		int[] schedule=scheduleMaker.makeSchedule();
966 		keySets=AbstractKmerTable.preallocate(WAYS, tableType, schedule, -1L);
967 
968 		//Initialize entropy
969 		calcEntropy=(entropyCutoff>=0 || entropyMark);
970 		if(calcEntropy){
971 			assert(EntropyTracker.defaultWindowBases>0 && (entropyMark || (entropyCutoff>=0 && entropyCutoff<=1)));
972 		}
973 		assert(calcEntropy || (!entropyMask && !entropyMark && entropyTrim==0)) : "Entropy masking/trimming operations require the entropy flag to be set.";
974 
975 		if(polymerStatsFile!=null || (polymerChar1>=0 && polymerChar2>=0)){
976 			countPolymers=true;
977 		}
978 
979 		//Initialize polymer-tracking
980 		if(countPolymers){
981 //			assert(polymerChar1>=0 && AminoAcid.baseToNumberACGTN[polymerChar1]>=0);
982 //			assert(polymerChar2>=0 && AminoAcid.baseToNumberACGTN[polymerChar2]>=0);
983 			pTracker=new PolymerTracker();
984 		}else{
985 			pTracker=null;
986 		}
987 	}
988 
modifyRefPath(String[] array, ArrayList<String> list)989 	String[] modifyRefPath(String[] array, ArrayList<String> list){
990 		if(array==null){return array;}
991 		for(String s : array){
992 			String s2=modifyRefPath(s);
993 			list.add(s2);
994 		}
995 		return list.toArray(new String[0]);
996 	}
997 
modifyRefPath(String s)998 	String modifyRefPath(String s){
999 		if(s==null || new File(s).exists()){
1000 			//do nothing
1001 		}else{
1002 			if("phix".equalsIgnoreCase(s)){
1003 				s=Data.findPath("?phix174_ill.ref.fa.gz");
1004 			}else if("lambda".equalsIgnoreCase(s)){
1005 				s=Data.findPath("?lambda.fa.gz");
1006 			}else if("kapa".equalsIgnoreCase(s)){
1007 				s=Data.findPath("?kapatags.L40.fa");
1008 			}else if("pjet".equalsIgnoreCase(s)){
1009 				s=Data.findPath("?pJET1.2.fa");
1010 			}else if("mtst".equalsIgnoreCase(s)){
1011 				s=Data.findPath("?mtst.fa");
1012 			}else if("adapters".equalsIgnoreCase(s)){
1013 				s=Data.findPath("?adapters.fa");
1014 			}else if("truseq".equalsIgnoreCase(s)){
1015 				s=Data.findPath("?truseq.fa.gz");
1016 			}else if("nextera".equalsIgnoreCase(s)){
1017 				s=Data.findPath("?nextera.fa.gz");
1018 			}else if("artifacts".equalsIgnoreCase(s)){
1019 				s=Data.findPath("?sequencing_artifacts.fa.gz");
1020 			}else{
1021 				assert(false) : "Can't find reference file "+s;
1022 			}
1023 		}
1024 		return s;
1025 	}
1026 
1027 
1028 	/*--------------------------------------------------------------*/
1029 	/*----------------         Outer Methods        ----------------*/
1030 	/*--------------------------------------------------------------*/
1031 
1032 
process()1033 	public void process(){
1034 
1035 		if(samref!=null){
1036 			scafMap=ScafMap.loadReference(samref, true);
1037 		}
1038 
1039 		if(varFile!=null || vcfFile!=null || filterVars){
1040 			if(scafMap==null){scafMap=ScafMap.loadSamHeader(in1);}
1041 			assert(scafMap!=null && scafMap.size()>0) : "No scaffold names were loaded.";
1042 			if(varFile!=null){
1043 				outstream.println("Loading variants.");
1044 				varMap=VcfLoader.loadVarFile(varFile, scafMap);
1045 			}else if(vcfFile!=null){
1046 				outstream.println("Loading variants.");
1047 				varMap=VcfLoader.loadVcfFile(vcfFile, scafMap, false, false);
1048 			}
1049 //			if(varMap!=null && varMap.size()>0){
1050 //				varKeySet=new HashSet<VarKey>();
1051 //				for(Var v : varMap.toArray(false)){
1052 //					if(v.type==Var.INS || v.type==Var.DEL){
1053 //						varKeySet.add(VarKey.toVarKey(v));
1054 //					}
1055 //				}
1056 //			}
1057 			fixVariants=(makeReadStats && varMap!=null && varMap.size()>0 && scafMap!=null && scafMap.size()>0);
1058 		}
1059 
1060 		if(recalibrateQuality){
1061 			if(samFile!=null){
1062 				CalcTrueQuality.main2(new String[] {"in="+samFile, "showstats=f"});
1063 			}
1064 			CalcTrueQuality.initializeMatrices();
1065 		}
1066 
1067 		/* Check for output file collisions */
1068 		if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outb1, outb2, outstats, outrpkm, outduk, outrqc, outrefstats)){
1069 			throw new RuntimeException("One or more output files were duplicate or could not be written to.  Check the names or set the 'overwrite=true' flag.");
1070 		}
1071 
1072 		/* Start overall timer */
1073 		Timer t=new Timer();
1074 
1075 //		boolean dq0=FASTQ.DETECT_QUALITY;
1076 //		boolean ti0=FASTQ.TEST_INTERLEAVED;
1077 //		int rbl0=Shared.bufferLen();;
1078 //		FASTQ.DETECT_QUALITY=false;
1079 //		FASTQ.TEST_INTERLEAVED=false;
1080 //		Shared.setBufferLen(16;
1081 
1082 		process2(t.time1);
1083 
1084 //		FASTQ.DETECT_QUALITY=dq0;
1085 //		FASTQ.TEST_INTERLEAVED=ti0;
1086 //		Shared.setBufferLen(rbl0;
1087 
1088 		/* Stop timer and calculate speed statistics */
1089 		t.stop();
1090 		lastReadsOut=readsOut;
1091 
1092 		if(showSpeed && !json){
1093 			outstream.println();
1094 			outstream.println(Tools.timeReadsBasesProcessed(t, readsIn, basesIn, 8));
1095 		}
1096 
1097 		if(outstream!=System.err && outstream!=System.out){outstream.close();}
1098 
1099 		/* Throw an exception if errors were detected */
1100 		if(errorState){
1101 			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
1102 		}
1103 	}
1104 
1105 
process2(long startTime)1106 	public void process2(long startTime){
1107 
1108 		/* Start phase timer */
1109 		Timer t=new Timer();
1110 
1111 		if(DISPLAY_PROGRESS && !json){
1112 			outstream.println("Initial:");
1113 			Shared.printMemory();
1114 			outstream.println();
1115 		}
1116 
1117 		/* Fill tables with reference kmers */
1118 		if((ref!=null && ref.length>0) || (literal!=null && literal.length>0)){
1119 			final boolean oldTI=FASTQ.TEST_INTERLEAVED; //TODO: This needs to be changed to a non-static field, or somehow 'read mode' and 'ref mode' need to be distinguished.
1120 			final boolean oldFI=FASTQ.FORCE_INTERLEAVED;
1121 			final boolean oldPC=FASTQ.PARSE_CUSTOM;
1122 			final boolean oldSplit=FastaReadInputStream.SPLIT_READS;
1123 			final int oldML=FastaReadInputStream.MIN_READ_LEN;
1124 
1125 			FASTQ.TEST_INTERLEAVED=false;
1126 			FASTQ.FORCE_INTERLEAVED=false;
1127 			FASTQ.PARSE_CUSTOM=false;
1128 			FastaReadInputStream.SPLIT_READS=false;
1129 			FastaReadInputStream.MIN_READ_LEN=1;
1130 
1131 
1132 			storedKmers=spawnLoadThreads();
1133 			if(storedKmers<1 && altRefNames.size()>0){
1134 				outstream.println("Ref had no kmers; using alt ref.");
1135 				ref=altref;
1136 				refNames=altRefNames;
1137 				refScafCounts=new int[refNames.size()];
1138 				scaffoldNames.clear();
1139 				scaffoldLengths.clear();
1140 				storedKmers=spawnLoadThreads();
1141 			}
1142 
1143 			FASTQ.TEST_INTERLEAVED=oldTI;
1144 			FASTQ.FORCE_INTERLEAVED=oldFI;
1145 			FASTQ.PARSE_CUSTOM=oldPC;
1146 			FastaReadInputStream.SPLIT_READS=oldSplit;
1147 			FastaReadInputStream.MIN_READ_LEN=oldML;
1148 
1149 			if(useRefNames){toRefNames();}
1150 			t.stop();
1151 		}
1152 
1153 		{
1154 			long ram=freeMemory();
1155 			ALLOW_LOCAL_ARRAYS=(scaffoldNames!=null && Tools.max(THREADS, 1)*3*8*scaffoldNames.size()<ram*5);
1156 		}
1157 
1158 		/* Dump kmers to text */
1159 		if(dump!=null){
1160 			ByteStreamWriter bsw=new ByteStreamWriter(dump, overwrite, false, true);
1161 			bsw.start();
1162 			for(AbstractKmerTable set : keySets){
1163 				set.dumpKmersAsBytes(bsw, k, 0, Integer.MAX_VALUE, null);
1164 			}
1165 			bsw.poisonAndWait();
1166 		}
1167 
1168 		if(storedKmers<1 && (ktrimRight || ktrimLeft || ktrimN || ksplit)){
1169 			outstream.println("******  WARNING! A KMER OPERATION WAS CHOSEN BUT NO KMERS WERE LOADED.  ******");
1170 			if(ref==null && literal==null){
1171 				outstream.println("******  YOU NEED TO SPECIFY A REFERENCE FILE OR LITERAL SEQUENCE.       ******\n");
1172 			}else{
1173 				outstream.println("******  PLEASE ENSURE K IS LESS THAN OR EQUAL TO REF SEQUENCE LENGTHS.  ******\n");
1174 			}
1175 			if(ktrimRight && trimByOverlap){
1176 				ktrimRight=false;
1177 			}else{
1178 				assert(false) : "You can bypass this assertion with the -da flag.";
1179 			}
1180 		}
1181 
1182 		final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
1183 		Read.VALIDATE_IN_CONSTRUCTOR=THREADS<4;
1184 
1185 		/* Do kmer matching of input reads */
1186 		spawnProcessThreads(t);
1187 
1188 		Read.VALIDATE_IN_CONSTRUCTOR=vic;
1189 
1190 		/* Write legacy duk statistics (which requires tables) */
1191 		writeDuk(System.nanoTime()-startTime);
1192 
1193 		/* Unload kmers to save memory */
1194 		if(RELEASE_TABLES){unloadKmers();}
1195 
1196 		/* Write statistics to files */
1197 		writeStats();
1198 		writeRPKM();
1199 		writeRefStats();
1200 		writeRqc();
1201 		if(pTracker!=null && polymerStatsFile!=null){
1202 			ReadWrite.writeString(pTracker.toHistogramCumulative(), polymerStatsFile);
1203 		}
1204 
1205 		/* Unload sequence data to save memory */
1206 		if(RELEASE_TABLES){unloadScaffolds();}
1207 
1208 		if(silent){return;}
1209 		if(json){
1210 			outstream.println(toJson(startTime));
1211 			return;
1212 		}
1213 
1214 		outstream.println("\nInput:                  \t"+readsIn+" reads \t\t"+basesIn+" bases.");
1215 
1216 		if((ref!=null || literal!=null) && !(ktrimLeft || ktrimRight || ktrimN)){
1217 			outstream.println("Contaminants:           \t"+readsKFiltered+" reads ("+toPercent(readsKFiltered, readsIn)+") \t"+
1218 					basesKFiltered+" bases ("+toPercent(basesKFiltered, basesIn)+")");
1219 			outstream.flush();
1220 		}
1221 		if(qtrimLeft || qtrimRight){
1222 			outstream.println("QTrimmed:               \t"+readsQTrimmed+" reads ("+toPercent(readsQTrimmed, readsIn)+") \t"+
1223 					basesQTrimmed+" bases ("+toPercent(basesQTrimmed, basesIn)+")");
1224 		}
1225 		if(trimPolyA>0 || trimPolyGLeft>0 || trimPolyGRight>0 || filterPolyG>0 || trimPolyCLeft>0 || trimPolyCRight>0 || filterPolyC>0){
1226 			outstream.println("Polymer-trimmed:        \t"+readsPolyTrimmed+" reads ("+toPercent(readsPolyTrimmed, readsIn)+") \t"+
1227 					basesPolyTrimmed+" bases ("+toPercent(basesPolyTrimmed, basesIn)+")");
1228 		}
1229 		if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
1230 			outstream.println("FTrimmed:               \t"+readsFTrimmed+" reads ("+toPercent(readsFTrimmed, readsIn)+") \t"+
1231 					basesFTrimmed+" bases ("+toPercent(basesFTrimmed, basesIn)+")");
1232 		}
1233 		if(ktrimLeft || ktrimRight || ktrimN){
1234 			String x=(ktrimN ? "KMasked: " : "KTrimmed:");
1235 			outstream.println(x+"               \t"+readsKTrimmed+" reads ("+toPercent(readsKTrimmed, readsIn)+") \t"+
1236 					basesKTrimmed+" bases ("+toPercent(basesKTrimmed, basesIn)+")");
1237 		}
1238 		if(swift){
1239 			outstream.println("Trimmed by Swift:       \t"+readsTrimmedBySwift+" reads ("+toPercent(readsTrimmedBySwift, readsIn)+") \t"+
1240 					basesTrimmedBySwift+" bases ("+toPercent(basesTrimmedBySwift, basesIn)+")");
1241 		}
1242 		if(trimByOverlap){
1243 			outstream.println("Trimmed by overlap:     \t"+readsTrimmedByOverlap+" reads ("+toPercent(readsTrimmedByOverlap, readsIn)+") \t"+
1244 					basesTrimmedByOverlap+" bases ("+toPercent(basesTrimmedByOverlap, basesIn)+")");
1245 		}
1246 		if(filterGC){
1247 			outstream.println("Filtered by GC:         \t"+badGcReads+" reads ("+toPercent(badGcReads, readsIn)+") \t"+
1248 					badGcBases+" bases ("+toPercent(badGcBases, basesIn)+")");
1249 		}
1250 		if(locationFilter || chastityFilter || removeBadBarcodes){
1251 			outstream.println("Filtered by header:     \t"+badHeaderReads+" reads ("+toPercent(badHeaderReads, readsIn)+") \t"+
1252 					badHeaderBases+" bases ("+toPercent(badHeaderBases, basesIn)+")");
1253 		}
1254 		if(minAvgQuality>0 || minBaseQuality>0 || maxNs>=0 || minBaseFrequency>0 || chastityFilter || removeBadBarcodes){
1255 			outstream.println("Low quality discards:   \t"+readsQFiltered+" reads ("+toPercent(readsQFiltered, readsIn)+") \t"+
1256 					basesQFiltered+" bases ("+toPercent(basesQFiltered, basesIn)+")");
1257 		}
1258 		if(polymerChar1>=0 && polymerChar2>=0){
1259 			outstream.println("Polymer Counts:         \t"+padRight(pTracker.getCountCumulative(polymerChar1, polymerLength)+" "+Character.toString((char)polymerChar1), 18)+"\t"+
1260 					padRight(pTracker.getCountCumulative(polymerChar2, polymerLength)+" "+Character.toString((char)polymerChar2), 18)+"\t"+
1261 					"("+String.format(Locale.ROOT, "%.4f", pTracker.calcRatioCumulative(polymerChar1, polymerChar2, polymerLength))+" ratio)");
1262 		}
1263 		if(calcEntropy){
1264 			String prefix;
1265 			if(entropyTrim>0){
1266 				prefix=("Entropy-trimmed:        \t");
1267 			}else if(entropyMask){
1268 				prefix=("Entropy-masked:         \t");
1269 			}else{
1270 				prefix=("Low entropy discards:   \t");
1271 			}
1272 			outstream.println(prefix+readsEFiltered+" reads ("+toPercent(readsEFiltered, readsIn)+") \t"+
1273 					basesEFiltered+" bases ("+toPercent(basesEFiltered, basesIn)+")");
1274 		}
1275 
1276 		final long readsRemoved=readsIn-readsOut;
1277 		final long basesRemoved=basesIn-basesOut;
1278 
1279 		outstream.println("Total Removed:          \t"+readsRemoved+" reads ("+toPercent(readsRemoved, readsIn)+") \t"+
1280 				basesRemoved+" bases ("+toPercent(basesRemoved, basesIn)+")");
1281 
1282 		outstream.println("Result:                 \t"+readsOut+" reads ("+toPercent(readsOut, readsIn)+") \t"+
1283 				basesOut+" bases ("+toPercent(basesOut, basesIn)+")");
1284 
1285 		if(loglogIn!=null){
1286 			outstream.println("Unique "+loglogIn.k+"-mers:         \t"+loglogIn.cardinality());
1287 			if(khistIn!=null){
1288 				loglogIn.printKhist(khistIn, overwrite, append, true, 2);
1289 			}
1290 		}
1291 		if(loglogOut!=null){
1292 			outstream.println("Unique "+loglogOut.k+"-mers out:     \t"+loglogOut.cardinality());
1293 			if(khistOut!=null){
1294 				loglogOut.printKhist(khistOut, overwrite, append, true, 2);
1295 			}
1296 		}
1297 	}
1298 
toJson(long startTime)1299 	private String toJson(long startTime){
1300 
1301 		jsonStats.add("k", k);
1302 		jsonStats.add("mode", ktrimLeft ? "ktrimLeft" : ktrimRight ? "ktrimRight" : ktrimN ? "ktrimN" : "kFilter");
1303 		jsonStats.add("readsIn", readsIn);
1304 		jsonStats.add("basesIn", basesIn);
1305 
1306 		if((ref!=null || literal!=null) && !(ktrimLeft || ktrimRight || ktrimN)){
1307 			jsonStats.add("readsKFiltered", readsKFiltered);
1308 			jsonStats.add("basesKFiltered", basesKFiltered);
1309 		}
1310 		if(qtrimLeft || qtrimRight){
1311 			jsonStats.add("readsQTrimmed", readsQTrimmed);
1312 			jsonStats.add("basesQTrimmed", basesQTrimmed);
1313 		}
1314 		if(trimPolyA>0 || trimPolyGLeft>0 || trimPolyGRight>0 || filterPolyG>0 || trimPolyCLeft>0 || trimPolyCRight>0 || filterPolyC>0){
1315 			jsonStats.add("readsPolyTrimmed", readsPolyTrimmed);
1316 			jsonStats.add("basesPolyTrimmed", basesPolyTrimmed);
1317 		}
1318 		if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
1319 			jsonStats.add("readsFTrimmed", readsFTrimmed);
1320 			jsonStats.add("basesFTrimmed", basesFTrimmed);
1321 		}
1322 		if(ktrimLeft || ktrimRight || ktrimN){
1323 			String x=(ktrimN ? "KMasked: " : "KTrimmed:");
1324 			jsonStats.add("reads"+x, readsKTrimmed);
1325 			jsonStats.add("bases+x", basesKTrimmed);
1326 		}
1327 		if(swift){
1328 			jsonStats.add("readsTrimmedBySwift", readsTrimmedBySwift);
1329 			jsonStats.add("basesTrimmedBySwift", basesTrimmedBySwift);
1330 		}
1331 		if(trimByOverlap){
1332 			jsonStats.add("readsTrimmedByOverlap", readsTrimmedByOverlap);
1333 			jsonStats.add("basesTrimmedByOverlap", basesTrimmedByOverlap);
1334 		}
1335 		if(filterGC){
1336 			jsonStats.add("badGcReads", badGcReads);
1337 			jsonStats.add("badGcBases", badGcBases);
1338 		}
1339 		if(locationFilter || chastityFilter || removeBadBarcodes){
1340 			jsonStats.add("badHeaderReads", badHeaderReads);
1341 			jsonStats.add("badHeaderBases", badHeaderBases);
1342 		}
1343 		if(minAvgQuality>0 || minBaseQuality>0 || maxNs>=0 || minBaseFrequency>0 || chastityFilter || removeBadBarcodes){
1344 			jsonStats.add("readsQFiltered", readsQFiltered);
1345 			jsonStats.add("basesQFiltered", basesQFiltered);
1346 		}
1347 		if(polymerChar1>=0 && polymerChar2>=0){
1348 			jsonStats.add("poly"+Character.toString((char)polymerChar1), pTracker.getCountCumulative(polymerChar1, polymerLength));
1349 			jsonStats.add("poly"+Character.toString((char)polymerChar2), pTracker.getCountCumulative(polymerChar2, polymerLength));
1350 			jsonStats.add("polyRatio", pTracker.calcRatioCumulative(polymerChar1, polymerChar2, polymerLength));
1351 		}
1352 		if(calcEntropy){
1353 			String suffix;
1354 			if(entropyTrim>0){
1355 				suffix=("EntropyTrimmed");
1356 			}else if(entropyMask){
1357 				suffix=("EntropyMasked");
1358 			}else{
1359 				suffix=("EntropyFiltered");
1360 			}
1361 			jsonStats.add("reads"+suffix, readsEFiltered);
1362 			jsonStats.add("bases"+suffix, basesEFiltered);
1363 		}
1364 
1365 		final long readsRemoved=readsIn-readsOut;
1366 		final long basesRemoved=basesIn-basesOut;
1367 
1368 		jsonStats.add("readsRemoved", readsRemoved);
1369 		jsonStats.add("basesRemoved", basesRemoved);
1370 		jsonStats.add("readsOut", readsOut);
1371 		jsonStats.add("basesOut", basesOut);
1372 
1373 		if(loglogIn!=null){
1374 			jsonStats.add("uniqueKmersIn", loglogIn.cardinality());
1375 		}
1376 		if(loglogOut!=null){
1377 			jsonStats.add("uniqueKmersOut", loglogOut.cardinality());
1378 		}
1379 		jsonStats.add("time", (System.nanoTime()-startTime)/1000000000.0);
1380 		return jsonStats.toString();
1381 	}
1382 
toPercent(long numerator, long denominator)1383 	private static String toPercent(long numerator, long denominator){
1384 		if(denominator<1){return "0.00%";}
1385 		return String.format(Locale.ROOT, "%.2f%%",numerator*100.0/denominator);
1386 	}
1387 
padRight(String s, int minLen)1388 	private static String padRight(String s, int minLen){
1389 		while(s.length()<minLen){s=s+" ";}
1390 		return s;
1391 	}
1392 
1393 	/**
1394 	 * Clear stored kmers.
1395 	 */
unloadKmers()1396 	public void unloadKmers(){
1397 		if(keySets!=null){
1398 			for(int i=0; i<keySets.length; i++){keySets[i]=null;}
1399 		}
1400 	}
1401 
1402 	/**
1403 	 * Clear stored sequence data.
1404 	 */
unloadScaffolds()1405 	public void unloadScaffolds(){
1406 		if(scaffoldNames!=null && !scaffoldNames.isEmpty()){
1407 			scaffoldNames.clear();
1408 			scaffoldNames.trimToSize();
1409 		}
1410 		scaffoldReadCounts=null;
1411 		scaffoldBaseCounts=null;
1412 		hitCounts=null;
1413 		scaffoldLengths=null;
1414 	}
1415 
1416 	/**
1417 	 * Write statistics about how many reads matched each reference scaffold.
1418 	 */
writeStats()1419 	private void writeStats(){
1420 		if(outstats==null){return;}
1421 		final TextStreamWriter tsw=new TextStreamWriter(outstats, overwrite, false, false);
1422 		tsw.start();
1423 
1424 		long rsum=0, bsum=0;
1425 
1426 		/* Create StringCount list of scaffold names and hitcounts */
1427 		ArrayList<StringCount> list=new ArrayList<StringCount>();
1428 		for(int i=1; i<scaffoldNames.size(); i++){
1429 			final long num1=scaffoldReadCounts.get(i), num2=scaffoldBaseCounts.get(i);
1430 			if(num1>0 || !printNonZeroOnly){
1431 				rsum+=num1;
1432 				bsum+=num2;
1433 				final String s=scaffoldNames.get(i);
1434 				final int len=scaffoldLengths.get(i);
1435 				final StringCount sn=new StringCount(s, len, num1, num2);
1436 				list.add(sn);
1437 			}
1438 		}
1439 		Shared.sort(list);
1440 		final double rmult=100.0/(readsIn>0 ? readsIn : 1);
1441 		final double bmult=100.0/(basesIn>0 ? basesIn : 1);
1442 
1443 		tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
1444 
1445 		if(STATS_COLUMNS==3){
1446 			tsw.print(String.format(Locale.ROOT, "#Total\t%d\n",readsIn));
1447 			tsw.print(String.format(Locale.ROOT, "#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum));
1448 			tsw.print("#Name\tReads\tReadsPct\n");
1449 			for(int i=0; i<list.size(); i++){
1450 				StringCount sn=list.get(i);
1451 				tsw.print(String.format(Locale.ROOT, "%s\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult)));
1452 			}
1453 		}else{
1454 			tsw.print(String.format(Locale.ROOT, "#Total\t%d\t%d\n",readsIn,basesIn));
1455 			tsw.print(String.format(Locale.ROOT, "#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum,bsum,bsum*bmult));
1456 			tsw.print("#Name\tReads\tReadsPct\tBases\tBasesPct\n");
1457 			for(int i=0; i<list.size(); i++){
1458 				StringCount sn=list.get(i);
1459 				tsw.print(String.format(Locale.ROOT, "%s\t%d\t%.5f%%\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult),sn.bases,(sn.bases*bmult)));
1460 			}
1461 		}
1462 
1463 		tsw.poisonAndWait();
1464 	}
1465 
1466 	/**
1467 	 * Write RPKM statistics.
1468 	 */
writeRPKM()1469 	private void writeRPKM(){
1470 		if(outrpkm==null){return;}
1471 		final TextStreamWriter tsw=new TextStreamWriter(outrpkm, overwrite, false, false);
1472 		tsw.start();
1473 
1474 		/* Count mapped reads */
1475 		long mapped=0;
1476 		for(int i=0; i<scaffoldReadCounts.length(); i++){
1477 			mapped+=scaffoldReadCounts.get(i);
1478 		}
1479 
1480 		/* Print header */
1481 		tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
1482 		tsw.print(String.format(Locale.ROOT, "#Reads\t%d\n",readsIn));
1483 		tsw.print(String.format(Locale.ROOT, "#Mapped\t%d\n",mapped));
1484 		tsw.print(String.format(Locale.ROOT, "#RefSequences\t%d\n",Tools.max(0, scaffoldNames.size()-1)));
1485 		tsw.print("#Name\tLength\tBases\tCoverage\tReads\tRPKM\n");
1486 
1487 		final float mult=1000000000f/Tools.max(1, mapped);
1488 
1489 		/* Print data */
1490 		for(int i=1; i<scaffoldNames.size(); i++){
1491 			final long reads=scaffoldReadCounts.get(i);
1492 			final long bases=scaffoldBaseCounts.get(i);
1493 			final String s=scaffoldNames.get(i);
1494 			final int len=scaffoldLengths.get(i);
1495 			final double invlen=1.0/Tools.max(1, len);
1496 			final double mult2=mult*invlen;
1497 			if(reads>0 || !printNonZeroOnly){
1498 				tsw.print(String.format(Locale.ROOT, "%s\t%d\t%d\t%.4f\t%d\t%.4f\n",s,len,bases,bases*invlen,reads,reads*mult2));
1499 			}
1500 		}
1501 		tsw.poisonAndWait();
1502 	}
1503 
1504 	/**
1505 	 * Write statistics on a per-reference basis.
1506 	 */
writeRefStats()1507 	private void writeRefStats(){
1508 		if(outrefstats==null){return;}
1509 		final TextStreamWriter tsw=new TextStreamWriter(outrefstats, overwrite, false, false);
1510 		tsw.start();
1511 
1512 		/* Count mapped reads */
1513 		long mapped=0;
1514 		for(int i=0; i<scaffoldReadCounts.length(); i++){
1515 			mapped+=scaffoldReadCounts.get(i);
1516 		}
1517 
1518 		final int numRefs=refNames.size();
1519 		long[] refReadCounts=new long[numRefs];
1520 		long[] refBaseCounts=new long[numRefs];
1521 		long[] refLengths=new long[numRefs];
1522 
1523 		for(int r=0, s=1; r<numRefs; r++){
1524 			final int lim=s+refScafCounts[r];
1525 			while(s<lim){
1526 				refReadCounts[r]+=scaffoldReadCounts.get(s);
1527 				refBaseCounts[r]+=scaffoldBaseCounts.get(s);
1528 				refLengths[r]+=scaffoldLengths.get(s);
1529 				s++;
1530 			}
1531 		}
1532 
1533 		/* Print header */
1534 		tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
1535 		tsw.print(String.format(Locale.ROOT, "#Reads\t%d\n",readsIn));
1536 		tsw.print(String.format(Locale.ROOT, "#Mapped\t%d\n",mapped));
1537 		tsw.print(String.format(Locale.ROOT, "#References\t%d\n",Tools.max(0, refNames.size())));
1538 		tsw.print("#Name\tLength\tScaffolds\tBases\tCoverage\tReads\tRPKM\n");
1539 
1540 		final float mult=1000000000f/Tools.max(1, mapped);
1541 
1542 		/* Print data */
1543 		for(int i=0; i<refNames.size(); i++){
1544 			final long reads=refReadCounts[i];
1545 			final long bases=refBaseCounts[i];
1546 			final long len=refLengths[i];
1547 			final int scafs=refScafCounts[i];
1548 			final String name=ReadWrite.stripToCore(refNames.get(i));
1549 			final double invlen=1.0/Tools.max(1, len);
1550 			final double mult2=mult*invlen;
1551 			if(reads>0 || !printNonZeroOnly){
1552 				tsw.print(String.format(Locale.ROOT, "%s\t%d\t%d\t%d\t%.4f\t%d\t%.4f\n",name,len,scafs,bases,bases*invlen,reads,reads*mult2));
1553 			}
1554 		}
1555 		tsw.poisonAndWait();
1556 	}
1557 
1558 	/**
1559 	 * Write processing statistics in DUK's format.
1560 	 * @param time Elapsed time, nanoseconds
1561 	 */
writeDuk(long time)1562 	private void writeDuk(long time){
1563 		if(outduk==null){return;}
1564 		final TextStreamWriter tsw=new TextStreamWriter(outduk, overwrite, false, false);
1565 		tsw.start();
1566 		tsw.println(dukString(time));
1567 		tsw.poisonAndWait();
1568 	}
1569 
1570 	/**
1571 	 * Write RQCFilter stats.
1572 	 * @param time Elapsed time, nanoseconds
1573 	 */
writeRqc()1574 	private void writeRqc(){
1575 		if(outrqc==null){return;}
1576 		addToRqcMap();
1577 		if(outrqc.endsWith("hashmap")){return;}
1578 		final TextStreamWriter tsw=new TextStreamWriter(outrqc, overwrite, false, false);
1579 		tsw.start();
1580 		tsw.println(rqcString());
1581 		tsw.poisonAndWait();
1582 	}
1583 
rqcString()1584 	public static String rqcString(){
1585 		if(RQC_MAP==null){return null;}
1586 		StringBuilder sb=new StringBuilder();
1587 
1588 		String[] keys=new String[] {"inputReads", "inputBases", "qtrimmedReads", "qtrimmedBases", "qfilteredReads", "qfilteredBases",
1589 				"ktrimmedReads", "ktrimmedBases", "kfilteredReads", "kfilteredBases", "outputReads", "outputBases"};
1590 
1591 		for(String key : keys){
1592 			Object value=RQC_MAP.get(key);
1593 			if(value!=null){
1594 				sb.append(key+"="+value+"\n");
1595 			}
1596 		}
1597 
1598 		return sb.toString();
1599 	}
1600 
addToRqcMap()1601 	private void addToRqcMap(){
1602 		putRqc("inputReads", readsIn, false, false);
1603 		putRqc("inputBases", basesIn, false, false);
1604 		if(qtrimLeft || qtrimRight){
1605 			putRqc("qtrimmedReads", readsQTrimmed, false, true);
1606 			putRqc("qtrimmedBases", basesQTrimmed, false, true);
1607 		}
1608 		putRqc("qfilteredReads", readsQFiltered, false, true);
1609 		putRqc("qfilteredBases", basesQFiltered, false, true);
1610 
1611 		if(ktrimLeft || ktrimRight || ktrimN){
1612 			putRqc("ktrimmedReads", readsKTrimmed, true, true);
1613 			putRqc("ktrimmedBases", basesKTrimmed, true, true);
1614 		}else{
1615 			putRqc("kfilteredReads", readsKFiltered, false, true);
1616 			putRqc("kfilteredBases", basesKFiltered, false, true);
1617 		}
1618 		putRqc("outputReads", readsOut, true, false);
1619 		putRqc("outputBases", basesOut, true, false);
1620 	}
1621 
putRqc(String key, Long value, boolean evict, boolean add)1622 	public static void putRqc(String key, Long value, boolean evict, boolean add){
1623 		if(RQC_MAP==null){RQC_MAP=new HashMap<String,Long>();}
1624 		Long old=RQC_MAP.get(key);
1625 		if(evict || old==null){RQC_MAP.put(key, value);}
1626 		else if(add){RQC_MAP.put(key, value+old);}
1627 	}
1628 
1629 	/**
1630 	 * Helper method; formats statistics to be duk-compatible
1631 	 * @param time Elapsed time, nanoseconds
1632 	 * @return duk output string
1633 	 */
dukString(long time)1634 	private String dukString(long time){
1635 		StringBuilder sb=new StringBuilder();
1636 		sb.append("##INPUT PARAMETERS##\n");
1637 		sb.append("#Reference file:	"+(ref==null || ref.length<1 ? null : ref.length==1 ? ref[0] : Arrays.toString(ref))+"\n");
1638 		sb.append("#Query file:	"+in1+(in2==null ? "" : ","+in2)+"\n");
1639 		sb.append("#Not matched reads file:	"+out1+(out2==null ? "" : ","+out2)+"\n");
1640 		sb.append("#Matched reads file:	"+outb1+(outb2==null ? "" : ","+outb2)+"\n");
1641 		sb.append("#Output file (duk):	"+outduk+"\n");
1642 		sb.append("#Output file (stats):	"+outstats+"\n");
1643 		sb.append("#Mer size:	"+k+"\n");
1644 		long size=0;
1645 		for(AbstractKmerTable x : keySets){size+=x.size();}
1646 		sb.append("#Avg step size:	"+String.format(Locale.ROOT, "%.1f", refKmers/(double)(Tools.max(1, size)))+"\n");
1647 		sb.append("#Cut off:	"+maxBadKmers0+"\n");
1648 		sb.append("#Mask middle:	"+maskMiddle+"\n");
1649 		sb.append("#Quality trim:	"+((qtrimLeft || qtrimRight) ? trimq : "false")+"\n");
1650 		sb.append("\n");
1651 
1652 		sb.append("##REFERENCE STAT##\n");
1653 		sb.append("#Total Reads:	"+refReads+"\n");
1654 		sb.append("#Total Bases:	"+refBases+"\n");
1655 		sb.append("#Total kmers:	"+refKmers+"\n");
1656 		sb.append("#Total stored kmers:	"+size+"\n");
1657 		sb.append("\n");
1658 
1659 		sb.append("## ELAPSED TIME##\n");
1660 		sb.append("# Time:	"+String.format(Locale.ROOT, "%.2f", time/1000000000.0)+" seconds\n");
1661 		sb.append("\n");
1662 
1663 		sb.append("##QUERY FILE STAT##\n");
1664 		sb.append("# Total number of reads:	"+readsIn+"\n");
1665 		sb.append("# Total number of matched reads:	"+readsKFiltered+"\n");
1666 		sb.append("# Match ratio:	"+String.format(Locale.ROOT, "%.6f", readsKFiltered*1.0/readsIn)+"\n");
1667 		sb.append("\n");
1668 
1669 		sb.append("##P-VALUE##\n");
1670 		sb.append("#Avg number of Kmer for each read:	"+((basesIn/(Tools.max(readsIn, 1)))-k)+"\n");
1671 //		sb.append("# P value for the given threshold 1 is 4.05231e-14\n"); //duk prints a P value; not sure what it means
1672 		sb.append("\n");
1673 
1674 		sb.append("## Histogram of kmer occurance for reads with at least one occurance ##\n");
1675 		sb.append("#NumOcc\tNumReads\tPercentage\n");
1676 
1677 		long sum=Tools.sum(hitCounts);
1678 		double mult=100.0/(sum<1 ? 1 : sum);
1679 		for(int i=0; i<hitCounts.length; i++){
1680 			long x=hitCounts[i];
1681 			if(x>0){
1682 				sb.append(i).append('\t').append(x).append('\t').append(String.format(Locale.ROOT, "%.4f",(x*mult))).append('\n');
1683 			}
1684 		}
1685 
1686 		return sb.toString();
1687 	}
1688 
1689 	/**
1690 	 * Fills the scaffold names array with reference names.
1691 	 */
toRefNames()1692 	private void toRefNames(){
1693 		final int numRefs=refNames.size();
1694 		for(int r=0, s=1; r<numRefs; r++){
1695 			final int scafs=refScafCounts[r];
1696 			final int lim=s+scafs;
1697 			final String name=ReadWrite.stripToCore(refNames.get(r));
1698 //			outstream.println("r="+r+", s="+s+", scafs="+scafs+", lim="+lim+", name="+name);
1699 			while(s<lim){
1700 //				outstream.println(r+", "+s+". Setting "+scaffoldNames.get(s)+" -> "+name);
1701 				scaffoldNames.set(s, name);
1702 				s++;
1703 			}
1704 		}
1705 	}
1706 
getPolymerRatio()1707 	public double getPolymerRatio(){
1708 		return pTracker.calcRatioCumulative(polymerChar1, polymerChar2, polymerLength);
1709 	}
1710 
1711 
1712 	/*--------------------------------------------------------------*/
1713 	/*----------------         Inner Methods        ----------------*/
1714 	/*--------------------------------------------------------------*/
1715 
1716 
1717 	/**
1718 	 * Fills tables with kmers from references, using multiple LoadThread.
1719 	 * @return Number of kmers stored.
1720 	 */
spawnLoadThreads()1721 	private long spawnLoadThreads(){
1722 		Timer t=new Timer();
1723 		if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;}
1724 		long added=0;
1725 
1726 		/* Create load threads */
1727 		LoadThread[] loaders=new LoadThread[WAYS];
1728 		for(int i=0; i<loaders.length; i++){
1729 			loaders[i]=new LoadThread(i);
1730 			loaders[i].start();
1731 		}
1732 
1733 		/* For each reference file... */
1734 		int refNum=0;
1735 		if(ref!=null){
1736 			for(String refname : ref){
1737 
1738 				/* Start an input stream */
1739 				FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, false, true);
1740 				ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true);
1741 				cris.start(); //4567
1742 				ListNum<Read> ln=cris.nextList();
1743 				ArrayList<Read> reads=(ln!=null ? ln.list : null);
1744 
1745 				/* Iterate through read lists from the input stream */
1746 				while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
1747 					{
1748 						/* Assign a unique ID number to each scaffold */
1749 						ArrayList<Read> reads2=new ArrayList<Read>(reads);
1750 						for(Read r1 : reads2){
1751 							final Read r2=r1.mate;
1752 							final Integer id=scaffoldNames.size();
1753 							refScafCounts[refNum]++;
1754 							scaffoldNames.add(r1.id==null ? id.toString() : r1.id);
1755 							int len=r1.length();
1756 							r1.obj=id;
1757 							if(r2!=null){
1758 								r2.obj=id;
1759 								len+=r2.length();
1760 							}
1761 							scaffoldLengths.add(len);
1762 						}
1763 
1764 						if(REPLICATE_AMBIGUOUS){
1765 							reads2=Tools.replicateAmbiguous(reads2, Tools.min(k, mink));
1766 						}
1767 
1768 						/* Send a pointer to the read list to each LoadThread */
1769 						for(LoadThread lt : loaders){
1770 							boolean b=true;
1771 							while(b){
1772 								try {
1773 									lt.queue.put(reads2);
1774 									b=false;
1775 								} catch (InterruptedException e) {
1776 									//TODO:  This will hang due to still-running threads.
1777 									throw new RuntimeException(e);
1778 								}
1779 							}
1780 						}
1781 					}
1782 
1783 					/* Dispose of the old list and fetch a new one */
1784 					cris.returnList(ln);
1785 					ln=cris.nextList();
1786 					reads=(ln!=null ? ln.list : null);
1787 				}
1788 				/* Cleanup */
1789 				cris.returnList(ln);
1790 				errorState|=ReadWrite.closeStream(cris);
1791 				refNum++;
1792 			}
1793 		}
1794 
1795 		/* If there are literal sequences to use as references */
1796 		if(literal!=null){
1797 			ArrayList<Read> list=new ArrayList<Read>(literal.length);
1798 			if(verbose){outstream.println("Adding literals "+Arrays.toString(literal));}
1799 
1800 			/* Assign a unique ID number to each literal sequence */
1801 			for(int i=0; i<literal.length; i++){
1802 				final Integer id=scaffoldNames.size();
1803 				final Read r=new Read(literal[i].getBytes(), null, id);
1804 				refScafCounts[refNum]++;
1805 				scaffoldNames.add(id.toString());
1806 				scaffoldLengths.add(r.length());
1807 				r.obj=id;
1808 				list.add(r);
1809 			}
1810 
1811 			if(REPLICATE_AMBIGUOUS){
1812 				list=Tools.replicateAmbiguous(list, Tools.min(k, mink));
1813 			}
1814 
1815 			/* Send a pointer to the read list to each LoadThread */
1816 			for(LoadThread lt : loaders){
1817 				boolean b=true;
1818 				while(b){
1819 					try {
1820 						lt.queue.put(list);
1821 						b=false;
1822 					} catch (InterruptedException e) {
1823 						//TODO:  This will hang due to still-running threads.
1824 						throw new RuntimeException(e);
1825 					}
1826 				}
1827 			}
1828 		}
1829 
1830 		/* Signal loaders to terminate */
1831 		for(LoadThread lt : loaders){
1832 			boolean b=true;
1833 			while(b){
1834 				try {
1835 					lt.queue.put(POISON);
1836 					b=false;
1837 				} catch (InterruptedException e) {
1838 					//TODO:  This will hang due to still-running threads.
1839 					throw new RuntimeException(e);
1840 				}
1841 			}
1842 		}
1843 
1844 		/* Wait for loaders to die, and gather statistics */
1845 		boolean success=true;
1846 		for(LoadThread lt : loaders){
1847 			while(lt.getState()!=Thread.State.TERMINATED){
1848 				try {
1849 					lt.join();
1850 				} catch (InterruptedException e) {
1851 					// TODO Auto-generated catch block
1852 					e.printStackTrace();
1853 				}
1854 			}
1855 			added+=lt.addedT;
1856 			refKmers+=lt.refKmersT;
1857 			refBases+=lt.refBasesT;
1858 			refReads+=lt.refReadsT;
1859 //			modsum+=lt.modsumT;
1860 			success&=lt.success;
1861 		}
1862 		if(!success){KillSwitch.kill("Failed loading ref kmers; aborting.");}
1863 
1864 		//Correct statistics for number of threads, since each thread processes all reference data
1865 		refKmers/=WAYS;
1866 		refBases/=WAYS;
1867 		refReads/=WAYS;
1868 
1869 		scaffoldReadCounts=new AtomicLongArray(scaffoldNames.size());
1870 		scaffoldBaseCounts=new AtomicLongArray(scaffoldNames.size());
1871 
1872 		t.stop();
1873 		if(DISPLAY_PROGRESS && !json){
1874 			outstream.println("Added "+added+" kmers; time: \t"+t);
1875 			Shared.printMemory();
1876 			outstream.println();
1877 		}
1878 
1879 		if(verbose){
1880 			TextStreamWriter tsw=new TextStreamWriter("stdout", false, false, false, FileFormat.TEXT);
1881 			tsw.start();
1882 			for(AbstractKmerTable table : keySets){
1883 				table.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE);
1884 			}
1885 			tsw.poisonAndWait();
1886 		}
1887 
1888 		return added;
1889 	}
1890 
1891 	/**
1892 	 * Match reads against reference kmers, using multiple ProcessThread.
1893 	 * @param t
1894 	 */
spawnProcessThreads(Timer t)1895 	private void spawnProcessThreads(Timer t){
1896 		t.start();
1897 
1898 		/* Create read input stream */
1899 		final ConcurrentReadInputStream cris;
1900 		final boolean paired;
1901 		{
1902 			cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ffin1.samOrBam(), ffin1, ffin2, qfin1, qfin2);
1903 			cris.setSampleRate(samplerate, sampleseed);
1904 			cris.start(); //4567
1905 			paired=cris.paired();
1906 			if(!ffin1.samOrBam() && !silent){
1907 				if(json){
1908 					jsonStats.add("paired", paired);
1909 				}else{
1910 					outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));
1911 				}
1912 			}
1913 		}
1914 
1915 		/* Create read output streams */
1916 		final ConcurrentReadOutputStream ros, rosb, ross;
1917 		{
1918 			final int buff=(!ordered ? 12 : Tools.max(32, 2*Shared.threads()));
1919 			if(out1!=null){
1920 				ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, true);
1921 				ros.start();
1922 			}else{ros=null;}
1923 			if(outb1!=null){
1924 				rosb=ConcurrentReadOutputStream.getStream(ffoutb1, ffoutb2, null, null, buff, null, true);
1925 				rosb.start();
1926 			}else{rosb=null;}
1927 			if(outsingle!=null){
1928 				ross=ConcurrentReadOutputStream.getStream(ffouts, null, null, null, buff, null, true);
1929 				ross.start();
1930 			}else{ross=null;}
1931 			if(ros!=null || rosb!=null || ross!=null){
1932 				t.stop();
1933 				if(!silent && !json){outstream.println("Started output streams:\t"+t);}
1934 				t.start();
1935 			}
1936 		}
1937 
1938 		/* Optionally skip the first reads, since initial reads may have lower quality */
1939 		if(skipreads>0){
1940 			long skipped=0;
1941 
1942 			ListNum<Read> ln=cris.nextList();
1943 			ArrayList<Read> reads=(ln!=null ? ln.list : null);
1944 
1945 			while(skipped<skipreads && reads!=null && reads.size()>0){
1946 				skipped+=reads.size();
1947 
1948 				if(rosb!=null){rosb.add(new ArrayList<Read>(1), ln.id);}
1949 				if(ros!=null){ros.add(new ArrayList<Read>(1), ln.id);}
1950 				if(ross!=null){ross.add(new ArrayList<Read>(1), ln.id);}
1951 
1952 				cris.returnList(ln);
1953 				ln=cris.nextList();
1954 				reads=(ln!=null ? ln.list : null);
1955 			}
1956 			cris.returnList(ln);//if added for compiler benefit
1957 			if(reads==null || reads.isEmpty()){
1958 				ReadWrite.closeStreams(cris, ros, rosb, ross);
1959 				outstream.println("Skipped all of the reads.");
1960 				System.exit(0);
1961 			}
1962 		}
1963 
1964 		/* Create ProcessThreads */
1965 		ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(THREADS);
1966 		for(int i=0; i<THREADS; i++){alpt.add(new ProcessThread(cris, ros, rosb, ross, ALLOW_LOCAL_ARRAYS));}
1967 		for(ProcessThread pt : alpt){pt.start();}
1968 
1969 		/* Wait for threads to die, and gather statistics */
1970 		for(ProcessThread pt : alpt){
1971 
1972 			/* Wait for a thread to die */
1973 			while(pt.getState()!=Thread.State.TERMINATED){
1974 				try {
1975 					pt.join();
1976 				} catch (InterruptedException e) {
1977 					e.printStackTrace();
1978 				}
1979 			}
1980 
1981 			/* Accumulate data from per-thread counters */
1982 			readsIn+=pt.readsInT;
1983 			basesIn+=pt.basesInT;
1984 			readsOut+=pt.readsOutuT;
1985 			basesOut+=pt.basesOutuT;
1986 			readsKFiltered+=pt.readsKFilteredT;
1987 			basesKFiltered+=pt.basesKFilteredT;
1988 			readsQTrimmed+=pt.readsQTrimmedT;
1989 			basesQTrimmed+=pt.basesQTrimmedT;
1990 			readsFTrimmed+=pt.readsFTrimmedT;
1991 			basesFTrimmed+=pt.basesFTrimmedT;
1992 			readsKTrimmed+=pt.readsKTrimmedT;
1993 			basesKTrimmed+=pt.basesKTrimmedT;
1994 			readsTrimmedBySwift+=pt.readsTrimmedBySwiftT;
1995 			basesTrimmedBySwift+=pt.basesTrimmedBySwiftT;
1996 			readsTrimmedByOverlap+=pt.readsTrimmedByOverlapT;
1997 			basesTrimmedByOverlap+=pt.basesTrimmedByOverlapT;
1998 			badGcReads+=pt.badGcReadsT;
1999 			badGcBases+=pt.badGcBasesT;
2000 			badHeaderReads+=pt.badHeaderReadsT;
2001 			badHeaderBases+=pt.badHeaderBasesT;
2002 			readsQFiltered+=pt.readsQFilteredT;
2003 			basesQFiltered+=pt.basesQFilteredT;
2004 			readsNFiltered+=pt.readsNFilteredT;
2005 			basesNFiltered+=pt.basesNFilteredT;
2006 			readsEFiltered+=pt.readsEFilteredT;
2007 			basesEFiltered+=pt.basesEFilteredT;
2008 			readsPolyTrimmed+=pt.readsPolyTrimmedT;
2009 			basesPolyTrimmed+=pt.basesPolyTrimmedT;
2010 
2011 			if(pTracker!=null){
2012 				pTracker.add(pt.pTrackerT);
2013 			}
2014 
2015 			if(hitCounts!=null){
2016 				for(int i=0; i<hitCounts.length; i++){hitCounts[i]+=pt.hitCountsT[i];}
2017 				pt.hitCountsT=null;
2018 			}
2019 			if(pt.scaffoldReadCountsT!=null && scaffoldReadCounts!=null){
2020 				for(int i=0; i<pt.scaffoldReadCountsT.length; i++){scaffoldReadCounts.addAndGet(i, pt.scaffoldReadCountsT[i]);}
2021 				pt.scaffoldReadCountsT=null;
2022 			}
2023 			if(pt.scaffoldBaseCountsT!=null && scaffoldBaseCounts!=null){
2024 				for(int i=0; i<pt.scaffoldBaseCountsT.length; i++){scaffoldBaseCounts.addAndGet(i, pt.scaffoldBaseCountsT[i]);}
2025 				pt.scaffoldBaseCountsT=null;
2026 			}
2027 			errorState|=(!pt.finishedSuccessfully);
2028 		}
2029 
2030 		/* Shut down I/O streams; capture error status */
2031 		{
2032 			//Prevent a spurious error message in the event of a race condition when maxReads is set.
2033 			boolean b=ReadWrite.closeStream(cris);
2034 			if(maxReads<1 || maxReads==Long.MAX_VALUE || (maxReads!=readsIn && maxReads*2!=readsIn && samplerate<1)){errorState|=b;}
2035 		}
2036 		errorState|=ReadWrite.closeOutputStreams(ros, rosb, ross);
2037 		errorState|=ReadStats.writeAll();
2038 
2039 		t.stop();
2040 		if(showSpeed && !json){
2041 			outstream.println("Processing time:   \t\t"+t);
2042 		}
2043 	}
2044 
2045 	/*--------------------------------------------------------------*/
2046 	/*----------------         Inner Classes        ----------------*/
2047 	/*--------------------------------------------------------------*/
2048 
2049 	/**
2050 	 * Loads kmers into a table.  Each thread handles all kmers X such that X%WAYS==tnum.
2051 	 */
2052 	private class LoadThread extends Thread{
2053 
LoadThread(final int tnum_)2054 		public LoadThread(final int tnum_){
2055 			tnum=tnum_;
2056 			map=keySets[tnum];
2057 		}
2058 
2059 		/**
2060 		 * Get the next list of reads (or scaffolds) from the queue.
2061 		 * @return List of reads
2062 		 */
fetch()2063 		private ArrayList<Read> fetch(){
2064 			ArrayList<Read> list=null;
2065 			while(list==null){
2066 				try {
2067 					list=queue.take();
2068 				} catch (InterruptedException e) {
2069 					// TODO Auto-generated catch block
2070 					e.printStackTrace();
2071 				}
2072 			}
2073 			return list;
2074 		}
2075 
2076 		@Override
run()2077 		public void run(){
2078 			ArrayList<Read> reads=fetch();
2079 			while(reads!=POISON){
2080 				for(Read r1 : reads){
2081 					assert(r1.pairnum()==0);
2082 					final Read r2=r1.mate;
2083 
2084 					final int rblen=(r1==null ? 0 : r1.length());
2085 					final int rblen2=r1.mateLength();
2086 
2087 					addedT+=addToMap(r1, rblen>20000000 ? k : rblen>5000000 ? 11 : rblen>500000 ? 2 : 0);
2088 					if(r2!=null){
2089 						addedT+=addToMap(r2, rblen2>20000000 ? k : rblen2>5000000 ? 11 : rblen2>500000 ? 2 : 0);
2090 					}
2091 				}
2092 				reads=fetch();
2093 			}
2094 
2095 			if(map.canRebalance() && map.size()>2L*map.arrayLength()){
2096 				map.rebalance();
2097 			}
2098 			success=true;
2099 		}
2100 
2101 		/**
2102 		 * Store the read's kmers in a table.
2103 		 * @param r The current read to process
2104 		 * @param skip Number of bases to skip between kmers
2105 		 * @return Number of kmers stored
2106 		 */
addToMap(Read r, int skip)2107 		private long addToMap(Read r, int skip){
2108 			skip=Tools.max(minSkip, Tools.min(maxSkip, skip));
2109 			final byte[] bases=r.bases;
2110 			long kmer=0;
2111 			long rkmer=0;
2112 			long added=0;
2113 			int len=0;
2114 
2115 			if(bases!=null){
2116 				refReadsT++;
2117 				refBasesT+=bases.length;
2118 			}
2119 			if(bases==null || bases.length<k){return 0;}
2120 
2121 			final int id=(Integer)r.obj;
2122 
2123 			if(skip>1){ //Process while skipping some kmers
2124 				for(int i=0; i<bases.length; i++){
2125 					byte b=bases[i];
2126 					long x=symbolToNumber0[b];
2127 					long x2=symbolToComplementNumber0[b];
2128 					kmer=((kmer<<bitsPerBase)|x)&mask;
2129 					rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
2130 					if(isFullyDefined(b)){len++;}else{len=0; rkmer=0;}
2131 					if(verbose){outstream.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
2132 					if(len>=k){
2133 						refKmersT++;
2134 						if(len%skip==0){
2135 							final long extraBase=(i>=bases.length-1 ? -1 : symbolToNumber[bases[i+1]]);
2136 							added+=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
2137 							if(useShortKmers){
2138 								if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
2139 								if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
2140 							}
2141 						}
2142 					}
2143 				}
2144 			}else{ //Process all kmers
2145 				for(int i=0; i<bases.length; i++){
2146 					final byte b=bases[i];
2147 					final long x=symbolToNumber0[b];
2148 					final long x2=symbolToComplementNumber0[b];
2149 //					assert(x!=x2) : x+", "+x2+", "+Character.toString((char)b)+"\n"+Arrays.toString(symbolToNumber0)+"\n"+Arrays.toString(symbolToComplementNumber);
2150 					kmer=((kmer<<bitsPerBase)|x)&mask;
2151 					//10000, 1111111111, 16, 16, 2, 10, 8
2152 					rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
2153 					if(isFullyDefined(b)){len++;}else{len=0; rkmer=0;}
2154 					if(verbose){
2155 						if(verbose){
2156 							String fwd=new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k));
2157 							String rev=AminoAcid.reverseComplementBases(fwd);
2158 							String fwd2=kmerToString(kmer, Tools.min(len, k));
2159 							outstream.println("fwd="+fwd+", fwd2="+fwd2+", rev="+rev+", kmer="+kmer+", rkmer="+rkmer);
2160 							outstream.println("b="+(char)b+", x="+x+", x2="+x2+", bitsPerBase="+bitsPerBase+", shift2="+shift2);
2161 							if(!amino){
2162 								assert(AminoAcid.stringToKmer(fwd)==kmer) : fwd+", "+AminoAcid.stringToKmer(fwd)+", "+kmer+", "+len;
2163 								if(len>=k){
2164 									assert(rcomp(kmer, Tools.min(len, k))==rkmer);
2165 									assert(rcomp(rkmer, Tools.min(len, k))==kmer);
2166 									assert(AminoAcid.kmerToString(kmer, Tools.min(len, k)).equals(fwd));
2167 									assert(AminoAcid.kmerToString(rkmer, Tools.min(len, k)).equals(rev)) : AminoAcid.kmerToString(rkmer, Tools.min(len, k))+" != "+rev+" (rkmer)";
2168 								}
2169 								assert(fwd.equalsIgnoreCase(fwd2)) : fwd+", "+fwd2; //may be unsafe
2170 							}
2171 							outstream.println("Scanning6 i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+", bases="+fwd+", rbases="+rev);
2172 						}
2173 					}
2174 					if(len>=k){
2175 //						assert(kmer==rcomp(rkmer, k)) : Long.toBinaryString(kmer)+", "+Long.toBinaryString(rkmer)+", "+Long.toBinaryString(mask)+", x="+x+", x2="+x2+", bits="+bitsPerBase+", s="+shift+", s2="+shift2+", b="+Character.toString((char)b);
2176 						refKmersT++;
2177 						final long extraBase=(i>=bases.length-1 ? -1 : symbolToNumber[bases[i+1]]);
2178 						final long atm=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
2179 						added+=atm;
2180 //						assert(false) : atm+", "+map.contains(toValue(kmer, rkmer, kmask));
2181 						if(useShortKmers){
2182 							if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
2183 							if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
2184 						}
2185 					}
2186 				}
2187 			}
2188 			return added;
2189 		}
2190 
2191 
2192 		/**
2193 		 * Adds short kmers on the left end of the read.
2194 		 * @param kmer Forward kmer
2195 		 * @param rkmer Reverse kmer
2196 		 * @param extraBase Base added to end in case of deletions
2197 		 * @param id Scaffold number
2198 		 * @return Number of kmers stored
2199 		 */
addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id)2200 		private long addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id){
2201 			if(verbose){outstream.println("addToMapLeftShift");}
2202 			long added=0;
2203 			for(int i=k-1; i>=mink; i--){
2204 				kmer=kmer&rightMasks[i];
2205 				rkmer=rkmer>>>bitsPerBase;
2206 				long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2);
2207 				added+=x;
2208 				if(verbose){
2209 					if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){
2210 						outstream.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added left-shift kmer "+kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
2211 						outstream.println("i="+i+"; tnum="+tnum+"; Looking for left-shift kmer "+kmerToString(kmer&~lengthMasks[i], i));
2212 						final long value=toValue(kmer, rkmer, lengthMasks[i]);
2213 						if(map.contains(value)){outstream.println("Found "+value);}
2214 					}
2215 				}
2216 			}
2217 			return added;
2218 		}
2219 
2220 
2221 		/**
2222 		 * Adds short kmers on the right end of the read.
2223 		 * @param kmer Forward kmer
2224 		 * @param rkmer Reverse kmer
2225 		 * @param id Scaffold number
2226 		 * @return Number of kmers stored
2227 		 */
addToMapRightShift(long kmer, long rkmer, final int id)2228 		private long addToMapRightShift(long kmer, long rkmer, final int id){
2229 			if(verbose){outstream.println("addToMapRightShift");}
2230 			long added=0;
2231 			for(int i=k-1; i>=mink; i--){
2232 				long extraBase=kmer&symbolMask;
2233 				kmer=kmer>>>bitsPerBase;
2234 				rkmer=rkmer&rightMasks[i];
2235 //				assert(Long.numberOfLeadingZeros(kmer)>=2*(32-i)) : Long.numberOfLeadingZeros(kmer)+", "+i+", "+kmer+", "+kMasks[i];
2236 //				assert(Long.numberOfLeadingZeros(rkmer)>=2*(32-i)) : Long.numberOfLeadingZeros(rkmer)+", "+i+", "+rkmer+", "+kMasks[i];
2237 				long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2);
2238 				added+=x;
2239 				if(verbose){
2240 					if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){
2241 						outstream.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added right-shift kmer "+kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
2242 						outstream.println("i="+i+"; tnum="+tnum+"; Looking for right-shift kmer "+kmerToString(kmer&~lengthMasks[i], i));
2243 						final long value=toValue(kmer, rkmer, lengthMasks[i]);
2244 						if(map.contains(value)){outstream.println("Found "+value);}
2245 					}
2246 				}
2247 			}
2248 			return added;
2249 		}
2250 
2251 
2252 		/**
2253 		 * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance.
2254 		 * @param kmer Forward kmer
2255 		 * @param rkmer Reverse kmer
2256 		 * @param len Kmer length
2257 		 * @param extraBase Base added to end in case of deletions
2258 		 * @param id Scaffold number
2259 		 * @param kmask0
2260 		 * @return Number of kmers stored
2261 		 */
addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist)2262 		private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist){
2263 
2264 			assert(kmask0==lengthMasks[len]) : kmask0+", "+len+", "+lengthMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(lengthMasks[len]);
2265 
2266 			if(verbose){outstream.println("addToMap_A; len="+len+"; kMasks[len]="+lengthMasks[len]);}
2267 			assert((kmer&kmask0)==0);
2268 			final long added;
2269 			if(hdist==0){
2270 				final long key=toValue(kmer, rkmer, kmask0);
2271 				if(verbose){outstream.println("toValue ("+kmerToString(kmer, len)+", "+kmerToString(rkmer, len)+") = "+kmerToString(key, len)+" = "+key);}
2272 				if(failsSpeed(key)){return 0;}
2273 				if(key%WAYS!=tnum){return 0;}
2274 				if(verbose){outstream.println("addToMap_B: "+kmerToString(key, len)+" ("+key+")");}
2275 				added=map.setIfNotPresent(key, id);
2276 			}else if(edist>0){
2277 //				long extraBase=(i>=bases.length-1 ? -1 : symbolToNumber2bases[i+1]]);
2278 				added=mutate(kmer, rkmer, len, id, edist, extraBase);
2279 			}else{
2280 				added=mutate(kmer, rkmer, len, id, hdist, -1);
2281 			}
2282 			if(verbose){outstream.println("addToMap added "+added+" keys.");}
2283 			return added;
2284 		}
2285 
2286 		/**
2287 		 * Mutate and store this kmer through 'dist' recursions.
2288 		 * @param kmer Forward kmer
2289 		 * @param rkmer Reverse kmer
2290 		 * @param id Scaffold number
2291 		 * @param dist Number of mutations
2292 		 * @param extraBase Base added to end in case of deletions
2293 		 * @return Number of kmers stored
2294 		 */
mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase)2295 		private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){
2296 			long added=0;
2297 
2298 			final long key=toValue(kmer, rkmer, lengthMasks[len]);
2299 
2300 //			if(dist==1){System.err.println(".\t.\t"+kmerToString(kmer, k)+" initial.");}//123
2301 
2302 			if(verbose){outstream.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+lengthMasks[len]);}
2303 			if(key%WAYS==tnum){
2304 				if(verbose){outstream.println("mutate_B: "+kmerToString(kmer&~lengthMasks[len], len)+" = "+key);}
2305 				int x=map.setIfNotPresent(key, id);
2306 //				if(x>0){System.err.println(".\t.\t"+kmerToString(kmer, k)+" Added!");}//123
2307 				if(verbose){outstream.println("mutate_B added "+x+" keys.");}
2308 				added+=x;
2309 				assert(map.contains(key));
2310 			}
2311 
2312 			if(dist>0){
2313 				final int dist2=dist-1;
2314 
2315 				//Sub
2316 				for(int j=0; j<symbols; j++){
2317 					for(int i=0; i<len; i++){
2318 						final long temp=(kmer&clearMasks[i])|setMasks[j][i];
2319 //						System.err.println("cm:"+kmerToString(clearMasks[i], k));//123
2320 //						System.err.println("sm:"+kmerToString(setMasks[j][i], k));//123
2321 //						assert(Long.bitCount((temp^kmer))<=bitsPerBase) : //Warning: Slow assertion //123
2322 //							"\n"+kmerToString(kmer, k)+"\n"+kmerToString(temp, k)+"\n"+i+","+j+","+k;
2323 //						System.err.println(j+"\t"+i+"\t"+kmerToString(temp, k));//123
2324 						if(temp!=kmer){
2325 							long rtemp=rcomp(temp, len);
2326 							added+=mutate(temp, rtemp, len, id, dist2, extraBase);
2327 						}
2328 					}
2329 				}
2330 
2331 				if(editDistance>0){
2332 					//Del
2333 					if(extraBase>=0 && extraBase<=maxSymbol){
2334 						for(int i=1; i<len; i++){
2335 							final long temp=(kmer&leftMasks[i])|((kmer<<bitsPerBase)&rightMasks[i])|extraBase;
2336 							if(temp!=kmer){
2337 								long rtemp=rcomp(temp, len);
2338 								added+=mutate(temp, rtemp, len, id, dist2, -1);
2339 							}
2340 						}
2341 					}
2342 
2343 					//Ins
2344 					final long eb2=kmer&symbolMask;
2345 					for(int i=1; i<len; i++){
2346 						final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>bitsPerBase);
2347 						for(int j=0; j<symbols; j++){
2348 							final long temp=temp0|setMasks[j][i-1];
2349 							if(temp!=kmer){
2350 								long rtemp=rcomp(temp, len);
2351 								added+=mutate(temp, rtemp, len, id, dist2, eb2);
2352 							}
2353 						}
2354 					}
2355 				}
2356 
2357 			}
2358 
2359 //			if(dist==1){//123
2360 //				System.err.println("Added "+added);
2361 //				assert(false);
2362 //			}
2363 
2364 			return added;
2365 		}
2366 
2367 		/*--------------------------------------------------------------*/
2368 
2369 		/** Number of kmers stored by this thread */
2370 		public long addedT=0;
2371 		/** Number of items encountered by this thread */
2372 		public long refKmersT=0, refReadsT=0, refBasesT=0;
2373 		/** Thread number; used to determine which kmers to store */
2374 		public final int tnum;
2375 		/** Buffer of input read lists */
2376 		public final ArrayBlockingQueue<ArrayList<Read>> queue=new ArrayBlockingQueue<ArrayList<Read>>(32);
2377 //		/** Used to trick compiler */
2378 //		public long modsumT=0; //123
2379 
2380 		/** Destination for storing kmers */
2381 		private final AbstractKmerTable map;
2382 
2383 		/** Completed successfully */
2384 		boolean success=false;
2385 
2386 	}
2387 
2388 	/*--------------------------------------------------------------*/
2389 
2390 	/**
2391 	 * Matches read kmers against reference kmers, performs binning and/or trimming, and writes output.
2392 	 */
2393 	private class ProcessThread extends Thread{
2394 
2395 		/**
2396 		 * Constructor
2397 		 * @param cris_ Read input stream
2398 		 * @param ros_ Unmatched read output stream (optional)
2399 		 * @param rosb_ Matched read output stream (optional)
2400 		 * @param ross_ Singleton read output stream (optional)
2401 		 */
ProcessThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, ConcurrentReadOutputStream rosb_, ConcurrentReadOutputStream ross_, boolean localArrays)2402 		public ProcessThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, ConcurrentReadOutputStream rosb_, ConcurrentReadOutputStream ross_, boolean localArrays){
2403 			cris=cris_;
2404 			ros=ros_;
2405 			rosb=rosb_;
2406 			ross=ross_;
2407 
2408 			readstats=makeReadStats ? new ReadStats() : null;
2409 
2410 			final int alen=(scaffoldNames==null ? 0 : scaffoldNames.size());
2411 
2412 			if(findBestMatch){
2413 				countArray=new int[alen];
2414 				idList=new IntList();
2415 				countList=new IntList();
2416 			}else{
2417 				countArray=null;
2418 				idList=countList=null;
2419 			}
2420 
2421 			overlapVector=(trimByOverlap ? new int[5] : null);
2422 
2423 			hitCountsT=(hitCounts==null ? null : new long[hitCounts.length]);
2424 
2425 			if(localArrays && alen>0 && alen<10000 && scaffoldReadCounts!=null && scaffoldBaseCounts!=null){
2426 				scaffoldReadCountsT=new long[alen];
2427 				scaffoldBaseCountsT=new long[alen];
2428 			}else{
2429 				scaffoldReadCountsT=scaffoldBaseCountsT=null;
2430 			}
2431 
2432 			if(calcEntropy){
2433 				eTrackerT=new EntropyTracker(amino, Tools.max(0, entropyCutoff), entropyHighpass);
2434 			}else{
2435 				eTrackerT=null;
2436 			}
2437 
2438 			if(countPolymers){
2439 				pTrackerT=new PolymerTracker();
2440 			}else{
2441 				pTrackerT=null;
2442 			}
2443 
2444 			maxBasesOutmT=(maxBasesOutm>0 ? Tools.max(1, maxBasesOutm/THREADS) : -1);
2445 			maxBasesOutuT=(maxBasesOutu>0 ? Tools.max(1, maxBasesOutu/THREADS) : -1);
2446 
2447 			flowCoords=(locationFilter ? new FlowcellCoordinate() : null);
2448 		}
2449 
2450 		@Override
run()2451 		public void run(){
2452 			ListNum<Read> ln=cris.nextList();
2453 			ArrayList<Read> reads=(ln!=null ? ln.list : null);
2454 			ArrayList<Read> bad=(rosb==null ? null : new ArrayList<Read>(Shared.bufferLen()));
2455 			ArrayList<Read> single=new ArrayList<Read>(Shared.bufferLen());
2456 
2457 			final boolean ktrimrl=ktrimLeft || ktrimRight;
2458 			final boolean doKmerTrimming=storedKmers>0 && (ktrimLeft || ktrimRight || ktrimN || ksplit);
2459 			final boolean doKmerFiltering=storedKmers>0 && !doKmerTrimming;
2460 
2461 			//While there are more reads lists...
2462 			while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
2463 
2464 				int removed=0;
2465 
2466 				//For each read (or pair) in the list...
2467 				for(int i=0; i<reads.size(); i++){
2468 					final Read r1=reads.get(i);
2469 					if(pairedToSingle && r1.mate!=null){
2470 						r1.mate=null;
2471 					}
2472 					final Read r2=r1.mate;
2473 
2474 					if(!r1.validated()){r1.validate(true);}
2475 					if(r2!=null && !r2.validated()){r2.validate(true);}
2476 
2477 					boolean remove=false;
2478 					if(tossJunk){
2479 						if(r1!=null && r1.junk()){
2480 							setDiscarded(r1);
2481 							remove=true;
2482 						}
2483 						if(r2!=null && r2.junk()){
2484 							setDiscarded(r2);
2485 							remove=true;
2486 						}
2487 					}
2488 
2489 					if(isNotDiscarded(r1)){
2490 
2491 						if(histogramsBeforeProcessing){addToHistograms(r1, r2);}
2492 
2493 						if(loglogIn!=null){loglogIn.hash(r1);}
2494 
2495 					}
2496 
2497 					final int initialLength1=r1.length();
2498 					final int initialLength2=r1.mateLength();
2499 					final int initialPairLength=initialLength1+initialLength2;
2500 					final int pairCount=r1.pairCount();
2501 
2502 					final int minlen1=(int)Tools.max(initialLength1*minLenFraction, minReadLength);
2503 					final int minlen2=(int)Tools.max(initialLength2*minLenFraction, minReadLength);
2504 
2505 					if(verbose){outstream.println("Considering read "+r1.id+" "+new String(r1.bases));}
2506 
2507 					readsInT+=pairCount;
2508 					basesInT+=initialPairLength;
2509 
2510 					if(!remove){//due to being junk
2511 						if(chastityFilter){
2512 							if(r1!=null && r1.failsChastity()){
2513 								setDiscarded(r1);
2514 								if(r2!=null){setDiscarded(r2);}
2515 							}
2516 						}
2517 
2518 						if(locationFilter && isNotDiscarded(r1)){
2519 							flowCoords.setFrom(r1.id);
2520 							boolean discard=false;
2521 							if(xMinLoc>-1 && flowCoords.x<xMinLoc){discard=true;}
2522 							if(xMaxLoc>-1 && flowCoords.x>xMaxLoc){discard=true;}
2523 							if(yMinLoc>-1 && flowCoords.y<yMinLoc){discard=true;}
2524 							if(yMaxLoc>-1 && flowCoords.y>yMaxLoc){discard=true;}
2525 							if(discard){
2526 								setDiscarded(r1);
2527 								if(r2!=null){setDiscarded(r2);}
2528 							}
2529 						}
2530 
2531 						if(removeBadBarcodes){
2532 							if(isNotDiscarded(r1) && r1.failsBarcode(barcodes, failIfNoBarcode)){
2533 								if(failBadBarcodes){KillSwitch.kill("Invalid barcode detected: "+r1.id+"\nThis can be disabled with the flag barcodefilter=f");}
2534 								setDiscarded(r1);
2535 								if(r2!=null){setDiscarded(r2);}
2536 							}
2537 						}
2538 
2539 						if(isDiscarded(r1)){
2540 							badHeaderBasesT+=initialPairLength;
2541 							badHeaderReadsT+=pairCount;
2542 						}
2543 
2544 						if(recalibrateQuality){
2545 							if(isNotDiscarded(r1)){
2546 								CalcTrueQuality.recalibrate(r1);
2547 							}
2548 							if(isNotDiscarded(r2)){
2549 								CalcTrueQuality.recalibrate(r2);
2550 							}
2551 						}
2552 
2553 						if(filterGC && (initialLength1>0 || initialLength2>0)){
2554 							float gc1=(initialLength1>0 ? r1.gc() : -1);
2555 							float gc2=(initialLength2>0 ? r2.gc() : gc1);
2556 							if(gc1==-1){gc1=gc2;}
2557 							if(usePairGC){
2558 								final float gc;
2559 								if(r2==null){
2560 									gc=gc1;
2561 								}else{
2562 									gc=(gc1*initialLength1+gc2*initialLength2)/(initialPairLength);
2563 								}
2564 								gc1=gc2=gc;
2565 							}
2566 							if(isNotDiscarded(r1) && (gc1<minGC || gc1>maxGC)){
2567 								setDiscarded(r1);
2568 								badGcBasesT+=initialLength1;
2569 								badGcReadsT++;
2570 							}
2571 							if(isNotDiscarded(r2) && (gc2<minGC || gc2>maxGC)){
2572 								setDiscarded(r2);
2573 								badGcBasesT+=initialLength2;
2574 								badGcReadsT++;
2575 							}
2576 						}
2577 
2578 						if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
2579 							if(isNotDiscarded(r1)){
2580 								final int len=r1.length();
2581 								final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
2582 								final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
2583 								final int b1=forceTrimRight>0 ? forceTrimRight : len;
2584 								final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
2585 								final int b=Tools.min(b0, b1, b2);
2586 								final int x=TrimRead.trimToPosition(r1, a, b, 1);
2587 								basesFTrimmedT+=x;
2588 								readsFTrimmedT+=(x>0 ? 1 : 0);
2589 								if(r1.length()<minlen1){setDiscarded(r1);}
2590 							}
2591 							if(isNotDiscarded(r2)){
2592 								final int len=r2.length();
2593 								final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
2594 								final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
2595 								final int b1=forceTrimRight>0 ? forceTrimRight : len;
2596 								final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
2597 								final int b=Tools.min(b0, b1, b2);
2598 								final int x=TrimRead.trimToPosition(r2, a, b, 1);
2599 								basesFTrimmedT+=x;
2600 								readsFTrimmedT+=(x>0 ? 1 : 0);
2601 								if(r2.length()<minlen2){setDiscarded(r2);}
2602 							}
2603 						}
2604 
2605 						if(filterVars){
2606 							if(isNotDiscarded(r1)){
2607 								boolean b=passesVariantFilter(r1);
2608 								if(!b){setDiscarded(r1);}
2609 							}
2610 							if(isNotDiscarded(r2)){
2611 								boolean b=passesVariantFilter(r2);
2612 								if(!b){setDiscarded(r2);}
2613 							}
2614 						}
2615 
2616 						if(isNotDiscarded(r1) && r1.length()<minlen1){setDiscarded(r1);}
2617 						if(isNotDiscarded(r2) && r2.length()<minlen2){setDiscarded(r2);}
2618 
2619 						if(removePairsIfEitherBad){remove=isDiscarded(r1) || isDiscarded(r2);}
2620 						else{remove=isDiscarded(r1) && isNullOrDiscarded(r2);}
2621 					}
2622 
2623 					if(remove){
2624 						if(r1!=null){
2625 							basesQFilteredT+=initialLength1;
2626 							readsQFilteredT++;
2627 						}
2628 						if(r2!=null){
2629 							basesQFilteredT+=initialLength2;
2630 							readsQFilteredT++;
2631 						}
2632 						if(bad!=null){bad.add(r1);}
2633 					}else{
2634 
2635 						if(ecc && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
2636 
2637 						//Process kmers
2638 						if(doKmerTrimming){
2639 
2640 							int rlen1=0, rlen2=0;
2641 							int xsum=0;
2642 							int rktsum=0;
2643 
2644 							if(ktrimrl){
2645 								if(r1!=null){
2646 									int x=ktrim(r1, keySets);
2647 									xsum+=x;
2648 									rktsum+=(x>0 ? 1 : 0);
2649 									rlen1=r1.length();
2650 									if(rlen1<minlen1){setDiscarded(r1);}
2651 								}
2652 								if(r2!=null){
2653 									int x=ktrim(r2, keySets);
2654 									xsum+=x;
2655 									rktsum+=(x>0 ? 1 : 0);
2656 									rlen2=r2.length();
2657 									if(rlen2<minlen2){setDiscarded(r2);}
2658 								}
2659 							}else if(ktrimN){
2660 								if(r1!=null){
2661 									int x=kmask(r1, keySets);
2662 									xsum+=x;
2663 									rktsum+=(x>0 ? 1 : 0);
2664 									rlen1=r1.length();
2665 									if(rlen1<minlen1){setDiscarded(r1);}
2666 								}
2667 								if(r2!=null){
2668 									int x=kmask(r2, keySets);
2669 									xsum+=x;
2670 									rktsum+=(x>0 ? 1 : 0);
2671 									rlen2=r2.length();
2672 									if(rlen2<minlen2){setDiscarded(r2);}
2673 								}
2674 							}else if(ksplit){
2675 								assert(r2==null);
2676 								if(r1!=null){
2677 									int oldLen=r1.pairLength();
2678 									boolean b=ksplit(r1, keySets);
2679 									int trimmed=oldLen-r1.pairLength();
2680 									xsum+=trimmed;
2681 									rktsum+=(trimmed>0 ? 1 : 0);
2682 									rlen1=r1.length();
2683 //									if(rlen1<minlen1){setDiscarded(r1);}
2684 //									if(b && r1.mateLength()<minlen1){setDiscarded(r1);}
2685 								}
2686 							}
2687 
2688 							if(ksplit){
2689 								remove=(r1.mate!=null);
2690 								if(remove && addTrimmedToBad && bad!=null){bad.add(r1);}
2691 							}else if(shouldRemove(r1, r2)){
2692 								if(!ktrimN){
2693 									xsum+=(rlen1+rlen2);
2694 									rktsum=pairCount;
2695 								}
2696 								remove=true;
2697 								if(addTrimmedToBad && bad!=null){bad.add(r1);}
2698 							}else if(ktrimRight && trimPairsEvenly && xsum>0 && r2!=null && r1.length()!=r2.length()){
2699 								int x;
2700 								if(r1.length()>r2.length()){
2701 									x=TrimRead.trimToPosition(r1, 0, r2.length()-1, 1);
2702 								}else{
2703 									x=TrimRead.trimToPosition(r2, 0, r1.length()-1, 1);
2704 								}
2705 								if(rktsum<2){rktsum++;}
2706 								xsum+=x;
2707 								assert(r1.length()==r2.length()) : r1.length()+", "+r2.length();
2708 							}
2709 							basesKTrimmedT+=xsum;
2710 							readsKTrimmedT+=rktsum;
2711 
2712 						}else if(doKmerFiltering){
2713 							//Do kmer matching
2714 
2715 							if(minCoveredFraction>0){
2716 								if(isNotDiscarded(r1)){
2717 									final int minCoveredBases=(int)Math.ceil(minCoveredFraction*r1.length());
2718 									final int covered=countCoveredBases(r1, keySets, minCoveredBases);
2719 									if(covered>=minCoveredBases){setDiscarded(r1);}
2720 								}
2721 								if(isNotDiscarded(r2)){
2722 									final int minCoveredBases=(int)Math.ceil(minCoveredFraction*r2.length());
2723 									final int covered=countCoveredBases(r2, keySets, minCoveredBases);
2724 									if(covered>=minCoveredBases){setDiscarded(r2);}
2725 								}
2726 							}else{
2727 
2728 								final int maxBadKmersR1, maxBadKmersR2;
2729 								if(minKmerFraction==0){
2730 									maxBadKmersR1=maxBadKmersR2=maxBadKmers0;
2731 								}else{
2732 									final int vk1=r1.numValidKmers(keff), vk2=(r2==null ? 0 : r2.numValidKmers(keff));
2733 									maxBadKmersR1=Tools.max(maxBadKmers0, (int)((vk1-1)*minKmerFraction));
2734 									maxBadKmersR2=Tools.max(maxBadKmers0, (int)((vk2-1)*minKmerFraction));
2735 								}
2736 
2737 								if(!findBestMatch){
2738 									final int a=(kbig<=k ? countSetKmers(r1, keySets, maxBadKmersR1) : countSetKmersBig(r1, keySets, maxBadKmersR1));
2739 									final int b=(kbig<=k ? countSetKmers(r2, keySets, maxBadKmersR2) : countSetKmersBig(r2, keySets, maxBadKmersR2));
2740 
2741 									if(r1!=null && a>maxBadKmersR1){setDiscarded(r1);}
2742 									if(r2!=null && b>maxBadKmersR2){setDiscarded(r2);}
2743 
2744 								}else{
2745 									final int a=findBestMatch(r1, keySets, maxBadKmersR1);
2746 									final int b=findBestMatch(r2, keySets, maxBadKmersR2);
2747 
2748 									if(r1!=null && a>0){setDiscarded(r1);}
2749 									if(r2!=null && b>0){setDiscarded(r2);}
2750 								}
2751 							}
2752 
2753 							if(shouldRemove(r1, r2)){
2754 								remove=true;
2755 								if(r1!=null){
2756 									readsKFilteredT++;
2757 									basesKFilteredT+=initialLength1;
2758 								}
2759 								if(r2!=null){
2760 									readsKFilteredT++;
2761 									basesKFilteredT+=initialLength2;
2762 								}
2763 								if(bad!=null){bad.add(r1);}
2764 							}
2765 
2766 						}
2767 					}
2768 
2769 //					assert(false) : remove+", "+trimByOverlap+", "+(r2!=null);
2770 
2771 					if(!remove && trimByOverlap && r2!=null && expectedErrors(r1, r2)<meeFilter){
2772 
2773 						if(aprob==null || aprob.length<r1.length()){aprob=new float[r1.length()];}
2774 						if(bprob==null || bprob.length<r2.length()){bprob=new float[r2.length()];}
2775 
2776 						//Do overlap trimming
2777 						r2.reverseComplement();
2778 //						int bestInsert=BBMergeOverlapper.mateByOverlap(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap,
2779 //								overlapMargin, overlapMaxMismatches0, overlapMaxMismatches, overlapMinq);
2780 						int bestInsert=BBMergeOverlapper.mateByOverlapRatio(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap,
2781 								minInsert0, minInsert, maxRatio, 0.12f, ratioMargin, ratioOffset, 0.95f, 0.95f, useQualityForOverlap);
2782 
2783 						if(bestInsert<minInsert){bestInsert=-1;}
2784 						boolean ambig=(overlapVector[4]==1);
2785 						final int bestBad=overlapVector[2];
2786 
2787 						if(bestInsert>0 && !ambig && r1.quality!=null && r2.quality!=null && useQualityForOverlap){
2788 							if(efilterRatio>0 && bestInsert>0 && !ambig){
2789 								float bestExpected=BBMergeOverlapper.expectedMismatches(r1, r2, bestInsert);
2790 								if((bestExpected+efilterOffset)*efilterRatio<bestBad){ambig=true;}
2791 							}
2792 							if(pfilterRatio>0 && bestInsert>0 && !ambig){
2793 								float probability=BBMergeOverlapper.probability(r1, r2, bestInsert);
2794 								if(probability<pfilterRatio){bestInsert=-1;}
2795 							}
2796 							if(meeFilter>=0 && bestInsert>0 && !ambig){
2797 								float expected=BBMergeOverlapper.expectedMismatches(r1, r2, bestInsert);
2798 								if(expected>meeFilter){bestInsert=-1;}
2799 							}
2800 						}
2801 
2802 						r2.reverseComplement();
2803 
2804 						if(bestInsert>0 && !ambig){
2805 							if(bestInsert<r1.length()){
2806 								if(verbose){outstream.println("Overlap right trimming r1 to "+0+", "+(bestInsert-1));}
2807 								int x=TrimRead.trimToPosition(r1, 0, bestInsert-1, 1);
2808 								if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r1.bases));}
2809 								readsTrimmedByOverlapT++;
2810 								basesTrimmedByOverlapT+=x;
2811 							}
2812 							if(bestInsert<r2.length()){
2813 								if(verbose){outstream.println("Overlap right trimming r2 to "+0+", "+(bestInsert-1));}
2814 								int x=TrimRead.trimToPosition(r2, 0, bestInsert-1, 1);
2815 								if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r2.bases));}
2816 								readsTrimmedByOverlapT++;
2817 								basesTrimmedByOverlapT+=x;
2818 							}
2819 						}
2820 					}
2821 
2822 					if(!remove && swift){
2823 						//Do Swift trimming
2824 
2825 						int rlen1=0, rlen2=0;
2826 						if(r1!=null){
2827 							int x=trimSwift(r1);
2828 							basesTrimmedBySwiftT+=x;
2829 							readsTrimmedBySwiftT+=(x>0 ? 1 : 0);
2830 							rlen1=r1.length();
2831 							if(rlen1<minlen1){setDiscarded(r1);}
2832 						}
2833 						if(r2!=null){
2834 							int x=trimSwift(r2);
2835 							basesTrimmedBySwiftT+=x;
2836 							readsTrimmedBySwiftT+=(x>0 ? 1 : 0);
2837 							rlen2=r2.length();
2838 							if(rlen2<minlen2){setDiscarded(r2);}
2839 						}
2840 
2841 						//Discard reads if too short
2842 						if(shouldRemove(r1, r2)){
2843 							basesTrimmedBySwiftT+=r1.pairLength();
2844 							remove=true;
2845 							if(addTrimmedToBad && bad!=null){bad.add(r1);}
2846 						}
2847 					}
2848 
2849 					if(!remove && trimPolyA>0){
2850 						//Do poly-A trimming
2851 
2852 						int rlen1=0, rlen2=0;
2853 						if(r1!=null){
2854 							int x=trimPolyA(r1, trimPolyA);
2855 							basesPolyTrimmedT+=x;
2856 							readsPolyTrimmedT+=(x>0 ? 1 : 0);
2857 							rlen1=r1.length();
2858 							if(rlen1<minlen1){setDiscarded(r1);}
2859 						}
2860 						if(r2!=null){
2861 							int x=trimPolyA(r2, trimPolyA);
2862 							basesPolyTrimmedT+=x;
2863 							readsPolyTrimmedT+=(x>0 ? 1 : 0);
2864 							rlen2=r2.length();
2865 							if(rlen2<minlen2){setDiscarded(r2);}
2866 						}
2867 
2868 						//Discard reads if too short
2869 						if(shouldRemove(r1, r2)){
2870 							basesPolyTrimmedT+=r1.pairLength();
2871 							remove=true;
2872 							if(addTrimmedToBad && bad!=null){bad.add(r1);}
2873 						}
2874 					}
2875 
2876 					if(!remove && (trimPolyGLeft>0 || trimPolyGRight>0 || filterPolyG>0)){
2877 						//Do poly-G trimming
2878 
2879 						int rlen1=0, rlen2=0;
2880 						if(r1!=null){
2881 							if(filterPolyG>0 && r1.countLeft('G')>=filterPolyG){
2882 								setDiscarded(r1);
2883 								readsPolyTrimmedT++;
2884 							}else if(trimPolyGLeft>0 || trimPolyGRight>0){
2885 								int x=trimPoly(r1, trimPolyGLeft, trimPolyGRight, (byte)'G');
2886 								basesPolyTrimmedT+=x;
2887 								readsPolyTrimmedT+=(x>0 ? 1 : 0);
2888 								rlen1=r1.length();
2889 								if(rlen1<minlen1){setDiscarded(r1);}
2890 							}
2891 						}
2892 						if(r2!=null){
2893 							if(filterPolyG>0 && r2.countLeft('G')>=filterPolyG){
2894 								setDiscarded(r1);
2895 								readsPolyTrimmedT++;
2896 							}else if(trimPolyGLeft>0 || trimPolyGRight>0){
2897 								int x=trimPoly(r2, trimPolyGLeft, trimPolyGRight, (byte)'G');
2898 								basesPolyTrimmedT+=x;
2899 								readsPolyTrimmedT+=(x>0 ? 1 : 0);
2900 								rlen2=r2.length();
2901 								if(rlen2<minlen2){setDiscarded(r2);}
2902 							}
2903 						}
2904 
2905 						//Discard reads if too short
2906 						if(shouldRemove(r1, r2)){
2907 							basesPolyTrimmedT+=r1.pairLength();
2908 							remove=true;
2909 							if(addTrimmedToBad && bad!=null){bad.add(r1);}
2910 						}
2911 					}
2912 
2913 					if(!remove && (trimPolyCLeft>0 || trimPolyCRight>0 || filterPolyC>0)){
2914 						//Do poly-C trimming
2915 
2916 						int rlen1=0, rlen2=0;
2917 						if(r1!=null){
2918 							if(filterPolyC>0 && r1.countLeft('G')>=filterPolyC){
2919 								setDiscarded(r1);
2920 								readsPolyTrimmedT++;
2921 							}else if(trimPolyCLeft>0 || trimPolyCRight>0){
2922 								int x=trimPoly(r1, trimPolyCLeft, trimPolyCRight, (byte)'C');
2923 								basesPolyTrimmedT+=x;
2924 								readsPolyTrimmedT+=(x>0 ? 1 : 0);
2925 								rlen1=r1.length();
2926 								if(rlen1<minlen1){setDiscarded(r1);}
2927 							}
2928 						}
2929 						if(r2!=null){
2930 							if(filterPolyC>0 && r2.countLeft('G')>=filterPolyC){
2931 								setDiscarded(r1);
2932 								readsPolyTrimmedT++;
2933 							}else if(trimPolyCLeft>0 || trimPolyCRight>0){
2934 								int x=trimPoly(r2, trimPolyCLeft, trimPolyCRight, (byte)'C');
2935 								basesPolyTrimmedT+=x;
2936 								readsPolyTrimmedT+=(x>0 ? 1 : 0);
2937 								rlen2=r2.length();
2938 								if(rlen2<minlen2){setDiscarded(r2);}
2939 							}
2940 						}
2941 
2942 						//Discard reads if too short
2943 						if(shouldRemove(r1, r2)){
2944 							basesPolyTrimmedT+=r1.pairLength();
2945 							remove=true;
2946 							if(addTrimmedToBad && bad!=null){bad.add(r1);}
2947 						}
2948 					}
2949 
2950 					if(!remove && (entropyMask || entropyTrim>0)){
2951 						//Mask/trim entropy
2952 						if(isNotDiscarded(r1)){
2953 							int masked=(entropyTrim>0 ? trimLowEntropy(r1, null, eTrackerT) : maskLowEntropy(r1, null, eTrackerT));
2954 							basesEFilteredT+=masked;
2955 							readsEFilteredT+=(masked>0 ? 1 : 0);
2956 						}
2957 						if(isNotDiscarded(r2)){
2958 							int masked=(entropyTrim>0 ? trimLowEntropy(r2, null, eTrackerT) : maskLowEntropy(r2, null, eTrackerT));
2959 							basesEFilteredT+=masked;
2960 							readsEFilteredT+=(masked>0 ? 1 : 0);
2961 						}
2962 					}
2963 
2964 					if(entropyMark){
2965 						markLowEntropy(r1, eTrackerT);
2966 						markLowEntropy(r2, eTrackerT);
2967 					}
2968 
2969 					if(!remove){
2970 						//Do quality trimming
2971 
2972 						if(qtrimLeft || qtrimRight || trimClip){
2973 							if(r1!=null){
2974 								int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, trimE, 1, trimClip);
2975 								basesQTrimmedT+=x;
2976 								readsQTrimmedT+=(x>0 ? 1 : 0);
2977 
2978 //								assert(false) : trimClip+", "+x;
2979 							}
2980 							if(r2!=null){
2981 								int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, trimE, 1, trimClip);
2982 								basesQTrimmedT+=x;
2983 								readsQTrimmedT+=(x>0 ? 1 : 0);
2984 							}
2985 						}
2986 
2987 						if(isNotDiscarded(r1)){
2988 							int len=r1.length();
2989 							if(len<minlen1 || len>maxReadLength){setDiscarded(r1);}
2990 						}
2991 						if(isNotDiscarded(r2)){
2992 							int len=r2.length();
2993 							if(len<minlen2 || len>maxReadLength){setDiscarded(r2);}
2994 						}
2995 
2996 						//Discard reads if too short
2997 						if(shouldRemove(r1, r2)){
2998 							basesQTrimmedT+=r1.pairLength();
2999 							remove=true;
3000 							if(addTrimmedToBad && bad!=null){bad.add(r1);}
3001 						}
3002 
3003 					}
3004 
3005 					if(!remove){
3006 						//Do quality filtering
3007 
3008 						//Determine whether to discard the reads based on average quality
3009 						if(minAvgQuality>0){
3010 							if(r1!=null && r1.quality!=null && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){setDiscarded(r1);}
3011 							if(r2!=null && r2.quality!=null && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){setDiscarded(r2);}
3012 						}
3013 						//Determine whether to discard the reads based on lowest quality base
3014 						if(minBaseQuality>0){
3015 							if(r1!=null && r1.quality!=null && r1.minQuality()<minBaseQuality){setDiscarded(r1);}
3016 							if(r2!=null && r2.quality!=null && r2.minQuality()<minBaseQuality){setDiscarded(r2);}
3017 						}
3018 						//Determine whether to discard the reads based on the presence of Ns
3019 						if(maxNs>=0){
3020 							if(r1!=null && r1.countUndefined()>maxNs){
3021 								readsNFilteredT++;
3022 								basesNFilteredT+=r1.length();
3023 								setDiscarded(r1);
3024 							}
3025 							if(r2!=null && r2.countUndefined()>maxNs){
3026 								readsNFilteredT++;
3027 								basesNFilteredT+=r2.length();
3028 								setDiscarded(r2);
3029 							}
3030 						}
3031 						//Determine whether to discard the reads based on a lack of useful kmers
3032 						if(minConsecutiveBases>0){
3033 							if(isNotDiscarded(r1) && !r1.hasMinConsecutiveBases(minConsecutiveBases)){setDiscarded(r1);}
3034 							if(isNotDiscarded(r2) && !r2.hasMinConsecutiveBases(minConsecutiveBases)){setDiscarded(r2);}
3035 						}
3036 						//Determine whether to discard the reads based on minimum base frequency
3037 						if(minBaseFrequency>0){
3038 							if(r1!=null && r1.minBaseCount()<minBaseFrequency*r1.length()){setDiscarded(r1);}
3039 							if(r2!=null && r2.minBaseCount()<minBaseFrequency*r2.length()){setDiscarded(r2);}
3040 						}
3041 
3042 						//Discard reads if too short
3043 						if(shouldRemove(r1, r2)){
3044 							basesQFilteredT+=r1.pairLength();
3045 							readsQFilteredT+=pairCount;
3046 							remove=true;
3047 							if(addTrimmedToBad && bad!=null){bad.add(r1);}
3048 						}
3049 					}
3050 
3051 					if(!remove && calcEntropy && entropyCutoff>=0 && !entropyMask && entropyTrim<1){
3052 						//Test entropy
3053 						if(isNotDiscarded(r1) && !eTrackerT.passes(r1.bases, true)){setDiscarded(r1);}
3054 						if(isNotDiscarded(r2) && !eTrackerT.passes(r2.bases, true)){setDiscarded(r2);}
3055 
3056 						if(shouldRemove(r1, r2)){
3057 							basesEFilteredT+=r1.pairLength();
3058 							readsEFilteredT+=pairCount;
3059 							remove=true;
3060 							if(bad!=null){bad.add(r1);}
3061 						}
3062 					}
3063 
3064 					if(!remove && !histogramsBeforeProcessing){
3065 						addToHistograms(r1, r2);
3066 					}
3067 
3068 					if(ross!=null){
3069 						if(isNotDiscarded(r1) && isNullOrDiscarded(r2)){
3070 							Read clone=r1.clone();
3071 							clone.mate=null;
3072 							single.add(clone);
3073 						}else if(r2!=null && isDiscarded(r1) && isNotDiscarded(r2)){
3074 							Read clone=r2.clone();
3075 							clone.mate=null;
3076 							single.add(clone);
3077 						}
3078 					}
3079 
3080 					if(remove && !trimFailuresTo1bp){
3081 						//Evict read
3082 						removed++;
3083 						if(r2!=null){removed++;}
3084 						reads.set(i, null);
3085 
3086 						readsOutmT+=pairCount;
3087 						basesOutmT+=r1.pairLength();
3088 					}else{
3089 						if(loglogOut!=null){loglogOut.hash(r1);}
3090 						readsOutuT+=pairCount;
3091 						basesOutuT+=r1.pairLength();
3092 					}
3093 				}
3094 
3095 				//Send matched list to matched output stream
3096 				if(rosb!=null){
3097 					rosb.add(bad, ln.id);
3098 					bad.clear();
3099 				}
3100 
3101 				//Send unmatched list to unmatched output stream
3102 				if(ros!=null){
3103 					ros.add((removed>0 ? Tools.condenseNew(reads) : reads), ln.id); //Creates a new list if old one became empty, to prevent shutting down the cris.
3104 				}
3105 
3106 				if(ross!=null){
3107 					ross.add(single, ln.id);
3108 					single.clear();
3109 				}
3110 
3111 				if(maxBasesOutmT>=0 && basesOutmT>=maxBasesOutmT){break;}
3112 				if(maxBasesOutuT>=0 && basesOutuT>=maxBasesOutuT){break;}
3113 
3114 				//Fetch a new read list
3115 				cris.returnList(ln);
3116 				ln=cris.nextList();
3117 				reads=(ln!=null ? ln.list : null);
3118 			}
3119 			cris.returnList(ln);
3120 			finishedSuccessfully=true;
3121 		}
3122 
setDiscarded(Read r)3123 		private void setDiscarded(Read r){
3124 			if(trimFailuresTo1bp){
3125 				if(r.length()>1){TrimRead.trimByAmount(r, 0, r.length()-1, 1, false);}
3126 			}else{
3127 				r.setDiscarded(true);
3128 			}
3129 		}
3130 
isDiscarded(Read r)3131 		private boolean isDiscarded(Read r){
3132 			if(r==null){return false;}
3133 			if(r.discarded()){return true;}
3134 			return trimFailuresTo1bp && r.length()==1;
3135 		}
3136 
isNullOrDiscarded(Read r)3137 		private boolean isNullOrDiscarded(Read r){
3138 			if(r==null){return true;}
3139 			if(r.discarded()){return true;}
3140 			return trimFailuresTo1bp && r.length()==1;
3141 		}
3142 
isNotDiscarded(Read r)3143 		private boolean isNotDiscarded(Read r){
3144 			if(r==null){return false;}
3145 			if(r.discarded()){return false;}
3146 			return !(trimFailuresTo1bp && r.length()==1);
3147 		}
3148 
shouldRemove(Read r1, Read r2)3149 		private boolean shouldRemove(Read r1, Read r2){
3150 			return (removePairsIfEitherBad && (isDiscarded(r1) || isDiscarded(r2))) ||
3151 					(isDiscarded(r1) && isNullOrDiscarded(r2));
3152 		}
3153 
3154 		/*--------------------------------------------------------------*/
3155 		/*----------------        Helper Methods        ----------------*/
3156 		/*--------------------------------------------------------------*/
3157 
addToHistograms(Read r1, Read r2)3158 		private void addToHistograms(Read r1, Read r2) {
3159 
3160 			if(pTrackerT!=null){
3161 				pTrackerT.addPair(r1);
3162 			}
3163 
3164 			if(fixVariants){
3165 				CallVariants.fixVars(r1, varMap, scafMap);
3166 				CallVariants.fixVars(r2, varMap, scafMap);
3167 			}
3168 
3169 			if(readstats!=null){
3170 				if(MAKE_QUALITY_HISTOGRAM){readstats.addToQualityHistogram(r1);}
3171 				if(MAKE_BASE_HISTOGRAM){readstats.addToBaseHistogram(r1);}
3172 				if(MAKE_MATCH_HISTOGRAM){readstats.addToMatchHistogram(r1);}
3173 				if(MAKE_QUALITY_ACCURACY){readstats.addToQualityAccuracy(r1);}
3174 
3175 				if(MAKE_EHIST){readstats.addToErrorHistogram(r1);}
3176 				if(MAKE_INDELHIST){readstats.addToIndelHistogram(r1);}
3177 				if(MAKE_LHIST){readstats.addToLengthHistogram(r1);}
3178 				if(MAKE_GCHIST){readstats.addToGCHistogram(r1);}
3179 				if(MAKE_ENTROPYHIST){readstats.addToEntropyHistogram(r1);}
3180 				if(MAKE_IDHIST){readstats.addToIdentityHistogram(r1);}
3181 
3182 				if(MAKE_IHIST){
3183 					SamLine sl1=r1.samline;
3184 					if(sl1!=null && !r1.secondary() && sl1.pairnum()==0){
3185 						readstats.addToInsertHistogram(sl1);
3186 					}
3187 				}
3188 			}
3189 
3190 			if(fixVariants && unfixVariants){
3191 				CallVariants.unfixVars(r1);
3192 				CallVariants.unfixVars(r2);
3193 			}
3194 		}
3195 
3196 		/**
3197 		 * Transforms a kmer into all canonical values for a given Hamming distance.
3198 		 * Returns the related id stored in the tables.
3199 		 * @param kmer Forward kmer
3200 		 * @param rkmer Reverse kmer
3201 		 * @param lengthMask Bitmask with single '1' set to left of kmer
3202 		 * @param qPos Position of kmer in query
3203 		 * @param len kmer length
3204 		 * @param qHDist Hamming distance
3205 		 * @param sets Kmer hash tables
3206 		 * @return Value stored in table, or -1
3207 		 */
getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets)3208 		private final int getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets){
3209 			assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
3210 			if(verbose){outstream.println("getValue()");}
3211 			int id=getValueInner(kmer, rkmer, lengthMask, len, qPos, sets);
3212 			if(id<1 && qHDist>0){
3213 				final int qHDist2=qHDist-1;
3214 
3215 				//Sub
3216 				for(int j=0; j<symbols && id<1; j++){
3217 					for(int i=0; i<len && id<1; i++){
3218 						final long temp=(kmer&clearMasks[i])|setMasks[j][i];
3219 //						outstream.println(i+", "+j+", "+setMasks[j][i]+", "+qHDist);
3220 						if(temp!=kmer){
3221 							long rtemp=rcomp(temp, len);
3222 //							assert(lengthMask==0 || (temp<lengthMask && rtemp<lengthMask)) : lengthMask+", "+temp+", "+rtemp+", "+kmer+", "+rkmer+
3223 //							"\n"+len+", "+Long.numberOfTrailingZeros(lengthMask)+"\n"+
3224 //									Long.toBinaryString(lengthMask|0x8000000000000000L)+"\n"+
3225 //											Long.toBinaryString(temp|0x8000000000000000L)+"\n"+
3226 //													Long.toBinaryString(rtemp|0x8000000000000000L);
3227 							id=getValue(temp, rtemp, lengthMask, qPos, len, qHDist2, sets);
3228 						}
3229 					}
3230 				}
3231 			}
3232 			return id;
3233 		}
3234 
3235 		/**
3236 		 * Transforms a kmer into a canonical value stored in the table and search.
3237 		 * @param kmer Forward kmer
3238 		 * @param rkmer Reverse kmer
3239 		 * @param lengthMask Bitmask with single '1' set to left of kmer
3240 		 * @param qPos Position of kmer in query
3241 		 * @param sets Kmer hash tables
3242 		 * @return Value stored in table
3243 		 */
getValueInner(final long kmer, final long rkmer, final long lengthMask, final int len, final int qPos, final AbstractKmerTable[] sets)3244 		private final int getValueInner(final long kmer, final long rkmer, final long lengthMask, final int len, final int qPos, final AbstractKmerTable[] sets){
3245 			assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
3246 			if(qSkip>1 && (qPos%qSkip!=0)){return -1;}
3247 
3248 			if(verbose){
3249 				outstream.println("getValueInner(kmer="+AminoAcid.kmerToString(kmer, len)+", rkmer="+AminoAcid.kmerToString(rkmer, len)+", len="+len+", mask="+lengthMask+")");
3250 				outstream.println("getValueInner(kmer="+kmer+", rkmer="+rkmer+", len="+len+", mask="+lengthMask+")");
3251 			}
3252 			final long max=(rcomp ? Tools.max(kmer, rkmer) : kmer);
3253 			if(verbose){outstream.println("max="+AminoAcid.kmerToString(max, len)+" ("+max+")");}
3254 			final long key=(max&middleMask)|lengthMask;
3255 			if(verbose){outstream.println("key="+AminoAcid.kmerToString(key, len)+" ("+key+")");}
3256 			if(passesSpeed(key)){
3257 				if(verbose){outstream.println("Testing key "+kmerToString(key, len)+" ("+key+")");}
3258 				AbstractKmerTable set=sets[(int)(key%WAYS)];
3259 				final int id=set.getValue(key);
3260 				if(verbose){outstream.println("getValueInner("+kmerToString(kmer, len)+", "+kmerToString(rkmer, len)+") > "+kmerToString(key, len)+" ("+key+") = "+id);}
3261 				return id;
3262 			}
3263 			if(verbose){outstream.println("Invalid key.");}
3264 			return -1;
3265 		}
3266 
3267 
3268 		/**
3269 		 * Counts the number of kmer hits for a read.
3270 		 * @param r Read to process
3271 		 * @param sets Kmer tables
3272 		 * @return Number of hits
3273 		 */
countSetKmers(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers)3274 		private final int countSetKmers(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers){
3275 			if(r==null || r.length()<k || storedKmers<1){return 0;}
3276 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
3277 			final byte[] bases=r.bases;
3278 			long kmer=0;
3279 			long rkmer=0;
3280 			int found=0;
3281 			int len=0;
3282 
3283 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
3284 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
3285 
3286 			/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
3287 			for(int i=start; i<stop; i++){
3288 				byte b=bases[i];
3289 				long x=symbolToNumber0[b];
3290 				long x2=symbolToComplementNumber0[b];
3291 				kmer=((kmer<<bitsPerBase)|x)&mask;
3292 				rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
3293 				if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;}
3294 				if(verbose){
3295 					String fwd=new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k));
3296 					String rev=AminoAcid.reverseComplementBases(fwd);
3297 					String fwd2=kmerToString(kmer, Tools.min(len, k));
3298 					outstream.println("fwd="+fwd+", fwd2="+fwd2+", rev="+rev+", kmer="+kmer+", rkmer="+rkmer);
3299 					outstream.println("b="+(char)b+", x="+x+", x2="+x2+", bitsPerBase="+bitsPerBase+", shift2="+shift2);
3300 					if(!amino){
3301 						assert(AminoAcid.stringToKmer(fwd)==kmer) : fwd+", "+AminoAcid.stringToKmer(fwd)+", "+kmer+", "+len;
3302 						if(len>=k){
3303 							assert(rcomp(kmer, Tools.min(len, k))==rkmer);
3304 							assert(rcomp(rkmer, Tools.min(len, k))==kmer);
3305 							assert(AminoAcid.kmerToString(kmer, Tools.min(len, k)).equals(fwd));
3306 							assert(AminoAcid.kmerToString(rkmer, Tools.min(len, k)).equals(rev)) : AminoAcid.kmerToString(rkmer, Tools.min(len, k))+" != "+rev+" (rkmer)";
3307 						}
3308 						assert(fwd.equalsIgnoreCase(fwd2)) : fwd+", "+fwd2; //may be unsafe
3309 					}
3310 					outstream.println("Scanning6 i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+", bases="+fwd+", rbases="+rev);
3311 				}
3312 				if(len>=minlen2 && i>=minlen){
3313 					final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
3314 					if(verbose){outstream.println("Testing kmer "+kmer+"; id="+id);}
3315 					if(id>0){
3316 						if(verbose){outstream.println("Found = "+(found+1)+"/"+maxBadKmers);}
3317 						if(found==maxBadKmers){
3318 							if(scaffoldReadCountsT!=null){
3319 								scaffoldReadCountsT[id]++;
3320 								scaffoldBaseCountsT[id]+=bases.length;
3321 							}else{
3322 								scaffoldReadCounts.addAndGet(id, 1);
3323 								scaffoldBaseCounts.addAndGet(id, bases.length);
3324 							}
3325 							if(hitCounts==null){
3326 								return (found=found+1);
3327 							}//Early exit, but prevents generation of histogram that goes over maxBadKmers+1.
3328 						}
3329 						found++;
3330 					}
3331 				}
3332 			}
3333 
3334 			if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
3335 			return found;
3336 		}
3337 
3338 
3339 		/**
3340 		 * Counts the number of kmer hits for a read.
3341 		 * @param r Read to process
3342 		 * @param sets Kmer tables
3343 		 * @return Number of hits
3344 		 */
countCoveredBases(final Read r, final AbstractKmerTable[] sets, final int minCoveredBases)3345 		private final int countCoveredBases(final Read r, final AbstractKmerTable[] sets, final int minCoveredBases){
3346 			if(r==null || r.length()<k || storedKmers<1){return 0;}
3347 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
3348 			final byte[] bases=r.bases;
3349 			long kmer=0;
3350 			long rkmer=0;
3351 			int found=0;
3352 			int len=0;
3353 			int lastFound=-1;
3354 			boolean recorded=false;
3355 
3356 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
3357 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
3358 
3359 			/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
3360 			for(int i=start; i<stop; i++){
3361 				byte b=bases[i];
3362 				long x=symbolToNumber0[b];
3363 				long x2=symbolToComplementNumber0[b];
3364 				kmer=((kmer<<bitsPerBase)|x)&mask;
3365 				rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
3366 				if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;}
3367 				if(verbose){outstream.println("Scanning6b i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3368 				if(len>=minlen2 && i>=minlen){
3369 					final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
3370 					if(verbose){outstream.println("Testing kmer "+kmer+"; id="+id);}
3371 					if(id>0){
3372 
3373 						int extra=Tools.min(k, i-lastFound);
3374 						found+=extra;
3375 						lastFound=i;
3376 
3377 						if(verbose){outstream.println("Found = "+found+"/"+minCoveredBases);}
3378 						if(found>=minCoveredBases){
3379 							if(!recorded){
3380 								if(scaffoldReadCountsT!=null){
3381 									scaffoldReadCountsT[id]++;
3382 									scaffoldBaseCountsT[id]+=bases.length;
3383 								}else{
3384 									scaffoldReadCounts.addAndGet(id, 1);
3385 									scaffoldBaseCounts.addAndGet(id, bases.length);
3386 								}
3387 							}
3388 							if(hitCounts==null){
3389 								return found;
3390 							}
3391 						}
3392 					}
3393 				}
3394 			}
3395 
3396 			if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
3397 			return found;
3398 		}
3399 
3400 		/**
3401 		 * Returns the id of the sequence with the most kmer matches to this read, or -1 if none are over maxBadKmers.
3402 		 * @param r Read to process
3403 		 * @param sets Kmer tables
3404 		 * @return id of best match
3405 		 */
findBestMatch(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers)3406 		private final int findBestMatch(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers){
3407 			idList.size=0;
3408 			if(r==null || r.length()<k || storedKmers<1){return -1;}
3409 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return -1;}
3410 			final byte[] bases=r.bases;
3411 			long kmer=0;
3412 			long rkmer=0;
3413 			int len=0;
3414 			int found=0;
3415 
3416 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
3417 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
3418 
3419 			/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
3420 			for(int i=start; i<stop; i++){
3421 				byte b=bases[i];
3422 				long x=symbolToNumber0[b];
3423 				long x2=symbolToComplementNumber0[b];
3424 				kmer=((kmer<<bitsPerBase)|x)&mask;
3425 				rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
3426 				if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;}
3427 				if(verbose){outstream.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3428 				if(len>=minlen2 && i>=minlen){
3429 					if(verbose){outstream.println("Lookup kmer="+AminoAcid.kmerToString(kmer, k)+", rkmer="+AminoAcid.kmerToString(rkmer, k));}
3430 					final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
3431 					if(id>0){
3432 						countArray[id]++;
3433 						if(countArray[id]==1){idList.add(id);}
3434 						found++;
3435 						if(verbose){outstream.println("Found = "+found+"/"+maxBadKmers);}
3436 					}
3437 				}
3438 			}
3439 
3440 			final int id, max;
3441 			if(found>maxBadKmers){
3442 				max=condenseLoose(countArray, idList, countList);
3443 				int id0=-1;
3444 				for(int i=0; i<countList.size; i++){
3445 					if(countList.get(i)==max){
3446 						id0=idList.get(i); break;
3447 					}
3448 				}
3449 				if(rename){rename(r, idList, countList);}
3450 				id=id0;
3451 			}else{
3452 				max=0;
3453 				id=-1;
3454 			}
3455 
3456 			if(found>maxBadKmers){
3457 				if(scaffoldReadCountsT!=null){
3458 					scaffoldReadCountsT[id]++;
3459 					scaffoldBaseCountsT[id]+=bases.length;
3460 				}else{
3461 					scaffoldReadCounts.addAndGet(id, 1);
3462 					scaffoldBaseCounts.addAndGet(id, bases.length);
3463 				}
3464 			}
3465 
3466 			if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
3467 			return id;
3468 		}
3469 
3470 		/** Estimates kmer hit counts for kmers longer than k using consecutive matches
3471 		 * @param r
3472 		 * @param sets
3473 		 * @return Number of sets of consecutive hits of exactly length kbig
3474 		 */
countSetKmersBig(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers)3475 		private final int countSetKmersBig(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers){
3476 			if(r==null || r.length()<kbig || storedKmers<1){return 0;}
3477 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
3478 			assert(kbig>k);
3479 			final int sub=kbig-k-1;
3480 			assert(sub>=0) : kbig+", "+sub;
3481 			final byte[] bases=r.bases;
3482 			long kmer=0;
3483 			long rkmer=0;
3484 			int found=0;
3485 			int len=0;
3486 
3487 			int bkStart=-1;
3488 			int bkStop=-1;
3489 			int id=-1, lastId=-1;
3490 
3491 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
3492 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
3493 
3494 			/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
3495 			for(int i=start; i<stop; i++){
3496 				byte b=bases[i];
3497 				long x=symbolToNumber0[b];
3498 				long x2=symbolToComplementNumber0[b];
3499 				kmer=((kmer<<bitsPerBase)|x)&mask;
3500 				rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
3501 				if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;}
3502 				if(verbose){outstream.println("Scanning7 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3503 				if(len>=minlen2 && i>=minlen){
3504 					id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
3505 					if(verbose){outstream.println("Testing kmer "+kmer+"; id="+id);}
3506 					if(id>0){
3507 						lastId=id;
3508 						if(bkStart==-1){bkStart=i;}
3509 						bkStop=i;
3510 					}else{
3511 						if(bkStart>-1){
3512 							int dif=bkStop-bkStart-sub;
3513 							bkStop=bkStart=-1;
3514 							if(dif>0){
3515 								int old=found;
3516 								found+=dif;
3517 								if(found>maxBadKmers && old<=maxBadKmers){
3518 									if(scaffoldReadCountsT!=null){
3519 										scaffoldReadCountsT[lastId]++;
3520 										scaffoldBaseCountsT[lastId]+=bases.length;
3521 									}else{
3522 										scaffoldReadCounts.addAndGet(lastId, 1);
3523 										scaffoldBaseCounts.addAndGet(lastId, bases.length);
3524 									}
3525 									if(hitCounts==null){
3526 										return found;
3527 									}//Early exit, but prevents generation of histogram that goes over maxBadKmers+1.
3528 								}
3529 							}
3530 						}
3531 					}
3532 				}
3533 			}
3534 
3535 			// This catches the case where valid kmers extend to the end of the read
3536 			if(bkStart>-1){
3537 				int dif=bkStop-bkStart-sub;
3538 				bkStop=bkStart=-1;
3539 				if(dif>0){
3540 					int old=found;
3541 					found+=dif;
3542 					if(found>maxBadKmers && old<=maxBadKmers){
3543 						if(scaffoldReadCountsT!=null){
3544 							scaffoldReadCountsT[lastId]++;
3545 							scaffoldBaseCountsT[lastId]+=bases.length;
3546 						}else{
3547 							scaffoldReadCounts.addAndGet(lastId, 1);
3548 							scaffoldBaseCounts.addAndGet(lastId, bases.length);
3549 						}
3550 					}
3551 				}
3552 			}
3553 
3554 			if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
3555 			return found;
3556 		}
3557 
3558 		/**
3559 		 * Trim a read to remove matching kmers and everything to their left or right.
3560 		 * @param r Read to process
3561 		 * @param sets Kmer tables
3562 		 * @return Number of bases trimmed
3563 		 */
ktrim(final Read r, final AbstractKmerTable[] sets)3564 		private final int ktrim(final Read r, final AbstractKmerTable[] sets){
3565 			assert(ktrimLeft || ktrimRight);
3566 			if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return 0;}
3567 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
3568 			if(verbose){outstream.println("KTrimming read "+r.id);}
3569 			final byte[] bases=r.bases, quals=r.quality;
3570 			long kmer=0;
3571 			long rkmer=0;
3572 			int found=0;
3573 			int len=0;
3574 			int id0=-1; //ID of first kmer found.
3575 
3576 			int minLoc=999999999, minLocExclusive=999999999;
3577 			int maxLoc=-1, maxLocExclusive=-1;
3578 			final int initialLength=r.length();
3579 
3580 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
3581 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
3582 
3583 			//Scan for normal kmers
3584 			for(int i=start; i<stop; i++){
3585 				byte b=bases[i];
3586 				long x=symbolToNumber0[b];
3587 				long x2=symbolToComplementNumber0[b];
3588 				kmer=((kmer<<bitsPerBase)|x)&mask;
3589 				rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
3590 				if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;}
3591 				if(verbose){outstream.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3592 				if(len>=minlen2 && i>=minlen){
3593 					final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
3594 					if(id>0){
3595 						if(id0<0){id0=id;}
3596 						minLoc=Tools.min(minLoc, i-k+1);
3597 						assert(minLoc>=0);
3598 						maxLoc=i;
3599 						found++;
3600 					}
3601 				}
3602 			}
3603 
3604 			if(minLoc!=minLocExclusive){minLocExclusive=minLoc+k;}
3605 			if(maxLoc!=maxLocExclusive){maxLocExclusive=maxLoc-k;}
3606 
3607 			//If nothing was found, scan for short kmers.  Only used for trimming.
3608 			if(useShortKmers && found==0){
3609 				assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
3610 
3611 				//Look for short kmers on left side
3612 				if(ktrimLeft){
3613 					kmer=0;
3614 					rkmer=0;
3615 					len=0;
3616 					final int lim=Tools.min(k, stop);
3617 					for(int i=start; i<lim; i++){
3618 						byte b=bases[i];
3619 						long x=symbolToNumber0[b];
3620 						long x2=symbolToComplementNumber0[b];
3621 						kmer=((kmer<<bitsPerBase)|x)&mask;
3622 						rkmer=rkmer|(x2<<(bitsPerBase*len));
3623 						len++;
3624 						if(verbose){outstream.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3625 						if(len>=mink){
3626 
3627 							if(verbose){
3628 								outstream.println("Looking for left kmer  "+kmerToString(kmer, len));
3629 								outstream.println("Looking for left rkmer "+kmerToString(rkmer, len));
3630 							}
3631 
3632 							final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
3633 							if(id>0){
3634 								if(id0<0){id0=id;}
3635 								if(verbose){outstream.println("Found "+kmer);}
3636 								minLoc=0;
3637 								minLocExclusive=Tools.min(minLocExclusive, i+1);
3638 								maxLoc=Tools.max(maxLoc, i);
3639 								maxLocExclusive=Tools.max(maxLocExclusive, 0);
3640 								found++;
3641 							}
3642 						}
3643 					}
3644 				}
3645 
3646 				//Look for short kmers on right side
3647 				if(ktrimRight){
3648 					kmer=0;
3649 					rkmer=0;
3650 					len=0;
3651 					final int lim=Tools.max(-1, stop-k);
3652 					for(int i=stop-1; i>lim; i--){
3653 						byte b=bases[i];
3654 						long x=symbolToNumber0[b];
3655 						long x2=symbolToComplementNumber0[b];
3656 						kmer=kmer|(x<<(bitsPerBase*len));
3657 						rkmer=((rkmer<<bitsPerBase)|x2)&mask;
3658 						len++;
3659 						if(verbose){outstream.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3660 						if(len>=mink){
3661 							if(verbose){
3662 								outstream.println("Looking for right kmer "+
3663 										AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; value="+toValue(kmer, rkmer, lengthMasks[len])+"; kmask="+lengthMasks[len]);
3664 							}
3665 							final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
3666 							if(id>0){
3667 								if(id0<0){id0=id;}
3668 								if(verbose){outstream.println("Found "+kmer);}
3669 								minLoc=i;
3670 								minLocExclusive=Tools.min(minLocExclusive, bases.length);
3671 								maxLoc=bases.length-1;
3672 								maxLocExclusive=Tools.max(maxLocExclusive, i-1);
3673 								found++;
3674 							}
3675 						}
3676 					}
3677 				}
3678 			}
3679 
3680 
3681 			if(verbose){outstream.println("found="+found+", minLoc="+minLoc+", maxLoc="+maxLoc+", minLocExclusive="+minLocExclusive+", maxLocExclusive="+maxLocExclusive);}
3682 
3683 			if(found==0){return 0;}
3684 			assert(found>0) : "Overflow in 'found' variable.";
3685 
3686 			{//Increment counter for the scaffold whose kmer was first detected
3687 				if(scaffoldReadCountsT!=null){
3688 					scaffoldReadCountsT[id0]++;
3689 					scaffoldBaseCountsT[id0]+=bases.length;
3690 				}else{
3691 					scaffoldReadCounts.addAndGet(id0, 1);
3692 					scaffoldBaseCounts.addAndGet(id0, bases.length);
3693 				}
3694 			}
3695 
3696 			if(trimPad!=0){
3697 				maxLoc=Tools.mid(0, maxLoc+trimPad, bases.length);
3698 				minLoc=Tools.mid(0, minLoc-trimPad, bases.length);
3699 				maxLocExclusive=Tools.mid(0, maxLocExclusive+trimPad, bases.length);
3700 				minLocExclusive=Tools.mid(0, minLocExclusive-trimPad, bases.length);
3701 			}
3702 
3703 			if(ktrimLeft){ //Trim from the read start to the rightmost kmer base
3704 				if(verbose){outstream.println("Left trimming to "+(ktrimExclusive ? maxLocExclusive+1 : maxLoc+1)+", "+0);}
3705 				int x=TrimRead.trimToPosition(r, ktrimExclusive ? maxLocExclusive+1 : maxLoc+1, bases.length-1, 1);
3706 				if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r.bases));}
3707 				return x;
3708 			}else{ //Trim from the leftmost kmer base to the read stop
3709 				assert(ktrimRight);
3710 				if(verbose){outstream.println("Right trimming to "+0+", "+(ktrimExclusive ? minLocExclusive-1 : minLoc-1));}
3711 				int x=TrimRead.trimToPosition(r, 0, ktrimExclusive ? minLocExclusive-1 : minLoc-1, 1);
3712 				if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r.bases));}
3713 				return x;
3714 			}
3715 		}
3716 
3717 
3718 		/**
3719 		 * Mask a read to cover matching kmers.
3720 		 * @param r Read to process
3721 		 * @param sets Kmer tables
3722 		 * @return Number of bases masked
3723 		 */
kmask(final Read r, final AbstractKmerTable[] sets)3724 		private final int kmask(final Read r, final AbstractKmerTable[] sets){
3725 			assert(ktrimN);
3726 			if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return 0;}
3727 			if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
3728 			if(verbose){outstream.println("KMasking read "+r.id);}
3729 			final byte[] bases=r.bases, quals=r.quality;
3730 			if(bases==null || bases.length<k){return 0;}
3731 			long kmer=0;
3732 			long rkmer=0;
3733 			int found=0;
3734 			int len=0;
3735 			int id0=-1; //ID of first kmer found.
3736 
3737 			final BitSet bs=new BitSet(bases.length+trimPad+1);
3738 			if(kmaskFullyCovered){bs.set(0, bases.length);}
3739 
3740 			final int minus=k-1-trimPad;
3741 			final int plus=trimPad+1;
3742 
3743 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
3744 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
3745 
3746 			//Scan for normal kmers
3747 			for(int i=start; i<stop; i++){
3748 				byte b=bases[i];
3749 				long x=symbolToNumber0[b];
3750 				long x2=symbolToComplementNumber0[b];
3751 				kmer=((kmer<<bitsPerBase)|x)&mask;
3752 				rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
3753 				if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;}
3754 				if(verbose){outstream.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3755 
3756 				if(i>=minlen){
3757 					final int id;
3758 					if(len>=minlen2){
3759 						id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
3760 					}else{
3761 						id=-1;
3762 					}
3763 					if(id>0){
3764 						if(id0<0){id0=id;}
3765 						if(verbose){
3766 							outstream.println("a: Found "+kmer);
3767 							outstream.println("Setting "+Tools.max(0, i-minus)+", "+(i+plus));
3768 							outstream.println("i="+i+", minus="+minus+", plus="+plus+", trimpad="+trimPad+", k="+k);
3769 						}
3770 						if(!kmaskFullyCovered){bs.set(Tools.max(0, i-minus), i+plus);}
3771 						found++;
3772 					}else if(kmaskFullyCovered){
3773 						bs.clear(Tools.max(0, i-minus), i+plus);
3774 					}
3775 				}
3776 			}
3777 
3778 			//If nothing was found, scan for short kmers.
3779 			if(useShortKmers){
3780 				assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
3781 
3782 				//Look for short kmers on left side
3783 				{
3784 					kmer=0;
3785 					rkmer=0;
3786 					len=0;
3787 					int len2=0;
3788 					final int lim=Tools.min(k, stop);
3789 					for(int i=start; i<lim; i++){
3790 						byte b=bases[i];
3791 						long x=symbolToNumber0[b];
3792 						long x2=symbolToComplementNumber0[b];
3793 						kmer=((kmer<<bitsPerBase)|x)&mask;
3794 						rkmer=rkmer|(x2<<(bitsPerBase*len));
3795 						len++;
3796 						len2++;
3797 						if(verbose){outstream.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3798 
3799 						if(len2>=minminlen){
3800 							if(verbose){
3801 								outstream.println("Looking for left kmer  "+kmerToString(kmer, len));
3802 								outstream.println("Looking for left rkmer "+kmerToString(rkmer, len));
3803 							}
3804 							final int id;
3805 							if(len>=mink){
3806 								id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
3807 							}else{
3808 								id=-1;
3809 							}
3810 							if(id>0){
3811 								if(id0<0){id0=id;}
3812 								if(verbose){
3813 									outstream.println("b: Found "+kmer);
3814 									outstream.println("Setting "+0+", "+Tools.min(bases.length, i+trimPad+1));
3815 								}
3816 								if(!kmaskFullyCovered){bs.set(0, Tools.min(bases.length, i+trimPad+1));}
3817 								found++;
3818 							}else if(kmaskFullyCovered){
3819 								bs.clear(0, Tools.min(bases.length, i+trimPad+1));
3820 							}
3821 						}
3822 					}
3823 				}
3824 
3825 				//Look for short kmers on right side
3826 				{
3827 					kmer=0;
3828 					rkmer=0;
3829 					len=0;
3830 					int len2=0;
3831 					final int lim=Tools.max(-1, stop-k);
3832 					for(int i=stop-1; i>lim; i--){
3833 						byte b=bases[i];
3834 						long x=symbolToNumber0[b];
3835 						long x2=symbolToComplementNumber0[b];
3836 						kmer=kmer|(x<<(bitsPerBase*len));
3837 						rkmer=((rkmer<<bitsPerBase)|x2)&mask;
3838 						len++;
3839 						len2++;
3840 						if(verbose){outstream.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3841 
3842 						if(len2>=minminlen){
3843 							if(verbose){
3844 								outstream.println("Looking for right kmer "+
3845 										AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; value="+toValue(kmer, rkmer, lengthMasks[len])+"; kmask="+lengthMasks[len]);
3846 							}
3847 							final int id;
3848 							if(len>=mink){
3849 								id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
3850 							}else{
3851 								id=-1;
3852 							}
3853 							if(id>0){
3854 								if(id0<0){id0=id;}
3855 								if(verbose){
3856 									outstream.println("c: Found "+kmer);
3857 									outstream.println("Setting "+Tools.max(0, i-trimPad)+", "+bases.length);
3858 								}
3859 								if(!kmaskFullyCovered){bs.set(Tools.max(0, i-trimPad), bases.length);}
3860 								found++;
3861 							}else if(kmaskFullyCovered){
3862 								bs.clear(Tools.max(0, i-trimPad), bases.length);
3863 							}
3864 						}
3865 					}
3866 				}
3867 			}
3868 
3869 
3870 			if(verbose){outstream.println("found="+found+", bitset="+bs);}
3871 
3872 			if(found==0){return 0;}
3873 			assert(found>0) : "Overflow in 'found' variable.";
3874 
3875 			{//Increment counter for the scaffold whose kmer was first detected
3876 				if(scaffoldReadCountsT!=null){
3877 					scaffoldReadCountsT[id0]++;
3878 					scaffoldBaseCountsT[id0]+=bases.length;
3879 				}else{
3880 					scaffoldReadCounts.addAndGet(id0, 1);
3881 					scaffoldBaseCounts.addAndGet(id0, bases.length);
3882 				}
3883 			}
3884 //			int y=r.countNocalls();
3885 			int cardinality=bs.cardinality();
3886 //			assert(cardinality>0);
3887 
3888 			//Replace kmer hit zone with the trim symbol
3889 			for(int i=0; i<bases.length; i++){
3890 				if(bs.get(i)){
3891 					if(kmaskLowercase){
3892 						bases[i]=(byte)Tools.toLowerCase(bases[i]);
3893 					}else{
3894 						bases[i]=trimSymbol;
3895 						if(quals!=null && trimSymbol=='N'){quals[i]=0;}
3896 					}
3897 				}
3898 			}
3899 //			assert(cardinality==r.countNocalls() || y>0) : cardinality+", "+r.countNocalls()+"\n"+r.length()+"\n"+bs+"\n"+r;//123
3900 			return cardinality;
3901 		}
3902 
3903 
3904 		/**
3905 		 * Mask a read to cover matching kmers.
3906 		 * @param r Read to process
3907 		 * @param sets Kmer tables
3908 		 * @return Number of bases masked
3909 		 */
ksplit(final Read r, final AbstractKmerTable[] sets)3910 		private final boolean ksplit(final Read r, final AbstractKmerTable[] sets){
3911 			if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return false;}
3912 			assert(r.mate==null) : "Kmer splitting should only be performed on unpaired reads.";
3913 			assert(ksplit);
3914 			if(verbose){outstream.println("KSplitting read "+r.id);}
3915 			final byte[] bases=r.bases, quals=r.quality;
3916 			if(bases==null || bases.length<k){return false;}
3917 			long kmer=0;
3918 			long rkmer=0;
3919 			long found=0;
3920 			int len=0;
3921 			int id0=-1; //ID of first kmer found.
3922 			int leftmost=Integer.MAX_VALUE, rightmost=-1;
3923 
3924 			final int minus=k-1-trimPad;
3925 			final int plus=trimPad;
3926 
3927 			final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
3928 			final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
3929 
3930 			//Scan for normal kmers
3931 			for(int i=start; i<stop; i++){
3932 				byte b=bases[i];
3933 				long x=symbolToNumber0[b];
3934 				long x2=symbolToComplementNumber0[b];
3935 				kmer=((kmer<<bitsPerBase)|x)&mask;
3936 				rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
3937 				if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;}
3938 				if(verbose){outstream.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3939 
3940 				if(i>=minlen){
3941 					final int id;
3942 					if(len>=minlen2){
3943 						id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
3944 					}else{
3945 						id=-1;
3946 					}
3947 					if(id>0){
3948 						if(id0<0){id0=id;}
3949 						if(verbose){
3950 							outstream.println("a: Found "+kmer);
3951 							outstream.println("Setting "+Tools.max(0, i-minus)+", "+(i+plus));
3952 							outstream.println("i="+i+", minus="+minus+", plus="+plus+", trimpad="+trimPad+", k="+k);
3953 						}
3954 						leftmost=Tools.min(leftmost, Tools.max(0, i-minus));
3955 						rightmost=Tools.max(rightmost, i+plus);
3956 						found++;
3957 					}
3958 				}
3959 			}
3960 
3961 			//If nothing was found, scan for short kmers.
3962 			if(useShortKmers && id0==-1){
3963 				assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
3964 
3965 				//Look for short kmers on right side
3966 				{
3967 					kmer=0;
3968 					rkmer=0;
3969 					len=0;
3970 					int len2=0;
3971 					final int lim=Tools.max(-1, stop-k);
3972 					for(int i=stop-1; i>lim; i--){
3973 						byte b=bases[i];
3974 						long x=symbolToNumber0[b];
3975 						long x2=symbolToComplementNumber0[b];
3976 						kmer=kmer|(x<<(bitsPerBase*len));
3977 						rkmer=((rkmer<<bitsPerBase)|x2)&mask;
3978 						len++;
3979 						len2++;
3980 						if(verbose){outstream.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3981 
3982 						if(len2>=minminlen){
3983 							if(verbose){
3984 								outstream.println("Looking for right kmer "+
3985 										AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; value="+toValue(kmer, rkmer, lengthMasks[len])+"; kmask="+lengthMasks[len]);
3986 							}
3987 							final int id;
3988 							if(len>=mink){
3989 								id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
3990 							}else{
3991 								id=-1;
3992 							}
3993 							if(id>0){
3994 								if(id0<0){id0=id;}
3995 								if(verbose){
3996 									outstream.println("b: Found "+kmer);
3997 									outstream.println("Setting "+Tools.max(0, i-trimPad)+", "+bases.length);
3998 								}
3999 								leftmost=Tools.min(leftmost, Tools.max(0, i-trimPad));
4000 								rightmost=bases.length-1;
4001 								found++;
4002 							}
4003 						}
4004 					}
4005 				}
4006 
4007 				//Look for short kmers on left side
4008 				if(id0==-1){
4009 					kmer=0;
4010 					rkmer=0;
4011 					len=0;
4012 					int len2=0;
4013 					final int lim=Tools.min(k, stop);
4014 					for(int i=start; i<lim; i++){
4015 						byte b=bases[i];
4016 						long x=symbolToNumber0[b];
4017 						long x2=symbolToComplementNumber0[b];
4018 						kmer=((kmer<<bitsPerBase)|x)&mask;
4019 						rkmer=rkmer|(x2<<(bitsPerBase*len));
4020 						len++;
4021 						len2++;
4022 						if(verbose){outstream.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
4023 
4024 						if(len2>=minminlen){
4025 							if(verbose){
4026 								outstream.println("Looking for left kmer  "+kmerToString(kmer, len));
4027 								outstream.println("Looking for left rkmer "+kmerToString(rkmer, len));
4028 							}
4029 							final int id;
4030 							if(len>=mink){
4031 								id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
4032 							}else{
4033 								id=-1;
4034 							}
4035 							if(id>0){
4036 								if(id0<0){id0=id;}
4037 								if(verbose){
4038 									outstream.println("c: Found "+kmer);
4039 									outstream.println("Setting "+0+", "+(i+trimPad));
4040 								}
4041 								leftmost=0;
4042 								rightmost=Tools.max(rightmost, i+trimPad);
4043 								found++;
4044 							}
4045 						}
4046 					}
4047 				}
4048 			}
4049 
4050 
4051 			if(verbose){outstream.println("found="+found);}
4052 
4053 			if(found==0){return false;}
4054 
4055 			{//Increment counter for the scaffold whose kmer was first detected
4056 				if(scaffoldReadCountsT!=null){
4057 					scaffoldReadCountsT[id0]++;
4058 					scaffoldBaseCountsT[id0]+=bases.length;
4059 				}else{
4060 					scaffoldReadCounts.addAndGet(id0, 1);
4061 					scaffoldBaseCounts.addAndGet(id0, bases.length);
4062 				}
4063 			}
4064 
4065 			if(leftmost==0){
4066 				TrimRead.trimToPosition(r, rightmost+1, bases.length-1, 1);
4067 				return false;
4068 			}else if(rightmost==bases.length-1){
4069 				TrimRead.trimToPosition(r, 0, leftmost-1, 1);
4070 				return false;
4071 			}else{
4072 				Read r2=r.subRead(rightmost+1, bases.length-1);
4073 				TrimRead.trimToPosition(r, 0, leftmost-1, 1);
4074 				r.mate=r2;
4075 				r2.mate=r;
4076 				r2.setPairnum(1);
4077 				return true;
4078 			}
4079 		}
4080 
4081 		/**
4082 		 * @param r
4083 		 * @param idList
4084 		 * @param countList
4085 		 */
rename(Read r, IntList idList, IntList countList)4086 		private void rename(Read r, IntList idList, IntList countList) {
4087 			if(r==null || idList.size<1){return;}
4088 			StringBuilder sb=new StringBuilder();
4089 			if(r.id==null){sb.append(r.numericID);}
4090 			else{sb.append(r.id);}
4091 			for(int i=0; i<idList.size; i++){
4092 				int id=idList.get(i);
4093 				int count=countList.get(i);
4094 				sb.append('\t');
4095 				sb.append(scaffoldNames.get(id));
4096 				sb.append('=');
4097 				sb.append(count);
4098 			}
4099 			r.id=sb.toString();
4100 		}
4101 
4102 		/**
4103 		 * Pack a list of counts from an array to an IntList.
4104 		 * @param loose Counter array
4105 		 * @param packed Unique values
4106 		 * @param counts Counts of values
4107 		 * @return
4108 		 */
condenseLoose(int[] loose, IntList packed, IntList counts)4109 		private int condenseLoose(int[] loose, IntList packed, IntList counts){
4110 			counts.size=0;
4111 			if(packed.size<1){return 0;}
4112 
4113 			int max=0;
4114 			for(int i=0; i<packed.size; i++){
4115 				final int p=packed.get(i);
4116 				final int c=loose[p];
4117 				counts.add(c);
4118 				loose[p]=0;
4119 				max=Tools.max(max, c);
4120 			}
4121 			return max;
4122 		}
4123 
expectedErrors(Read r1, Read r2)4124 		private float expectedErrors(Read r1, Read r2){
4125 			float a=(r1==null ? 0 : r1.expectedErrors(false, -1));
4126 			float b=(r2==null ? 0 : r2.expectedErrors(false, -1));
4127 			return Tools.max(a, b);
4128 		}
4129 
4130 		/*--------------------------------------------------------------*/
4131 		/*----------------        Entropy Methods       ----------------*/
4132 		/*--------------------------------------------------------------*/
4133 
maskLowEntropy(final Read r, BitSet bs, EntropyTracker et)4134 		private int maskLowEntropy(final Read r, BitSet bs, EntropyTracker et){
4135 			final int window=et.windowBases();
4136 			if(r==null || r.length()<window){return 0;}
4137 			final byte[] bases=r.bases;
4138 			if(bs==null){bs=new BitSet(r.length());}
4139 			else{bs.clear();}
4140 
4141 			et.clear();
4142 			for(int i=0, min=window-1; i<bases.length; i++){
4143 				et.add(bases[i]);
4144 				if(i>=min && et.ns()<1 && !et.passes()){bs.set(et.leftPos(), et.rightPos()+1);}
4145 			}
4146 
4147 			return maskFromBitset(r, bs, entropyMaskLowercase);
4148 		}
4149 
trimLowEntropy(final Read r, BitSet bs, EntropyTracker et)4150 		private int trimLowEntropy(final Read r, BitSet bs, EntropyTracker et){
4151 			final int window=et.windowBases();
4152 			System.err.println("Trimming "+r.id+", len "+r.length()+", window "+window);
4153 			if(r==null || r.length()<window){return 0;}
4154 			final byte[] bases=r.bases;
4155 			if(bs==null){bs=new BitSet(r.length());}
4156 			else{bs.clear();}
4157 
4158 			et.clear();
4159 			for(int i=0, min=window-1; i<bases.length; i++){
4160 				et.add(bases[i]);
4161 				if(i>=min && et.ns()<1 && !et.passes()){bs.set(et.leftPos(), et.rightPos()+1);}
4162 			}
4163 
4164 			//Now, trim from bitset - which could be spun out into a function
4165 			int left=0, right=0;
4166 			final int len=bases.length;
4167 			for(int i=0; i<len; i++){
4168 				if(bs.get(i)){left++;}
4169 				else {break;}
4170 			}
4171 			for(int i=len-1; i>=0; i--){
4172 				if(bs.get(i)){right++;}
4173 				else {break;}
4174 			}
4175 //			System.err.println(new String(bases)+"\nleft="+left+", right="+right);
4176 			if(left==0 && right==0){return 0;}
4177 			return TrimRead.trimByAmount(r, left, right, 1);
4178 		}
4179 
markLowEntropy(final Read r, EntropyTracker et)4180 		private void markLowEntropy(final Read r, EntropyTracker et){
4181 			final int window=et.windowBases();
4182 			if(r==null || r.length()<window){return;}
4183 			final byte[] bases=r.bases;
4184 
4185 			float[] values=new float[r.length()];
4186 			Arrays.fill(values, 1);
4187 
4188 			et.clear();
4189 			for(int i=0, min=window-1; i<bases.length; i++){
4190 				et.add(bases[i]);
4191 				if(i>=min && et.ns()<1){
4192 					float e=et.calcEntropy();
4193 					for(int j=et.leftPos(), max=et.rightPos(); j<=max; j++){
4194 						values[j]=Tools.min(e, values[j]);
4195 					}
4196 				}
4197 			}
4198 
4199 			if(r.quality==null){
4200 				r.quality=new byte[r.length()];
4201 			}
4202 			for(int i=0; i<values.length; i++){
4203 				byte q=(byte)(values[i]*41);
4204 				r.quality[i]=q;
4205 			}
4206 		}
4207 
maskFromBitset(final Read r, final BitSet bs, final boolean lowercase)4208 		private int maskFromBitset(final Read r, final BitSet bs, final boolean lowercase){
4209 			final byte[] bases=r.bases;
4210 			final byte[] quals=r.quality;
4211 			int sum=0;
4212 			if(!lowercase){
4213 				for(int i=bs.nextSetBit(0); i>=0; i=bs.nextSetBit(i+1)){
4214 					if(bases[i]!='N'){
4215 						sum++;
4216 						bases[i]='N';
4217 						if(quals!=null){quals[i]=0;}
4218 					}
4219 				}
4220 			}else{
4221 				for(int i=bs.nextSetBit(0); i>=0; i=bs.nextSetBit(i+1)){
4222 					if(!Tools.isLowerCase(bases[i])){
4223 						 if(bases[i]!='N'){sum++;}
4224 						 bases[i]=(byte)Tools.toLowerCase(bases[i]);
4225 						 //Don't change quality
4226 					}
4227 				}
4228 			}
4229 			return sum;
4230 		}
4231 
passesVariantFilter(Read r)4232 		public final boolean passesVariantFilter(Read r){
4233 			if(!r.mapped() || r.bases==null || r.samline==null || r.match==null){return true;}
4234 			//TODO: Add Vars as well, like in FilterSam
4235 			if(Read.countSubs(r.match)<=maxBadSubs){return true;}
4236 			ArrayList<Var> list=CallVariants.findUniqueSubs(r, r.samline, varMap, scafMap, maxBadSubAlleleDepth, maxBadAlleleFraction, minBadSubReadDepth, minBadSubEDist);
4237 			return list==null || list.size()<=maxBadSubs;
4238 		}
4239 
4240 		/*--------------------------------------------------------------*/
4241 
4242 		/** Input read stream */
4243 		private final ConcurrentReadInputStream cris;
4244 		/** Output read streams */
4245 		private final ConcurrentReadOutputStream ros, rosb, ross;
4246 
4247 		private final FlowcellCoordinate flowCoords;
4248 
4249 		private final ReadStats readstats;
4250 		private final int[] overlapVector;
4251 		private final int[] countArray;
4252 
4253 		private final IntList idList;
4254 		private final IntList countList;
4255 
4256 		//These "*T" fields are used to store counts on a per-thread basis.
4257 
4258 		long[] hitCountsT;
4259 		long[] scaffoldReadCountsT;
4260 		long[] scaffoldBaseCountsT;
4261 
4262 		final EntropyTracker eTrackerT;
4263 		final PolymerTracker pTrackerT;
4264 
4265 		private float[] aprob, bprob;
4266 
4267 		long readsInT=0;
4268 		long basesInT=0;
4269 		long readsOutuT=0;
4270 		long basesOutuT=0;
4271 
4272 		long readsOutmT=0;
4273 		long basesOutmT=0;
4274 
4275 		final long maxBasesOutmT;
4276 		final long maxBasesOutuT;
4277 
4278 		long readsQTrimmedT=0;
4279 		long basesQTrimmedT=0;
4280 		long readsFTrimmedT=0;
4281 		long basesFTrimmedT=0;
4282 		long readsQFilteredT=0;
4283 		long basesQFilteredT=0;
4284 		long readsNFilteredT=0;
4285 		long basesNFilteredT=0;
4286 		long readsEFilteredT=0;
4287 		long basesEFilteredT=0;
4288 		long readsPolyTrimmedT=0;
4289 		long basesPolyTrimmedT=0;
4290 
4291 		long readsKTrimmedT=0;
4292 		long basesKTrimmedT=0;
4293 		long readsKFilteredT=0;
4294 		long basesKFilteredT=0;
4295 
4296 		long readsTrimmedBySwiftT=0;
4297 		long basesTrimmedBySwiftT=0;
4298 
4299 		long readsTrimmedByOverlapT=0;
4300 		long basesTrimmedByOverlapT=0;
4301 
4302 		long badGcBasesT=0;
4303 		long badGcReadsT=0;
4304 
4305 		long badHeaderBasesT=0;
4306 		long badHeaderReadsT=0;
4307 
4308 		boolean finishedSuccessfully=false;
4309 	}
4310 
4311 	/*--------------------------------------------------------------*/
4312 	/*----------------        Static Methods        ----------------*/
4313 	/*--------------------------------------------------------------*/
4314 
4315 	/** Current available memory */
freeMemory()4316 	private static final long freeMemory(){
4317 		Runtime rt=Runtime.getRuntime();
4318 		return rt.freeMemory();
4319 	}
4320 
4321 	/**
4322 	 * Transforms a kmer into a canonical value stored in the table.  Expected to be inlined.
4323 	 * @param kmer Forward kmer
4324 	 * @param rkmer Reverse kmer
4325 	 * @param lengthMask Bitmask with single '1' set to left of kmer
4326 	 * @return Canonical value
4327 	 */
toValue(long kmer, long rkmer, long lengthMask)4328 	final long toValue(long kmer, long rkmer, long lengthMask){
4329 		assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) :
4330 			"\n"+Long.toBinaryString(lengthMask)+
4331 			"\n"+Long.toBinaryString(kmer)+
4332 			"\n"+Long.toBinaryString(rkmer)+
4333 			"\n"+Long.toBinaryString(rcomp(kmer, k));
4334 		if(verbose){outstream.println("toValue("+AminoAcid.kmerToString(kmer, k)+", "+AminoAcid.kmerToString(rkmer, k)+", "+lengthMask+")");}
4335 		final long value=(rcomp ? Tools.max(kmer, rkmer) : kmer);
4336 		if(verbose){outstream.println("value="+AminoAcid.kmerToString(value, k)+" = "+value);}
4337 		final long ret=(value&middleMask)|lengthMask;
4338 		if(verbose){outstream.println("ret="+AminoAcid.kmerToString(ret, k)+" = "+ret);}
4339 		return ret;
4340 	}
4341 
rcomp(long kmer, int len)4342 	final long rcomp(long kmer, int len){
4343 		return amino ? kmer : AminoAcid.reverseComplementBinaryFast(kmer, len);
4344 	}
4345 
passesSpeed(long key)4346 	final boolean passesSpeed(long key){
4347 		return speed<1 || ((key&Long.MAX_VALUE)%17)>=speed;
4348 	}
4349 
failsSpeed(long key)4350 	final boolean failsSpeed(long key){
4351 		return speed>0 && ((key&Long.MAX_VALUE)%17)<speed;
4352 	}
4353 
trimPolyA(final Read r, final int minPoly)4354 	public static int trimPolyA(final Read r, final int minPoly){
4355 		assert(minPoly>0);
4356 		if(r==null || r.length()<minPoly){return 0;}
4357 
4358 		int left=Tools.max(r.countLeft((byte)'A'), r.countLeft((byte)'T'));
4359 		int right=Tools.max(r.countRight((byte)'A'), r.countRight((byte)'T'));
4360 
4361 		if(left<minPoly){left=0;}
4362 		if(right<minPoly){right=0;}
4363 		int trimmed=0;
4364 		if(left>0 || right>0){
4365 			trimmed=TrimRead.trimByAmount(r, left, right, 1);
4366 		}
4367 		return trimmed;
4368 	}
4369 
trimPoly(final Read r, final int minPolyLeft, final int minPolyRight, final byte c)4370 	public static int trimPoly(final Read r, final int minPolyLeft, final int minPolyRight, final byte c){
4371 		assert(minPolyLeft>0 || minPolyRight>0);
4372 		if(r==null){return 0;}
4373 
4374 		int left=minPolyLeft>0 ? r.countLeft(c) : 0;
4375 		int right=minPolyRight>0 ? r.countRight(c) : 0;
4376 
4377 		if(left<minPolyLeft){left=0;}
4378 		if(right<minPolyRight){right=0;}
4379 		int trimmed=0;
4380 		if(left>0 || right>0){
4381 			trimmed=TrimRead.trimByAmount(r, left, right, 1);
4382 		}
4383 		return trimmed;
4384 	}
4385 
trimSwift(Read r)4386 	private static int trimSwift(Read r){
4387 		int left=0, right=0, trimmed=0;
4388 		if(r.pairnum()==0){
4389 			for(int i=r.length()-1; i>=0; i--){
4390 				byte b=r.bases[i];
4391 				if(b=='C' || b=='T' || b=='N'){right++;}
4392 				else{break;}
4393 			}
4394 
4395 		}else{
4396 			for(int i=0; i<r.length(); i++){
4397 				byte b=r.bases[i];
4398 				if(b=='G' || b=='A' || b=='N'){left++;}
4399 				else{break;}
4400 			}
4401 		}
4402 		if(left>0 || right>0){
4403 			trimmed=TrimRead.trimByAmount(r, left, right, 1);
4404 		}
4405 		return trimmed;
4406 	}
4407 
parseEnd(String s)4408 	private static int parseEnd(String s){
4409 		if(s==null){return RIGHTLEFT;}
4410 		if(s.equalsIgnoreCase("right") || s.equalsIgnoreCase("r")){return RIGHT;}
4411 		if(s.equalsIgnoreCase("left") || s.equalsIgnoreCase("l")){return LEFT;}
4412 		if(s.equalsIgnoreCase("rl") || s.equalsIgnoreCase("lr") || s.equalsIgnoreCase("both") || s.equalsIgnoreCase("b")){return RIGHTLEFT;}
4413 		return Parse.parseBoolean(s) ? RIGHTLEFT : 0;
4414 	}
4415 
4416 	/*--------------------------------------------------------------*/
4417 	/*----------------            Fields            ----------------*/
4418 	/*--------------------------------------------------------------*/
4419 
4420 	//TODO: Document
4421 	boolean silent=false;
4422 	boolean json=false;
4423 
4424 	boolean swift=false; //https://issues.jgi-psf.org/browse/AUTOQC-2193
4425 
4426 	/** For calculating kmer cardinality in input */
4427 	final CardinalityTracker loglogIn;
4428 	/** For calculating kmer cardinality in output */
4429 	final CardinalityTracker loglogOut;
4430 	/** Requires (and sets) cardinality tracking.  This is for input kmers. */
4431 	String khistIn=null;
4432 	/** Requires (and sets) cardinality tracking.  This is for output kmers. */
4433 	String khistOut=null;
4434 
4435 	/** Has this class encountered errors while processing? */
4436 	public boolean errorState=false;
4437 
4438 	/** Fraction of available memory preallocated to arrays */
4439 	private double preallocFraction=1.0;
4440 	/** Initial size of data structures */
4441 	private int initialSize=-1;
4442 
4443 	/** Hold kmers.  A kmer X such that X%WAYS=Y will be stored in keySets[Y] */
4444 	final AbstractKmerTable[] keySets;
4445 	/** A scaffold's name is stored at scaffoldNames.get(id).
4446 	 * scaffoldNames[0] is reserved, so the first id is 1. */
4447 	final ArrayList<String> scaffoldNames=new ArrayList<String>();
4448 	/** Names of reference files (refNames[0] is valid). */
4449 	private ArrayList<String> refNames=new ArrayList<String>();
4450 	private final ArrayList<String> altRefNames=new ArrayList<String>();
4451 	/** Number of scaffolds per reference. */
4452 	private int[] refScafCounts;
4453 	/** scaffoldCounts[id] stores the number of reads with kmer matches to that scaffold */
4454 	AtomicLongArray scaffoldReadCounts;
4455 	/** scaffoldBaseCounts[id] stores the number of bases with kmer matches to that scaffold */
4456 	AtomicLongArray scaffoldBaseCounts;
4457 	/** Set to false to force threads to share atomic counter arrays. */
4458 	private boolean ALLOW_LOCAL_ARRAYS=true;
4459 	/** scaffoldLengths[id] stores the length of that scaffold */
4460 	private IntList scaffoldLengths=new IntList();
4461 	/** hitCounts[x] stores the number of reads with exactly x kmer matches */
4462 	long[] hitCounts;
4463 	/** Array of reference files from which to load kmers */
4464 	private String[] ref=null;
4465 	/** Alternate reference to be used if main reference has no kmers */
4466 	private String[] altref=null;
4467 	/** Array of literal strings from which to load kmers */
4468 	private String[] literal=null;
4469 	/** Optional reference for sam file */
4470 	private String samref=null;
4471 
4472 	/** Input reads */
4473 	private String in1=null, in2=null;
4474 	/** Input FileFormats */
4475 	private final FileFormat ffin1, ffin2;
4476 	/** Input qual files */
4477 	private String qfin1=null, qfin2=null;
4478 	/** Output qual files */
4479 	private String qfout1=null, qfout2=null;
4480 	/** Output reads (unmatched and at least minlen) */
4481 	private String out1=null, out2=null;
4482 	/** Output reads (matched or shorter than minlen) */
4483 	private String outb1=null, outb2=null;
4484 	/** Output FileFormats */
4485 	private final FileFormat ffout1, ffout2, ffoutb1, ffoutb2, ffouts;
4486 	/** Output reads whose mate was discarded */
4487 	private String outsingle=null;
4488 	/** Statistics output files */
4489 	private String outstats=null, outrqc=null, outrpkm=null, outrefstats=null, polymerStatsFile=null;
4490 	@Deprecated
4491 	/** duk-style statistics */
4492 	private String outduk=null;
4493 
4494 	final boolean tossJunk;
4495 
4496 	/** Dump kmers here. */
4497 	private String dump=null;
4498 
4499 	/** Quit after this many bases written to outm */
4500 	long maxBasesOutm=-1;
4501 	/** Quit after this many bases written to outu */
4502 	long maxBasesOutu=-1;
4503 
4504 	/** Maximum input reads (or pairs) to process.  Does not apply to references.  -1 means unlimited. */
4505 	private long maxReads=-1;
4506 	/** Process this fraction of input reads. */
4507 	private float samplerate=1f;
4508 	/** Set samplerate seed to this value. */
4509 	private long sampleseed=-1;
4510 
4511 	/** Output reads in input order.  May reduce speed. */
4512 	private final boolean ordered;
4513 	/** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */
4514 	boolean useShortKmers=false;
4515 	/** Make the middle base in a kmer a wildcard to improve sensitivity */
4516 	boolean maskMiddle=true;
4517 
4518 	/** Store reference kmers with up to this many substitutions */
4519 	int hammingDistance=0;
4520 	/** Search for query kmers with up to this many substitutions */
4521 	int qHammingDistance=0;
4522 	/** Store reference kmers with up to this many edits (including indels) */
4523 	int editDistance=0;
4524 	/** Store short reference kmers with up to this many substitutions */
4525 	int hammingDistance2=-1;
4526 	/** Search for short query kmers with up to this many substitutions */
4527 	int qHammingDistance2=-1;
4528 	/** Store short reference kmers with up to this many edits (including indels) */
4529 	int editDistance2=-1;
4530 	/** Never skip more than this many consecutive kmers when hashing reference. */
4531 	int maxSkip=1;
4532 	/** Always skip at least this many consecutive kmers when hashing reference.
4533 	 * 1 means every kmer is used, 2 means every other, etc. */
4534 	int minSkip=1;
4535 
4536 	/** Trim this much extra around matched kmers */
4537 	int trimPad;
4538 
4539 	/*--------------------------------------------------------------*/
4540 	/*----------------      Flowcell Filtering      ----------------*/
4541 	/*--------------------------------------------------------------*/
4542 
4543 	//TODO: Document
4544 	int xMinLoc=-1;
4545 	int yMinLoc=-1;
4546 	int xMaxLoc=-1;
4547 	int yMaxLoc=-1;
4548 	final boolean locationFilter;
4549 
4550 	/*--------------------------------------------------------------*/
4551 	/*----------------       Variant-Related        ----------------*/
4552 	/*--------------------------------------------------------------*/
4553 
4554 	//TODO: Document
4555 	private String varFile=null;
4556 	private String vcfFile=null;
4557 	private VarMap varMap=null;
4558 //	private HashSet<VarKey> varKeySet=null;
4559 	private ScafMap scafMap=null;
4560 	private boolean fixVariants=false;
4561 	private boolean unfixVariants=true;
4562 
4563 	/** Optional file for quality score recalibration */
4564 	private String samFile=null;
4565 
4566 	/** Filter reads with unsupported substitutions */
4567 	private boolean filterVars=false;
4568 	/** Maximum allowed unsupported substitutions in a read */
4569 	private int maxBadSubs=2;
4570 	/** Maximum variant depth for a variant to be considered unsupported */
4571 	private int maxBadSubAlleleDepth=1;
4572 	/** Minimum read depth for a variant to be considered unsupported */
4573 	private int minBadSubReadDepth=2;
4574 	//TODO
4575 	private int minBadSubEDist=0;
4576 	//TODO
4577 	private float maxBadAlleleFraction=0;
4578 
4579 	/*--------------------------------------------------------------*/
4580 	/*----------------        Entropy Fields        ----------------*/
4581 	/*--------------------------------------------------------------*/
4582 
4583 	/** Minimum entropy to be considered "complex", on a scale of 0-1 */
4584 	float entropyCutoff=-1;
4585 	/** Mask entropy with a highpass filter */
4586 	boolean entropyHighpass=true;
4587 
4588 	/** Change the quality scores to be proportional to the entropy */
4589 	boolean entropyMark=false;
4590 	/** Mask low-entropy areas (e.g., with N) */
4591 	boolean entropyMask=false;
4592 	/** Trim only trailing or leading low-entropy areas, ignoring middle areas */
4593 	int entropyTrim=0;
4594 	/** Convert low-entropy areas to lower case */
4595 	boolean entropyMaskLowercase=false;
4596 
4597 	/** Perform entropy calculation */
4598 	final boolean calcEntropy;
4599 
4600 
4601 	/*--------------------------------------------------------------*/
4602 	/*----------------          Statistics          ----------------*/
4603 	/*--------------------------------------------------------------*/
4604 
4605 	/** Stores JSON output */
4606 	JsonObject jsonStats;
4607 
4608 	long readsIn=0;
4609 	long basesIn=0;
4610 	long readsOut=0;
4611 	long basesOut=0;
4612 
4613 	long readsQTrimmed=0;
4614 	long basesQTrimmed=0;
4615 	long readsFTrimmed=0;
4616 	long basesFTrimmed=0;
4617 	long readsQFiltered=0;
4618 	long basesQFiltered=0;
4619 	long readsEFiltered=0;
4620 	long basesEFiltered=0;
4621 	long readsNFiltered=0;
4622 	long basesNFiltered=0;
4623 
4624 	long readsPolyTrimmed=0;
4625 	long basesPolyTrimmed=0;
4626 
4627 	long readsKTrimmed=0;
4628 	long basesKTrimmed=0;
4629 	long readsKFiltered=0;
4630 	long basesKFiltered=0;
4631 
4632 	long badGcReads;
4633 	long badGcBases;
4634 
4635 	long badHeaderReads=0;
4636 	long badHeaderBases=0;
4637 
4638 	long readsTrimmedByOverlap;
4639 	long basesTrimmedByOverlap;
4640 
4641 	long readsTrimmedBySwift;
4642 	long basesTrimmedBySwift;
4643 
4644 	long refReads=0;
4645 	long refBases=0;
4646 	long refKmers=0;
4647 
4648 //	public long modsum=0; //123
4649 
4650 	long storedKmers=0;
4651 
4652 	/*--------------------------------------------------------------*/
4653 	/*----------------         Homopolymers         ----------------*/
4654 	/*--------------------------------------------------------------*/
4655 
4656 	boolean countPolymers=false;
4657 	byte polymerChar1=-1;
4658 	byte polymerChar2=-1;
4659 	/** Minimum length to consider a homopolymer, for the purpose of statistics */
4660 	int polymerLength=20;
4661 
4662 	/** Tracks homopolymer statistics */
4663 	PolymerTracker pTracker;
4664 
4665 	/*--------------------------------------------------------------*/
4666 	/*----------------       Final Primitives       ----------------*/
4667 	/*--------------------------------------------------------------*/
4668 
4669 	/** Don't look for kmers in read 1 */
4670 	final boolean skipR1;
4671 	/** Don't look for kmers in read 2 */
4672 	final boolean skipR2;
4673 	/** Correct errors via read overlap */
4674 	final boolean ecc;
4675 	/** True if a ReadStats object is being used for collecting data */
4676 	final boolean makeReadStats;
4677 
4678 	/** Look for reverse-complements as well as forward kmers.  Default: true */
4679 	final boolean rcomp;
4680 	/** Don't allow a read 'N' to match a reference 'A'.
4681 	 * Reduces sensitivity when hdist>0 or edist>0.  Default: false. */
4682 	final boolean forbidNs;
4683 	/** AND bitmask with 0's at the middle base */
4684 	final long middleMask;
4685 	/** Use HashForest data structure */
4686 	private final boolean useForest;
4687 	/** Use KmerTable data structure */
4688 	private final boolean useTable;
4689 	/** Use HashArray data structure (default) */
4690 	private final boolean useArray;
4691 
4692 	/** Normal kmer length */
4693 	final int k;
4694 	/** k-1; used in some expressions */
4695 	final int k2;
4696 	/** Emulated kmer greater than k */
4697 	final int kbig;
4698 	/** Effective kmer size */
4699 	final int keff;
4700 	/** Shortest kmer to use for trimming */
4701 	final int mink;
4702 	/** A read may contain up to this many kmers before being considered a match.  Default: 0 */
4703 	final int maxBadKmers0;
4704 	/** A read must share at least this fraction of its kmers to be considered a match.  Default: 0 */
4705 	final float minKmerFraction;
4706 	/** Reference kmers must cover at least this fraction of read bases to be considered a match.  Default: 0 */
4707 	final float minCoveredFraction;
4708 
4709 	/** Recalibrate quality scores using matrices */
4710 	final boolean recalibrateQuality;
4711 	/** Quality-trim the left side */
4712 	final boolean qtrimLeft;
4713 	/** Quality-trim the right side */
4714 	final boolean qtrimRight;
4715 	/** Trim soft-clipped bases */
4716 	final boolean trimClip;
4717 	/** Trim poly-A tails of at least this length */
4718 	final int trimPolyA;
4719 
4720 	/** Trim poly-G prefixes of at least this length */
4721 	final int trimPolyGLeft;
4722 	/** Trim poly-G tails of at least this length */
4723 	final int trimPolyGRight;
4724 	/** Remove reads with poly-G prefixes of at least this length */
4725 	final int filterPolyG;
4726 
4727 	/** Trim poly-C prefixes of at least this length */
4728 	final int trimPolyCLeft;
4729 	/** Trim poly-C tails of at least this length */
4730 	final int trimPolyCRight;
4731 	/** Remove reads with poly-C prefixes of at least this length */
4732 	final int filterPolyC;
4733 
4734 	/** Trim bases at this quality or below.  Default: 4 */
4735 	final float trimq;
4736 	/** Error rate for trimming (derived from trimq) */
4737 	private final float trimE;
4738 	/** Throw away reads below this average quality after trimming.  Default: 0 */
4739 	final float minAvgQuality;
4740 	/** Throw away reads with any base below this quality after trimming.  Default: 0 */
4741 	final byte minBaseQuality;
4742 	/** If positive, calculate average quality from the first X bases only.  Default: 0 */
4743 	final int minAvgQualityBases;
4744 	/** Throw away reads failing chastity filter (:Y: in read header) */
4745 	final boolean chastityFilter;
4746 	/** Crash if a barcode is encountered that contains Ns or is not in the table */
4747 	final boolean failBadBarcodes;
4748 	/** Remove reads with Ns in barcodes or that are not in the table */
4749 	final boolean removeBadBarcodes;
4750 	/** Fail reads missing a barcode */
4751 	final boolean failIfNoBarcode;
4752 	/** A set of valid barcodes; null if unused */
4753 	final HashSet<String> barcodes;
4754 	/** Throw away reads containing more than this many Ns.  Default: -1 (disabled) */
4755 	final int maxNs;
4756 	/** Throw away reads containing without at least this many consecutive called bases. */
4757 	final int minConsecutiveBases;
4758 	/** Throw away reads containing fewer than this fraction of any particular base. */
4759 	final float minBaseFrequency;
4760 	/** Throw away reads shorter than this after trimming.  Default: 10 */
4761 	final int minReadLength;
4762 	/** Throw away reads longer than this after trimming.  Default: Integer.MAX_VALUE */
4763 	final int maxReadLength;
4764 	/** Toss reads shorter than this fraction of initial length, after trimming */
4765 	final float minLenFraction;
4766 	/** Filter reads by whether or not they have matching kmers */
4767 	private final boolean kfilter;
4768 	/** Trim matching kmers and all bases to the left */
4769 	final boolean ktrimLeft;
4770 	/** Trim matching kmers and all bases to the right */
4771 	boolean ktrimRight;
4772 	/** Don't trim, but replace matching kmers with a symbol (default N) */
4773 	final boolean ktrimN;
4774 	/** Exclude kmer itself when ktrimming */
4775 	final boolean ktrimExclusive;
4776 	/** Split into two reads around the kmer */
4777 	final boolean ksplit;
4778 	/** Replace bases covered by matched kmers with this symbol */
4779 	final byte trimSymbol;
4780 	/** Convert kmer-masked bases to lowercase */
4781 	final boolean kmaskLowercase;
4782 	/** Only mask fully-covered bases **/
4783 	final boolean kmaskFullyCovered;
4784 	/** Output over-trimmed reads to outbad (outmatch).  If false, they are discarded. */
4785 	final boolean addTrimmedToBad;
4786 	/** Find the sequence that shares the most kmer matches when filtering. */
4787 	final boolean findBestMatch;
4788 	/** Trim pairs to the same length, when adapter-trimming */
4789 	final boolean trimPairsEvenly;
4790 	/** Trim left bases of the read to this position (exclusive, 0-based) */
4791 	final int forceTrimLeft;
4792 	/** Trim right bases of the read after this position (exclusive, 0-based) */
4793 	final int forceTrimRight;
4794 	/** Trim this many rightmost bases of the read */
4795 	final int forceTrimRight2;
4796 	/** Trim right bases of the read modulo this value.
4797 	 * e.g. forceTrimModulo=50 would trim the last 3bp from a 153bp read. */
4798 	final int forceTrimModulo;
4799 
4800 	/** Discard reads with GC below this. */
4801 	final float minGC;
4802 	/** Discard reads with GC above this. */
4803 	final float maxGC;
4804 	/** Discard reads outside of GC bounds. */
4805 	final boolean filterGC;
4806 	/** Average GC for paired reads. */
4807 	final boolean usePairGC;
4808 
4809 	/** If positive, only look for kmer matches in the leftmost X bases */
4810 	int restrictLeft;
4811 	/** If positive, only look for kmer matches the rightmost X bases */
4812 	int restrictRight;
4813 
4814 	/** Skip this many initial input reads */
4815 	private final long skipreads;
4816 
4817 	/** Pairs go to outbad if either of them is bad, as opposed to requiring both to be bad.
4818 	 * Default: true. */
4819 	final boolean removePairsIfEitherBad;
4820 
4821 	/** Rather than discarding, trim failures to 1bp.
4822 	 * Default: false. */
4823 	final boolean trimFailuresTo1bp;
4824 
4825 	/** Print only statistics for scaffolds that matched at least one read
4826 	 * Default: true. */
4827 	private final boolean printNonZeroOnly;
4828 
4829 	/** Rename reads to indicate what they matched.
4830 	 * Default: false. */
4831 	final boolean rename;
4832 	/** Use names of reference files instead of scaffolds.
4833 	 * Default: false. */
4834 	private final boolean useRefNames;
4835 
4836 	/** Fraction of kmers to skip, 0 to 16 out of 17 */
4837 	final int speed;
4838 
4839 	/** Skip this many kmers when examining the read.  Default 1.
4840 	 * 1 means every kmer is used, 2 means every other, etc. */
4841 	final int qSkip;
4842 
4843 	/** noAccel is true if speed and qSkip are disabled, accel is the opposite. */
4844 	final boolean noAccel;
4845 	private final boolean accel;
4846 
4847 	private boolean pairedToSingle=false;
4848 
4849 	/*--------------------------------------------------------------*/
4850 	/*-----------        Symbol-Specific Constants        ----------*/
4851 	/*--------------------------------------------------------------*/
4852 
4853 	/** True for amino acid data, false for nucleotide data */
4854 	final boolean amino;
4855 	final int maxSupportedK;
4856 	final int bitsPerBase;
4857 	final int maxSymbol;
4858 	final int symbols;
4859 	final int symbolArrayLen;
4860 	final int symbolSpace;
4861 	final long symbolMask;
4862 
4863 	final int minlen;
4864 	final int minminlen;
4865 	final int minlen2;
4866 	final int shift;
4867 	final int shift2;
4868 	final long mask;
4869 	final long kmask;
4870 
4871 	/** x&clearMasks[i] will clear base i */
4872 	final long[] clearMasks;
4873 	/** x|setMasks[i][j] will set base i to j */
4874 	final long[][] setMasks;
4875 	/** x&leftMasks[i] will clear all bases to the right of i (exclusive) */
4876 	final long[] leftMasks;
4877 	/** x&rightMasks[i] will clear all bases to the left of i (inclusive) */
4878 	final long[] rightMasks;
4879 	/** x|kMasks[i] will set the bit to the left of the leftmost base */
4880 	final long[] lengthMasks;
4881 
4882 	/** Symbol code; -1 for undefined */
4883 	final byte[] symbolToNumber;
4884 	/** Symbol code; 0 for undefined */
4885 	final byte[] symbolToNumber0;
4886 	/** Complementary symbol code; 0 for undefined */
4887 	final byte[] symbolToComplementNumber0;
4888 
4889 	/** For verbose / debugging output */
kmerToString(long kmer, int k)4890 	final String kmerToString(long kmer, int k){
4891 		return amino ? AminoAcid.kmerToStringAA(kmer, k) : AminoAcid.kmerToString(kmer, k);
4892 	}
4893 
4894 	/** Returns true if the symbol is not degenerate (e.g., 'N') for the alphabet in use. */
isFullyDefined(byte symbol)4895 	final boolean isFullyDefined(byte symbol){
4896 		return symbol>=0 && symbolToNumber[symbol]>=0;
4897 	}
4898 
4899 	/*--------------------------------------------------------------*/
4900 	/*----------------         BBMerge Flags        ----------------*/
4901 	/*--------------------------------------------------------------*/
4902 
4903 	/** Trim implied adapters based on overlap, for reads with insert size shorter than read length */
4904 	final boolean trimByOverlap;
4905 	final boolean useQualityForOverlap;
4906 	final boolean strictOverlap;
4907 
4908 	int minOverlap0=7;
4909 	int minOverlap=14;
4910 	int minInsert0=16;
4911 	int minInsert=40;
4912 
4913 	final float maxRatio;
4914 	final float ratioMargin;
4915 	final float ratioOffset;
4916 	final float efilterRatio;
4917 	final float efilterOffset;
4918 	final float pfilterRatio;
4919 	final float meeFilter;
4920 
4921 	/*--------------------------------------------------------------*/
4922 	/*----------------        Histogram Flags       ----------------*/
4923 	/*--------------------------------------------------------------*/
4924 
4925 	/**
4926 	 * Generate histograms from the reads before rather than after processing;
4927 	 * default is true.  Khist is handled independently.
4928 	 */
4929 	final boolean histogramsBeforeProcessing;
4930 
4931 	final boolean MAKE_QUALITY_ACCURACY;
4932 	final boolean MAKE_QUALITY_HISTOGRAM;
4933 	final boolean MAKE_MATCH_HISTOGRAM;
4934 	final boolean MAKE_BASE_HISTOGRAM;
4935 
4936 	final boolean MAKE_EHIST;
4937 	final boolean MAKE_INDELHIST;
4938 	final boolean MAKE_LHIST;
4939 	final boolean MAKE_GCHIST;
4940 	final boolean MAKE_ENTROPYHIST;
4941 	final boolean MAKE_IDHIST;
4942 
4943 	final boolean MAKE_IHIST;
4944 
4945 	/*--------------------------------------------------------------*/
4946 	/*----------------         Static Fields        ----------------*/
4947 	/*--------------------------------------------------------------*/
4948 
4949 	/** Number of tables (and threads, during loading) */
4950 	private static final int WAYS=7; //123
4951 	/** Default initial size of data structures */
4952 	private static final int initialSizeDefault=128000;
4953 	/** Verbose messages */
4954 	public static final boolean verbose=false; //123
4955 
4956 	/** Ends for some operations like entropytrim; could be migrated over to other operations */
4957 	private static final int RIGHT=1, LEFT=2, RIGHTLEFT=3;
4958 
4959 	/** Number of reads output in the last run */
4960 	public static long lastReadsOut;
4961 	/** Print messages to this stream */
4962 	private static PrintStream outstream=System.err;
4963 	/** Permission to overwrite existing files */
4964 	public static boolean overwrite=true;
4965 	/** Permission to append to existing files */
4966 	public static boolean append=false;
4967 	/** Print speed statistics upon completion */
4968 	public static boolean showSpeed=true;
4969 	/** Display progress messages such as memory usage */
4970 	public static boolean DISPLAY_PROGRESS=true;
4971 	/** Number of ProcessThreads */
4972 	public static int THREADS=Shared.threads();
4973 	/** Indicates end of input stream */
4974 	static final ArrayList<Read> POISON=new ArrayList<Read>(0);
4975 	/** Number of columns for statistics output, 3 or 5 */
4976 	public static int STATS_COLUMNS=3;
4977 	/** Release memory used by kmer storage after processing reads */
4978 	public static boolean RELEASE_TABLES=true;
4979 	/** Max value of hitCount array */
4980 	public static final int HITCOUNT_LEN=1000;
4981 	/** Make unambiguous copies of ref sequences with ambiguous bases */
4982 	public static boolean REPLICATE_AMBIGUOUS=false;
4983 
4984 	/** Stores some data for statistics when running RQCFilter; not used otherwise. */
4985 	public static HashMap<String, Long> RQC_MAP=null;
4986 
4987 }
4988