1 package kmer;
2 
3 import java.io.File;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 import java.util.BitSet;
7 import java.util.concurrent.atomic.AtomicLong;
8 
9 import assemble.Contig;
10 import bloom.KmerCountAbstract;
11 import dna.AminoAcid;
12 import fileIO.ByteStreamWriter;
13 import fileIO.FileFormat;
14 import fileIO.ReadWrite;
15 import jgi.BBMerge;
16 import shared.Parse;
17 import shared.Parser;
18 import shared.PreParser;
19 import shared.Primes;
20 import shared.ReadStats;
21 import shared.Shared;
22 import shared.Timer;
23 import shared.Tools;
24 import shared.TrimRead;
25 import stream.ConcurrentReadInputStream;
26 import stream.FastaReadInputStream;
27 import stream.Read;
28 import structures.ByteBuilder;
29 import structures.IntList;
30 import structures.ListNum;
31 import structures.LongList;
32 import ukmer.Kmer;
33 
34 
35 /**
36  * Loads and holds kmers for Tadpole
37  * @author Brian Bushnell
38  * @date Jun 22, 2015
39  *
40  */
41 public class KmerTableSet extends AbstractKmerTableSet {
42 
43 	/**
44 	 * Code entrance from the command line.
45 	 * @param args Command line arguments
46 	 */
main(String[] args)47 	public static void main(String[] args){
48 		Timer t=new Timer(), t2=new Timer();
49 		t.start();
50 		t2.start();
51 		KmerTableSet set=new KmerTableSet(args);
52 		t2.stop();
53 		outstream.println("Initialization Time:      \t"+t2);
54 
55 		///And run it
56 		set.process(t);
57 
58 		//Close the print stream if it was redirected
59 		Shared.closeStream(outstream);
60 	}
61 
62 
63 	/**
64 	 * Constructor.
65 	 * @param args Command line arguments
66 	 */
KmerTableSet(String[] args)67 	private KmerTableSet(String[] args){
68 		this(args, 12);//+5 if using ownership and building contigs
69 	}
70 
71 
72 	/**
73 	 * Constructor.
74 	 * @param args Command line arguments
75 	 */
KmerTableSet(String[] args, final int bytesPerKmer_)76 	public KmerTableSet(String[] args, final int bytesPerKmer_){
77 		{//Preparse block for help, config files, and outstream
78 			PreParser pp=new PreParser(args, getClass(), false);
79 			args=pp.args;
80 			outstream=pp.outstream;
81 		}//TODO - no easy way to close outstream
82 
83 		/* Initialize local variables with defaults */
84 		Parser parser=new Parser();
85 		boolean prealloc_=false;
86 		int k_=31;
87 		int ways_=-1;
88 		int filterMax_=2;
89 		boolean ecco_=false, merge_=false;
90 		boolean rcomp_=true;
91 		double minProb_=defaultMinprob;
92 		int tableType_=AbstractKmerTable.ARRAY1D;
93 		/* Parse arguments */
94 		for(int i=0; i<args.length; i++){
95 
96 			final String arg=args[i];
97 			String[] split=arg.split("=");
98 			String a=split[0].toLowerCase();
99 			String b=split.length>1 ? split[1] : null;
100 
101 			if(Parser.parseCommonStatic(arg, a, b)){
102 				//do nothing
103 			}else if(Parser.parseZip(arg, a, b)){
104 				//do nothing
105 			}else if(Parser.parseQuality(arg, a, b)){
106 				//do nothing
107 			}else if(Parser.parseFasta(arg, a, b)){
108 				//do nothing
109 			}else if(parser.parseInterleaved(arg, a, b)){
110 				//do nothing
111 			}else if(parser.parseTrim(arg, a, b)){
112 				//do nothing
113 			}else if(a.equals("in") || a.equals("in1")){
114 				in1.clear();
115 				if(b!=null){
116 					String[] s=b.split(",");
117 					for(String ss : s){
118 						in1.add(ss);
119 					}
120 				}
121 			}else if(a.equals("in2")){
122 				in2.clear();
123 				if(b!=null){
124 					String[] s=b.split(",");
125 					for(String ss : s){
126 						in2.add(ss);
127 					}
128 				}
129 			}else if(a.equals("extra")){
130 				extra.clear();
131 				if(b!=null){
132 					String[] s=b.split(",");
133 					for(String ss : s){
134 						extra.add(ss);
135 					}
136 				}
137 			}else if(a.equals("append") || a.equals("app")){
138 				append=ReadStats.append=Parse.parseBoolean(b);
139 			}else if(a.equals("overwrite") || a.equals("ow")){
140 				overwrite=Parse.parseBoolean(b);
141 			}else if(a.equals("initialsize")){
142 				initialSize=Parse.parseIntKMG(b);
143 			}else if(a.equals("showstats") || a.equals("stats")){
144 				showStats=Parse.parseBoolean(b);
145 			}else if(a.equals("ways")){
146 				ways_=Parse.parseIntKMG(b);
147 			}else if(a.equals("buflen") || a.equals("bufflen") || a.equals("bufferlength")){
148 				buflen=Parse.parseIntKMG(b);
149 			}else if(a.equals("k")){
150 				assert(b!=null) : "\nk needs an integer value from 1 to 31, such as k=27.  Default is 31.\n";
151 				k_=Parse.parseIntKMG(b);
152 			}else if(a.equals("threads") || a.equals("t")){
153 				THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b));
154 			}else if(a.equals("showspeed") || a.equals("ss")){
155 				showSpeed=Parse.parseBoolean(b);
156 			}else if(a.equals("ecco")){
157 				ecco_=Parse.parseBoolean(b);
158 			}else if(a.equals("merge")){
159 				merge_=Parse.parseBoolean(b);
160 			}else if(a.equals("verbose")){
161 //				assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
162 				verbose=Parse.parseBoolean(b);
163 			}else if(a.equals("verbose2")){
164 //				assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
165 				verbose2=Parse.parseBoolean(b);
166 			}else if(a.equals("minprob")){
167 				minProb_=Double.parseDouble(b);
168 			}else if(a.equals("minprobprefilter") || a.equals("mpp")){
169 				minProbPrefilter=Parse.parseBoolean(b);
170 			}else if(a.equals("minprobmain") || a.equals("mpm")){
171 				minProbMain=Parse.parseBoolean(b);
172 			}else if(a.equals("reads") || a.startsWith("maxreads")){
173 				maxReads=Parse.parseKMG(b);
174 			}else if(a.equals("prealloc") || a.equals("preallocate")){
175 				if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){
176 					prealloc_=Parse.parseBoolean(b);
177 				}else{
178 					preallocFraction=Tools.max(0, Double.parseDouble(b));
179 					prealloc_=(preallocFraction>0);
180 				}
181 			}else if(a.equals("prefilter")){
182 				if(b==null || b.length()<1 || !Tools.isDigit(b.charAt(0))){
183 					prefilter=Parse.parseBoolean(b);
184 				}else{
185 					filterMax_=Parse.parseIntKMG(b);
186 					prefilter=filterMax_>0;
187 				}
188 			}else if(a.equals("prefiltersize") || a.equals("prefilterfraction") || a.equals("pff")){
189 				prefilterFraction=Tools.max(0, Double.parseDouble(b));
190 				assert(prefilterFraction<=1) : "prefiltersize must be 0-1, a fraction of total memory.";
191 				prefilter=prefilterFraction>0;
192 			}else if(a.equals("prehashes") || a.equals("hashes")){
193 				prehashes=Parse.parseIntKMG(b);
194 			}else if(a.equals("prefilterpasses") || a.equals("prepasses")){
195 				assert(b!=null) : "Bad parameter: "+arg;
196 				if(b.equalsIgnoreCase("auto")){
197 					prepasses=-1;
198 				}else{
199 					prepasses=Parse.parseIntKMG(b);
200 				}
201 			}else if(a.equals("onepass")){
202 				onePass=Parse.parseBoolean(b);
203 			}else if(a.equals("passes")){
204 				int passes=Parse.parseIntKMG(b);
205 				onePass=(passes<2);
206 			}else if(a.equals("rcomp")){
207 				rcomp_=Parse.parseBoolean(b);
208 			}else if(a.equals("tabletype")){
209 				tableType_=Integer.parseInt(b);
210 			}
211 
212 			else if(a.equalsIgnoreCase("filterMemoryOverride") || a.equalsIgnoreCase("filterMemory") ||
213 					a.equalsIgnoreCase("prefilterMemory") || a.equalsIgnoreCase("filtermem")){
214 				filterMemoryOverride=Parse.parseKMG(b);
215 			}
216 
217 			else if(IGNORE_UNKNOWN_ARGS){
218 				//Do nothing
219 			}else{
220 				throw new RuntimeException("Unknown parameter "+args[i]);
221 			}
222 		}
223 
224 		{//Process parser fields
225 			Parser.processQuality();
226 
227 			qtrimLeft=parser.qtrimLeft;
228 			qtrimRight=parser.qtrimRight;
229 			trimq=parser.trimq;
230 			trimE=parser.trimE();
231 
232 			minAvgQuality=parser.minAvgQuality;
233 			minAvgQualityBases=parser.minAvgQualityBases;
234 			amino=Shared.AMINO_IN;
235 			if(amino){k_=Tools.min(k_, 12);}
236 		}
237 
238 		if(prepasses==0 || !prefilter){
239 			prepasses=0;
240 			prefilter=false;
241 		}
242 
243 
244 //		assert(false) : prepasses+", "+onePass+", "+prefilter;
245 
246 		{
247 			long memory=Runtime.getRuntime().maxMemory();
248 			double xmsRatio=Shared.xmsRatio();
249 //			long tmemory=Runtime.getRuntime().totalMemory();
250 			usableMemory=(long)Tools.max(((memory-96000000)*(xmsRatio>0.97 ? 0.82 : 0.72)), memory*0.45);
251 			if(prepasses==0 || !prefilter){
252 				filterMemory0=filterMemory1=0;
253 			}else if(filterMemoryOverride>0){
254 				filterMemory0=filterMemory1=filterMemoryOverride;
255 			}else{
256 				double low=Tools.min(prefilterFraction, 1-prefilterFraction);
257 				double high=1-low;
258 				if(prepasses<0 || (prepasses&1)==1){//odd passes
259 					filterMemory0=(long)(usableMemory*low);
260 					filterMemory1=(long)(usableMemory*high);
261 				}else{//even passes
262 					filterMemory0=(long)(usableMemory*high);
263 					filterMemory1=(long)(usableMemory*low);
264 				}
265 			}
266 			tableMemory=(long)(usableMemory*.95-filterMemory0);
267 		}
268 
269 		tableType=tableType_;
270 		prealloc=prealloc_;
271 		bytesPerKmer=bytesPerKmer_;
272 		if(ways_<1){
273 			long maxKmers=(2*tableMemory)/bytesPerKmer;
274 			long minWays=Tools.min(10000, maxKmers/Integer.MAX_VALUE);
275 			ways_=(int)Tools.max(31, (int)(THREADS*2.5), minWays);
276 			ways_=(int)Primes.primeAtLeast(ways_);
277 			assert(ways_>0);
278 //			System.err.println("ways="+ways_);
279 		}
280 
281 		/* Set final variables; post-process and validate argument combinations */
282 
283 		onePass=onePass&prefilter;
284 		ways=ways_;
285 		filterMax=Tools.min(filterMax_, 0x7FFFFFFF);
286 		ecco=ecco_;
287 		merge=merge_;
288 		minProb=(float)minProb_;
289 		rcomp=rcomp_;
290 //		assert(false) : tableMemory+", "+bytesPerKmer+", "+prealloc+", "+preallocFraction;
291 		estimatedKmerCapacity=(long)((tableMemory*1.0/bytesPerKmer)*((prealloc ? preallocFraction : 0.81))*(HashArray.maxLoadFactorFinal*0.97));
292 
293 //		System.err.println("tableMemory="+tableMemory+", bytesPerKmer="+bytesPerKmer+", estimatedKmerCapacity="+estimatedKmerCapacity);
294 		KmerCountAbstract.minProb=(minProbPrefilter ? minProb : 0);
295 		k=k_;
296 		k2=k-1;
297 		int bitsPerBase=(amino ? 5 : 2);
298 		int mask=(amino ? 31 : 3);
299 		coreMask=(AbstractKmerTableSet.MASK_CORE ? ~(((-1L)<<(bitsPerBase*(k_-1)))|mask) : -1L);
300 
301 		if(k<1 || k>31){throw new RuntimeException("\nk needs an integer value from 1 to 31, such as k=27.  Default is 31.\n");}
302 
303 		if(initialSize<1){
304 			final long memOverWays=tableMemory/(bytesPerKmer*ways);
305 			final double mem2=(prealloc ? preallocFraction : 1)*tableMemory;
306 			initialSize=(prealloc || memOverWays<initialSizeDefault ? (int)Tools.min(2142000000, (long)(mem2/(bytesPerKmer*ways))) : initialSizeDefault);
307 			if(initialSize!=initialSizeDefault){
308 				System.err.println("Initial size set to "+initialSize);
309 			}
310 		}
311 
312 		/* Adjust I/O settings and filenames */
313 
314 		assert(FastaReadInputStream.settingsOK());
315 
316 		if(in1.isEmpty()){
317 			//throw new RuntimeException("Error - at least one input file is required.");
318 		}
319 
320 		for(int i=0; i<in1.size(); i++){
321 			String s=in1.get(i);
322 			if(s!=null && s.contains("#") && !new File(s).exists()){
323 				int pound=s.lastIndexOf('#');
324 				String a=s.substring(0, pound);
325 				String b=s.substring(pound+1);
326 				in1.set(i, a+1+b);
327 				in2.add(a+2+b);
328 			}
329 		}
330 
331 		if(!extra.isEmpty()){
332 			ArrayList<String> temp=(ArrayList<String>) extra.clone();
333 			extra.clear();
334 			for(String s : temp){
335 				if(s!=null && s.contains("#") && !new File(s).exists()){
336 					int pound=s.lastIndexOf('#');
337 					String a=s.substring(0, pound);
338 					String b=s.substring(pound+1);
339 					extra.add(a);
340 					extra.add(b);
341 				}else{
342 					extra.add(s);
343 				}
344 			}
345 		}
346 
347 		{
348 			boolean allowDuplicates=true;
349 			if(!Tools.testInputFiles(allowDuplicates, true, in1, in2, extra)){
350 				throw new RuntimeException("\nCan't read some input files.\n");
351 			}
352 		}
353 		assert(THREADS>0);
354 
355 		if(DISPLAY_PROGRESS){
356 //			assert(false);
357 			outstream.println("Initial:");
358 			outstream.println("Ways="+ways+", initialSize="+initialSize+", prefilter="+(prefilter ? "t" : "f")+", prealloc="+(prealloc ? (""+preallocFraction) : "f"));
359 			Shared.printMemory();
360 			outstream.println();
361 		}
362 	}
363 
364 	/*--------------------------------------------------------------*/
365 	/*----------------         Outer Methods        ----------------*/
366 	/*--------------------------------------------------------------*/
367 
368 	@Override
clear()369 	public void clear(){
370 		tables=null;
371 	}
372 
373 	/*--------------------------------------------------------------*/
374 	/*----------------         Inner Methods        ----------------*/
375 	/*--------------------------------------------------------------*/
376 
377 	@Override
allocateTables()378 	public void allocateTables(){
379 		assert(tables==null);
380 		tables=null;
381 		final long coreMask=(MASK_CORE ? ~(((-1L)<<(2*(k-1)))|3) : -1L);
382 
383 		ScheduleMaker scheduleMaker=new ScheduleMaker(ways, bytesPerKmer, prealloc,
384 				(prealloc ? preallocFraction : 1.0), -1, (prefilter ? prepasses : 0), prefilterFraction, filterMemoryOverride);
385 		int[] schedule=scheduleMaker.makeSchedule();
386 		tables=AbstractKmerTable.preallocate(ways, tableType, schedule, coreMask);
387 
388 //		tables=AbstractKmerTable.preallocate(ways, tableType, initialSize, coreMask, (!prealloc || preallocFraction<1));
389 	}
390 
391 	/**
392 	 * Load reads into tables, using multiple LoadThread.
393 	 */
394 	@Override
loadKmers(String fname1, String fname2)395 	public long loadKmers(String fname1, String fname2){
396 
397 		/* Create read input stream */
398 		final ConcurrentReadInputStream cris;
399 		{
400 			FileFormat ff1=FileFormat.testInput(fname1, FileFormat.FASTQ, null, true, true);
401 			FileFormat ff2=FileFormat.testInput(fname2, FileFormat.FASTQ, null, true, true);
402 			cris=ConcurrentReadInputStream.getReadInputStream(maxReads, false, ff1, ff2);
403 			cris.start(); //4567
404 		}
405 
406 //		/* Optionally skip the first reads, since initial reads may have lower quality */
407 //		if(skipreads>0){
408 //			long skipped=0;
409 //
410 //			ListNum<Read> ln=cris.nextList();
411 //			ArrayList<Read> reads=(ln!=null ? ln.list : null);
412 //
413 //			while(skipped<skipreads && reads!=null && reads.size()>0){
414 //				skipped+=reads.size();
415 //
416 //				cris.returnList(ln);
417 //				ln=cris.nextList();
418 //				reads=(ln!=null ? ln.list : null);
419 //			}
420 //			cris.returnList(ln);
421 //			if(reads==null || reads.isEmpty()){
422 //				ReadWrite.closeStreams(cris);
423 //				System.err.println("Skipped all of the reads.");
424 //				System.exit(0);
425 //			}
426 //		}
427 
428 		/* Create ProcessThreads */
429 		ArrayList<LoadThread> alpt=new ArrayList<LoadThread>(THREADS);
430 		for(int i=0; i<THREADS; i++){alpt.add(new LoadThread(cris));}
431 		for(LoadThread pt : alpt){pt.start();}
432 
433 		long added=0;
434 
435 		/* Wait for threads to die, and gather statistics */
436 		for(LoadThread pt : alpt){
437 			while(pt.getState()!=Thread.State.TERMINATED){
438 				try {
439 					pt.join();
440 				} catch (InterruptedException e) {
441 					// TODO Auto-generated catch block
442 					e.printStackTrace();
443 				}
444 			}
445 			added+=pt.added;
446 
447 			readsIn+=pt.readsInT;
448 			basesIn+=pt.basesInT;
449 			lowqReads+=pt.lowqReadsT;
450 			lowqBases+=pt.lowqBasesT;
451 			readsTrimmed+=pt.readsTrimmedT;
452 			basesTrimmed+=pt.basesTrimmedT;
453 			kmersIn+=pt.kmersInT;
454 		}
455 
456 		/* Shut down I/O streams; capture error status */
457 		errorState|=ReadWrite.closeStreams(cris);
458 		return added;
459 	}
460 
461 	/*--------------------------------------------------------------*/
462 	/*----------------         Inner Classes        ----------------*/
463 	/*--------------------------------------------------------------*/
464 
465 	/**
466 	 * Loads kmers.
467 	 */
468 	private class LoadThread extends Thread{
469 
470 		/**
471 		 * Constructor
472 		 * @param cris_ Read input stream
473 		 */
LoadThread(ConcurrentReadInputStream cris_)474 		public LoadThread(ConcurrentReadInputStream cris_){
475 			cris=cris_;
476 			table=new HashBuffer(tables, buflen, k, false, true);
477 		}
478 
479 		@Override
run()480 		public void run(){
481 			ListNum<Read> ln=cris.nextList();
482 			ArrayList<Read> reads=(ln!=null ? ln.list : null);
483 
484 			//While there are more reads lists...
485 			while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
486 
487 				//For each read (or pair) in the list...
488 				for(int i=0; i<reads.size(); i++){
489 					Read r1=reads.get(i);
490 					Read r2=r1.mate;
491 
492 					if(!r1.validated()){r1.validate(true);}
493 					if(r2!=null && !r2.validated()){r2.validate(true);}
494 
495 					if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));}
496 
497 					readsInT++;
498 					basesInT+=r1.length();
499 					if(r2!=null){
500 						readsInT++;
501 						basesInT+=r2.length();
502 					}
503 
504 					if(maxNs<Integer.MAX_VALUE){
505 						if(r1!=null && r1.countUndefined()>maxNs){r1.setDiscarded(true);}
506 						if(r2!=null && r2.countUndefined()>maxNs){r2.setDiscarded(true);}
507 					}
508 
509 					//Determine whether to discard the reads based on average quality
510 					if(minAvgQuality>0){
511 						if(r1!=null && r1.quality!=null && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){r1.setDiscarded(true);}
512 						if(r2!=null && r2.quality!=null && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){r2.setDiscarded(true);}
513 					}
514 
515 					if(r1!=null){
516 						if(qtrimLeft || qtrimRight){
517 							int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, trimE, 1, false);
518 							basesTrimmedT+=x;
519 							readsTrimmedT+=(x>0 ? 1 : 0);
520 						}
521 						if(r1.length()<k){r1.setDiscarded(true);}
522 					}
523 					if(r2!=null){
524 						if(qtrimLeft || qtrimRight){
525 							int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, trimE, 1, false);
526 							basesTrimmedT+=x;
527 							readsTrimmedT+=(x>0 ? 1 : 0);
528 						}
529 						if(r2.length()<k){r2.setDiscarded(true);}
530 					}
531 
532 					if((ecco || merge) && r1!=null && r2!=null && !r1.discarded() && !r2.discarded()){
533 						if(merge){
534 							final int insert=BBMerge.findOverlapStrict(r1, r2, false);
535 							if(insert>0){
536 								r2.reverseComplement();
537 								r1=r1.joinRead(insert);
538 								r2=null;
539 							}
540 						}else if(ecco){
541 							BBMerge.findOverlapStrict(r1, r2, true);
542 						}
543 					}
544 
545 					if(minLen>0){
546 						if(r1!=null && r1.length()<minLen){r1.setDiscarded(true);}
547 						if(r2!=null && r2.length()<minLen){r2.setDiscarded(true);}
548 					}
549 
550 					if(r1!=null){
551 						if(r1.discarded()){
552 							lowqBasesT+=r1.length();
553 							lowqReadsT++;
554 						}else{
555 							long temp=addKmersToTable(r1);
556 							added+=temp;
557 							if(verbose){System.err.println("A: Added "+temp);}
558 						}
559 					}
560 					if(r2!=null){
561 						if(r2.discarded()){
562 							lowqBasesT+=r2.length();
563 							lowqReadsT++;
564 						}else{
565 							long temp=addKmersToTable(r2);
566 							added+=temp;
567 							if(verbose){System.err.println("B: Added "+temp);}
568 						}
569 					}
570 				}
571 
572 				//Fetch a new read list
573 				cris.returnList(ln);
574 				ln=cris.nextList();
575 				reads=(ln!=null ? ln.list : null);
576 			}
577 			cris.returnList(ln);
578 			long temp=table.flush();
579 			if(verbose){System.err.println("Flush: Added "+temp);}
580 			added+=temp;
581 		}
582 
addKmersToTableAA(final Read r)583 		private final int addKmersToTableAA(final Read r){
584 			if(r==null || r.bases==null){return 0;}
585 			final float minProb2=(minProbMain ? minProb : 0);
586 			final byte[] bases=r.bases;
587 			final byte[] quals=r.quality;
588 			final int shift=5*k;
589 			final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
590 			long kmer=0;
591 			int created=0;
592 			int len=0;
593 
594 			if(bases==null || bases.length<k){return -1;}
595 
596 			/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
597 			float prob=1;
598 			for(int i=0; i<bases.length; i++){
599 				final byte b=bases[i];
600 				final long x=AminoAcid.acidToNumber[b];
601 
602 				//Update kmers
603 				kmer=((kmer<<5)|x)&mask;
604 
605 				if(minProb2>0 && quals!=null){//Update probability
606 					prob=prob*PROB_CORRECT[quals[i]];
607 					if(len>k){
608 						byte oldq=quals[i-k];
609 						prob=prob*PROB_CORRECT_INVERSE[oldq];
610 					}
611 				}
612 
613 				//Handle Ns
614 				if(x<0){
615 					len=0;
616 					kmer=0;
617 					prob=1;
618 				}else{len++;}
619 
620 				if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
621 				if(len>=k && prob>=minProb2){
622 					kmersInT++;
623 					final long key=kmer;
624 					if(!prefilter || prefilterArray.read(key)>filterMax2){
625 						int temp=table.incrementAndReturnNumCreated(key, 1);
626 						created+=temp;
627 						if(verbose){System.err.println("C: Added "+temp);}
628 					}
629 				}
630 			}
631 
632 			return created;
633 		}
634 
addKmersToTable(final Read r)635 		private final int addKmersToTable(final Read r){
636 			if(amino){return addKmersToTableAA(r);}
637 			if(onePass){return addKmersToTable_onePass(r);}
638 			if(r==null || r.bases==null){return 0;}
639 			final float minProb2=(minProbMain ? minProb : 0);
640 			final byte[] bases=r.bases;
641 			final byte[] quals=r.quality;
642 			final int shift=2*k;
643 			final int shift2=shift-2;
644 			final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
645 			long kmer=0;
646 			long rkmer=0;
647 			int created=0;
648 			int len=0;
649 
650 			if(bases==null || bases.length<k){return -1;}
651 
652 			/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
653 			float prob=1;
654 			for(int i=0; i<bases.length; i++){
655 				final byte b=bases[i];
656 				final long x=AminoAcid.baseToNumber[b];
657 				final long x2=AminoAcid.baseToComplementNumber[b];
658 
659 				//Update kmers
660 				kmer=((kmer<<2)|x)&mask;
661 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
662 
663 				if(minProb2>0 && quals!=null){//Update probability
664 					prob=prob*PROB_CORRECT[quals[i]];
665 					if(len>k){
666 						byte oldq=quals[i-k];
667 						prob=prob*PROB_CORRECT_INVERSE[oldq];
668 					}
669 				}
670 
671 				//Handle Ns
672 				if(x<0){
673 					len=0;
674 					kmer=rkmer=0;
675 					prob=1;
676 				}else{len++;}
677 
678 				if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
679 				if(len>=k && prob>=minProb2){
680 					kmersInT++;
681 					final long key=toValue(kmer, rkmer);
682 					if(!prefilter || prefilterArray.read(key)>filterMax2){
683 						int temp=table.incrementAndReturnNumCreated(key, 1);
684 						created+=temp;
685 						if(verbose){System.err.println("C: Added "+temp);}
686 					}
687 				}
688 			}
689 
690 			return created;
691 		}
692 
693 
addKmersToTable_onePass(final Read r)694 		private final int addKmersToTable_onePass(final Read r){
695 			assert(prefilter);
696 			if(r==null || r.bases==null){return 0;}
697 			final byte[] bases=r.bases;
698 			final byte[] quals=r.quality;
699 			final int shift=2*k;
700 			final int shift2=shift-2;
701 			final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
702 			long kmer=0;
703 			long rkmer=0;
704 			int created=0;
705 			int len=0;
706 
707 			if(bases==null || bases.length<k){return -1;}
708 
709 			/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
710 			float prob=1;
711 			for(int i=0; i<bases.length; i++){
712 				final byte b=bases[i];
713 				final long x=AminoAcid.baseToNumber[b];
714 				final long x2=AminoAcid.baseToComplementNumber[b];
715 
716 				//Update kmers
717 				kmer=((kmer<<2)|x)&mask;
718 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
719 
720 				if(minProb>0 && quals!=null){//Update probability
721 					prob=prob*PROB_CORRECT[quals[i]];
722 					if(len>k){
723 						byte oldq=quals[i-k];
724 						prob=prob*PROB_CORRECT_INVERSE[oldq];
725 					}
726 				}
727 
728 				//Handle Ns
729 				if(x<0){
730 					len=0;
731 					kmer=rkmer=0;
732 					prob=1;
733 				}else{len++;}
734 
735 				if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
736 				if(len>=k){
737 					final long key=toValue(kmer, rkmer);
738 					int count=prefilterArray.incrementAndReturnUnincremented(key, 1);
739 					if(count>=filterMax2){
740 						int temp=table.incrementAndReturnNumCreated(key, 1);
741 						created+=temp;
742 						if(verbose){System.err.println("D: Added "+temp);}
743 					}
744 				}
745 			}
746 			return created;
747 		}
748 
749 		/*--------------------------------------------------------------*/
750 
751 		/** Input read stream */
752 		private final ConcurrentReadInputStream cris;
753 
754 		private final HashBuffer table;
755 
756 		public long added=0;
757 
758 		private long readsInT=0;
759 		private long basesInT=0;
760 		private long lowqReadsT=0;
761 		private long lowqBasesT=0;
762 		private long readsTrimmedT=0;
763 		private long basesTrimmedT=0;
764 		private long kmersInT=0;
765 
766 	}
767 
768 
769 	/*--------------------------------------------------------------*/
770 	/*----------------          Convenience         ----------------*/
771 	/*--------------------------------------------------------------*/
772 
regenerateKmers(byte[] bases, LongList kmers, IntList counts, final int a)773 	public void regenerateKmers(byte[] bases, LongList kmers, IntList counts, final int a){
774 		final int loc=a+k;
775 		final int lim=Tools.min(counts.size, a+k+1);
776 		final int shift=2*k;
777 		final int shift2=shift-2;
778 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
779 		long kmer=kmers.get(a);
780 		long rkmer=rcomp(kmer);
781 		int len=k;
782 
783 //		assert(false) : a+", "+loc+", "+lim;
784 
785 		/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
786 		for(int i=loc, j=a+1; j<lim; i++, j++){
787 			final byte b=bases[i];
788 			final long x=AminoAcid.baseToNumber[b];
789 			final long x2=AminoAcid.baseToComplementNumber[b];
790 			kmer=((kmer<<2)|x)&mask;
791 			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
792 			if(x<0){
793 				len=0;
794 				kmer=rkmer=0;
795 			}else{len++;}
796 
797 			if(len>=k){
798 				assert(kmers.get(j)!=kmer);
799 				kmers.set(j, kmer);
800 				int count=getCount(kmer, rkmer);
801 				counts.set(j, count);
802 			}else{
803 				kmers.set(j, -1);
804 				counts.set(j, 0);
805 			}
806 		}
807 	}
808 
809 	@Override
regenerateCounts(byte[] bases, IntList counts, final Kmer dummy, BitSet changed)810 	public int regenerateCounts(byte[] bases, IntList counts, final Kmer dummy, BitSet changed){
811 		assert(!changed.isEmpty());
812 		final int firstBase=changed.nextSetBit(0), lastBase=changed.length()-1;
813 		final int ca=firstBase-k;
814 //		final int b=changed.nextSetBit(0);ca+kbig; //first base changed
815 		final int firstCount=Tools.max(firstBase-k+1, 0), lastCount=Tools.min(counts.size-1, lastBase); //count limit
816 //		System.err.println("ca="+ca+", b="+b+", lim="+lim);
817 //		System.err.println("Regen from count "+(ca+1)+"-"+lim);
818 
819 		final int shift=2*k;
820 		final int shift2=shift-2;
821 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
822 		long kmer=0, rkmer=0;
823 		int len=0;
824 		int valid=0;
825 
826 //		System.err.println("ca="+ca+", b="+b+", lim="+lim+", "+counts);
827 
828 		for(int i=Tools.max(0, firstBase-k+1), lim=Tools.min(lastBase+k-1, bases.length-1); i<=lim; i++){
829 			final byte base=bases[i];
830 			final long x=AminoAcid.baseToNumber[base];
831 			final long x2=AminoAcid.baseToComplementNumber[base];
832 			kmer=((kmer<<2)|x)&mask;
833 			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
834 
835 			if(x<0){
836 				len=0;
837 				kmer=rkmer=0;
838 			}else{
839 				len++;
840 			}
841 
842 			final int c=i-k+1;
843 			if(i>=firstBase){
844 				if(len>=k){
845 					valid++;
846 					int count=getCount(kmer, rkmer);
847 					counts.set(c, count);
848 				}else if(c>=0){
849 					counts.set(c, 0);
850 				}
851 			}
852 		}
853 
854 		return valid;
855 	}
856 
857 	@Override
fillSpecificCounts(byte[] bases, IntList counts, BitSet positions, final Kmer dummy)858 	public int fillSpecificCounts(byte[] bases, IntList counts, BitSet positions, final Kmer dummy){
859 		final int b=k-1;
860 		final int lim=(positions==null ? bases.length : Tools.min(bases.length, positions.length()+k-1));
861 		final int shift=2*k;
862 		final int shift2=shift-2;
863 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
864 		long kmer=0, rkmer=0;
865 		int len=0;
866 		int valid=0;
867 
868 		counts.clear();
869 
870 		for(int i=0, j=-b; i<lim; i++, j++){
871 			final byte base=bases[i];
872 			final long x=AminoAcid.baseToNumber[base];
873 			final long x2=AminoAcid.baseToComplementNumber[base];
874 			kmer=((kmer<<2)|x)&mask;
875 			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
876 
877 			if(x<0){
878 				len=0;
879 				kmer=rkmer=0;
880 			}else{
881 				len++;
882 			}
883 
884 			if(j>=0){
885 				if(len>=k && (positions==null || positions.get(j))){
886 					valid++;
887 					int count=getCount(kmer, rkmer);
888 					assert(i-k+1==counts.size()) : "j="+j+", counts="+counts.size()+", b="+(b)+", (i-k+1)="+(i-k+1);
889 					assert(j==counts.size());
890 					counts.add(Tools.max(count, 0));
891 //					counts.set(i-k+1, count);
892 				}else{
893 					counts.add(0);
894 //					counts.set(i-k+1, 0);
895 				}
896 			}
897 		}
898 
899 //		assert((counts.size==0 && bases.length<k) || counts.size==bases.length-k+1) : bases.length+", "+k+", "+counts.size;
900 		assert((counts.size==0 && bases.length<k) || counts.size==lim-k+1) : bases.length+", "+k+", "+counts.size;
901 
902 		return valid;
903 	}
904 
regenerateCounts(byte[] bases, IntList counts, final int ca)905 	public int regenerateCounts(byte[] bases, IntList counts, final int ca){
906 		final int b=ca+k-1;
907 		final int lim=Tools.min(bases.length, b+k+1);
908 		final int shift=2*k;
909 		final int shift2=shift-2;
910 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
911 		long kmer=0, rkmer=0;
912 		int len=0;
913 		int valid=0;
914 
915 		final int clen=counts.size;
916 
917 		/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts.
918 		 * i is an index in the base array, j is an index in the count array. */
919 		for(int i=b-k+1; i<lim; i++){
920 			final byte base=bases[i];
921 			final long x=AminoAcid.baseToNumber[base];
922 			final long x2=AminoAcid.baseToComplementNumber[base];
923 			kmer=((kmer<<2)|x)&mask;
924 			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
925 
926 			if(x<0){
927 				len=0;
928 				kmer=rkmer=0;
929 			}else{
930 				len++;
931 			}
932 
933 			if(i>=b){
934 				if(len>=k){
935 					valid++;
936 					int count=getCount(kmer, rkmer);
937 					counts.set(i-k+1, count);
938 				}else{
939 					counts.set(i-k+1, 0);
940 				}
941 			}
942 		}
943 
944 		assert((counts.size==0 && bases.length<k) || counts.size==bases.length-k+1) : bases.length+", "+k+", "+counts.size;
945 		assert(clen==counts.size);
946 
947 		return valid;
948 	}
949 
950 	/** Returns number of valid kmers */
fillKmers(byte[] bases, LongList kmers)951 	public int fillKmers(byte[] bases, LongList kmers){
952 		final int blen=bases.length;
953 		if(blen<k){return 0;}
954 		final int min=k-1;
955 		final int shift=2*k;
956 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
957 		long kmer=0;
958 		int len=0;
959 		int valid=0;
960 
961 		kmers.clear();
962 
963 		/* Loop through the bases, maintaining a forward kmer via bitshifts */
964 		for(int i=0; i<blen; i++){
965 			final byte b=bases[i];
966 			assert(b>=0) : Arrays.toString(bases);
967 			final long x=AminoAcid.baseToNumber[b];
968 			kmer=((kmer<<2)|x)&mask;
969 			if(x<0){
970 				len=0;
971 				kmer=0;
972 			}else{len++;}
973 			if(i>=min){
974 				if(len>=k){
975 					kmers.add(kmer);
976 					valid++;
977 				}else{
978 					kmers.add(-1);
979 				}
980 			}
981 		}
982 		return valid;
983 	}
984 
fillCounts(LongList kmers, IntList counts)985 	public void fillCounts(LongList kmers, IntList counts){
986 		counts.clear();
987 		for(int i=0; i<kmers.size; i++){
988 			long kmer=kmers.get(i);
989 			if(kmer>=0){
990 				long rkmer=rcomp(kmer);
991 				int count=getCount(kmer, rkmer);
992 				counts.add(count);
993 			}else{
994 				counts.add(0);
995 			}
996 		}
997 	}
998 
999 
1000 	/*--------------------------------------------------------------*/
1001 	/*----------------        Helper Methods        ----------------*/
1002 	/*--------------------------------------------------------------*/
1003 
1004 	@Override
regenerate(final int limit)1005 	public long regenerate(final int limit){
1006 		long sum=0;
1007 		for(AbstractKmerTable akt : tables){
1008 			sum+=akt.regenerate(limit);
1009 		}
1010 		return sum;
1011 	}
1012 
getTableForKey(long key)1013 	public HashArray1D getTableForKey(long key){
1014 		return (HashArray1D) tables[kmerToWay(key)];
1015 	}
1016 
1017 	@Override
getTable(int tnum)1018 	public HashArray1D getTable(int tnum){
1019 		return (HashArray1D) tables[tnum];
1020 	}
1021 
1022 	@Override
fillHistogram(int histMax)1023 	public long[] fillHistogram(int histMax) {
1024 		return HistogramMaker.fillHistogram(tables, histMax);
1025 	}
1026 
1027 	@Override
countGC(long[] gcCounts, int max)1028 	public void countGC(long[] gcCounts, int max) {
1029 		for(AbstractKmerTable set : tables){
1030 			set.countGC(gcCounts, max);
1031 		}
1032 	}
1033 
1034 	@Override
initializeOwnership()1035 	public void initializeOwnership(){
1036 		OwnershipThread.initialize(tables);
1037 	}
1038 
1039 	@Override
clearOwnership()1040 	public void clearOwnership(){
1041 		OwnershipThread.clear(tables);
1042 	}
1043 
rightmostKmer(final ByteBuilder bb)1044 	public long rightmostKmer(final ByteBuilder bb){
1045 		return rightmostKmer(bb.array, bb.length());
1046 	}
1047 
rightmostKmer(final byte[] bases, final int blen)1048 	public long rightmostKmer(final byte[] bases, final int blen){
1049 		if(blen<k){return -1;}
1050 		final int shift=2*k;
1051 		final int shift2=shift-2;
1052 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
1053 		long kmer=0;
1054 		long rkmer=0;
1055 		int len=0;
1056 
1057 		/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the rightmost kmer */
1058 		{
1059 			for(int i=blen-k; i<blen; i++){
1060 				final byte b=bases[i];
1061 				final long x=AminoAcid.baseToNumber[b];
1062 				final long x2=AminoAcid.baseToComplementNumber[b];
1063 				kmer=((kmer<<2)|x)&mask;
1064 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
1065 				if(x<0){
1066 					len=0;
1067 					kmer=rkmer=0;
1068 				}else{len++;}
1069 				if(verbose){outstream.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
1070 			}
1071 		}
1072 
1073 		if(len<k){return -1;}
1074 		else{assert(len==k);}
1075 		return kmer;
1076 	}
1077 
leftmostKmer(final ByteBuilder bb)1078 	public long leftmostKmer(final ByteBuilder bb){
1079 		return leftmostKmer(bb.array, bb.length());
1080 	}
1081 
leftmostKmer(final byte[] bases, final int blen)1082 	public long leftmostKmer(final byte[] bases, final int blen){
1083 		if(blen<k){return -1;}
1084 		final int shift=2*k;
1085 		final int shift2=shift-2;
1086 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
1087 		long kmer=0;
1088 		long rkmer=0;
1089 		int len=0;
1090 
1091 		/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the leftmost kmer */
1092 		{
1093 			for(int i=0; i<k; i++){
1094 				final byte b=bases[i];
1095 				final long x=AminoAcid.baseToNumber[b];
1096 				final long x2=AminoAcid.baseToComplementNumber[b];
1097 				kmer=((kmer<<2)|x)&mask;
1098 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
1099 				if(x<0){
1100 					len=0;
1101 					kmer=rkmer=0;
1102 				}else{len++;}
1103 				if(verbose){outstream.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
1104 			}
1105 		}
1106 
1107 		if(len<k){return -1;}
1108 		else{assert(len==k);}
1109 		return kmer;
1110 	}
1111 
doubleClaim(final ByteBuilder bb, final int id)1112 	public boolean doubleClaim(final ByteBuilder bb, final int id){
1113 		return doubleClaim(bb.array, bb.length(), id);
1114 	}
1115 
1116 	/** Ensures there can be only one owner. */
doubleClaim(final byte[] bases, final int blength, final int id)1117 	public boolean doubleClaim(final byte[] bases, final int blength, final int id){
1118 		boolean success=claim(bases, blength, id, true);
1119 		if(verbose){outstream.println("success1="+success+", id="+id+", blength="+blength);}
1120 		if(!success){return false;}
1121 		success=claim(bases, blength, id+CLAIM_OFFSET, true);
1122 		if(verbose){outstream.println("success2="+success+", id="+id+", blength="+blength);}
1123 		return success;
1124 	}
1125 
claim(final ByteBuilder bb, final int id, final boolean exitEarly)1126 	public boolean claim(final ByteBuilder bb, final int id, final boolean exitEarly){
1127 		return claim(bb.array, bb.length(), id, exitEarly);
1128 	}
1129 
calcCoverage(final byte[] bases, final int blength)1130 	public float calcCoverage(final byte[] bases, final int blength){
1131 		final int shift=2*k;
1132 		final int shift2=shift-2;
1133 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
1134 		long kmer=0, rkmer=0;
1135 		int len=0;
1136 		long sum=0, max=0;
1137 		int kmers=0;
1138 
1139 		/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
1140 		for(int i=0; i<blength; i++){
1141 			final byte b=bases[i];
1142 			final long x=AminoAcid.baseToNumber[b];
1143 			final long x2=AminoAcid.baseToComplementNumber[b];
1144 			kmer=((kmer<<2)|x)&mask;
1145 			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
1146 			if(x<0){
1147 				len=0;
1148 				kmer=rkmer=0;
1149 			}else{len++;}
1150 			if(len>=k){
1151 				int count=getCount(kmer, rkmer);
1152 				sum+=count;
1153 				max=Tools.max(count, max);
1154 				kmers++;
1155 			}
1156 		}
1157 		return sum==0 ? 0 : sum/(float)kmers;
1158 	}
1159 
calcCoverage(final Contig contig)1160 	public float calcCoverage(final Contig contig){
1161 		final byte[] bases=contig.bases;
1162 		if(bases.length<k){return 0;}
1163 		final int shift=2*k;
1164 		final int shift2=shift-2;
1165 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
1166 		long kmer=0, rkmer=0;
1167 		int len=0;
1168 		long sum=0;
1169 		int max=0, min=Integer.MAX_VALUE;
1170 		int kmers=0;
1171 
1172 		/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
1173 		for(int i=0; i<bases.length; i++){
1174 			final byte b=bases[i];
1175 			final long x=AminoAcid.baseToNumber[b];
1176 			final long x2=AminoAcid.baseToComplementNumber[b];
1177 			kmer=((kmer<<2)|x)&mask;
1178 			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
1179 			if(x<0){
1180 				len=0;
1181 				kmer=rkmer=0;
1182 			}else{len++;}
1183 			if(len>=k){
1184 				int count=getCount(kmer, rkmer);
1185 				sum+=count;
1186 				max=Tools.max(count, max);
1187 				min=Tools.min(count, min);
1188 				kmers++;
1189 			}
1190 		}
1191 		contig.coverage=sum==0 ? 0 : sum/(float)kmers;
1192 		contig.maxCov=max;
1193 		contig.minCov=sum==0 ? 0 : min;
1194 		return contig.coverage;
1195 	}
1196 
claim(final byte[] bases, final int blength, final int id, boolean exitEarly)1197 	public boolean claim(final byte[] bases, final int blength, final int id, boolean exitEarly){
1198 		if(verbose){outstream.println("Thread "+id+" claim start.");}
1199 		final int shift=2*k;
1200 		final int shift2=shift-2;
1201 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
1202 		long kmer=0, rkmer=0;
1203 		int len=0;
1204 		boolean success=true;
1205 		/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
1206 		for(int i=0; i<blength && success; i++){
1207 			final byte b=bases[i];
1208 			final long x=AminoAcid.baseToNumber[b];
1209 			final long x2=AminoAcid.baseToComplementNumber[b];
1210 			kmer=((kmer<<2)|x)&mask;
1211 			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
1212 			if(x<0){
1213 				len=0;
1214 				kmer=rkmer=0;
1215 			}else{len++;}
1216 			if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
1217 			if(len>=k){
1218 				success=claim(kmer, rkmer, id/*, rid, i*/);
1219 				success=(success || !exitEarly);
1220 			}
1221 		}
1222 		return success;
1223 	}
1224 
claim(final long kmer, final long rkmer, final int id )1225 	public boolean claim(final long kmer, final long rkmer, final int id/*, final long rid, final int pos*/){
1226 		//TODO: rid and pos are just for debugging.
1227 		final long key=toValue(kmer, rkmer);
1228 		final int way=kmerToWay(key);
1229 		final AbstractKmerTable table=tables[way];
1230 		final int count=table.getValue(key);
1231 		assert(count==-1 || count>0) : count;
1232 //		if(verbose  /*|| true*/){outstream.println("Count="+count+".");}
1233 		if(count<0){return true;}
1234 		assert(count>0) : count;
1235 		final int owner=table.setOwner(key, id);
1236 		if(verbose){outstream.println("owner="+owner+".");}
1237 //		assert(owner==id) : id+", "+owner+", "+rid+", "+pos;
1238 		return owner==id;
1239 	}
1240 
release(ByteBuilder bb, final int id)1241 	public void release(ByteBuilder bb, final int id){
1242 		release(bb.array, bb.length(), id);
1243 	}
1244 
release(final byte[] bases, final int blength, final int id)1245 	public void release(final byte[] bases, final int blength, final int id){
1246 		if(verbose  /*|| true*/){outstream.println("*Thread "+id+" release start.");}
1247 		final int shift=2*k;
1248 		final int shift2=shift-2;
1249 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
1250 		long kmer=0, rkmer=0;
1251 		int len=0;
1252 		/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
1253 		for(int i=0; i<blength; i++){
1254 			final byte b=bases[i];
1255 			final long x=AminoAcid.baseToNumber[b];
1256 			final long x2=AminoAcid.baseToComplementNumber[b];
1257 			kmer=((kmer<<2)|x)&mask;
1258 			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
1259 			if(x<0){
1260 				len=0;
1261 				kmer=rkmer=0;
1262 			}else{len++;}
1263 			if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
1264 			if(len>=k){
1265 				release(kmer, rkmer, id);
1266 			}
1267 		}
1268 	}
1269 
release(final long kmer, final long rkmer, final int id)1270 	public boolean release(final long kmer, final long rkmer, final int id){
1271 		return release(toValue(kmer, rkmer), id);
1272 	}
1273 
release(final long key, final int id)1274 	public boolean release(final long key, final int id){
1275 		final int way=kmerToWay(key);
1276 		final AbstractKmerTable table=tables[way];
1277 		final int count=table.getValue(key);
1278 //		if(verbose  /*|| true*/){outstream.println("Count="+count+".");}
1279 		if(count<1){return true;}
1280 		return table.clearOwner(key, id);
1281 	}
1282 
findOwner(ByteBuilder bb, final int id)1283 	public int findOwner(ByteBuilder bb, final int id){
1284 		return findOwner(bb.array, bb.length(), id);
1285 	}
1286 
findOwner(final byte[] bases, final int blength, final int id)1287 	public int findOwner(final byte[] bases, final int blength, final int id){
1288 		final int shift=2*k;
1289 		final int shift2=shift-2;
1290 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
1291 		long kmer=0, rkmer=0;
1292 		int len=0;
1293 		int maxOwner=-1;
1294 		/* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
1295 		for(int i=0; i<blength; i++){
1296 			final byte b=bases[i];
1297 			final long x=AminoAcid.baseToNumber[b];
1298 			final long x2=AminoAcid.baseToComplementNumber[b];
1299 			kmer=((kmer<<2)|x)&mask;
1300 			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
1301 			if(x<0){
1302 				len=0;
1303 				kmer=rkmer=0;
1304 			}else{len++;}
1305 			if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
1306 			if(len>=k){
1307 				int owner=findOwner(kmer, rkmer);
1308 				maxOwner=Tools.max(owner, maxOwner);
1309 				if(maxOwner>id){break;}
1310 			}
1311 		}
1312 		return maxOwner;
1313 	}
1314 
findOwner(final long kmer)1315 	public int findOwner(final long kmer){
1316 		return findOwner(kmer, rcomp(kmer));
1317 	}
1318 
findOwner(final long kmer, final long rkmer)1319 	public int findOwner(final long kmer, final long rkmer){
1320 		final long key=toValue(kmer, rkmer);
1321 		final int way=kmerToWay(key);
1322 		final AbstractKmerTable table=tables[way];
1323 		final int count=table.getValue(key);
1324 		if(count<0){return -1;}
1325 		final int owner=table.getOwner(key);
1326 		return owner;
1327 	}
1328 
getCount(long kmer, long rkmer)1329 	public int getCount(long kmer, long rkmer){
1330 		long key=toValue(kmer, rkmer);
1331 		int way=kmerToWay(key);
1332 		return tables[way].getValue(key);
1333 	}
1334 
getCount(long key)1335 	public int getCount(long key){
1336 		int way=kmerToWay(key);
1337 		return tables[way].getValue(key);
1338 	}
1339 
1340 	/*--------------------------------------------------------------*/
1341 	/*----------------          Fill Counts         ----------------*/
1342 	/*--------------------------------------------------------------*/
1343 
fillRightCounts(long kmer, long rkmer, int[] counts, long mask, int shift2)1344 	public int fillRightCounts(long kmer, long rkmer, int[] counts, long mask, int shift2){
1345 		if(FAST_FILL && MASK_CORE && k>2/*((k&1)==1)*/){
1346 			return fillRightCounts_fast(kmer, rkmer, counts, mask, shift2);
1347 		}else{
1348 			return fillRightCounts_safe(kmer, rkmer, counts, mask, shift2);
1349 		}
1350 	}
1351 
fillRightCounts_fast(final long kmer0, final long rkmer0, int[] counts, long mask, int shift2)1352 	public int fillRightCounts_fast(final long kmer0, final long rkmer0, int[] counts,
1353 			long mask, int shift2){
1354 //		assert((k&1)==1) : k;
1355 		assert(MASK_CORE);
1356 		final long kmer=(kmer0<<2)&mask;
1357 		final long rkmer=(rkmer0>>>2);
1358 		final long a=kmer&coreMask, b=rkmer&coreMask;
1359 
1360 		int max=-1, maxPos=0;
1361 		if(a==b){
1362 			return fillRightCounts_safe(kmer0, rkmer0, counts, mask, shift2);
1363 		}else if(a>b){
1364 //			return fillRightCounts_safe(kmer0, rkmer0, counts, mask, shift2);
1365 			final long core=a;
1366 			final int way=kmerToWay(core);
1367 //			final AbstractKmerTable table=tables[way];
1368 			final HashArray table=(HashArray) tables[way];
1369 			final int cell=table.kmerToCell(kmer);
1370 			for(int i=0; i<=3; i++){
1371 				final long key=kmer|i;
1372 				final int count=Tools.max(0, table.getValue(key, cell));
1373 //				final int count=Tools.max(0, table.getValue(key));
1374 				counts[i]=count;
1375 				if(count>max){
1376 					max=count;
1377 					maxPos=i;
1378 				}
1379 			}
1380 		}else{
1381 //			return fillRightCounts_safe(kmer0, rkmer0, counts, mask, shift2);
1382 			//Use a higher increment for the left bits
1383 			final long core=b;
1384 			final int way=kmerToWay(core);
1385 //			final AbstractKmerTable table=tables[way];
1386 			final HashArray table=(HashArray) tables[way];
1387 			final int cell=table.kmerToCell(rkmer);
1388 			final long incr=1L<<(2*(k-1));
1389 			long j=3*incr;
1390 			for(int i=0; i<=3; i++, j-=incr){
1391 				final long key=rkmer|j;
1392 				final int count=Tools.max(0, table.getValue(key, cell));
1393 //				final int count=Tools.max(0, table.getValue(key));
1394 				counts[i]=count;
1395 				if(count>max){
1396 					max=count;
1397 					maxPos=i;
1398 				}
1399 			}
1400 		}
1401 		return maxPos;
1402 	}
1403 
1404 	//TODO: Change this to take advantage of coreMask
1405 	//Requires special handling of core palindromes.
1406 	//Thus it would be easiest to just handle odd kmers, and K is normally 31 anyway.
fillRightCounts_safe(long kmer, long rkmer, int[] counts, long mask, int shift2)1407 	public int fillRightCounts_safe(long kmer, long rkmer, int[] counts, long mask, int shift2){
1408 		assert(kmer==rcomp(rkmer));
1409 		if(verbose){outstream.println("fillRightCounts:   "+toText(kmer)+",   "+toText(rkmer));}
1410 		kmer=(kmer<<2)&mask;
1411 		rkmer=(rkmer>>>2);
1412 		int max=-1, maxPos=0;
1413 
1414 		for(int i=0; i<=3; i++){
1415 			long kmer2=kmer|((long)i);
1416 			long rkmer2=rkmer|(((long)AminoAcid.numberToComplement[i])<<shift2);
1417 			if(verbose){outstream.println("kmer:               "+toText(kmer2)+", "+toText(rkmer2));}
1418 			assert(kmer2==(kmer2&mask));
1419 			assert(rkmer2==(rkmer2&mask));
1420 			assert(kmer2==rcomp(rkmer2));
1421 			long key=toValue(kmer2, rkmer2);
1422 			int way=kmerToWay(key);
1423 			int count=tables[way].getValue(key);
1424 			assert(count==NOT_PRESENT || count>=0);
1425 			count=Tools.max(count, 0);
1426 			counts[i]=count;
1427 			if(count>max){
1428 				max=count;
1429 				maxPos=i;
1430 			}
1431 		}
1432 		return maxPos;
1433 	}
1434 
1435 	/** For KmerCompressor. */
fillRightCountsRcompOnly(long kmer, long rkmer, int[] counts, long mask, int shift2)1436 	public int fillRightCountsRcompOnly(long kmer, long rkmer, int[] counts, long mask, int shift2){
1437 		assert(kmer==rcomp(rkmer));
1438 		if(verbose){outstream.println("fillRightCounts:   "+toText(kmer)+",   "+toText(rkmer));}
1439 		kmer=(kmer<<2)&mask;
1440 		rkmer=(rkmer>>>2);
1441 		int max=-1, maxPos=0;
1442 
1443 		for(int i=0; i<=3; i++){
1444 			long kmer2=kmer|((long)i);
1445 			long rkmer2=rkmer|(((long)AminoAcid.numberToComplement[i])<<shift2);
1446 			if(verbose){outstream.println("kmer:               "+toText(kmer2)+", "+toText(rkmer2));}
1447 			assert(kmer2==(kmer2&mask));
1448 			assert(rkmer2==(rkmer2&mask));
1449 			assert(kmer2==rcomp(rkmer2));
1450 			long key=rkmer2;
1451 			int way=kmerToWay(key);
1452 			int count=tables[way].getValue(key);
1453 			assert(count==NOT_PRESENT || count>=0);
1454 			count=Tools.max(count, 0);
1455 			counts[i]=count;
1456 			if(count>max){
1457 				max=count;
1458 				maxPos=i;
1459 			}
1460 		}
1461 		return maxPos;
1462 	}
1463 
fillLeftCounts(long kmer, long rkmer, int[] counts, long mask, int shift2)1464 	public int fillLeftCounts(long kmer, long rkmer, int[] counts, long mask, int shift2){
1465 		if(FAST_FILL && MASK_CORE && k>2/*((k&1)==1)*/){
1466 			return fillLeftCounts_fast(kmer, rkmer, counts, mask, shift2);
1467 		}else{
1468 			return fillLeftCounts_safe(kmer, rkmer, counts, mask, shift2);
1469 		}
1470 	}
1471 
fillLeftCounts_fast(final long kmer0, final long rkmer0, int[] counts, long mask, int shift2)1472 	public int fillLeftCounts_fast(final long kmer0, final long rkmer0, int[] counts,
1473 			long mask, int shift2){
1474 //		assert((k&1)==1) : k;
1475 		assert(MASK_CORE);
1476 		final long rkmer=(rkmer0<<2)&mask;
1477 		final long kmer=(kmer0>>>2);
1478 		final long a=kmer&coreMask, b=rkmer&coreMask;
1479 
1480 		int max=-1, maxPos=0;
1481 		if(a==b){
1482 			return fillLeftCounts_safe(kmer0, rkmer0, counts, mask, shift2);
1483 		}else if(a>b){
1484 //			return fillLeftCounts_safe(kmer0, rkmer0, counts, mask, shift2);
1485 
1486 			final long core=a;
1487 			final int way=kmerToWay(core);
1488 //			final AbstractKmerTable table=tables[way];
1489 			final HashArray table=(HashArray) tables[way];
1490 			final int cell=table.kmerToCell(kmer);
1491 			final long incr=1L<<(2*(k-1));
1492 			long j=0;//Must be declared as a long, outside of loop
1493 			for(int i=0; i<=3; i++, j+=incr){
1494 //				final long rkmer2=rkmer|((long)AminoAcid.numberToComplement[i]);
1495 //				final long kmer2=kmer|(((long)i)<<shift2);
1496 //				final long key2=toValue(rkmer2, kmer2);
1497 //				int way2=kmerToWay(key2);
1498 //				int count2=tables[way2].getValue(key2);
1499 //				count2=Tools.max(count2, 0);
1500 
1501 				final long key=kmer|j;
1502 				final int count=Tools.max(0, table.getValue(key, cell));
1503 //				int count=Tools.max(0, table.getValue(key));
1504 
1505 //				assert(way==way2) : i+", "+way+", "+way2;
1506 //				assert(key==key2) : i+", "+Long.toBinaryString(kmer)+", "+
1507 //					Long.toBinaryString(key)+", "+Long.toBinaryString(key2)+
1508 //					", "+Long.toBinaryString(j)+", "+Long.toBinaryString(incr);
1509 //				assert(count==count2) : i+", "+count+", "+count2;
1510 
1511 				counts[i]=count;
1512 				if(count>max){
1513 					max=count;
1514 					maxPos=i;
1515 				}
1516 			}
1517 			return maxPos;
1518 		}else{
1519 //			return fillLeftCounts_safe(kmer0, rkmer0, counts, mask, shift2);
1520 			final long core=b;
1521 			final int way=kmerToWay(core);
1522 //			final AbstractKmerTable table=tables[way];
1523 			final HashArray table=(HashArray) tables[way];
1524 			final int cell=table.kmerToCell(rkmer);
1525 			for(int i=0, j=3; i<=3; i++, j--){
1526 				final long key=rkmer|j;
1527 				final int count=Tools.max(0, table.getValue(key, cell));
1528 //				final int count=Tools.max(0, table.getValue(key));
1529 
1530 				counts[i]=count;
1531 				if(count>max){
1532 					max=count;
1533 					maxPos=i;
1534 				}
1535 			}
1536 			return maxPos;
1537 		}
1538 	}
1539 
fillLeftCounts_safe(final long kmer0, final long rkmer0, int[] counts, long mask, int shift2)1540 	public int fillLeftCounts_safe(final long kmer0, final long rkmer0, int[] counts, long mask, int shift2){
1541 		assert(kmer0==rcomp(rkmer0));
1542 		if(verbose){outstream.println("fillLeftCounts:    "+toText(kmer0)+",   "+toText(rkmer0));}
1543 		long rkmer=(rkmer0<<2)&mask;
1544 		long kmer=(kmer0>>>2);
1545 		int max=-1, maxPos=0;
1546 //		assert(false) : shift2+", "+k;
1547 		for(int i=0; i<=3; i++){
1548 			final long rkmer2=rkmer|((long)AminoAcid.numberToComplement[i]);
1549 			final long kmer2=kmer|(((long)i)<<shift2);
1550 			if(verbose){outstream.println("kmer:             "+toText(kmer2)+",     "+toText(rkmer2));}
1551 			assert(kmer2==(kmer2&mask));
1552 			assert(rkmer2==(rkmer2&mask));
1553 			assert(kmer2==rcomp(rkmer2)) : "\n"+"kmer:      \t"+toText(rcomp(rkmer2))+", "+toText(rcomp(kmer2));
1554 			long key=toValue(rkmer2, kmer2);
1555 			int way=kmerToWay(key);
1556 			int count=tables[way].getValue(key);
1557 			assert(count==NOT_PRESENT || count>=0);
1558 			count=Tools.max(count, 0);
1559 			counts[i]=count;
1560 			if(count>max){
1561 				max=count;
1562 				maxPos=i;
1563 			}
1564 		}
1565 		return maxPos;
1566 	}
1567 
1568 	/*--------------------------------------------------------------*/
1569 	/*----------------       Printing Methods       ----------------*/
1570 	/*--------------------------------------------------------------*/
1571 
1572 	@Override
dumpKmersAsBytes(String fname, int mincount, int maxcount, boolean printTime, AtomicLong remaining)1573 	public boolean dumpKmersAsBytes(String fname, int mincount, int maxcount, boolean printTime, AtomicLong remaining){
1574 		if(fname==null){return false;}
1575 		Timer t=new Timer();
1576 
1577 		ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, append, true);
1578 		bsw.start();
1579 		for(AbstractKmerTable set : tables){
1580 			set.dumpKmersAsBytes(bsw, k, mincount, maxcount, remaining);
1581 		}
1582 		bsw.poisonAndWait();
1583 
1584 		t.stop();
1585 		if(printTime){outstream.println("Kmer Dump Time:             \t"+t);}
1586 		return bsw.errorState;
1587 	}
1588 
1589 	@Override
dumpKmersAsBytes_MT(String fname, int mincount, int maxcount, boolean printTime, AtomicLong remaining)1590 	public boolean dumpKmersAsBytes_MT(String fname, int mincount, int maxcount, boolean printTime, AtomicLong remaining){
1591 		final int threads=Tools.min(Shared.threads(), tables.length);
1592 		if(threads<3 || DumpThread.NUM_THREADS==1){return dumpKmersAsBytes(fname, mincount, maxcount, printTime, remaining);}
1593 
1594 		if(fname==null){return false;}
1595 		Timer t=new Timer();
1596 
1597 		ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, append, true);
1598 		bsw.start();
1599 		DumpThread.dump(k, mincount, maxcount, tables, bsw, remaining);
1600 		bsw.poisonAndWait();
1601 
1602 		t.stop();
1603 		if(printTime){outstream.println("Kmer Dump Time:             \t"+t);}
1604 		return bsw.errorState;
1605 	}
1606 
1607 	/*--------------------------------------------------------------*/
1608 	/*----------------        Recall Methods        ----------------*/
1609 	/*--------------------------------------------------------------*/
1610 
rcomp(long kmer)1611 	private final long rcomp(long kmer){return AminoAcid.reverseComplementBinaryFast(kmer, k);}
toText(long kmer)1612 	private final StringBuilder toText(long kmer){return AbstractKmerTable.toText(kmer, k);}
1613 
1614 	/*--------------------------------------------------------------*/
1615 	/*----------------        Static Methods        ----------------*/
1616 	/*--------------------------------------------------------------*/
1617 
1618 	/**
1619 	 * Transforms a kmer into a canonical value stored in the table.  Expected to be inlined.
1620 	 * @param kmer Forward kmer
1621 	 * @param rkmer Reverse kmer
1622 	 * @return Canonical value
1623 	 */
toValue(long kmer, long rkmer)1624 	public final long toValue(long kmer, long rkmer){
1625 //		long value=(rcomp ? Tools.max(kmer, rkmer) : kmer);
1626 //		return value;
1627 		if(!rcomp){return kmer;}
1628 		final long a=kmer&coreMask, b=rkmer&coreMask;
1629 		return a>b ? kmer : a<b ? rkmer : Tools.max(kmer, rkmer);
1630 	}
1631 
1632 	/*--------------------------------------------------------------*/
1633 	/*----------------       Final Primitives       ----------------*/
1634 	/*--------------------------------------------------------------*/
1635 
1636 	@Override
kbig()1637 	public int kbig(){return k;}
1638 	@Override
filterMemory(int pass)1639 	public long filterMemory(int pass){return ((pass&1)==0) ? filterMemory0 : filterMemory1;}
1640 	@Override
ecco()1641 	public boolean ecco(){return ecco;}
1642 	@Override
qtrimLeft()1643 	public boolean qtrimLeft(){return qtrimLeft;}
1644 	@Override
qtrimRight()1645 	public boolean qtrimRight(){return qtrimRight;}
1646 	@Override
minAvgQuality()1647 	public float minAvgQuality(){return minAvgQuality;}
1648 	@Override
tableMemory()1649 	public long tableMemory(){return tableMemory;}
1650 	@Override
estimatedKmerCapacity()1651 	public long estimatedKmerCapacity(){return estimatedKmerCapacity;}
1652 	@Override
ways()1653 	public int ways(){return ways;}
1654 	@Override
rcomp()1655 	public boolean rcomp(){return rcomp;}
1656 
kmerToWay(final long kmer)1657 	public final int kmerToWay(final long kmer){
1658 		final int way=(int)((kmer&coreMask)%ways);
1659 		return way;
1660 	}
1661 
1662 	/** Hold kmers.  A kmer X such that X%WAYS=Y will be stored in tables[Y] */
1663 	private AbstractKmerTable[] tables;
1664 
tables()1665 	public AbstractKmerTable[] tables(){return tables;}
1666 
1667 	public long filterMemoryOverride=0;
1668 
1669 	public final int tableType; //AbstractKmerTable.ARRAY1D;
1670 
1671 	private final int bytesPerKmer;
1672 
1673 	private final long usableMemory;
1674 	private final long filterMemory0;
1675 	private final long filterMemory1;
1676 	private final long tableMemory;
1677 	private final long estimatedKmerCapacity;
1678 
1679 	/** Number of tables (and threads, during loading) */
1680 	private final boolean prealloc;
1681 
1682 	/** Number of tables (and threads, during loading) */
1683 	public final int ways;
1684 
1685 	/** Normal kmer length */
1686 	public final int k;
1687 	/** k-1; used in some expressions */
1688 	public final int k2;
1689 
1690 	public final long coreMask;
1691 
1692 	/** Look for reverse-complements as well as forward kmers.  Default: true */
1693 	private final boolean rcomp;
1694 
1695 	/** Quality-trim the left side */
1696 	public final boolean qtrimLeft;
1697 	/** Quality-trim the right side */
1698 	public final boolean qtrimRight;
1699 	/** Trim bases at this quality or below.  Default: 4 */
1700 	public final float trimq;
1701 	/** Error rate for trimming (derived from trimq) */
1702 	private final float trimE;
1703 
1704 	/** Throw away reads below this average quality after trimming.  Default: 0 */
1705 	public final float minAvgQuality;
1706 	/** If positive, calculate average quality from the first X bases only.  Default: 0 */
1707 	public final int minAvgQualityBases;
1708 
1709 	/** Ignore kmers with probability of correctness less than this */
1710 	public final float minProb;
1711 
1712 	/** Correct via overlap */
1713 	private final boolean ecco;
1714 
1715 	/** Attempt to merge via overlap prior to counting kmers */
1716 	private final boolean merge;
1717 
1718 	/*--------------------------------------------------------------*/
1719 	/*----------------            Walker            ----------------*/
1720 	/*--------------------------------------------------------------*/
1721 
walk()1722 	public Walker walk(){
1723 		return new WalkerKST();
1724 	}
1725 
1726 	public class WalkerKST extends Walker {
1727 
WalkerKST()1728 		WalkerKST(){
1729 			w=tables[0].walk();
1730 		}
1731 
1732 		/**
1733 		 * Fills this object with the next key and value.
1734 		 * @return True if successful.
1735 		 */
next()1736 		public boolean next(){
1737 			if(w==null){return false;}
1738 			if(w.next()){return true;}
1739 			if(tnum<tables.length){tnum++;}
1740 			w=(tnum<tables.length ? tables[tnum].walk() : null);
1741 			return w==null ? false : w.next();
1742 		}
1743 
kmer()1744 		public long kmer(){return w.kmer();}
value()1745 		public int value(){return w.value();}
1746 
1747 		private Walker w=null;
1748 
1749 		/** current table number */
1750 		private int tnum=0;
1751 	}
1752 
1753 }
1754