1 package kmer;
2 
3 import java.util.Arrays;
4 import java.util.concurrent.atomic.AtomicIntegerArray;
5 import java.util.concurrent.atomic.AtomicLong;
6 import java.util.concurrent.locks.Lock;
7 import java.util.concurrent.locks.ReentrantLock;
8 
9 import fileIO.ByteStreamWriter;
10 import fileIO.TextStreamWriter;
11 import shared.Primes;
12 import shared.Tools;
13 import structures.ByteBuilder;
14 import structures.SuperLongList;
15 
16 /**
17  * Stores kmers in a long[] and values in an int[][], with a victim cache.
18  * @author Brian Bushnell
19  * @date Nov 7, 2014
20  *
21  */
22 public abstract class HashArray extends AbstractKmerTable {
23 
24 	/*--------------------------------------------------------------*/
25 	/*----------------        Initialization        ----------------*/
26 	/*--------------------------------------------------------------*/
27 
HashArray(int[] schedule_, long coreMask_, boolean twod_)28 	HashArray(int[] schedule_, long coreMask_, boolean twod_){
29 		schedule=schedule_;
30 		autoResize=schedule.length>1;
31 		prime=schedule[0];
32 
33 		sizeLimit=(long)((schedule.length==1 ? maxLoadFactorFinal : maxLoadFactor)*prime);
34 		array=allocLong1D(prime+extra);
35 		victims=new HashForest(Tools.max(10, prime/victimRatio), autoResize, twod_);
36 		Arrays.fill(array, NOT_PRESENT);
37 		twoD=twod_;
38 		coreMask=coreMask_;
39 //		coreMask2=coreMask_|3;
40 		coreMask2=coreMask_; //Simplifies fast fill
41 	}
42 
HashArray(int initialSize, long coreMask_, boolean autoResize_, boolean twod_)43 	HashArray(int initialSize, long coreMask_, boolean autoResize_, boolean twod_){
44 		schedule=null;
45 		prime=initialSize;
46 		sizeLimit=(long)(maxLoadFactor*prime);
47 		array=allocLong1D(prime+extra);
48 		victims=new HashForest(Tools.max(10, initialSize/victimRatio), autoResize_, twod_);
49 		Arrays.fill(array, NOT_PRESENT);
50 		autoResize=autoResize_;
51 		twoD=twod_;
52 		coreMask=coreMask_;
53 //		coreMask2=coreMask_|3;
54 		coreMask2=coreMask_; //Simplifies fast fill
55 	}
56 
57 	/*--------------------------------------------------------------*/
58 	/*----------------        Public Methods        ----------------*/
59 	/*--------------------------------------------------------------*/
60 
61 	/*--------------------------------------------------------------*/
62 	/*----------------         Test Methods         ----------------*/
63 	/*--------------------------------------------------------------*/
64 
65 //	public final int set_Test(final long kmer, final int v){
66 //		assert(TESTMODE);
67 //		final int x;
68 //		if(TWOD){
69 //			int[] old=getValues(kmer, new int[1]);
70 //			assert(old==null || contains(kmer, old));
71 //			if(verbose){System.err.println("Fetched "+Arrays.toString(old));}
72 //			x=set0(kmer, v);
73 //			assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+v+", kmer="+kmer+
74 //				", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
75 //			assert(contains(kmer, v));
76 //		}else{
77 //			int old=getValue(kmer);
78 //			assert(old==0 || old==-1 || contains(kmer, old));
79 //			x=set0(kmer, v);
80 //			assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
81 //			assert(v==old || !contains(kmer, old));
82 //		}
83 //		return x;
84 //	}
85 //
86 //	public final int set_Test(final long kmer, final int v[]){
87 //		assert(TESTMODE);
88 //		final int x;
89 //		if(TWOD){
90 //			final int[] singleton=new int[1];
91 //			int[] old=getValues(kmer, singleton);
92 //			assert(old==null || contains(kmer, old));
93 //			if(verbose){System.err.println("Before: old="+Arrays.toString(old)+", v="+Arrays.toString(v));}
94 //			x=set0(kmer, v);
95 //			if(verbose){System.err.println("After:  old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", get()="+Arrays.toString(getValues(kmer, singleton)));}
96 //			assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+
97 //				", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
98 //			assert(contains(kmer, v)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+
99 //				", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
100 //		}else{
101 //			int old=getValue(kmer);
102 //			assert(old==0 || old==-1 || contains(kmer, old));
103 //			x=set0(kmer, v);
104 //			assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
105 //			assert(v[0]==old || !contains(kmer, old));
106 //		}
107 //		return x;
108 //	}
109 //
110 //	public final int setIfNotPresent_Test(long kmer, int v){
111 //		assert(TESTMODE);
112 //		final int x;
113 //		if(TWOD){
114 ////			int[] vals=getValues(kmer, null);
115 ////			assert(vals==null || contains(kmer, vals));
116 ////			x=setIfNotPresent(kmer, v);
117 ////			assert(contains(kmer, vals));
118 ////			assert(contains(kmer, v));
119 //			x=0;
120 //			assert(false);
121 //		}else{
122 //			int old=getValue(kmer);
123 //			assert(old==0 || old==-1 || contains(kmer, old));
124 //			x=setIfNotPresent0(kmer, v);
125 //			assert((old<1 && contains(kmer, v)) || (old>0 && contains(kmer, old))) : kmer+", "+old+", "+v;
126 //		}
127 //		return x;
128 //	}
129 
130 	/*--------------------------------------------------------------*/
131 
kmerToCell(long kmer)132 	public final int kmerToCell(long kmer){
133 		final int cell=(int)((kmer&coreMask)%prime);
134 		return cell;
135 	}
136 
137 	@Override
set(final long kmer, final int[] v, final int vlen)138 	public final int set(final long kmer, final int[] v, final int vlen){
139 		int cell=kmerToCell(kmer);
140 
141 		for(final int max=cell+extra; cell<max; cell++){
142 			long n=array[cell];
143 			if(n==kmer){
144 				if(verbose){System.err.println("A2: Adding "+kmer+", "+Arrays.toString(v)+", "+cell);}
145 				insertValue(kmer, v, cell, vlen);
146 				if(verbose){System.err.println("A2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
147 				return 0;
148 			}else if(n==NOT_PRESENT){
149 				if(verbose){System.err.println("B2: Adding "+kmer+", "+Arrays.toString(v)+", "+cell);}
150 				array[cell]=kmer;
151 				insertValue(kmer, v, cell, vlen);
152 				if(verbose){System.err.println("B2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
153 				size++;
154 				if(autoResize && size+victims.size>sizeLimit){resize();}
155 				return 1;
156 			}
157 		}
158 		if(verbose){System.err.println("C2: Adding "+kmer+", "+v+", "+cell);}
159 		final int x=victims.set(kmer, v, vlen);
160 		if(autoResize && size+victims.size>sizeLimit){resize();}
161 		if(verbose){System.err.println("C2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
162 		return x;
163 	}
164 
165 	@Override
set(final long kmer, final int v)166 	public final int set(final long kmer, final int v){
167 		int cell=kmerToCell(kmer);
168 
169 //		assert(TESTMODE);
170 //		ll.add(kmer);
171 //		il.add(v);
172 
173 		for(final int max=cell+extra; cell<max; cell++){
174 			long n=array[cell];
175 			if(n==kmer){
176 				if(verbose){System.err.println("A1: Adding "+kmer+", "+v+", "+cell);}
177 				insertValue(kmer, v, cell);
178 				if(verbose){System.err.println("A1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
179 				return 0;
180 			}else if(n==NOT_PRESENT){
181 				if(verbose){System.err.println("B1: Adding "+kmer+", "+v+", "+cell);}
182 				array[cell]=kmer;
183 				insertValue(kmer, v, cell);
184 				if(verbose){System.err.println("B1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
185 				size++;
186 				if(autoResize && size+victims.size>sizeLimit){resize();}
187 				return 1;
188 			}
189 		}
190 		if(verbose){System.err.println("C1: Adding "+kmer+", "+v+", "+cell+
191 				"; victims.get(kmer)="+Arrays.toString(victims.getValues(kmer, new int[1])));}
192 		final int x=victims.set(kmer, v);
193 		if(autoResize && size+victims.size>sizeLimit){resize();}
194 		if(verbose){System.err.println("C1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1]))+
195 				"; victims.get(kmer)="+Arrays.toString(victims.getValues(kmer, new int[1])));}
196 		return x;
197 	}
198 
199 	@Override
setIfNotPresent(long kmer, int value)200 	public final int setIfNotPresent(long kmer, int value){
201 		int cell=kmerToCell(kmer);
202 		for(final int max=cell+extra; cell<max; cell++){
203 			long n=array[cell];
204 			if(n==kmer){
205 				return 0;
206 			}else if(n==NOT_PRESENT){
207 				array[cell]=kmer;
208 				insertValue(kmer, value, cell);
209 				size++;
210 				if(autoResize && size+victims.size>sizeLimit){resize();}
211 				return 1;
212 			}
213 		}
214 //		System.err.println("size="+size+", prime="+prime+", limit="+sizeLimit);
215 		int x=victims.setIfNotPresent(kmer, value);
216 		if(autoResize && size+victims.size>sizeLimit){resize();}
217 		return x;
218 	}
219 
220 	@Override
getValue(long kmer)221 	public final int getValue(long kmer){
222 		int cell=findKmer(kmer);
223 		if(cell==NOT_PRESENT){return NOT_PRESENT;}
224 		if(cell==HASH_COLLISION){return victims.getValue(kmer);}
225 		return readCellValue(cell);
226 	}
227 
getValue(long kmer, int startCell)228 	public final int getValue(long kmer, int startCell){
229 		int cell=findKmer(kmer, startCell);
230 		if(cell==NOT_PRESENT){return NOT_PRESENT;}
231 		if(cell==HASH_COLLISION){return victims.getValue(kmer);}
232 		return readCellValue(cell);
233 	}
234 
235 	@Override
getValues(long kmer, int[] singleton)236 	public final int[] getValues(long kmer, int[] singleton){
237 		int cell=findKmer(kmer);
238 		if(cell==NOT_PRESENT){
239 			singleton[0]=NOT_PRESENT;
240 			return singleton;
241 		}
242 		if(cell==HASH_COLLISION){return victims.getValues(kmer, singleton);}
243 		return readCellValues(cell, singleton);
244 	}
245 
246 	@Override
contains(long kmer)247 	public final boolean contains(long kmer){
248 		int cell=findKmer(kmer);
249 		if(cell==NOT_PRESENT){return false;}
250 		if(cell==HASH_COLLISION){return victims.contains(kmer);}
251 		return true;
252 	}
253 
getKmer(int cell)254 	public final long getKmer(int cell) {
255 		return array[cell];
256 	}
257 
258 	/*--------------------------------------------------------------*/
259 	/*----------------          Ownership           ----------------*/
260 	/*--------------------------------------------------------------*/
261 
262 	@Override
initializeOwnership()263 	public final void initializeOwnership(){
264 		assert(owners==null);
265 		owners=allocAtomicInt(array.length);
266 		for(int i=0; i<array.length; i++){
267 			owners.set(i, NO_OWNER);
268 		}
269 		victims.initializeOwnership();
270 	}
271 
272 	@Override
clearOwnership()273 	public final void clearOwnership(){
274 		owners=null;
275 		victims.clearOwnership();
276 	}
277 
278 	@Override
setOwner(final long kmer, final int newOwner)279 	public final int setOwner(final long kmer, final int newOwner){
280 		final int cell=findKmer(kmer);
281 		assert(cell!=NOT_PRESENT);
282 		if(cell==HASH_COLLISION){return victims.setOwner(kmer, newOwner);}
283 		return setOwner(kmer, newOwner, cell);
284 	}
285 
setOwner(final long kmer, final int newOwner, final int cell)286 	public final int setOwner(final long kmer, final int newOwner, final int cell){
287 		assert(array[cell]==kmer);
288 		final int original=owners.get(cell);
289 		int current=original;
290 		while(current<newOwner){
291 			boolean success=owners.compareAndSet(cell, current, newOwner);
292 			if(!success){current=owners.get(cell);}
293 			else{current=newOwner;}
294 		}
295 		assert(current>=original) : "original="+original+", current="+current+", newOwner="+newOwner+", re-read="+owners.get(cell);
296 		return current;
297 	}
298 
299 	@Override
clearOwner(final long kmer, final int owner)300 	public final boolean clearOwner(final long kmer, final int owner){
301 		final int cell=findKmer(kmer);
302 		assert(cell!=NOT_PRESENT);
303 		if(cell==HASH_COLLISION){return victims.clearOwner(kmer, owner);}
304 		return clearOwner(kmer, owner, cell);
305 	}
306 
clearOwner(final long kmer, final int owner, final int cell)307 	public final boolean clearOwner(final long kmer, final int owner, final int cell){
308 		assert(array[cell]==kmer);
309 		boolean success=owners.compareAndSet(cell, owner, NO_OWNER);
310 		return success;
311 	}
312 
313 	@Override
getOwner(final long kmer)314 	public final int getOwner(final long kmer){
315 		final int cell=findKmer(kmer);
316 		assert(cell!=NOT_PRESENT);
317 		if(cell==HASH_COLLISION){return victims.getOwner(kmer);}
318 		return getCellOwner(cell);
319 	}
320 
getCellOwner(final int cell)321 	public final int getCellOwner(final int cell){
322 		return owners.get(cell);
323 	}
324 
325 	/*--------------------------------------------------------------*/
326 	/*----------------      Nonpublic Methods       ----------------*/
327 	/*--------------------------------------------------------------*/
328 
insertValue(final long kmer, final int v, final int cell)329 	protected abstract void insertValue(final long kmer, final int v, final int cell);
330 
331 //	protected abstract void insertValue(final long kmer, final int[] vals, final int cell);
332 
333 	/** This is for IntList3 support with HashArrayHybridFast */
insertValue(final long kmer, final int[] vals, final int cell, final int vlen)334 	protected abstract void insertValue(final long kmer, final int[] vals, final int cell, final int vlen);
335 
readCellValue(int cell)336 	protected abstract int readCellValue(int cell);
readCellValues(int cell, int[] singleton)337 	protected abstract int[] readCellValues(int cell, int[] singleton);
338 
339 	@Override
get(long kmer)340 	final Object get(long kmer){
341 		throw new RuntimeException("Unimplemented.");
342 	}
343 
findKmer(long kmer)344 	final int findKmer(long kmer){
345 		return findKmer(kmer, kmerToCell(kmer));
346 	}
347 
findKmer(final long kmer, final int startCell)348 	final int findKmer(final long kmer, final int startCell){
349 		int cell=startCell;
350 		for(final int max=cell+extra; cell<max; cell++){
351 			final long n=array[cell];
352 			if(n==kmer){return cell;}
353 			else if(n==NOT_PRESENT){return NOT_PRESENT;}
354 		}
355 		return HASH_COLLISION;
356 	}
357 
findKmerOrEmpty(long kmer)358 	final int findKmerOrEmpty(long kmer){
359 		int cell=kmerToCell(kmer);
360 		for(final int max=cell+extra; cell<max; cell++){
361 			final long n=array[cell];
362 			if(n==kmer || n==NOT_PRESENT){return cell;}
363 		}
364 		return HASH_COLLISION;
365 	}
366 
367 	/*--------------------------------------------------------------*/
368 	/*----------------   Resizing and Rebalancing   ----------------*/
369 	/*--------------------------------------------------------------*/
370 
371 	@Override
canResize()372 	final boolean canResize() {return true;}
373 
374 	@Override
size()375 	final public long size() {return size;}
376 
377 	@Override
arrayLength()378 	final public int arrayLength() {return array.length;}
379 
380 	@Override
resize()381 	protected abstract void resize();
382 
383 	/*--------------------------------------------------------------*/
384 	/*----------------         Info Dumping         ----------------*/
385 	/*--------------------------------------------------------------*/
386 
387 	@Override
dumpKmersAsText(TextStreamWriter tsw, int k, int mincount, int maxcount)388 	public final boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount, int maxcount){
389 		if(twoD){
390 			final int[] singleton=new int[1];
391 			for(int i=0; i<array.length; i++){
392 				long kmer=array[i];
393 				if(kmer!=NOT_PRESENT){
394 					tsw.print(toText(kmer, readCellValues(i, singleton), k).append('\n'));
395 				}
396 			}
397 		}else{
398 			for(int i=0; i<array.length; i++){
399 				long kmer=array[i];
400 				if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
401 					tsw.print(toText(kmer, readCellValue(i), k).append('\n'));
402 				}
403 			}
404 		}
405 		if(victims!=null){
406 			victims.dumpKmersAsText(tsw, k, mincount, maxcount);
407 		}
408 		return true;
409 	}
410 
411 	@Override
dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount, int maxcount, AtomicLong remaining)412 	public final boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount, int maxcount, AtomicLong remaining){
413 		if(twoD){
414 			final int[] singleton=new int[1];
415 			for(int i=0; i<array.length; i++){
416 				long kmer=array[i];
417 				if(kmer!=NOT_PRESENT){
418 					if(remaining!=null && remaining.decrementAndGet()<0){return true;}
419 					bsw.printlnKmer(kmer, readCellValues(i, singleton), k);
420 				}
421 			}
422 		}else{
423 			for(int i=0; i<array.length; i++){
424 				long kmer=array[i];
425 				if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
426 					if(remaining!=null && remaining.decrementAndGet()<0){return true;}
427 					bsw.printlnKmer(kmer, readCellValue(i), k);
428 				}
429 			}
430 		}
431 		if(victims!=null){
432 			victims.dumpKmersAsBytes(bsw, k, mincount, maxcount, remaining);
433 		}
434 		return true;
435 	}
436 
437 	@Override
dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount, int maxcount, AtomicLong remaining)438 	public final boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount, int maxcount, AtomicLong remaining){
439 		if(twoD){
440 			final int[] singleton=new int[1];
441 			for(int i=0; i<array.length; i++){
442 				long kmer=array[i];
443 				if(kmer!=NOT_PRESENT){
444 					if(remaining!=null && remaining.decrementAndGet()<0){return true;}
445 					toBytes(kmer, readCellValues(i, singleton), k, bb);
446 					bb.nl();
447 					if(bb.length()>=16000){
448 						ByteBuilder bb2=new ByteBuilder(bb);
449 						synchronized(bsw){bsw.addJob(bb2);}
450 						bb.clear();
451 					}
452 				}
453 			}
454 		}else{
455 			for(int i=0; i<array.length; i++){
456 				long kmer=array[i];
457 				if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
458 					if(remaining!=null && remaining.decrementAndGet()<0){return true;}
459 					toBytes(kmer, readCellValue(i), k, bb);
460 					bb.nl();
461 					if(bb.length()>=16000){
462 						ByteBuilder bb2=new ByteBuilder(bb);
463 						synchronized(bsw){bsw.addJob(bb2);}
464 						bb.clear();
465 					}
466 				}
467 			}
468 		}
469 		if(victims!=null){
470 			victims.dumpKmersAsBytes_MT(bsw, bb, k, mincount, maxcount, remaining);
471 		}
472 		return true;
473 	}
474 
475 	@Override
fillHistogram(long[] ca, int max)476 	public final void fillHistogram(long[] ca, int max){
477 		for(int i=0; i<array.length; i++){
478 			long kmer=array[i];
479 			if(kmer!=NOT_PRESENT){
480 				int count=Tools.min(readCellValue(i), max);
481 				ca[count]++;
482 			}
483 		}
484 		if(victims!=null){
485 			victims.fillHistogram(ca, max);
486 		}
487 	}
488 
489 	@Override
fillHistogram(SuperLongList sll)490 	public void fillHistogram(SuperLongList sll){
491 		for(int i=0; i<array.length; i++){
492 			long kmer=array[i];
493 			if(kmer!=NOT_PRESENT){
494 				int count=readCellValue(i);
495 				sll.add(count);
496 			}
497 		}
498 		if(victims!=null){
499 			victims.fillHistogram(sll);
500 		}
501 	}
502 
503 	@Override
countGC(long[] gcCounts, int max)504 	public final void countGC(long[] gcCounts, int max){
505 		for(int i=0; i<array.length; i++){
506 			long kmer=array[i];
507 			if(kmer!=NOT_PRESENT){
508 				int count=readCellValue(i);
509 				int index=Tools.min(count, max);
510 				gcCounts[index]+=gc(kmer);
511 			}
512 		}
513 		if(victims!=null){
514 			victims.countGC(gcCounts, max);
515 		}
516 	}
517 
victims()518 	public HashForest victims(){
519 		return victims;
520 	}
521 
522 	/*--------------------------------------------------------------*/
523 	/*----------------            Fields            ----------------*/
524 	/*--------------------------------------------------------------*/
525 
526 	AtomicIntegerArray owners;
527 	long[] array;
528 	int prime;
529 	long size=0;
530 	long sizeLimit;
531 	final HashForest victims;
532 	final boolean autoResize;
533 	public final boolean twoD;
534 	private final Lock lock=new ReentrantLock();
535 	private final long coreMask;//for ways
536 	private final long coreMask2;//for cells
537 
nextScheduleSize()538 	protected int nextScheduleSize(){
539 		if(schedulePos<schedule.length-1){schedulePos++;}
540 		return schedule[schedulePos];
541 	}
542 
atMaxSize()543 	protected boolean atMaxSize(){
544 		return schedulePos>=schedule.length-1;
545 	}
546 
547 	protected final int[] schedule;
548 	private int schedulePos=0;
549 
array()550 	public long[] array(){return array;}
551 
owners()552 	public AtomicIntegerArray owners() {return owners;}
553 	@Override
getLock()554 	final Lock getLock(){return lock;}
555 
556 	/*--------------------------------------------------------------*/
557 	/*----------------        Static Fields         ----------------*/
558 	/*--------------------------------------------------------------*/
559 
560 	final static int victimRatio=16; //Initial divisor for victim cache size; it self-resizes.
561 	final static int extra=60; //Amazingly, increasing this gave increasing returns past 300.  Old default was 21.  Could allow higher maxLoadFactorFinal and smaller victim cache.
562 	final static int maxPrime=Primes.primeAtMost(Integer.MAX_VALUE-extra-20);
563 	final static float resizeMult=2f; //Resize by a minimum of this much; not needed for schedule
564 	final static float minLoadFactor=0.58f; //Resize by enough to get the load above this factor; not needed for schedule
565 	final static float maxLoadFactor=0.88f; //Reaching this load triggers resizing
566 	final static float maxLoadFactorFinal=0.95f; //Reaching this load triggers killing
567 	final static float minLoadMult=1/minLoadFactor;
568 	final static float maxLoadMult=1/maxLoadFactor;
569 
570 }
571