1 package kmer; 2 3 import java.util.Arrays; 4 import java.util.concurrent.atomic.AtomicIntegerArray; 5 import java.util.concurrent.atomic.AtomicLong; 6 import java.util.concurrent.locks.Lock; 7 import java.util.concurrent.locks.ReentrantLock; 8 9 import fileIO.ByteStreamWriter; 10 import fileIO.TextStreamWriter; 11 import shared.Primes; 12 import shared.Tools; 13 import structures.ByteBuilder; 14 import structures.SuperLongList; 15 16 /** 17 * Stores kmers in a long[] and values in an int[][], with a victim cache. 18 * @author Brian Bushnell 19 * @date Nov 7, 2014 20 * 21 */ 22 public abstract class HashArray extends AbstractKmerTable { 23 24 /*--------------------------------------------------------------*/ 25 /*---------------- Initialization ----------------*/ 26 /*--------------------------------------------------------------*/ 27 HashArray(int[] schedule_, long coreMask_, boolean twod_)28 HashArray(int[] schedule_, long coreMask_, boolean twod_){ 29 schedule=schedule_; 30 autoResize=schedule.length>1; 31 prime=schedule[0]; 32 33 sizeLimit=(long)((schedule.length==1 ? maxLoadFactorFinal : maxLoadFactor)*prime); 34 array=allocLong1D(prime+extra); 35 victims=new HashForest(Tools.max(10, prime/victimRatio), autoResize, twod_); 36 Arrays.fill(array, NOT_PRESENT); 37 twoD=twod_; 38 coreMask=coreMask_; 39 // coreMask2=coreMask_|3; 40 coreMask2=coreMask_; //Simplifies fast fill 41 } 42 HashArray(int initialSize, long coreMask_, boolean autoResize_, boolean twod_)43 HashArray(int initialSize, long coreMask_, boolean autoResize_, boolean twod_){ 44 schedule=null; 45 prime=initialSize; 46 sizeLimit=(long)(maxLoadFactor*prime); 47 array=allocLong1D(prime+extra); 48 victims=new HashForest(Tools.max(10, initialSize/victimRatio), autoResize_, twod_); 49 Arrays.fill(array, NOT_PRESENT); 50 autoResize=autoResize_; 51 twoD=twod_; 52 coreMask=coreMask_; 53 // coreMask2=coreMask_|3; 54 coreMask2=coreMask_; //Simplifies fast fill 55 } 56 57 /*--------------------------------------------------------------*/ 58 /*---------------- Public Methods ----------------*/ 59 /*--------------------------------------------------------------*/ 60 61 /*--------------------------------------------------------------*/ 62 /*---------------- Test Methods ----------------*/ 63 /*--------------------------------------------------------------*/ 64 65 // public final int set_Test(final long kmer, final int v){ 66 // assert(TESTMODE); 67 // final int x; 68 // if(TWOD){ 69 // int[] old=getValues(kmer, new int[1]); 70 // assert(old==null || contains(kmer, old)); 71 // if(verbose){System.err.println("Fetched "+Arrays.toString(old));} 72 // x=set0(kmer, v); 73 // assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+v+", kmer="+kmer+ 74 // ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1]))); 75 // assert(contains(kmer, v)); 76 // }else{ 77 // int old=getValue(kmer); 78 // assert(old==0 || old==-1 || contains(kmer, old)); 79 // x=set0(kmer, v); 80 // assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer); 81 // assert(v==old || !contains(kmer, old)); 82 // } 83 // return x; 84 // } 85 // 86 // public final int set_Test(final long kmer, final int v[]){ 87 // assert(TESTMODE); 88 // final int x; 89 // if(TWOD){ 90 // final int[] singleton=new int[1]; 91 // int[] old=getValues(kmer, singleton); 92 // assert(old==null || contains(kmer, old)); 93 // if(verbose){System.err.println("Before: old="+Arrays.toString(old)+", v="+Arrays.toString(v));} 94 // x=set0(kmer, v); 95 // if(verbose){System.err.println("After: old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", get()="+Arrays.toString(getValues(kmer, singleton)));} 96 // assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+ 97 // ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1]))); 98 // assert(contains(kmer, v)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+ 99 // ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1]))); 100 // }else{ 101 // int old=getValue(kmer); 102 // assert(old==0 || old==-1 || contains(kmer, old)); 103 // x=set0(kmer, v); 104 // assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer); 105 // assert(v[0]==old || !contains(kmer, old)); 106 // } 107 // return x; 108 // } 109 // 110 // public final int setIfNotPresent_Test(long kmer, int v){ 111 // assert(TESTMODE); 112 // final int x; 113 // if(TWOD){ 114 //// int[] vals=getValues(kmer, null); 115 //// assert(vals==null || contains(kmer, vals)); 116 //// x=setIfNotPresent(kmer, v); 117 //// assert(contains(kmer, vals)); 118 //// assert(contains(kmer, v)); 119 // x=0; 120 // assert(false); 121 // }else{ 122 // int old=getValue(kmer); 123 // assert(old==0 || old==-1 || contains(kmer, old)); 124 // x=setIfNotPresent0(kmer, v); 125 // assert((old<1 && contains(kmer, v)) || (old>0 && contains(kmer, old))) : kmer+", "+old+", "+v; 126 // } 127 // return x; 128 // } 129 130 /*--------------------------------------------------------------*/ 131 kmerToCell(long kmer)132 public final int kmerToCell(long kmer){ 133 final int cell=(int)((kmer&coreMask)%prime); 134 return cell; 135 } 136 137 @Override set(final long kmer, final int[] v, final int vlen)138 public final int set(final long kmer, final int[] v, final int vlen){ 139 int cell=kmerToCell(kmer); 140 141 for(final int max=cell+extra; cell<max; cell++){ 142 long n=array[cell]; 143 if(n==kmer){ 144 if(verbose){System.err.println("A2: Adding "+kmer+", "+Arrays.toString(v)+", "+cell);} 145 insertValue(kmer, v, cell, vlen); 146 if(verbose){System.err.println("A2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));} 147 return 0; 148 }else if(n==NOT_PRESENT){ 149 if(verbose){System.err.println("B2: Adding "+kmer+", "+Arrays.toString(v)+", "+cell);} 150 array[cell]=kmer; 151 insertValue(kmer, v, cell, vlen); 152 if(verbose){System.err.println("B2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));} 153 size++; 154 if(autoResize && size+victims.size>sizeLimit){resize();} 155 return 1; 156 } 157 } 158 if(verbose){System.err.println("C2: Adding "+kmer+", "+v+", "+cell);} 159 final int x=victims.set(kmer, v, vlen); 160 if(autoResize && size+victims.size>sizeLimit){resize();} 161 if(verbose){System.err.println("C2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));} 162 return x; 163 } 164 165 @Override set(final long kmer, final int v)166 public final int set(final long kmer, final int v){ 167 int cell=kmerToCell(kmer); 168 169 // assert(TESTMODE); 170 // ll.add(kmer); 171 // il.add(v); 172 173 for(final int max=cell+extra; cell<max; cell++){ 174 long n=array[cell]; 175 if(n==kmer){ 176 if(verbose){System.err.println("A1: Adding "+kmer+", "+v+", "+cell);} 177 insertValue(kmer, v, cell); 178 if(verbose){System.err.println("A1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));} 179 return 0; 180 }else if(n==NOT_PRESENT){ 181 if(verbose){System.err.println("B1: Adding "+kmer+", "+v+", "+cell);} 182 array[cell]=kmer; 183 insertValue(kmer, v, cell); 184 if(verbose){System.err.println("B1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));} 185 size++; 186 if(autoResize && size+victims.size>sizeLimit){resize();} 187 return 1; 188 } 189 } 190 if(verbose){System.err.println("C1: Adding "+kmer+", "+v+", "+cell+ 191 "; victims.get(kmer)="+Arrays.toString(victims.getValues(kmer, new int[1])));} 192 final int x=victims.set(kmer, v); 193 if(autoResize && size+victims.size>sizeLimit){resize();} 194 if(verbose){System.err.println("C1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1]))+ 195 "; victims.get(kmer)="+Arrays.toString(victims.getValues(kmer, new int[1])));} 196 return x; 197 } 198 199 @Override setIfNotPresent(long kmer, int value)200 public final int setIfNotPresent(long kmer, int value){ 201 int cell=kmerToCell(kmer); 202 for(final int max=cell+extra; cell<max; cell++){ 203 long n=array[cell]; 204 if(n==kmer){ 205 return 0; 206 }else if(n==NOT_PRESENT){ 207 array[cell]=kmer; 208 insertValue(kmer, value, cell); 209 size++; 210 if(autoResize && size+victims.size>sizeLimit){resize();} 211 return 1; 212 } 213 } 214 // System.err.println("size="+size+", prime="+prime+", limit="+sizeLimit); 215 int x=victims.setIfNotPresent(kmer, value); 216 if(autoResize && size+victims.size>sizeLimit){resize();} 217 return x; 218 } 219 220 @Override getValue(long kmer)221 public final int getValue(long kmer){ 222 int cell=findKmer(kmer); 223 if(cell==NOT_PRESENT){return NOT_PRESENT;} 224 if(cell==HASH_COLLISION){return victims.getValue(kmer);} 225 return readCellValue(cell); 226 } 227 getValue(long kmer, int startCell)228 public final int getValue(long kmer, int startCell){ 229 int cell=findKmer(kmer, startCell); 230 if(cell==NOT_PRESENT){return NOT_PRESENT;} 231 if(cell==HASH_COLLISION){return victims.getValue(kmer);} 232 return readCellValue(cell); 233 } 234 235 @Override getValues(long kmer, int[] singleton)236 public final int[] getValues(long kmer, int[] singleton){ 237 int cell=findKmer(kmer); 238 if(cell==NOT_PRESENT){ 239 singleton[0]=NOT_PRESENT; 240 return singleton; 241 } 242 if(cell==HASH_COLLISION){return victims.getValues(kmer, singleton);} 243 return readCellValues(cell, singleton); 244 } 245 246 @Override contains(long kmer)247 public final boolean contains(long kmer){ 248 int cell=findKmer(kmer); 249 if(cell==NOT_PRESENT){return false;} 250 if(cell==HASH_COLLISION){return victims.contains(kmer);} 251 return true; 252 } 253 getKmer(int cell)254 public final long getKmer(int cell) { 255 return array[cell]; 256 } 257 258 /*--------------------------------------------------------------*/ 259 /*---------------- Ownership ----------------*/ 260 /*--------------------------------------------------------------*/ 261 262 @Override initializeOwnership()263 public final void initializeOwnership(){ 264 assert(owners==null); 265 owners=allocAtomicInt(array.length); 266 for(int i=0; i<array.length; i++){ 267 owners.set(i, NO_OWNER); 268 } 269 victims.initializeOwnership(); 270 } 271 272 @Override clearOwnership()273 public final void clearOwnership(){ 274 owners=null; 275 victims.clearOwnership(); 276 } 277 278 @Override setOwner(final long kmer, final int newOwner)279 public final int setOwner(final long kmer, final int newOwner){ 280 final int cell=findKmer(kmer); 281 assert(cell!=NOT_PRESENT); 282 if(cell==HASH_COLLISION){return victims.setOwner(kmer, newOwner);} 283 return setOwner(kmer, newOwner, cell); 284 } 285 setOwner(final long kmer, final int newOwner, final int cell)286 public final int setOwner(final long kmer, final int newOwner, final int cell){ 287 assert(array[cell]==kmer); 288 final int original=owners.get(cell); 289 int current=original; 290 while(current<newOwner){ 291 boolean success=owners.compareAndSet(cell, current, newOwner); 292 if(!success){current=owners.get(cell);} 293 else{current=newOwner;} 294 } 295 assert(current>=original) : "original="+original+", current="+current+", newOwner="+newOwner+", re-read="+owners.get(cell); 296 return current; 297 } 298 299 @Override clearOwner(final long kmer, final int owner)300 public final boolean clearOwner(final long kmer, final int owner){ 301 final int cell=findKmer(kmer); 302 assert(cell!=NOT_PRESENT); 303 if(cell==HASH_COLLISION){return victims.clearOwner(kmer, owner);} 304 return clearOwner(kmer, owner, cell); 305 } 306 clearOwner(final long kmer, final int owner, final int cell)307 public final boolean clearOwner(final long kmer, final int owner, final int cell){ 308 assert(array[cell]==kmer); 309 boolean success=owners.compareAndSet(cell, owner, NO_OWNER); 310 return success; 311 } 312 313 @Override getOwner(final long kmer)314 public final int getOwner(final long kmer){ 315 final int cell=findKmer(kmer); 316 assert(cell!=NOT_PRESENT); 317 if(cell==HASH_COLLISION){return victims.getOwner(kmer);} 318 return getCellOwner(cell); 319 } 320 getCellOwner(final int cell)321 public final int getCellOwner(final int cell){ 322 return owners.get(cell); 323 } 324 325 /*--------------------------------------------------------------*/ 326 /*---------------- Nonpublic Methods ----------------*/ 327 /*--------------------------------------------------------------*/ 328 insertValue(final long kmer, final int v, final int cell)329 protected abstract void insertValue(final long kmer, final int v, final int cell); 330 331 // protected abstract void insertValue(final long kmer, final int[] vals, final int cell); 332 333 /** This is for IntList3 support with HashArrayHybridFast */ insertValue(final long kmer, final int[] vals, final int cell, final int vlen)334 protected abstract void insertValue(final long kmer, final int[] vals, final int cell, final int vlen); 335 readCellValue(int cell)336 protected abstract int readCellValue(int cell); readCellValues(int cell, int[] singleton)337 protected abstract int[] readCellValues(int cell, int[] singleton); 338 339 @Override get(long kmer)340 final Object get(long kmer){ 341 throw new RuntimeException("Unimplemented."); 342 } 343 findKmer(long kmer)344 final int findKmer(long kmer){ 345 return findKmer(kmer, kmerToCell(kmer)); 346 } 347 findKmer(final long kmer, final int startCell)348 final int findKmer(final long kmer, final int startCell){ 349 int cell=startCell; 350 for(final int max=cell+extra; cell<max; cell++){ 351 final long n=array[cell]; 352 if(n==kmer){return cell;} 353 else if(n==NOT_PRESENT){return NOT_PRESENT;} 354 } 355 return HASH_COLLISION; 356 } 357 findKmerOrEmpty(long kmer)358 final int findKmerOrEmpty(long kmer){ 359 int cell=kmerToCell(kmer); 360 for(final int max=cell+extra; cell<max; cell++){ 361 final long n=array[cell]; 362 if(n==kmer || n==NOT_PRESENT){return cell;} 363 } 364 return HASH_COLLISION; 365 } 366 367 /*--------------------------------------------------------------*/ 368 /*---------------- Resizing and Rebalancing ----------------*/ 369 /*--------------------------------------------------------------*/ 370 371 @Override canResize()372 final boolean canResize() {return true;} 373 374 @Override size()375 final public long size() {return size;} 376 377 @Override arrayLength()378 final public int arrayLength() {return array.length;} 379 380 @Override resize()381 protected abstract void resize(); 382 383 /*--------------------------------------------------------------*/ 384 /*---------------- Info Dumping ----------------*/ 385 /*--------------------------------------------------------------*/ 386 387 @Override dumpKmersAsText(TextStreamWriter tsw, int k, int mincount, int maxcount)388 public final boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount, int maxcount){ 389 if(twoD){ 390 final int[] singleton=new int[1]; 391 for(int i=0; i<array.length; i++){ 392 long kmer=array[i]; 393 if(kmer!=NOT_PRESENT){ 394 tsw.print(toText(kmer, readCellValues(i, singleton), k).append('\n')); 395 } 396 } 397 }else{ 398 for(int i=0; i<array.length; i++){ 399 long kmer=array[i]; 400 if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){ 401 tsw.print(toText(kmer, readCellValue(i), k).append('\n')); 402 } 403 } 404 } 405 if(victims!=null){ 406 victims.dumpKmersAsText(tsw, k, mincount, maxcount); 407 } 408 return true; 409 } 410 411 @Override dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount, int maxcount, AtomicLong remaining)412 public final boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount, int maxcount, AtomicLong remaining){ 413 if(twoD){ 414 final int[] singleton=new int[1]; 415 for(int i=0; i<array.length; i++){ 416 long kmer=array[i]; 417 if(kmer!=NOT_PRESENT){ 418 if(remaining!=null && remaining.decrementAndGet()<0){return true;} 419 bsw.printlnKmer(kmer, readCellValues(i, singleton), k); 420 } 421 } 422 }else{ 423 for(int i=0; i<array.length; i++){ 424 long kmer=array[i]; 425 if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){ 426 if(remaining!=null && remaining.decrementAndGet()<0){return true;} 427 bsw.printlnKmer(kmer, readCellValue(i), k); 428 } 429 } 430 } 431 if(victims!=null){ 432 victims.dumpKmersAsBytes(bsw, k, mincount, maxcount, remaining); 433 } 434 return true; 435 } 436 437 @Override dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount, int maxcount, AtomicLong remaining)438 public final boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount, int maxcount, AtomicLong remaining){ 439 if(twoD){ 440 final int[] singleton=new int[1]; 441 for(int i=0; i<array.length; i++){ 442 long kmer=array[i]; 443 if(kmer!=NOT_PRESENT){ 444 if(remaining!=null && remaining.decrementAndGet()<0){return true;} 445 toBytes(kmer, readCellValues(i, singleton), k, bb); 446 bb.nl(); 447 if(bb.length()>=16000){ 448 ByteBuilder bb2=new ByteBuilder(bb); 449 synchronized(bsw){bsw.addJob(bb2);} 450 bb.clear(); 451 } 452 } 453 } 454 }else{ 455 for(int i=0; i<array.length; i++){ 456 long kmer=array[i]; 457 if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){ 458 if(remaining!=null && remaining.decrementAndGet()<0){return true;} 459 toBytes(kmer, readCellValue(i), k, bb); 460 bb.nl(); 461 if(bb.length()>=16000){ 462 ByteBuilder bb2=new ByteBuilder(bb); 463 synchronized(bsw){bsw.addJob(bb2);} 464 bb.clear(); 465 } 466 } 467 } 468 } 469 if(victims!=null){ 470 victims.dumpKmersAsBytes_MT(bsw, bb, k, mincount, maxcount, remaining); 471 } 472 return true; 473 } 474 475 @Override fillHistogram(long[] ca, int max)476 public final void fillHistogram(long[] ca, int max){ 477 for(int i=0; i<array.length; i++){ 478 long kmer=array[i]; 479 if(kmer!=NOT_PRESENT){ 480 int count=Tools.min(readCellValue(i), max); 481 ca[count]++; 482 } 483 } 484 if(victims!=null){ 485 victims.fillHistogram(ca, max); 486 } 487 } 488 489 @Override fillHistogram(SuperLongList sll)490 public void fillHistogram(SuperLongList sll){ 491 for(int i=0; i<array.length; i++){ 492 long kmer=array[i]; 493 if(kmer!=NOT_PRESENT){ 494 int count=readCellValue(i); 495 sll.add(count); 496 } 497 } 498 if(victims!=null){ 499 victims.fillHistogram(sll); 500 } 501 } 502 503 @Override countGC(long[] gcCounts, int max)504 public final void countGC(long[] gcCounts, int max){ 505 for(int i=0; i<array.length; i++){ 506 long kmer=array[i]; 507 if(kmer!=NOT_PRESENT){ 508 int count=readCellValue(i); 509 int index=Tools.min(count, max); 510 gcCounts[index]+=gc(kmer); 511 } 512 } 513 if(victims!=null){ 514 victims.countGC(gcCounts, max); 515 } 516 } 517 victims()518 public HashForest victims(){ 519 return victims; 520 } 521 522 /*--------------------------------------------------------------*/ 523 /*---------------- Fields ----------------*/ 524 /*--------------------------------------------------------------*/ 525 526 AtomicIntegerArray owners; 527 long[] array; 528 int prime; 529 long size=0; 530 long sizeLimit; 531 final HashForest victims; 532 final boolean autoResize; 533 public final boolean twoD; 534 private final Lock lock=new ReentrantLock(); 535 private final long coreMask;//for ways 536 private final long coreMask2;//for cells 537 nextScheduleSize()538 protected int nextScheduleSize(){ 539 if(schedulePos<schedule.length-1){schedulePos++;} 540 return schedule[schedulePos]; 541 } 542 atMaxSize()543 protected boolean atMaxSize(){ 544 return schedulePos>=schedule.length-1; 545 } 546 547 protected final int[] schedule; 548 private int schedulePos=0; 549 array()550 public long[] array(){return array;} 551 owners()552 public AtomicIntegerArray owners() {return owners;} 553 @Override getLock()554 final Lock getLock(){return lock;} 555 556 /*--------------------------------------------------------------*/ 557 /*---------------- Static Fields ----------------*/ 558 /*--------------------------------------------------------------*/ 559 560 final static int victimRatio=16; //Initial divisor for victim cache size; it self-resizes. 561 final static int extra=60; //Amazingly, increasing this gave increasing returns past 300. Old default was 21. Could allow higher maxLoadFactorFinal and smaller victim cache. 562 final static int maxPrime=Primes.primeAtMost(Integer.MAX_VALUE-extra-20); 563 final static float resizeMult=2f; //Resize by a minimum of this much; not needed for schedule 564 final static float minLoadFactor=0.58f; //Resize by enough to get the load above this factor; not needed for schedule 565 final static float maxLoadFactor=0.88f; //Reaching this load triggers resizing 566 final static float maxLoadFactorFinal=0.95f; //Reaching this load triggers killing 567 final static float minLoadMult=1/minLoadFactor; 568 final static float maxLoadMult=1/maxLoadFactor; 569 570 } 571