1 package cardinality; 2 3 import java.util.Arrays; 4 import java.util.Random; 5 6 import dna.AminoAcid; 7 import fileIO.ByteStreamWriter; 8 import jgi.Dedupe; 9 import shared.Parser; 10 import shared.Shared; 11 import shared.Tools; 12 import stream.Read; 13 import structures.LongList; 14 import structures.SuperLongList; 15 import ukmer.Kmer; 16 17 /** 18 * Abstract superclass for cardinality-tracking structures like LogLog. 19 * @author Brian Bushnell 20 * @date Feb 20, 2020 21 * 22 */ 23 public abstract class CardinalityTracker { 24 25 /*--------------------------------------------------------------*/ 26 /*---------------- Initialization ----------------*/ 27 /*--------------------------------------------------------------*/ 28 29 /** 30 * Factory method; creates a tracker using default settings. 31 * Subclass is determined by static Parser.loglogType field. 32 */ makeTracker()33 public static CardinalityTracker makeTracker(){ 34 if(trackCounts || "BBLog".equalsIgnoreCase(Parser.loglogType)){ 35 return new BBLog();//Fastest, most accurate 36 }else if("LogLog".equalsIgnoreCase(Parser.loglogType)){ 37 return new LogLog();//Least accurate 38 }else if("LogLog2".equalsIgnoreCase(Parser.loglogType)){ 39 return new LogLog2();//Slowest, uses mantissa 40 }else if("LogLog16".equalsIgnoreCase(Parser.loglogType)){ 41 return new LogLog16();//Uses 10-bit mantissa 42 }else if("LogLog8".equalsIgnoreCase(Parser.loglogType)){ 43 return new LogLog8();//Lowest memory 44 } 45 assert(false) : "TODO: "+Parser.loglogType; 46 throw new RuntimeException(Parser.loglogType); 47 } 48 49 /** 50 * Factory method; creates a tracker using parsed settings. 51 * Subclass is determined by static Parser.loglogType field. 52 */ makeTracker(Parser p)53 public static CardinalityTracker makeTracker(Parser p){ 54 if(trackCounts || "BBLog".equalsIgnoreCase(Parser.loglogType)){ 55 return new BBLog(p); 56 }else if("LogLog".equalsIgnoreCase(Parser.loglogType)){ 57 return new LogLog(p); 58 }else if("LogLog2".equalsIgnoreCase(Parser.loglogType)){ 59 return new LogLog2(p); 60 }else if("LogLog16".equalsIgnoreCase(Parser.loglogType)){ 61 return new LogLog16(p); 62 }else if("LogLog8".equalsIgnoreCase(Parser.loglogType)){ 63 return new LogLog8(p); 64 } 65 assert(false) : "TODO: "+Parser.loglogType; 66 throw new RuntimeException(Parser.loglogType); 67 } 68 69 /** 70 * Factory method; creates a tracker using specified settings. 71 * Subclass is determined by static Parser.loglogType field. 72 */ makeTracker(int buckets_, int k_, long seed, float minProb_)73 public static CardinalityTracker makeTracker(int buckets_, int k_, long seed, float minProb_){ 74 if(trackCounts || "BBLog".equalsIgnoreCase(Parser.loglogType)){ 75 return new BBLog(buckets_, k_, seed, minProb_); 76 }else if("LogLog".equalsIgnoreCase(Parser.loglogType)){ 77 return new LogLog(buckets_, k_, seed, minProb_); 78 }else if("LogLog2".equalsIgnoreCase(Parser.loglogType)){ 79 return new LogLog2(buckets_, k_, seed, minProb_); 80 }else if("LogLog16".equalsIgnoreCase(Parser.loglogType)){ 81 return new LogLog16(buckets_, k_, seed, minProb_); 82 }else if("LogLog8".equalsIgnoreCase(Parser.loglogType)){ 83 return new LogLog8(buckets_, k_, seed, minProb_); 84 } 85 assert(false) : "TODO: "+Parser.loglogType; 86 throw new RuntimeException(Parser.loglogType); 87 } 88 89 /*--------------------------------------------------------------*/ 90 /*---------------- Constructors ----------------*/ 91 /*--------------------------------------------------------------*/ 92 93 /** Create a tracker with parsed parameters. */ CardinalityTracker(Parser p)94 public CardinalityTracker(Parser p){ 95 this(p.loglogbuckets, p.loglogk, p.loglogseed, p.loglogMinprob); 96 } 97 98 /** 99 * Create a tracker with specified parameters. 100 * @param buckets_ Number of buckets (counters) 101 * @param k_ Kmer length 102 * @param seed Random number generator seed; -1 for a random seed 103 * @param minProb_ Ignore kmers with under this probability of being correct 104 */ CardinalityTracker(int buckets_, int k_, long seed, float minProb_)105 public CardinalityTracker(int buckets_, int k_, long seed, float minProb_){ 106 // if((buckets_&1)==0){buckets_=(int)Primes.primeAtLeast(buckets_);} //Legacy code, needed modulo operation 107 buckets=powerOf2AtLeast(buckets_); 108 assert(buckets>0 && Integer.bitCount(buckets)==1) : "Buckets must be a power of 2: "+buckets; 109 bucketMask=buckets-1; 110 k=Kmer.getKbig(k_); 111 minProb=minProb_; 112 113 //For old hash function 114 // tables=new long[numTables][][]; 115 // for(int i=0; i<numTables; i++){ 116 // tables[i]=makeCodes(steps, bits, (seed<0 ? -1 : seed+i)); 117 // } 118 119 Random randy=Shared.threadLocalRandom(seed<0 ? -1 : seed); 120 hashXor=randy.nextLong(); 121 } 122 123 /** 124 * Return the lowest power of 2 that is >= target. 125 * Provided because buckets must currently be a power of 2. 126 */ powerOf2AtLeast(int target)127 public static final int powerOf2AtLeast(int target){ 128 if(target<1){return 1;} 129 int ret=1, limit=Tools.min(target, 0x40000000); 130 while(ret<limit){ret<<=1;} 131 return ret; 132 } 133 134 /*--------------------------------------------------------------*/ 135 /*---------------- Methods ----------------*/ 136 /*--------------------------------------------------------------*/ 137 138 /** Deprecated; use LogLogWrapper */ main(String[] args)139 public static final void main(String[] args){ 140 LogLogWrapper llw=new LogLogWrapper(args); 141 142 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; 143 Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4; 144 145 llw.process(); 146 147 Read.VALIDATE_IN_CONSTRUCTOR=vic; 148 } 149 150 /** 151 * Old table-based hash function. 152 * Slower than the new function. 153 */ 154 public final long hash(final long value0, final long[][] table){ 155 long value=value0, code=0; 156 long mask=(bits>63 ? -1L : ~((-1L)<<bits)); 157 158 for(int i=0; i<steps; i++){//I could also do while value!=0 159 int x=(int)(value&mask); 160 value>>=bits; 161 code=code^table[i][x]; 162 } 163 return code; 164 } 165 166 /** Taken from Thomas Wang, Jan 1997: 167 * http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm 168 * 169 * This is much faster than the table version. Results seem similar though. 170 */ hash64shift(long key)171 public long hash64shift(long key){ 172 key^=hashXor; 173 key = (~key) + (key << 21); // key = (key << 21) - key - 1; 174 key = key ^ (key >>> 24); 175 key = (key + (key << 3)) + (key << 8); // key * 265 176 key = key ^ (key >>> 14); 177 key = (key + (key << 2)) + (key << 4); // key * 21 178 key = key ^ (key >>> 28); 179 key = key + (key << 31); 180 return key; 181 } 182 183 /** Hash and add a number to this tracker */ add(long number)184 public final void add(long number){ 185 hashAndStore(number); 186 } 187 188 /** 189 * Hash and track the Read 190 * */ hash(Read r)191 public final void hash(Read r){ 192 if(r==null){return;} 193 if(r.length()>=k){hash(r.bases, r.quality);} 194 if(r.mateLength()>=k){hash(r.mate.bases, r.mate.quality);} 195 } 196 197 /** 198 * Hash and track the sequence 199 * */ hash(byte[] bases, byte[] quals)200 public final void hash(byte[] bases, byte[] quals){ 201 if(k<32){hashSmall(bases, quals);} 202 else{hashBig(bases, quals);} 203 } 204 205 /** 206 * Hash and track the sequence using short kmers 0<k<32 207 * */ hashSmall(byte[] bases, byte[] quals)208 public final void hashSmall(byte[] bases, byte[] quals){ 209 final int shift=2*k; 210 final int shift2=shift-2; 211 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 212 int len=0; 213 214 long kmer=0, rkmer=0; 215 216 if(minProb>0 && quals!=null){//Debranched loop 217 assert(quals.length==bases.length) : quals.length+", "+bases.length; 218 float prob=1; 219 for(int i=0; i<bases.length; i++){ 220 byte b=bases[i]; 221 long x=AminoAcid.baseToNumber[b]; 222 long x2=AminoAcid.baseToComplementNumber[b]; 223 kmer=((kmer<<2)|x)&mask; 224 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 225 226 {//Update probability 227 byte q=quals[i]; 228 prob=prob*PROB_CORRECT[q]; 229 if(len>k){ 230 byte oldq=quals[i-k]; 231 prob=prob*PROB_CORRECT_INVERSE[oldq]; 232 } 233 } 234 if(x>=0){ 235 len++; 236 }else{ 237 len=0; 238 kmer=rkmer=0; 239 prob=1; 240 } 241 if(len>=k && prob>=minProb){ 242 add(Tools.max(kmer, rkmer)); 243 } 244 } 245 }else{ 246 247 for(int i=0; i<bases.length; i++){ 248 byte b=bases[i]; 249 long x=AminoAcid.baseToNumber[b]; 250 long x2=AminoAcid.baseToComplementNumber[b]; 251 kmer=((kmer<<2)|x)&mask; 252 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 253 254 if(x>=0){ 255 len++; 256 }else{ 257 len=0; 258 kmer=rkmer=0; 259 } 260 if(len>=k){ 261 add(Tools.max(kmer, rkmer)); 262 } 263 } 264 } 265 } 266 267 /** 268 * Hash and track the sequence using long kmers >31 269 * */ hashBig(byte[] bases, byte[] quals)270 public final void hashBig(byte[] bases, byte[] quals){ 271 272 Kmer kmer=getLocalKmer(); 273 int len=0; 274 float prob=1; 275 276 for(int i=0; i<bases.length; i++){ 277 byte b=bases[i]; 278 long x=Dedupe.baseToNumber[b]; 279 kmer.addRightNumeric(x); 280 if(minProb>0 && quals!=null){//Update probability 281 prob=prob*PROB_CORRECT[quals[i]]; 282 if(len>k){ 283 byte oldq=quals[i-k]; 284 prob=prob*PROB_CORRECT_INVERSE[oldq]; 285 } 286 } 287 if(AminoAcid.isFullyDefined(b)){ 288 len++; 289 }else{ 290 len=0; 291 prob=1; 292 } 293 if(len>=k && prob>=minProb){ 294 add(kmer.xor()); 295 } 296 } 297 } 298 299 /** 300 * Make a table of random bitmasks for hashing. 301 * Superceded by new hash method. 302 * */ makeCodes(int length, int bits, long seed)303 private static final long[][] makeCodes(int length, int bits, long seed){ 304 if(true) {return null;}//Short circuit 305 Random randy=Shared.threadLocalRandom(seed); 306 int modes=1<<bits; 307 long[][] r=new long[length][modes]; 308 for(int i=0; i<length; i++){ 309 for(int j=0; j<modes; j++){ 310 long x=randy.nextLong(); 311 while(Long.bitCount(x)>33){ 312 x&=(~(1L<<randy.nextInt(64))); 313 } 314 while(Long.bitCount(x)<31){ 315 x|=(1L<<randy.nextInt(64)); 316 } 317 r[i][j]=x; 318 319 } 320 } 321 return r; 322 } 323 compensationFactorBuckets()324 public final float compensationFactorBuckets(){ 325 assert(Integer.bitCount(buckets)==1) : buckets; 326 int zeros=Integer.numberOfTrailingZeros(buckets); 327 return compensationFactorLogBuckets(zeros); 328 } 329 330 /** 331 * Multiplier to compensate for overestimating genome size 332 * when the number of buckets is too small, 333 * as a function of log2(buckets) 334 * @param logBuckets 335 * @return Multiplier for final estimate 336 */ compensationFactorLogBuckets(int logBuckets)337 public final float compensationFactorLogBuckets(int logBuckets){ 338 float[] array=compensationFactorLogBucketsArray(); 339 return (array!=null && logBuckets<array.length) ? array[logBuckets] : 1/(1+(1<<logBuckets)); 340 } 341 toFrequency()342 public SuperLongList toFrequency(){ 343 SuperLongList list=new SuperLongList(1000); 344 int[] counts=getCounts(); 345 for(int x : counts){ 346 if(x>0){list.add(x);} 347 } 348 list.sort(); 349 return list; 350 } 351 352 /** 353 * Print a kmer frequency histogram. 354 * @param path File to which to print 355 * @param overwrite 356 * @param append 357 * @param supersample Adjust counts for the effect of subsampling 358 */ printKhist(String path, boolean overwrite, boolean append, boolean supersample, int decimals)359 public void printKhist(String path, boolean overwrite, boolean append, boolean supersample, int decimals){ 360 SuperLongList sll=toFrequency(); 361 ByteStreamWriter bsw=new ByteStreamWriter(path, overwrite, append, false); 362 bsw.start(); 363 bsw.print("#Depth\tCount\n"); 364 final double mult=Tools.max(1.0, (supersample ? cardinality()/(double)buckets : 1)); 365 final long[] array=sll.array(); 366 final LongList list=sll.list(); 367 368 for(int depth=0; depth<array.length; depth++){ 369 long count=array[depth]; 370 if(count>0){ 371 bsw.print(depth).tab(); 372 if(supersample){ 373 if(decimals>0){ 374 bsw.print(count*mult, decimals).nl(); 375 }else{ 376 bsw.print(Tools.max(1, Math.round(count*mult))).nl(); 377 } 378 }else{ 379 bsw.print(count).nl(); 380 } 381 } 382 } 383 int count=0; 384 long prevDepth=-1; 385 for(int i=0; i<list.size; i++){ 386 long depth=list.get(i); 387 if(depth!=prevDepth && count>0){ 388 assert(depth>prevDepth); 389 bsw.print(prevDepth).tab(); 390 if(supersample){ 391 if(decimals>0){ 392 bsw.print(count*mult, decimals).nl(); 393 }else{ 394 bsw.print(Tools.max(1, Math.round(count*mult))).nl(); 395 } 396 }else{ 397 bsw.print(count).nl(); 398 } 399 count=0; 400 }else{ 401 count++; 402 } 403 prevDepth=depth; 404 } 405 if(count>0){ 406 bsw.print(prevDepth).tab(); 407 if(supersample){ 408 if(decimals>0){ 409 bsw.print(count*mult, decimals).nl(); 410 }else{ 411 bsw.print(Tools.max(1, Math.round(count*mult))).nl(); 412 } 413 }else{ 414 bsw.print(count).nl(); 415 } 416 } 417 bsw.poisonAndWait(); 418 } 419 420 /** Sum of counts array, if present */ countSum()421 public final long countSum(){ 422 int[] counts=getCounts(); 423 return counts==null ? 0 : Tools.sum(counts); 424 } 425 426 /*--------------------------------------------------------------*/ 427 /*---------------- Abstract Methods ----------------*/ 428 /*--------------------------------------------------------------*/ 429 430 /*--------------------------------------------------------------*/ 431 /*---------------- Abstract Methods ----------------*/ 432 /*--------------------------------------------------------------*/ 433 434 /** Calculate cardinality estimate from this tracker */ cardinality()435 public abstract long cardinality(); 436 437 /** Counts array, if present. 438 * Should be overridden for classes that track counts. */ getCounts()439 public int[] getCounts(){ 440 return null; 441 } 442 443 /** Add another tracker to this one */ add(CardinalityTracker log)444 public abstract void add(CardinalityTracker log); 445 446 /** Generate a 64-bit hashcode from a number, and add it to this tracker */ hashAndStore(final long number)447 public abstract void hashAndStore(final long number); 448 449 /** TODO: Deprecate? 450 * Designed to compensate for overestimate with small numbers of buckets. 451 * Appears to be handled by using the harmonic mean in the case of multiple trials. */ compensationFactorLogBucketsArray()452 public abstract float[] compensationFactorLogBucketsArray(); 453 454 /*--------------------------------------------------------------*/ 455 /*---------------- Fields ----------------*/ 456 /*--------------------------------------------------------------*/ 457 458 /** Kmer length */ 459 public final int k; 460 461 /** Ignore kmers under this probability of correctness */ 462 public final float minProb; 463 464 /** 465 * Number of buckets for tracking. 466 * Must be a power of 2. 467 * Larger is slower and uses more memory, but more accurate. 468 */ 469 public final int buckets; 470 471 /** Mask for lower bits of hashcode to yield bucket; bucketMask==buckets-1. */ 472 final int bucketMask; 473 474 /** Reusable kmer for hashing. */ 475 private final ThreadLocal<Kmer> localKmer=new ThreadLocal<Kmer>(); 476 477 /** The hash function will yield a different mapping for each mask here. */ 478 private final long hashXor; 479 480 /** For long kmer mode */ getLocalKmer()481 protected Kmer getLocalKmer(){ 482 Kmer kmer=localKmer.get(); 483 if(kmer==null){ 484 localKmer.set(new Kmer(k)); 485 kmer=localKmer.get(); 486 } 487 kmer.clearFast(); 488 return kmer; 489 } 490 491 /*--------------------------------------------------------------*/ 492 /*---------------- Deprecated Table Fields ----------------*/ 493 /*--------------------------------------------------------------*/ 494 495 static final int numTables=4; 496 static final int numTablesMask=numTables-1; 497 /** Bits hashed per cycle; no longer needed */ 498 private static final int bits=8; 499 private static final int steps=(63+bits)/bits;; 500 // final long[][][] tables; 501 502 /*--------------------------------------------------------------*/ 503 /*---------------- Statics ----------------*/ 504 /*--------------------------------------------------------------*/ 505 506 /** Converts quality score to probability of correctness */ 507 public static final float[] PROB_CORRECT=Arrays.copyOf(align2.QualityTools.PROB_CORRECT, 128); 508 /** Inverse of PROB_CORRECT */ 509 public static final float[] PROB_CORRECT_INVERSE=Arrays.copyOf(align2.QualityTools.PROB_CORRECT_INVERSE, 128); 510 511 /** Atomic mode allows less memory when multithreaded, but lower peak concurrency due to contention. */ 512 public static final boolean atomic=false;//non-atomic is faster. 513 /** Track number of times the value in each bucket was seen. */ 514 public static boolean trackCounts=false; 515 /** Ignores kmers such that x%SKIPMOD!=0, to skip expensive hash and store functions */ 516 static final long SKIPMOD=1;//No longer used; requires a modulo operation 517 /** Records the last cardinality tracked, for use in static contexts (RQCFilter) */ 518 public static long lastCardinality=-1; 519 // /** Ignore hashed values above this, to skip expensive read and store functions. */ 520 // static final long maxHashedValue=((-1L)>>>3);//No longer used 521 522 /** These determine how to combine multiple buckets to yield a value. 523 * Mean is best even though mwa is pretty close. Median is much worse. */ 524 public static boolean USE_MEAN=true;//Arithmetic mean 525 public static boolean USE_MEDIAN=false; 526 public static boolean USE_MWA=false;//Median-weighted-average 527 public static boolean USE_HMEAN=false;//Harmonic mean 528 public static boolean USE_GMEAN=false;//Geometric mean 529 530 } 531