1 package cardinality;
2 
3 import java.util.Arrays;
4 import java.util.Random;
5 
6 import dna.AminoAcid;
7 import fileIO.ByteStreamWriter;
8 import jgi.Dedupe;
9 import shared.Parser;
10 import shared.Shared;
11 import shared.Tools;
12 import stream.Read;
13 import structures.LongList;
14 import structures.SuperLongList;
15 import ukmer.Kmer;
16 
17 /**
18  * Abstract superclass for cardinality-tracking structures like LogLog.
19  * @author Brian Bushnell
20  * @date Feb 20, 2020
21  *
22  */
23 public abstract class CardinalityTracker {
24 
25 	/*--------------------------------------------------------------*/
26 	/*----------------        Initialization        ----------------*/
27 	/*--------------------------------------------------------------*/
28 
29 	/**
30 	 * Factory method; creates a tracker using default settings.
31 	 * Subclass is determined by static Parser.loglogType field.
32 	 */
makeTracker()33 	public static CardinalityTracker makeTracker(){
34 		if(trackCounts || "BBLog".equalsIgnoreCase(Parser.loglogType)){
35 			return new BBLog();//Fastest, most accurate
36 		}else if("LogLog".equalsIgnoreCase(Parser.loglogType)){
37 			return new LogLog();//Least accurate
38 		}else if("LogLog2".equalsIgnoreCase(Parser.loglogType)){
39 			return new LogLog2();//Slowest, uses mantissa
40 		}else if("LogLog16".equalsIgnoreCase(Parser.loglogType)){
41 			return new LogLog16();//Uses 10-bit mantissa
42 		}else if("LogLog8".equalsIgnoreCase(Parser.loglogType)){
43 			return new LogLog8();//Lowest memory
44 		}
45 		assert(false) : "TODO: "+Parser.loglogType;
46 		throw new RuntimeException(Parser.loglogType);
47 	}
48 
49 	/**
50 	 * Factory method; creates a tracker using parsed settings.
51 	 * Subclass is determined by static Parser.loglogType field.
52 	 */
makeTracker(Parser p)53 	public static CardinalityTracker makeTracker(Parser p){
54 		if(trackCounts || "BBLog".equalsIgnoreCase(Parser.loglogType)){
55 			return new BBLog(p);
56 		}else if("LogLog".equalsIgnoreCase(Parser.loglogType)){
57 			return new LogLog(p);
58 		}else if("LogLog2".equalsIgnoreCase(Parser.loglogType)){
59 			return new LogLog2(p);
60 		}else if("LogLog16".equalsIgnoreCase(Parser.loglogType)){
61 			return new LogLog16(p);
62 		}else if("LogLog8".equalsIgnoreCase(Parser.loglogType)){
63 			return new LogLog8(p);
64 		}
65 		assert(false) : "TODO: "+Parser.loglogType;
66 		throw new RuntimeException(Parser.loglogType);
67 	}
68 
69 	/**
70 	 * Factory method; creates a tracker using specified settings.
71 	 * Subclass is determined by static Parser.loglogType field.
72 	 */
makeTracker(int buckets_, int k_, long seed, float minProb_)73 	public static CardinalityTracker makeTracker(int buckets_, int k_, long seed, float minProb_){
74 		if(trackCounts || "BBLog".equalsIgnoreCase(Parser.loglogType)){
75 			return new BBLog(buckets_, k_, seed, minProb_);
76 		}else if("LogLog".equalsIgnoreCase(Parser.loglogType)){
77 			return new LogLog(buckets_, k_, seed, minProb_);
78 		}else if("LogLog2".equalsIgnoreCase(Parser.loglogType)){
79 			return new LogLog2(buckets_, k_, seed, minProb_);
80 		}else if("LogLog16".equalsIgnoreCase(Parser.loglogType)){
81 			return new LogLog16(buckets_, k_, seed, minProb_);
82 		}else if("LogLog8".equalsIgnoreCase(Parser.loglogType)){
83 			return new LogLog8(buckets_, k_, seed, minProb_);
84 		}
85 		assert(false) : "TODO: "+Parser.loglogType;
86 		throw new RuntimeException(Parser.loglogType);
87 	}
88 
89 	/*--------------------------------------------------------------*/
90 	/*----------------         Constructors         ----------------*/
91 	/*--------------------------------------------------------------*/
92 
93 	/** Create a tracker with parsed parameters. */
CardinalityTracker(Parser p)94 	public CardinalityTracker(Parser p){
95 		this(p.loglogbuckets, p.loglogk, p.loglogseed, p.loglogMinprob);
96 	}
97 
98 	/**
99 	 * Create a tracker with specified parameters.
100 	 * @param buckets_ Number of buckets (counters)
101 	 * @param k_ Kmer length
102 	 * @param seed Random number generator seed; -1 for a random seed
103 	 * @param minProb_ Ignore kmers with under this probability of being correct
104 	 */
CardinalityTracker(int buckets_, int k_, long seed, float minProb_)105 	public CardinalityTracker(int buckets_, int k_, long seed, float minProb_){
106 //		if((buckets_&1)==0){buckets_=(int)Primes.primeAtLeast(buckets_);} //Legacy code, needed modulo operation
107 		buckets=powerOf2AtLeast(buckets_);
108 		assert(buckets>0 && Integer.bitCount(buckets)==1) : "Buckets must be a power of 2: "+buckets;
109 		bucketMask=buckets-1;
110 		k=Kmer.getKbig(k_);
111 		minProb=minProb_;
112 
113 		//For old hash function
114 //		tables=new long[numTables][][];
115 //		for(int i=0; i<numTables; i++){
116 //			tables[i]=makeCodes(steps, bits, (seed<0 ? -1 : seed+i));
117 //		}
118 
119 		Random randy=Shared.threadLocalRandom(seed<0 ? -1 : seed);
120 		hashXor=randy.nextLong();
121 	}
122 
123 	/**
124 	 * Return the lowest power of 2 that is >= target.
125 	 * Provided because buckets must currently be a power of 2.
126 	 */
powerOf2AtLeast(int target)127 	public static final int powerOf2AtLeast(int target){
128 		if(target<1){return 1;}
129 		int ret=1, limit=Tools.min(target, 0x40000000);
130 		while(ret<limit){ret<<=1;}
131 		return ret;
132 	}
133 
134 	/*--------------------------------------------------------------*/
135 	/*----------------           Methods            ----------------*/
136 	/*--------------------------------------------------------------*/
137 
138 	/** Deprecated; use LogLogWrapper */
main(String[] args)139 	public static final void main(String[] args){
140 		LogLogWrapper llw=new LogLogWrapper(args);
141 
142 		final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
143 		Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4;
144 
145 		llw.process();
146 
147 		Read.VALIDATE_IN_CONSTRUCTOR=vic;
148 	}
149 
150 	/**
151 	 * Old table-based hash function.
152 	 * Slower than the new function.
153 	 */
154 	public final long hash(final long value0, final long[][] table){
155 		long value=value0, code=0;
156 		long mask=(bits>63 ? -1L : ~((-1L)<<bits));
157 
158 		for(int i=0; i<steps; i++){//I could also do while value!=0
159 			int x=(int)(value&mask);
160 			value>>=bits;
161 			code=code^table[i][x];
162 		}
163 		return code;
164 	}
165 
166 	/** Taken from Thomas Wang, Jan 1997:
167 	 * http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
168 	 *
169 	 *  This is much faster than the table version.  Results seem similar though.
170 	 */
hash64shift(long key)171 	public long hash64shift(long key){
172 		key^=hashXor;
173 		key = (~key) + (key << 21); // key = (key << 21) - key - 1;
174 		key = key ^ (key >>> 24);
175 		key = (key + (key << 3)) + (key << 8); // key * 265
176 		key = key ^ (key >>> 14);
177 		key = (key + (key << 2)) + (key << 4); // key * 21
178 		key = key ^ (key >>> 28);
179 		key = key + (key << 31);
180 		return key;
181 	}
182 
183 	/** Hash and add a number to this tracker */
add(long number)184 	public final void add(long number){
185 		hashAndStore(number);
186 	}
187 
188 	/**
189 	 * Hash and track the Read
190 	 * */
hash(Read r)191 	public final void hash(Read r){
192 		if(r==null){return;}
193 		if(r.length()>=k){hash(r.bases, r.quality);}
194 		if(r.mateLength()>=k){hash(r.mate.bases, r.mate.quality);}
195 	}
196 
197 	/**
198 	 * Hash and track the sequence
199 	 * */
hash(byte[] bases, byte[] quals)200 	public final void hash(byte[] bases, byte[] quals){
201 		if(k<32){hashSmall(bases, quals);}
202 		else{hashBig(bases, quals);}
203 	}
204 
205 	/**
206 	 * Hash and track the sequence using short kmers 0<k<32
207 	 * */
hashSmall(byte[] bases, byte[] quals)208 	public final void hashSmall(byte[] bases, byte[] quals){
209 		final int shift=2*k;
210 		final int shift2=shift-2;
211 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
212 		int len=0;
213 
214 		long kmer=0, rkmer=0;
215 
216 		if(minProb>0 && quals!=null){//Debranched loop
217 			assert(quals.length==bases.length) : quals.length+", "+bases.length;
218 			float prob=1;
219 			for(int i=0; i<bases.length; i++){
220 				byte b=bases[i];
221 				long x=AminoAcid.baseToNumber[b];
222 				long x2=AminoAcid.baseToComplementNumber[b];
223 				kmer=((kmer<<2)|x)&mask;
224 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
225 
226 				{//Update probability
227 					byte q=quals[i];
228 					prob=prob*PROB_CORRECT[q];
229 					if(len>k){
230 						byte oldq=quals[i-k];
231 						prob=prob*PROB_CORRECT_INVERSE[oldq];
232 					}
233 				}
234 				if(x>=0){
235 					len++;
236 				}else{
237 					len=0;
238 					kmer=rkmer=0;
239 					prob=1;
240 				}
241 				if(len>=k && prob>=minProb){
242 					add(Tools.max(kmer, rkmer));
243 				}
244 			}
245 		}else{
246 
247 			for(int i=0; i<bases.length; i++){
248 				byte b=bases[i];
249 				long x=AminoAcid.baseToNumber[b];
250 				long x2=AminoAcid.baseToComplementNumber[b];
251 				kmer=((kmer<<2)|x)&mask;
252 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
253 
254 				if(x>=0){
255 					len++;
256 				}else{
257 					len=0;
258 					kmer=rkmer=0;
259 				}
260 				if(len>=k){
261 					add(Tools.max(kmer, rkmer));
262 				}
263 			}
264 		}
265 	}
266 
267 	/**
268 	 * Hash and track the sequence using long kmers >31
269 	 * */
hashBig(byte[] bases, byte[] quals)270 	public final void hashBig(byte[] bases, byte[] quals){
271 
272 		Kmer kmer=getLocalKmer();
273 		int len=0;
274 		float prob=1;
275 
276 		for(int i=0; i<bases.length; i++){
277 			byte b=bases[i];
278 			long x=Dedupe.baseToNumber[b];
279 			kmer.addRightNumeric(x);
280 			if(minProb>0 && quals!=null){//Update probability
281 				prob=prob*PROB_CORRECT[quals[i]];
282 				if(len>k){
283 					byte oldq=quals[i-k];
284 					prob=prob*PROB_CORRECT_INVERSE[oldq];
285 				}
286 			}
287 			if(AminoAcid.isFullyDefined(b)){
288 				len++;
289 			}else{
290 				len=0;
291 				prob=1;
292 			}
293 			if(len>=k && prob>=minProb){
294 				add(kmer.xor());
295 			}
296 		}
297 	}
298 
299 	/**
300 	 * Make a table of random bitmasks for hashing.
301 	 * Superceded by new hash method.
302 	 * */
makeCodes(int length, int bits, long seed)303 	private static final long[][] makeCodes(int length, int bits, long seed){
304 		if(true) {return null;}//Short circuit
305 		Random randy=Shared.threadLocalRandom(seed);
306 		int modes=1<<bits;
307 		long[][] r=new long[length][modes];
308 		for(int i=0; i<length; i++){
309 			for(int j=0; j<modes; j++){
310 				long x=randy.nextLong();
311 				while(Long.bitCount(x)>33){
312 					x&=(~(1L<<randy.nextInt(64)));
313 				}
314 				while(Long.bitCount(x)<31){
315 					x|=(1L<<randy.nextInt(64));
316 				}
317 				r[i][j]=x;
318 
319 			}
320 		}
321 		return r;
322 	}
323 
compensationFactorBuckets()324 	public final float compensationFactorBuckets(){
325 		assert(Integer.bitCount(buckets)==1) : buckets;
326 		int zeros=Integer.numberOfTrailingZeros(buckets);
327 		return compensationFactorLogBuckets(zeros);
328 	}
329 
330 	/**
331 	 * Multiplier to compensate for overestimating genome size
332 	 * when the number of buckets is too small,
333 	 * as a function of log2(buckets)
334 	 * @param logBuckets
335 	 * @return Multiplier for final estimate
336 	 */
compensationFactorLogBuckets(int logBuckets)337 	public final float compensationFactorLogBuckets(int logBuckets){
338 		float[] array=compensationFactorLogBucketsArray();
339 		return (array!=null && logBuckets<array.length) ? array[logBuckets] : 1/(1+(1<<logBuckets));
340 	}
341 
toFrequency()342 	public SuperLongList toFrequency(){
343 		SuperLongList list=new SuperLongList(1000);
344 		int[] counts=getCounts();
345 		for(int x : counts){
346 			if(x>0){list.add(x);}
347 		}
348 		list.sort();
349 		return list;
350 	}
351 
352 	/**
353 	 * Print a kmer frequency histogram.
354 	 * @param path File to which to print
355 	 * @param overwrite
356 	 * @param append
357 	 * @param supersample Adjust counts for the effect of subsampling
358 	 */
printKhist(String path, boolean overwrite, boolean append, boolean supersample, int decimals)359 	public void printKhist(String path, boolean overwrite, boolean append, boolean supersample, int decimals){
360 		SuperLongList sll=toFrequency();
361 		ByteStreamWriter bsw=new ByteStreamWriter(path, overwrite, append, false);
362 		bsw.start();
363 		bsw.print("#Depth\tCount\n");
364 		final double mult=Tools.max(1.0, (supersample ? cardinality()/(double)buckets : 1));
365 		final long[] array=sll.array();
366 		final LongList list=sll.list();
367 
368 		for(int depth=0; depth<array.length; depth++){
369 			long count=array[depth];
370 			if(count>0){
371 				bsw.print(depth).tab();
372 				if(supersample){
373 					if(decimals>0){
374 						bsw.print(count*mult, decimals).nl();
375 					}else{
376 						bsw.print(Tools.max(1, Math.round(count*mult))).nl();
377 					}
378 				}else{
379 					bsw.print(count).nl();
380 				}
381 			}
382 		}
383 		int count=0;
384 		long prevDepth=-1;
385 		for(int i=0; i<list.size; i++){
386 			long depth=list.get(i);
387 			if(depth!=prevDepth && count>0){
388 				assert(depth>prevDepth);
389 				bsw.print(prevDepth).tab();
390 				if(supersample){
391 					if(decimals>0){
392 						bsw.print(count*mult, decimals).nl();
393 					}else{
394 						bsw.print(Tools.max(1, Math.round(count*mult))).nl();
395 					}
396 				}else{
397 					bsw.print(count).nl();
398 				}
399 				count=0;
400 			}else{
401 				count++;
402 			}
403 			prevDepth=depth;
404 		}
405 		if(count>0){
406 			bsw.print(prevDepth).tab();
407 			if(supersample){
408 				if(decimals>0){
409 					bsw.print(count*mult, decimals).nl();
410 				}else{
411 					bsw.print(Tools.max(1, Math.round(count*mult))).nl();
412 				}
413 			}else{
414 				bsw.print(count).nl();
415 			}
416 		}
417 		bsw.poisonAndWait();
418 	}
419 
420 	/** Sum of counts array, if present */
countSum()421 	public final long countSum(){
422 		int[] counts=getCounts();
423 		return counts==null ? 0 : Tools.sum(counts);
424 	}
425 
426 	/*--------------------------------------------------------------*/
427 	/*----------------       Abstract Methods       ----------------*/
428 	/*--------------------------------------------------------------*/
429 
430 	/*--------------------------------------------------------------*/
431 	/*----------------       Abstract Methods       ----------------*/
432 	/*--------------------------------------------------------------*/
433 
434 	/** Calculate cardinality estimate from this tracker */
cardinality()435 	public abstract long cardinality();
436 
437 	/** Counts array, if present.
438 	 * Should be overridden for classes that track counts. */
getCounts()439 	public int[] getCounts(){
440 		return null;
441 	}
442 
443 	/** Add another tracker to this one */
add(CardinalityTracker log)444 	public abstract void add(CardinalityTracker log);
445 
446 	/** Generate a 64-bit hashcode from a number, and add it to this tracker */
hashAndStore(final long number)447 	public abstract void hashAndStore(final long number);
448 
449 	/** TODO: Deprecate?
450 	 * Designed to compensate for overestimate with small numbers of buckets.
451 	 * Appears to be handled by using the harmonic mean in the case of multiple trials. */
compensationFactorLogBucketsArray()452 	public abstract float[] compensationFactorLogBucketsArray();
453 
454 	/*--------------------------------------------------------------*/
455 	/*----------------            Fields            ----------------*/
456 	/*--------------------------------------------------------------*/
457 
458 	/** Kmer length */
459 	public final int k;
460 
461 	/** Ignore kmers under this probability of correctness */
462 	public final float minProb;
463 
464 	/**
465 	 * Number of buckets for tracking.
466 	 * Must be a power of 2.
467 	 * Larger is slower and uses more memory, but more accurate.
468 	 */
469 	public final int buckets;
470 
471 	/** Mask for lower bits of hashcode to yield bucket; bucketMask==buckets-1. */
472 	final int bucketMask;
473 
474 	/** Reusable kmer for hashing. */
475 	private final ThreadLocal<Kmer> localKmer=new ThreadLocal<Kmer>();
476 
477 	/** The hash function will yield a different mapping for each mask here. */
478 	private final long hashXor;
479 
480 	/** For long kmer mode */
getLocalKmer()481 	protected Kmer getLocalKmer(){
482 		Kmer kmer=localKmer.get();
483 		if(kmer==null){
484 			localKmer.set(new Kmer(k));
485 			kmer=localKmer.get();
486 		}
487 		kmer.clearFast();
488 		return kmer;
489 	}
490 
491 	/*--------------------------------------------------------------*/
492 	/*----------------    Deprecated Table Fields   ----------------*/
493 	/*--------------------------------------------------------------*/
494 
495 	static final int numTables=4;
496 	static final int numTablesMask=numTables-1;
497 	/** Bits hashed per cycle; no longer needed */
498 	private static final int bits=8;
499 	private static final int steps=(63+bits)/bits;;
500 //	final long[][][] tables;
501 
502 	/*--------------------------------------------------------------*/
503 	/*----------------            Statics           ----------------*/
504 	/*--------------------------------------------------------------*/
505 
506 	/** Converts quality score to probability of correctness */
507 	public static final float[] PROB_CORRECT=Arrays.copyOf(align2.QualityTools.PROB_CORRECT, 128);
508 	/** Inverse of PROB_CORRECT */
509 	public static final float[] PROB_CORRECT_INVERSE=Arrays.copyOf(align2.QualityTools.PROB_CORRECT_INVERSE, 128);
510 
511 	/** Atomic mode allows less memory when multithreaded, but lower peak concurrency due to contention. */
512 	public static final boolean atomic=false;//non-atomic is faster.
513 	/** Track number of times the value in each bucket was seen. */
514 	public static boolean trackCounts=false;
515 	/** Ignores kmers such that x%SKIPMOD!=0, to skip expensive hash and store functions */
516 	static final long SKIPMOD=1;//No longer used; requires a modulo operation
517 	/** Records the last cardinality tracked, for use in static contexts (RQCFilter) */
518 	public static long lastCardinality=-1;
519 //	/** Ignore hashed values above this, to skip expensive read and store functions. */
520 //	static final long maxHashedValue=((-1L)>>>3);//No longer used
521 
522 	/** These determine how to combine multiple buckets to yield a value.
523 	 * Mean is best even though mwa is pretty close.  Median is much worse. */
524 	public static boolean USE_MEAN=true;//Arithmetic mean
525 	public static boolean USE_MEDIAN=false;
526 	public static boolean USE_MWA=false;//Median-weighted-average
527 	public static boolean USE_HMEAN=false;//Harmonic mean
528 	public static boolean USE_GMEAN=false;//Geometric mean
529 
530 }
531