1package compress
2
3import "math"
4
5// Estimate returns a normalized compressibility estimate of block b.
6// Values close to zero are likely uncompressible.
7// Values above 0.1 are likely to be compressible.
8// Values above 0.5 are very compressible.
9// Very small lengths will return 0.
10func Estimate(b []byte) float64 {
11	if len(b) < 16 {
12		return 0
13	}
14
15	// Correctly predicted order 1
16	hits := 0
17	lastMatch := false
18	var o1 [256]byte
19	var hist [256]int
20	c1 := byte(0)
21	for _, c := range b {
22		if c == o1[c1] {
23			// We only count a hit if there was two correct predictions in a row.
24			if lastMatch {
25				hits++
26			}
27			lastMatch = true
28		} else {
29			lastMatch = false
30		}
31		o1[c1] = c
32		c1 = c
33		hist[c]++
34	}
35
36	// Use x^0.6 to give better spread
37	prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)
38
39	// Calculate histogram distribution
40	variance := float64(0)
41	avg := float64(len(b)) / 256
42
43	for _, v := range hist {
44		Δ := float64(v) - avg
45		variance += Δ * Δ
46	}
47
48	stddev := math.Sqrt(float64(variance)) / float64(len(b))
49	exp := math.Sqrt(1 / float64(len(b)))
50
51	// Subtract expected stddev
52	stddev -= exp
53	if stddev < 0 {
54		stddev = 0
55	}
56	stddev *= 1 + exp
57
58	// Use x^0.4 to give better spread
59	entropy := math.Pow(stddev, 0.4)
60
61	// 50/50 weight between prediction and histogram distribution
62	return math.Pow((prediction+entropy)/2, 0.9)
63}
64
65// ShannonEntropyBits returns the number of bits minimum required to represent
66// an entropy encoding of the input bytes.
67// https://en.wiktionary.org/wiki/Shannon_entropy
68func ShannonEntropyBits(b []byte) int {
69	if len(b) == 0 {
70		return 0
71	}
72	var hist [256]int
73	for _, c := range b {
74		hist[c]++
75	}
76	shannon := float64(0)
77	invTotal := 1.0 / float64(len(b))
78	for _, v := range hist[:] {
79		if v > 0 {
80			n := float64(v)
81			shannon += math.Ceil(-math.Log2(n*invTotal) * n)
82		}
83	}
84	return int(math.Ceil(shannon))
85}
86