1package compress 2 3import "math" 4 5// Estimate returns a normalized compressibility estimate of block b. 6// Values close to zero are likely uncompressible. 7// Values above 0.1 are likely to be compressible. 8// Values above 0.5 are very compressible. 9// Very small lengths will return 0. 10func Estimate(b []byte) float64 { 11 if len(b) < 16 { 12 return 0 13 } 14 15 // Correctly predicted order 1 16 hits := 0 17 lastMatch := false 18 var o1 [256]byte 19 var hist [256]int 20 c1 := byte(0) 21 for _, c := range b { 22 if c == o1[c1] { 23 // We only count a hit if there was two correct predictions in a row. 24 if lastMatch { 25 hits++ 26 } 27 lastMatch = true 28 } else { 29 lastMatch = false 30 } 31 o1[c1] = c 32 c1 = c 33 hist[c]++ 34 } 35 36 // Use x^0.6 to give better spread 37 prediction := math.Pow(float64(hits)/float64(len(b)), 0.6) 38 39 // Calculate histogram distribution 40 variance := float64(0) 41 avg := float64(len(b)) / 256 42 43 for _, v := range hist { 44 Δ := float64(v) - avg 45 variance += Δ * Δ 46 } 47 48 stddev := math.Sqrt(float64(variance)) / float64(len(b)) 49 exp := math.Sqrt(1 / float64(len(b))) 50 51 // Subtract expected stddev 52 stddev -= exp 53 if stddev < 0 { 54 stddev = 0 55 } 56 stddev *= 1 + exp 57 58 // Use x^0.4 to give better spread 59 entropy := math.Pow(stddev, 0.4) 60 61 // 50/50 weight between prediction and histogram distribution 62 return math.Pow((prediction+entropy)/2, 0.9) 63} 64 65// ShannonEntropyBits returns the number of bits minimum required to represent 66// an entropy encoding of the input bytes. 67// https://en.wiktionary.org/wiki/Shannon_entropy 68func ShannonEntropyBits(b []byte) int { 69 if len(b) == 0 { 70 return 0 71 } 72 var hist [256]int 73 for _, c := range b { 74 hist[c]++ 75 } 76 shannon := float64(0) 77 invTotal := 1.0 / float64(len(b)) 78 for _, v := range hist[:] { 79 if v > 0 { 80 n := float64(v) 81 shannon += math.Ceil(-math.Log2(n*invTotal) * n) 82 } 83 } 84 return int(math.Ceil(shannon)) 85} 86