1// Copyright 2015, Joe Tsai. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE.md file. 4 5// +build ignore 6 7// meta_stats is used to measure the efficiency of meta encoding. 8// 9// The XFLATE meta encoding is a block-based format that allows in-band encoding 10// of arbitrary meta data into a DEFLATE stream. This is possible because when 11// XFLATE meta encoding is decoded using a DEFLATE decoder, it produces zero 12// output. This allows XFLATE to add information on top of DEFLATE in a 13// backwards compatible manner. 14// 15// The meta encoding works by using dynamic DEFLATE blocks and encoding bits of 16// information into the HLIT tree definition. To ensure that the meta encoding 17// produces no output when decompressed, these dynamic blocks are terminated 18// immediately with an end-of-block (EOB) marker. Since Huffman tree definitions 19// were not designed to encode arbitrary data into them, there is some overhead 20// encoding meta data into them in a way that still produces valid trees. 21// 22// The purpose of this program is to measure the efficiency of the meta encoding 23// by computing the ratio of the raw input to the encoded output. Since the HLIT 24// tree can only handle some maximum number of symbols, the number of bytes that 25// can be encoded into a single dynamic block is capped at some limit. 26// This program also attempts to determine what that limit is. 27package main 28 29import ( 30 "flag" 31 "fmt" 32 "io/ioutil" 33 "log" 34 "math" 35 "math/big" 36 "math/rand" 37 38 "github.com/dsnet/compress/xflate/internal/meta" 39) 40 41func init() { log.SetFlags(log.Lshortfile) } 42 43func main() { 44 // It would be impractical to try all possible input strings. 45 // Thus, we sample random strings from the domain by performing 46 // numSamples trials per size class. 47 numSamples := flag.Int("n", 256, "number of strings to sample per size class") 48 randSeed := flag.Int("s", 0, "random number generator seed") 49 flag.Parse() 50 51 // Compute the combination nCk. 52 comb := func(n, k int64) *big.Int { 53 val := big.NewInt(1) 54 for j := int64(0); j < k; j++ { 55 // Compute: val = (val*(n-j)) / (j+1) 56 val.Div(new(big.Int).Mul(val, big.NewInt(n-j)), big.NewInt(j+1)) 57 } 58 return val 59 } 60 61 // Convert big integer to float64. 62 rtof := func(r *big.Rat) float64 { f, _ := r.Float64(); return f } 63 itof := func(i *big.Int) float64 { return rtof(new(big.Rat).SetInt(i)) } 64 65 // Print titles of each category to compute metrics on: 66 // NumBytes: The length of the input string. 67 // FullDomain: Are all possible strings of this length encodable? 68 // Coverage: What percentage of all possible strings of this length can be encoded? 69 // Efficiency: The efficiency of encoding; this compares NumBytes to EncSize. 70 // EncSize: The size of the output when encoding a string of this length. 71 fmt.Println("NumBytes FullDomain Coverage EncSize[min<=avg<=max] Efficiency[max=>avg=>min]") 72 73 var buf []byte 74 mw := new(meta.Writer) 75 rand := rand.New(rand.NewSource(int64(*randSeed))) 76 for numBytes := meta.MinRawBytes; numBytes <= meta.MaxRawBytes; numBytes++ { 77 numBits := numBytes * 8 78 79 // Whether a string is encodable or not is entirely based on the number 80 // of one bits and zero bits in the string. Thus, we gather results from 81 // every possible size class. 82 encodable := big.NewInt(0) // Total number of input strings that are encodable 83 encMin, encMax, encTotal := math.MaxInt8, math.MinInt8, 0.0 84 for zeros := 0; zeros <= numBits; zeros++ { 85 ones := numBits - zeros 86 87 // If huffLen is 0, then that means that a string with this many 88 // zero bits and bits is not encodable. 89 if huffLen, _ := computeHuffLen(zeros, ones); huffLen == 0 { 90 continue 91 } 92 93 // The total number of unique strings with the given number of zero 94 // bits and one bits is the combination nCk where n is the total 95 // number of bits and k is the the total number of one bits. 96 num := comb(int64(numBits), int64(ones)) 97 encodable.Add(encodable, num) 98 99 // For numSamples trials, keep track of the minimum, average, and 100 // maximum size of the encoded output. 101 encAvg := 0.0 102 for i := 0; i < *numSamples; i++ { 103 // Generate a random string permutation. 104 buf = buf[:0] 105 perm := rand.Perm(numBits) 106 for i := 0; i < numBits/8; i++ { 107 var b byte 108 for j := 0; j < 8; j++ { 109 if perm[8*i+j] >= zeros { 110 b |= 1 << uint(j) 111 } 112 } 113 buf = append(buf, b) 114 } 115 116 // Encode the string and compute the output length. 117 mw.Reset(ioutil.Discard) 118 if _, err := mw.Write(buf); err != nil { 119 log.Fatal(err) 120 } 121 if err := mw.Close(); err != nil { 122 log.Fatal(err) 123 } 124 if mw.NumBlocks != 1 { 125 log.Fatal("unexpected extra blocks") 126 } 127 128 cnt := int(mw.OutputOffset) 129 if encMin > cnt { 130 encMin = cnt 131 } 132 if encMax < cnt { 133 encMax = cnt 134 } 135 encAvg += float64(cnt) / float64(*numSamples) 136 } 137 138 // Weighted total based on the number of strings. 139 encTotal += itof(num) * encAvg 140 } 141 142 // If no input string is encodable, don't bother printing results. 143 if encodable.Cmp(new(big.Int)) == 0 { 144 continue 145 } 146 147 encAvg := encTotal / itof(encodable) // encAvg := encTotal / encodable 148 domain := new(big.Int).Lsh(big.NewInt(1), uint(numBits)) // domain := 1 << numBits 149 fullDomain := encodable.Cmp(domain) == 0 // fullDomain := encodable == domain 150 coverage := 100.0 * rtof(new(big.Rat).SetFrac(encodable, domain)) // coverage := 100.0 * (encodable / domain) 151 maxEff := 100.0 * (float64(numBytes) / float64(encMin)) // maxEff := 100.0 * (numBytes / encMin) 152 avgEff := 100.0 * (float64(numBytes) / float64(encAvg)) // avgEff := 100.0 * (numBytes / encAvg) 153 minEff := 100.0 * (float64(numBytes) / float64(encMax)) // minEff := 100.0 * (numBytes / encMax) 154 155 fmt.Printf("%8d%12v%9.2f%% [%2d <= %4.2f <= %2d] [%5.1f%% => %4.1f%% => %4.1f%%]\n", 156 numBytes, fullDomain, coverage, encMin, encAvg, encMax, maxEff, avgEff, minEff) 157 } 158} 159 160// computeHuffLen computes the shortest Huffman length to encode the data. 161// If the input data is too large, then 0 is returned. 162// 163// This is copied from Writer.computeHuffLen to avoid visibility issues. 164func computeHuffLen(zeros, ones int) (huffLen uint, inv bool) { 165 const ( 166 maxSyms = 257 // Maximum number of literal codes (with EOB marker) 167 minHuffLen = 1 // Minimum number of bits for each Huffman code 168 maxHuffLen = 7 // Maximum number of bits for each Huffman code 169 ) 170 171 if inv = ones > zeros; inv { 172 zeros, ones = ones, zeros 173 } 174 for huffLen = minHuffLen; huffLen <= maxHuffLen; huffLen++ { 175 maxOnes := 1 << uint(huffLen) 176 if maxSyms-maxOnes >= zeros+8 && maxOnes >= ones+8 { 177 return huffLen, inv 178 } 179 } 180 return 0, false 181} 182