1// Copyright 2015, Joe Tsai. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE.md file.
4
5// +build ignore
6
7// meta_stats is used to measure the efficiency of meta encoding.
8//
9// The XFLATE meta encoding is a block-based format that allows in-band encoding
10// of arbitrary meta data into a DEFLATE stream. This is possible because when
11// XFLATE meta encoding is decoded using a DEFLATE decoder, it produces zero
12// output. This allows XFLATE to add information on top of DEFLATE in a
13// backwards compatible manner.
14//
15// The meta encoding works by using dynamic DEFLATE blocks and encoding bits of
16// information into the HLIT tree definition. To ensure that the meta encoding
17// produces no output when decompressed, these dynamic blocks are terminated
18// immediately with an end-of-block (EOB) marker. Since Huffman tree definitions
19// were not designed to encode arbitrary data into them, there is some overhead
20// encoding meta data into them in a way that still produces valid trees.
21//
22// The purpose of this program is to measure the efficiency of the meta encoding
23// by computing the ratio of the raw input to the encoded output. Since the HLIT
24// tree can only handle some maximum number of symbols, the number of bytes that
25// can be encoded into a single dynamic block is capped at some limit.
26// This program also attempts to determine what that limit is.
27package main
28
29import (
30	"flag"
31	"fmt"
32	"io/ioutil"
33	"log"
34	"math"
35	"math/big"
36	"math/rand"
37
38	"github.com/dsnet/compress/xflate/internal/meta"
39)
40
41func init() { log.SetFlags(log.Lshortfile) }
42
43func main() {
44	// It would be impractical to try all possible input strings.
45	// Thus, we sample random strings from the domain by performing
46	// numSamples trials per size class.
47	numSamples := flag.Int("n", 256, "number of strings to sample per size class")
48	randSeed := flag.Int("s", 0, "random number generator seed")
49	flag.Parse()
50
51	// Compute the combination nCk.
52	comb := func(n, k int64) *big.Int {
53		val := big.NewInt(1)
54		for j := int64(0); j < k; j++ {
55			// Compute: val = (val*(n-j)) / (j+1)
56			val.Div(new(big.Int).Mul(val, big.NewInt(n-j)), big.NewInt(j+1))
57		}
58		return val
59	}
60
61	// Convert big integer to float64.
62	rtof := func(r *big.Rat) float64 { f, _ := r.Float64(); return f }
63	itof := func(i *big.Int) float64 { return rtof(new(big.Rat).SetInt(i)) }
64
65	// Print titles of each category to compute metrics on:
66	//	NumBytes: The length of the input string.
67	//	FullDomain: Are all possible strings of this length encodable?
68	//	Coverage: What percentage of all possible strings of this length can be encoded?
69	//	Efficiency: The efficiency of encoding; this compares NumBytes to EncSize.
70	//	EncSize: The size of the output when encoding a string of this length.
71	fmt.Println("NumBytes  FullDomain  Coverage  EncSize[min<=avg<=max]   Efficiency[max=>avg=>min]")
72
73	var buf []byte
74	mw := new(meta.Writer)
75	rand := rand.New(rand.NewSource(int64(*randSeed)))
76	for numBytes := meta.MinRawBytes; numBytes <= meta.MaxRawBytes; numBytes++ {
77		numBits := numBytes * 8
78
79		// Whether a string is encodable or not is entirely based on the number
80		// of one bits and zero bits in the string. Thus, we gather results from
81		// every possible size class.
82		encodable := big.NewInt(0) // Total number of input strings that are encodable
83		encMin, encMax, encTotal := math.MaxInt8, math.MinInt8, 0.0
84		for zeros := 0; zeros <= numBits; zeros++ {
85			ones := numBits - zeros
86
87			// If huffLen is 0, then that means that a string with this many
88			// zero bits and bits is not encodable.
89			if huffLen, _ := computeHuffLen(zeros, ones); huffLen == 0 {
90				continue
91			}
92
93			// The total number of unique strings with the given number of zero
94			// bits and one bits is the combination nCk where n is the total
95			// number of bits and k is the the total number of one bits.
96			num := comb(int64(numBits), int64(ones))
97			encodable.Add(encodable, num)
98
99			// For numSamples trials, keep track of the minimum, average, and
100			// maximum size of the encoded output.
101			encAvg := 0.0
102			for i := 0; i < *numSamples; i++ {
103				// Generate a random string permutation.
104				buf = buf[:0]
105				perm := rand.Perm(numBits)
106				for i := 0; i < numBits/8; i++ {
107					var b byte
108					for j := 0; j < 8; j++ {
109						if perm[8*i+j] >= zeros {
110							b |= 1 << uint(j)
111						}
112					}
113					buf = append(buf, b)
114				}
115
116				// Encode the string and compute the output length.
117				mw.Reset(ioutil.Discard)
118				if _, err := mw.Write(buf); err != nil {
119					log.Fatal(err)
120				}
121				if err := mw.Close(); err != nil {
122					log.Fatal(err)
123				}
124				if mw.NumBlocks != 1 {
125					log.Fatal("unexpected extra blocks")
126				}
127
128				cnt := int(mw.OutputOffset)
129				if encMin > cnt {
130					encMin = cnt
131				}
132				if encMax < cnt {
133					encMax = cnt
134				}
135				encAvg += float64(cnt) / float64(*numSamples)
136			}
137
138			// Weighted total based on the number of strings.
139			encTotal += itof(num) * encAvg
140		}
141
142		// If no input string is encodable, don't bother printing results.
143		if encodable.Cmp(new(big.Int)) == 0 {
144			continue
145		}
146
147		encAvg := encTotal / itof(encodable)                              // encAvg     := encTotal / encodable
148		domain := new(big.Int).Lsh(big.NewInt(1), uint(numBits))          // domain     := 1 << numBits
149		fullDomain := encodable.Cmp(domain) == 0                          // fullDomain := encodable == domain
150		coverage := 100.0 * rtof(new(big.Rat).SetFrac(encodable, domain)) // coverage   := 100.0 * (encodable / domain)
151		maxEff := 100.0 * (float64(numBytes) / float64(encMin))           // maxEff     := 100.0 *  (numBytes / encMin)
152		avgEff := 100.0 * (float64(numBytes) / float64(encAvg))           // avgEff     := 100.0 *  (numBytes / encAvg)
153		minEff := 100.0 * (float64(numBytes) / float64(encMax))           // minEff     := 100.0 *  (numBytes / encMax)
154
155		fmt.Printf("%8d%12v%9.2f%%     [%2d <= %4.2f <= %2d]  [%5.1f%% => %4.1f%% => %4.1f%%]\n",
156			numBytes, fullDomain, coverage, encMin, encAvg, encMax, maxEff, avgEff, minEff)
157	}
158}
159
160// computeHuffLen computes the shortest Huffman length to encode the data.
161// If the input data is too large, then 0 is returned.
162//
163// This is copied from Writer.computeHuffLen to avoid visibility issues.
164func computeHuffLen(zeros, ones int) (huffLen uint, inv bool) {
165	const (
166		maxSyms    = 257 // Maximum number of literal codes (with EOB marker)
167		minHuffLen = 1   // Minimum number of bits for each Huffman code
168		maxHuffLen = 7   // Maximum number of bits for each Huffman code
169	)
170
171	if inv = ones > zeros; inv {
172		zeros, ones = ones, zeros
173	}
174	for huffLen = minHuffLen; huffLen <= maxHuffLen; huffLen++ {
175		maxOnes := 1 << uint(huffLen)
176		if maxSyms-maxOnes >= zeros+8 && maxOnes >= ones+8 {
177			return huffLen, inv
178		}
179	}
180	return 0, false
181}
182