1// Copyright 2015, Joe Tsai. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE.md file.
4
5// +build ignore
6
7// xflate_stats is used to investigate trade-offs in XFLATE design.
8//
9// The XFLATE format extends the DEFLATE format in order to allow for limited
10// random access reading. This is achieved by individually compressing the
11// input data in chunks (so that each chunk can be decompressed by itself),
12// and by storing an index recording the size of every chunk (so that each
13// chunk can be located).
14//
15// However, this adds overhead and diminishes the compression ratio slightly.
16// In order to investigate the cost of chunking, this program computes the total
17// compressed size when the input is compressed as a single stream or when
18// compressed in chunks of some fixed size.
19//
20// Chunking diminishes compression ratio because the LZ77 dictionary is reset
21// for each chunk, reducing the amount of savings that could have been gained
22// by a backwards match to previous data. Secondly, the XFLATE format requires
23// that each chunk terminates with a SYNC_FLUSH marker of ~5 bytes.
24//
25// In addition to the costs of chunking itself, there is also the cost of
26// storing the index. The index is a list of (rawSize, compSize) tuples where
27// each record contains information about the uncompressed input size (rawSize)
28// and compressed output size (compSize) of each chunk.
29//
30// In order to reduce the index size, multiple techniques were explored:
31//	* Compressing the index itself using DEFLATE
32//	* Row-oriented vs. column-oriented layout of records; that is, row-oriented
33//	layout has each (rawSize, compSize) tuple laid out one after the other,
34//	while column-oriented has all rawSizes laid out as an array followed by all
35//	compSizes laid out as an array
36//	* Fixed-length vs. variable-length integers; that is, to store size values
37//	as uint64s or to use some variable-length encoding
38//	* Regular encoding vs. Delta encoding; that is, delta encoding encodes each
39//	value as the difference between that value and the previous value
40//
41// However, some of these techniques do not make sense unless done in
42// conjunction with another technique. For example:
43//	* Row-oriented vs. column-oriented is useless without compression
44//	* Delta encoding does not make sense without variable-length integers
45//
46// For this reason, the following table shows what techniques were used for the
47// given format names listed below:
48//
49//	         +-------------- Raw: The index is uncompressed
50//	         |   +---------- Row: The index is row-oriented
51//	         |   |   +------ Fix: Index records use fixed-length uint64s
52//	         |   |   |   +-- Reg: Index records are not delta encoded
53//	         |   |   |   |
54//	        Raw Row Fix Reg
55//	RawFix  [X] [?] [X] [X]
56//	RawVar  [X] [?] [ ] [X]
57//	RawDlt  [X] [?] [ ] [ ]
58//	RowFix  [ ] [X] [X] [X]
59//	RowVar  [ ] [X] [ ] [X]
60//	RowDlt  [ ] [X] [ ] [ ]
61//	ColFix  [ ] [ ] [X] [X]
62//	ColVar  [ ] [ ] [ ] [X]
63//	ColDlt  [ ] [ ] [ ] [ ]
64//
65// As seen, the Raw* formats do not care about the orientation of the index
66// since it is not compressed. Also, all *Dlt formats uses delta encoding
67// in conjunction with variable-length encoding.
68//
69// In summary, the greatest factor for overhead cost is the chunk size.
70// Smaller chunks reduce efficiency due to more frequent LZ77 dictionary resets
71// and also increases the total number of chunks needed, thus also increasing
72// the index size. On the other hand, larger chunks also diminishes the
73// effectiveness of random access reading since more data must be discarded to
74// start reading from some given offset. Thus, the choice of chunk size is not
75// hard-coded by the XFLATE format and this tool can help identify the
76// appropriate chunk size.
77package main
78
79import (
80	"bytes"
81	"compress/flate"
82	"encoding/binary"
83	"flag"
84	"fmt"
85	"io"
86	"log"
87	"os"
88
89	"github.com/dsnet/compress/xflate/internal/meta"
90)
91
92func init() { log.SetFlags(log.Lshortfile) }
93
94func main() {
95	inputFile := flag.String("f", "-", "path to input file")
96	chunkSize := flag.Int("n", 64*1024, "compress the input in n-sized chunks")
97	compLevel := flag.Int("l", flate.DefaultCompression, "compression level")
98	metaEncode := flag.Bool("m", false, "encode index using XFLATE meta encoding")
99	flag.Parse()
100
101	// Open the input file.
102	var r io.Reader
103	if *inputFile != "-" {
104		f, err := os.Open(*inputFile)
105		if err != nil {
106			log.Fatal(err)
107		}
108		defer f.Close()
109		r = f
110	} else {
111		r = os.Stdin
112	}
113
114	// Compute the streamed and chunked record values.
115	var chnkRec indexRecord
116	strmRec, chnkRecs := computeRecords(r, *compLevel, *chunkSize)
117	for _, r := range chnkRecs {
118		chnkRec.rawSize += r.rawSize
119		chnkRec.compSize += r.compSize
120	}
121
122	var n int
123	var pr indexRecord // Previous record
124	var brf, brv, brd, bcf, bcv, bcd bytes.Buffer
125	buf := make([]byte, binary.MaxVarintLen64)
126
127	// Row-based index format.
128	pr = indexRecord{}
129	for _, r := range chnkRecs {
130		binary.LittleEndian.PutUint64(buf, uint64(r.rawSize))
131		brf.Write(buf[:8])
132		binary.LittleEndian.PutUint64(buf, uint64(r.compSize))
133		brf.Write(buf[:8])
134
135		n = binary.PutUvarint(buf, uint64(r.rawSize))
136		brv.Write(buf[:n])
137		n = binary.PutUvarint(buf, uint64(r.compSize))
138		brv.Write(buf[:n])
139
140		n = binary.PutVarint(buf, r.rawSize-pr.rawSize)
141		brd.Write(buf[:n])
142		n = binary.PutVarint(buf, r.compSize-pr.compSize)
143		brd.Write(buf[:n])
144
145		pr = r
146	}
147
148	// Column-based index format.
149	pr = indexRecord{}
150	for _, r := range chnkRecs {
151		binary.LittleEndian.PutUint64(buf, uint64(r.rawSize))
152		bcf.Write(buf[:8])
153
154		n = binary.PutUvarint(buf, uint64(r.rawSize))
155		bcv.Write(buf[:n])
156
157		n = binary.PutVarint(buf, r.rawSize-pr.rawSize)
158		bcd.Write(buf[:n])
159
160		pr.rawSize = r.rawSize
161	}
162	for _, r := range chnkRecs {
163		binary.LittleEndian.PutUint64(buf, uint64(r.compSize))
164		bcf.Write(buf[:8])
165
166		n = binary.PutUvarint(buf, uint64(r.compSize))
167		bcv.Write(buf[:n])
168
169		n = binary.PutVarint(buf, r.compSize-pr.compSize)
170		bcd.Write(buf[:n])
171
172		pr.compSize = r.compSize
173	}
174
175	pf := func(a, b int64) float64 { return 100.0 * (float64(a) / float64(b)) }
176	me := func(b []byte) []byte { return b }
177	if *metaEncode {
178		me = encode
179	}
180
181	// Print basic statistics about the input and output.
182	ns, nc := strmRec.compSize, chnkRec.compSize
183	ps, pc := pf(ns-ns, ns), pf(nc-ns, ns)
184	fmt.Printf("rawSize:          %d\n", strmRec.rawSize)   // Uncompressed input size
185	fmt.Printf("strmRec.compSize: %d (%+0.2f%%)\n", ns, ps) // Total compressed size as a single stream
186	fmt.Printf("chnkRec.compSize: %d (%+0.2f%%)\n", nc, pc) // Total compressed size as individual chunks
187	fmt.Printf("chunkSize:        %d\n", *chunkSize)        // Individual chunk size
188	fmt.Printf("numChunks:        %d\n", len(chnkRecs))     // Total number of chunks
189	fmt.Println()
190
191	// Uncompressed index formats.
192	nf := int64(len(me(brf.Bytes())))
193	nv := int64(len(me(brv.Bytes())))
194	nd := int64(len(me(brd.Bytes())))
195	fmt.Printf("RawFix: %d (%+0.2f%%)\n", nf, pf(nf, ns)) // Fixed-length uint64s
196	fmt.Printf("RawVar: %d (%+0.2f%%)\n", nv, pf(nv, ns)) // Variable-length integers (VLI)
197	fmt.Printf("RawDlt: %d (%+0.2f%%)\n", nd, pf(nd, ns)) // VLI with delta encoding
198	fmt.Println()
199
200	// Compressed row-oriented index formats.
201	nrf := int64(len(me(compress(brf.Bytes(), *compLevel))))
202	nrv := int64(len(me(compress(brv.Bytes(), *compLevel))))
203	nrd := int64(len(me(compress(brd.Bytes(), *compLevel))))
204	fmt.Printf("RowFix: %d (%+0.2f%%)\n", nrf, pf(nrf, ns)) // Fixed-length uint64s
205	fmt.Printf("RowVar: %d (%+0.2f%%)\n", nrv, pf(nrv, ns)) // Variable-length integers (VLI)
206	fmt.Printf("RowDlt: %d (%+0.2f%%)\n", nrd, pf(nrd, ns)) // VLI with delta encoding
207	fmt.Println()
208
209	// Compressed column-oriented index formats.
210	ncf := int64(len(me(compress(bcf.Bytes(), *compLevel))))
211	ncv := int64(len(me(compress(bcv.Bytes(), *compLevel))))
212	ncd := int64(len(me(compress(bcd.Bytes(), *compLevel))))
213	fmt.Printf("ColFix: %d (%+0.2f%%)\n", ncf, pf(ncf, ns)) // Fixed-length uint64s
214	fmt.Printf("ColVar: %d (%+0.2f%%)\n", ncv, pf(ncv, ns)) // Variable-length integers (VLI)
215	fmt.Printf("ColDlt: %d (%+0.2f%%)\n", ncd, pf(ncd, ns)) // VLI with delta encoding
216	fmt.Println()
217}
218
219// countWriter counts and discards all bytes written to it.
220type countWriter int64
221
222func (cw *countWriter) Write(b []byte) (int, error) {
223	*(*int64)(cw) += int64(len(b))
224	return len(b), nil
225}
226
227type indexRecord struct {
228	rawSize  int64 // Size when decompressed
229	compSize int64 // Size when compressed
230}
231
232// computeRecords computes the records the raw input size and the compressed
233// output size. strmRec is a single record for when the input is compressed as
234// a single stream. chnkRecs is a list of records for when the input is
235// compressed individually as chunks.
236func computeRecords(r io.Reader, lvl, chnkSize int) (strmRec indexRecord, chnkRecs []indexRecord) {
237	var cw1, cw2 countWriter
238	zw1, _ := flate.NewWriter(&cw1, lvl) // Streamed compressor
239	zw2, _ := flate.NewWriter(&cw2, lvl) // Chunked compressor
240	buf := make([]byte, chnkSize)
241	for {
242		// Read a full chunks worth of data.
243		cnt, err := io.ReadFull(r, buf)
244		strmRec.rawSize += int64(cnt)
245		if err == io.EOF {
246			break
247		}
248
249		// Write chunk to both compressors.
250		if _, err := zw1.Write(buf[:cnt]); err != nil {
251			log.Fatal(err)
252		}
253		if _, err := zw2.Write(buf[:cnt]); err != nil {
254			log.Fatal(err)
255		}
256
257		// Flush the chunked compressor, append the record, and reset.
258		if err := zw2.Flush(); err != nil {
259			log.Fatal(err)
260		}
261		chnkRecs = append(chnkRecs, indexRecord{rawSize: int64(cnt), compSize: int64(cw2)})
262		cw2 = 0
263		zw2.Reset(&cw2)
264
265		if err == io.ErrUnexpectedEOF {
266			break
267		}
268		if err != nil {
269			log.Fatal(err)
270		}
271	}
272
273	// Flush the streamed compressor and record the compressed size.
274	if err := zw1.Flush(); err != nil {
275		log.Fatal(err)
276	}
277	strmRec.compSize = int64(cw1)
278	return strmRec, chnkRecs
279}
280
281// compress compresses the input buffer at the given level.
282func compress(b []byte, lvl int) []byte {
283	var buf bytes.Buffer
284	w, err := flate.NewWriter(&buf, lvl)
285	if err != nil {
286		log.Fatal(err)
287	}
288	if _, err := io.Copy(w, bytes.NewReader(b)); err != nil {
289		log.Fatal(err)
290	}
291	if err := w.Close(); err != nil {
292		log.Fatal(err)
293	}
294	return buf.Bytes()
295}
296
297// encode encodes the input using XFLATE's meta encoding.
298func encode(b []byte) []byte {
299	var buf bytes.Buffer
300	mw := meta.NewWriter(&buf)
301	mw.FinalMode = meta.FinalMeta
302	if _, err := io.Copy(mw, bytes.NewReader(b)); err != nil {
303		log.Fatal(err)
304	}
305	if err := mw.Close(); err != nil {
306		log.Fatal(err)
307	}
308	return buf.Bytes()
309}
310