1// Copyright 2015, Joe Tsai. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE.md file. 4 5// +build ignore 6 7// xflate_stats is used to investigate trade-offs in XFLATE design. 8// 9// The XFLATE format extends the DEFLATE format in order to allow for limited 10// random access reading. This is achieved by individually compressing the 11// input data in chunks (so that each chunk can be decompressed by itself), 12// and by storing an index recording the size of every chunk (so that each 13// chunk can be located). 14// 15// However, this adds overhead and diminishes the compression ratio slightly. 16// In order to investigate the cost of chunking, this program computes the total 17// compressed size when the input is compressed as a single stream or when 18// compressed in chunks of some fixed size. 19// 20// Chunking diminishes compression ratio because the LZ77 dictionary is reset 21// for each chunk, reducing the amount of savings that could have been gained 22// by a backwards match to previous data. Secondly, the XFLATE format requires 23// that each chunk terminates with a SYNC_FLUSH marker of ~5 bytes. 24// 25// In addition to the costs of chunking itself, there is also the cost of 26// storing the index. The index is a list of (rawSize, compSize) tuples where 27// each record contains information about the uncompressed input size (rawSize) 28// and compressed output size (compSize) of each chunk. 29// 30// In order to reduce the index size, multiple techniques were explored: 31// * Compressing the index itself using DEFLATE 32// * Row-oriented vs. column-oriented layout of records; that is, row-oriented 33// layout has each (rawSize, compSize) tuple laid out one after the other, 34// while column-oriented has all rawSizes laid out as an array followed by all 35// compSizes laid out as an array 36// * Fixed-length vs. variable-length integers; that is, to store size values 37// as uint64s or to use some variable-length encoding 38// * Regular encoding vs. Delta encoding; that is, delta encoding encodes each 39// value as the difference between that value and the previous value 40// 41// However, some of these techniques do not make sense unless done in 42// conjunction with another technique. For example: 43// * Row-oriented vs. column-oriented is useless without compression 44// * Delta encoding does not make sense without variable-length integers 45// 46// For this reason, the following table shows what techniques were used for the 47// given format names listed below: 48// 49// +-------------- Raw: The index is uncompressed 50// | +---------- Row: The index is row-oriented 51// | | +------ Fix: Index records use fixed-length uint64s 52// | | | +-- Reg: Index records are not delta encoded 53// | | | | 54// Raw Row Fix Reg 55// RawFix [X] [?] [X] [X] 56// RawVar [X] [?] [ ] [X] 57// RawDlt [X] [?] [ ] [ ] 58// RowFix [ ] [X] [X] [X] 59// RowVar [ ] [X] [ ] [X] 60// RowDlt [ ] [X] [ ] [ ] 61// ColFix [ ] [ ] [X] [X] 62// ColVar [ ] [ ] [ ] [X] 63// ColDlt [ ] [ ] [ ] [ ] 64// 65// As seen, the Raw* formats do not care about the orientation of the index 66// since it is not compressed. Also, all *Dlt formats uses delta encoding 67// in conjunction with variable-length encoding. 68// 69// In summary, the greatest factor for overhead cost is the chunk size. 70// Smaller chunks reduce efficiency due to more frequent LZ77 dictionary resets 71// and also increases the total number of chunks needed, thus also increasing 72// the index size. On the other hand, larger chunks also diminishes the 73// effectiveness of random access reading since more data must be discarded to 74// start reading from some given offset. Thus, the choice of chunk size is not 75// hard-coded by the XFLATE format and this tool can help identify the 76// appropriate chunk size. 77package main 78 79import ( 80 "bytes" 81 "compress/flate" 82 "encoding/binary" 83 "flag" 84 "fmt" 85 "io" 86 "log" 87 "os" 88 89 "github.com/dsnet/compress/xflate/internal/meta" 90) 91 92func init() { log.SetFlags(log.Lshortfile) } 93 94func main() { 95 inputFile := flag.String("f", "-", "path to input file") 96 chunkSize := flag.Int("n", 64*1024, "compress the input in n-sized chunks") 97 compLevel := flag.Int("l", flate.DefaultCompression, "compression level") 98 metaEncode := flag.Bool("m", false, "encode index using XFLATE meta encoding") 99 flag.Parse() 100 101 // Open the input file. 102 var r io.Reader 103 if *inputFile != "-" { 104 f, err := os.Open(*inputFile) 105 if err != nil { 106 log.Fatal(err) 107 } 108 defer f.Close() 109 r = f 110 } else { 111 r = os.Stdin 112 } 113 114 // Compute the streamed and chunked record values. 115 var chnkRec indexRecord 116 strmRec, chnkRecs := computeRecords(r, *compLevel, *chunkSize) 117 for _, r := range chnkRecs { 118 chnkRec.rawSize += r.rawSize 119 chnkRec.compSize += r.compSize 120 } 121 122 var n int 123 var pr indexRecord // Previous record 124 var brf, brv, brd, bcf, bcv, bcd bytes.Buffer 125 buf := make([]byte, binary.MaxVarintLen64) 126 127 // Row-based index format. 128 pr = indexRecord{} 129 for _, r := range chnkRecs { 130 binary.LittleEndian.PutUint64(buf, uint64(r.rawSize)) 131 brf.Write(buf[:8]) 132 binary.LittleEndian.PutUint64(buf, uint64(r.compSize)) 133 brf.Write(buf[:8]) 134 135 n = binary.PutUvarint(buf, uint64(r.rawSize)) 136 brv.Write(buf[:n]) 137 n = binary.PutUvarint(buf, uint64(r.compSize)) 138 brv.Write(buf[:n]) 139 140 n = binary.PutVarint(buf, r.rawSize-pr.rawSize) 141 brd.Write(buf[:n]) 142 n = binary.PutVarint(buf, r.compSize-pr.compSize) 143 brd.Write(buf[:n]) 144 145 pr = r 146 } 147 148 // Column-based index format. 149 pr = indexRecord{} 150 for _, r := range chnkRecs { 151 binary.LittleEndian.PutUint64(buf, uint64(r.rawSize)) 152 bcf.Write(buf[:8]) 153 154 n = binary.PutUvarint(buf, uint64(r.rawSize)) 155 bcv.Write(buf[:n]) 156 157 n = binary.PutVarint(buf, r.rawSize-pr.rawSize) 158 bcd.Write(buf[:n]) 159 160 pr.rawSize = r.rawSize 161 } 162 for _, r := range chnkRecs { 163 binary.LittleEndian.PutUint64(buf, uint64(r.compSize)) 164 bcf.Write(buf[:8]) 165 166 n = binary.PutUvarint(buf, uint64(r.compSize)) 167 bcv.Write(buf[:n]) 168 169 n = binary.PutVarint(buf, r.compSize-pr.compSize) 170 bcd.Write(buf[:n]) 171 172 pr.compSize = r.compSize 173 } 174 175 pf := func(a, b int64) float64 { return 100.0 * (float64(a) / float64(b)) } 176 me := func(b []byte) []byte { return b } 177 if *metaEncode { 178 me = encode 179 } 180 181 // Print basic statistics about the input and output. 182 ns, nc := strmRec.compSize, chnkRec.compSize 183 ps, pc := pf(ns-ns, ns), pf(nc-ns, ns) 184 fmt.Printf("rawSize: %d\n", strmRec.rawSize) // Uncompressed input size 185 fmt.Printf("strmRec.compSize: %d (%+0.2f%%)\n", ns, ps) // Total compressed size as a single stream 186 fmt.Printf("chnkRec.compSize: %d (%+0.2f%%)\n", nc, pc) // Total compressed size as individual chunks 187 fmt.Printf("chunkSize: %d\n", *chunkSize) // Individual chunk size 188 fmt.Printf("numChunks: %d\n", len(chnkRecs)) // Total number of chunks 189 fmt.Println() 190 191 // Uncompressed index formats. 192 nf := int64(len(me(brf.Bytes()))) 193 nv := int64(len(me(brv.Bytes()))) 194 nd := int64(len(me(brd.Bytes()))) 195 fmt.Printf("RawFix: %d (%+0.2f%%)\n", nf, pf(nf, ns)) // Fixed-length uint64s 196 fmt.Printf("RawVar: %d (%+0.2f%%)\n", nv, pf(nv, ns)) // Variable-length integers (VLI) 197 fmt.Printf("RawDlt: %d (%+0.2f%%)\n", nd, pf(nd, ns)) // VLI with delta encoding 198 fmt.Println() 199 200 // Compressed row-oriented index formats. 201 nrf := int64(len(me(compress(brf.Bytes(), *compLevel)))) 202 nrv := int64(len(me(compress(brv.Bytes(), *compLevel)))) 203 nrd := int64(len(me(compress(brd.Bytes(), *compLevel)))) 204 fmt.Printf("RowFix: %d (%+0.2f%%)\n", nrf, pf(nrf, ns)) // Fixed-length uint64s 205 fmt.Printf("RowVar: %d (%+0.2f%%)\n", nrv, pf(nrv, ns)) // Variable-length integers (VLI) 206 fmt.Printf("RowDlt: %d (%+0.2f%%)\n", nrd, pf(nrd, ns)) // VLI with delta encoding 207 fmt.Println() 208 209 // Compressed column-oriented index formats. 210 ncf := int64(len(me(compress(bcf.Bytes(), *compLevel)))) 211 ncv := int64(len(me(compress(bcv.Bytes(), *compLevel)))) 212 ncd := int64(len(me(compress(bcd.Bytes(), *compLevel)))) 213 fmt.Printf("ColFix: %d (%+0.2f%%)\n", ncf, pf(ncf, ns)) // Fixed-length uint64s 214 fmt.Printf("ColVar: %d (%+0.2f%%)\n", ncv, pf(ncv, ns)) // Variable-length integers (VLI) 215 fmt.Printf("ColDlt: %d (%+0.2f%%)\n", ncd, pf(ncd, ns)) // VLI with delta encoding 216 fmt.Println() 217} 218 219// countWriter counts and discards all bytes written to it. 220type countWriter int64 221 222func (cw *countWriter) Write(b []byte) (int, error) { 223 *(*int64)(cw) += int64(len(b)) 224 return len(b), nil 225} 226 227type indexRecord struct { 228 rawSize int64 // Size when decompressed 229 compSize int64 // Size when compressed 230} 231 232// computeRecords computes the records the raw input size and the compressed 233// output size. strmRec is a single record for when the input is compressed as 234// a single stream. chnkRecs is a list of records for when the input is 235// compressed individually as chunks. 236func computeRecords(r io.Reader, lvl, chnkSize int) (strmRec indexRecord, chnkRecs []indexRecord) { 237 var cw1, cw2 countWriter 238 zw1, _ := flate.NewWriter(&cw1, lvl) // Streamed compressor 239 zw2, _ := flate.NewWriter(&cw2, lvl) // Chunked compressor 240 buf := make([]byte, chnkSize) 241 for { 242 // Read a full chunks worth of data. 243 cnt, err := io.ReadFull(r, buf) 244 strmRec.rawSize += int64(cnt) 245 if err == io.EOF { 246 break 247 } 248 249 // Write chunk to both compressors. 250 if _, err := zw1.Write(buf[:cnt]); err != nil { 251 log.Fatal(err) 252 } 253 if _, err := zw2.Write(buf[:cnt]); err != nil { 254 log.Fatal(err) 255 } 256 257 // Flush the chunked compressor, append the record, and reset. 258 if err := zw2.Flush(); err != nil { 259 log.Fatal(err) 260 } 261 chnkRecs = append(chnkRecs, indexRecord{rawSize: int64(cnt), compSize: int64(cw2)}) 262 cw2 = 0 263 zw2.Reset(&cw2) 264 265 if err == io.ErrUnexpectedEOF { 266 break 267 } 268 if err != nil { 269 log.Fatal(err) 270 } 271 } 272 273 // Flush the streamed compressor and record the compressed size. 274 if err := zw1.Flush(); err != nil { 275 log.Fatal(err) 276 } 277 strmRec.compSize = int64(cw1) 278 return strmRec, chnkRecs 279} 280 281// compress compresses the input buffer at the given level. 282func compress(b []byte, lvl int) []byte { 283 var buf bytes.Buffer 284 w, err := flate.NewWriter(&buf, lvl) 285 if err != nil { 286 log.Fatal(err) 287 } 288 if _, err := io.Copy(w, bytes.NewReader(b)); err != nil { 289 log.Fatal(err) 290 } 291 if err := w.Close(); err != nil { 292 log.Fatal(err) 293 } 294 return buf.Bytes() 295} 296 297// encode encodes the input using XFLATE's meta encoding. 298func encode(b []byte) []byte { 299 var buf bytes.Buffer 300 mw := meta.NewWriter(&buf) 301 mw.FinalMode = meta.FinalMeta 302 if _, err := io.Copy(mw, bytes.NewReader(b)); err != nil { 303 log.Fatal(err) 304 } 305 if err := mw.Close(); err != nil { 306 log.Fatal(err) 307 } 308 return buf.Bytes() 309} 310