1// Copyright 2015, Joe Tsai. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE.md file.
4
5package bzip2
6
7import (
8	"io"
9
10	"github.com/dsnet/compress/internal"
11	"github.com/dsnet/compress/internal/errors"
12	"github.com/dsnet/compress/internal/prefix"
13)
14
15type Reader struct {
16	InputOffset  int64 // Total number of bytes read from underlying io.Reader
17	OutputOffset int64 // Total number of bytes emitted from Read
18
19	rd       prefixReader
20	err      error
21	level    int    // The current compression level
22	rdHdrFtr int    // Number of times we read the stream header and footer
23	blkCRC   uint32 // CRC-32 IEEE of each block (as stored)
24	endCRC   uint32 // Checksum of all blocks using bzip2's custom method
25
26	crc crc
27	mtf moveToFront
28	bwt burrowsWheelerTransform
29	rle runLengthEncoding
30
31	// These fields are allocated with Reader and re-used later.
32	treeSels []uint8
33	codes2D  [maxNumTrees][maxNumSyms]prefix.PrefixCode
34	codes1D  [maxNumTrees]prefix.PrefixCodes
35	trees1D  [maxNumTrees]prefix.Decoder
36	syms     []uint16
37
38	fuzzReader // Exported functionality when fuzz testing
39}
40
41type ReaderConfig struct {
42	_ struct{} // Blank field to prevent unkeyed struct literals
43}
44
45func NewReader(r io.Reader, conf *ReaderConfig) (*Reader, error) {
46	zr := new(Reader)
47	zr.Reset(r)
48	return zr, nil
49}
50
51func (zr *Reader) Reset(r io.Reader) error {
52	*zr = Reader{
53		rd: zr.rd,
54
55		mtf: zr.mtf,
56		bwt: zr.bwt,
57		rle: zr.rle,
58
59		treeSels: zr.treeSels,
60		trees1D:  zr.trees1D,
61		syms:     zr.syms,
62	}
63	zr.rd.Init(r)
64	return nil
65}
66
67func (zr *Reader) Read(buf []byte) (int, error) {
68	for {
69		cnt, err := zr.rle.Read(buf)
70		if err != rleDone && zr.err == nil {
71			zr.err = err
72		}
73		if cnt > 0 {
74			zr.crc.update(buf[:cnt])
75			zr.OutputOffset += int64(cnt)
76			return cnt, nil
77		}
78		if zr.err != nil || len(buf) == 0 {
79			return 0, zr.err
80		}
81
82		// Read the next chunk.
83		zr.rd.Offset = zr.InputOffset
84		func() {
85			defer errors.Recover(&zr.err)
86			if zr.rdHdrFtr%2 == 0 {
87				// Check if we are already at EOF.
88				if err := zr.rd.PullBits(1); err != nil {
89					if err == io.ErrUnexpectedEOF && zr.rdHdrFtr > 0 {
90						err = io.EOF // EOF is okay if we read at least one stream
91					}
92					errors.Panic(err)
93				}
94
95				// Read stream header.
96				if zr.rd.ReadBitsBE64(16) != hdrMagic {
97					panicf(errors.Corrupted, "invalid stream magic")
98				}
99				if ver := zr.rd.ReadBitsBE64(8); ver != 'h' {
100					if ver == '0' {
101						panicf(errors.Deprecated, "bzip1 format is not supported")
102					}
103					panicf(errors.Corrupted, "invalid version: %q", ver)
104				}
105				lvl := int(zr.rd.ReadBitsBE64(8)) - '0'
106				if lvl < BestSpeed || lvl > BestCompression {
107					panicf(errors.Corrupted, "invalid block size: %d", lvl*blockSize)
108				}
109				zr.level = lvl
110				zr.rdHdrFtr++
111			} else {
112				// Check and update the CRC.
113				if internal.GoFuzz {
114					zr.updateChecksum(-1, zr.crc.val) // Update with value
115					zr.blkCRC = zr.crc.val            // Suppress CRC failures
116				}
117				if zr.blkCRC != zr.crc.val {
118					panicf(errors.Corrupted, "mismatching block checksum")
119				}
120				zr.endCRC = (zr.endCRC<<1 | zr.endCRC>>31) ^ zr.blkCRC
121			}
122			buf := zr.decodeBlock()
123			zr.rle.Init(buf)
124		}()
125		if zr.InputOffset, err = zr.rd.Flush(); zr.err == nil {
126			zr.err = err
127		}
128		if zr.err != nil {
129			zr.err = errWrap(zr.err, errors.Corrupted)
130			return 0, zr.err
131		}
132	}
133}
134
135func (zr *Reader) Close() error {
136	if zr.err == io.EOF || zr.err == errClosed {
137		zr.rle.Init(nil) // Make sure future reads fail
138		zr.err = errClosed
139		return nil
140	}
141	return zr.err // Return the persistent error
142}
143
144func (zr *Reader) decodeBlock() []byte {
145	if magic := zr.rd.ReadBitsBE64(48); magic != blkMagic {
146		if magic == endMagic {
147			endCRC := uint32(zr.rd.ReadBitsBE64(32))
148			if internal.GoFuzz {
149				zr.updateChecksum(zr.rd.BitsRead()-32, zr.endCRC)
150				endCRC = zr.endCRC // Suppress CRC failures
151			}
152			if zr.endCRC != endCRC {
153				panicf(errors.Corrupted, "mismatching stream checksum")
154			}
155			zr.endCRC = 0
156			zr.rd.ReadPads()
157			zr.rdHdrFtr++
158			return nil
159		}
160		panicf(errors.Corrupted, "invalid block or footer magic")
161	}
162
163	zr.crc.val = 0
164	zr.blkCRC = uint32(zr.rd.ReadBitsBE64(32))
165	if internal.GoFuzz {
166		zr.updateChecksum(zr.rd.BitsRead()-32, 0) // Record offset only
167	}
168	if zr.rd.ReadBitsBE64(1) != 0 {
169		panicf(errors.Deprecated, "block randomization is not supported")
170	}
171
172	// Read BWT related fields.
173	ptr := int(zr.rd.ReadBitsBE64(24)) // BWT origin pointer
174
175	// Read MTF related fields.
176	var dictArr [256]uint8
177	dict := dictArr[:0]
178	bmapHi := uint16(zr.rd.ReadBits(16))
179	for i := 0; i < 256; i, bmapHi = i+16, bmapHi>>1 {
180		if bmapHi&1 > 0 {
181			bmapLo := uint16(zr.rd.ReadBits(16))
182			for j := 0; j < 16; j, bmapLo = j+1, bmapLo>>1 {
183				if bmapLo&1 > 0 {
184					dict = append(dict, uint8(i+j))
185				}
186			}
187		}
188	}
189
190	// Step 1: Prefix encoding.
191	syms := zr.decodePrefix(len(dict))
192
193	// Step 2: Move-to-front transform and run-length encoding.
194	zr.mtf.Init(dict, zr.level*blockSize)
195	buf := zr.mtf.Decode(syms)
196
197	// Step 3: Burrows-Wheeler transformation.
198	if ptr >= len(buf) {
199		panicf(errors.Corrupted, "origin pointer (0x%06x) exceeds block size: %d", ptr, len(buf))
200	}
201	zr.bwt.Decode(buf, ptr)
202
203	return buf
204}
205
206func (zr *Reader) decodePrefix(numSyms int) (syms []uint16) {
207	numSyms += 2 // Remove 0 symbol, add RUNA, RUNB, and EOF symbols
208	if numSyms < 3 {
209		panicf(errors.Corrupted, "not enough prefix symbols: %d", numSyms)
210	}
211
212	// Read information about the trees and tree selectors.
213	var mtf internal.MoveToFront
214	numTrees := int(zr.rd.ReadBitsBE64(3))
215	if numTrees < minNumTrees || numTrees > maxNumTrees {
216		panicf(errors.Corrupted, "invalid number of prefix trees: %d", numTrees)
217	}
218	numSels := int(zr.rd.ReadBitsBE64(15))
219	if cap(zr.treeSels) < numSels {
220		zr.treeSels = make([]uint8, numSels)
221	}
222	treeSels := zr.treeSels[:numSels]
223	for i := range treeSels {
224		sym, ok := zr.rd.TryReadSymbol(&decSel)
225		if !ok {
226			sym = zr.rd.ReadSymbol(&decSel)
227		}
228		if int(sym) >= numTrees {
229			panicf(errors.Corrupted, "invalid prefix tree selector: %d", sym)
230		}
231		treeSels[i] = uint8(sym)
232	}
233	mtf.Decode(treeSels)
234	zr.treeSels = treeSels
235
236	// Initialize prefix codes.
237	for i := range zr.codes2D[:numTrees] {
238		zr.codes1D[i] = zr.codes2D[i][:numSyms]
239	}
240	zr.rd.ReadPrefixCodes(zr.codes1D[:numTrees], zr.trees1D[:numTrees])
241
242	// Read prefix encoded symbols of compressed data.
243	var tree *prefix.Decoder
244	var blkLen, selIdx int
245	syms = zr.syms[:0]
246	for {
247		if blkLen == 0 {
248			blkLen = numBlockSyms
249			if selIdx >= len(treeSels) {
250				panicf(errors.Corrupted, "not enough prefix tree selectors")
251			}
252			tree = &zr.trees1D[treeSels[selIdx]]
253			selIdx++
254		}
255		blkLen--
256		sym, ok := zr.rd.TryReadSymbol(tree)
257		if !ok {
258			sym = zr.rd.ReadSymbol(tree)
259		}
260
261		if int(sym) == numSyms-1 {
262			break // EOF marker
263		}
264		if int(sym) >= numSyms {
265			panicf(errors.Corrupted, "invalid prefix symbol: %d", sym)
266		}
267		if len(syms) >= zr.level*blockSize {
268			panicf(errors.Corrupted, "number of prefix symbols exceeds block size")
269		}
270		syms = append(syms, uint16(sym))
271	}
272	zr.syms = syms
273	return syms
274}
275