1// Copyright 2015, Joe Tsai. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE.md file.
4
5package bzip2
6
7import (
8	"io"
9
10	"github.com/dsnet/compress/internal"
11	"github.com/dsnet/compress/internal/errors"
12	"github.com/dsnet/compress/internal/prefix"
13)
14
15const (
16	minNumTrees = 2
17	maxNumTrees = 6
18
19	maxPrefixBits = 20      // Maximum bit-width of a prefix code
20	maxNumSyms    = 256 + 2 // Maximum number of symbols in the alphabet
21	numBlockSyms  = 50      // Number of bytes in a block
22)
23
24// encSel and decSel are used to handle the prefix encoding for tree selectors.
25// The prefix encoding is as follows:
26//
27//	Code         TreeIdx
28//	0        <=> 0
29//	10       <=> 1
30//	110      <=> 2
31//	1110     <=> 3
32//	11110    <=> 4
33//	111110   <=> 5
34//	111111   <=> 6	Invalid tree index, so should fail
35//
36var encSel, decSel = func() (e prefix.Encoder, d prefix.Decoder) {
37	var selCodes [maxNumTrees + 1]prefix.PrefixCode
38	for i := range selCodes {
39		selCodes[i] = prefix.PrefixCode{Sym: uint32(i), Len: uint32(i + 1)}
40	}
41	selCodes[maxNumTrees] = prefix.PrefixCode{Sym: maxNumTrees, Len: maxNumTrees}
42	prefix.GeneratePrefixes(selCodes[:])
43	e.Init(selCodes[:])
44	d.Init(selCodes[:])
45	return
46}()
47
48type prefixReader struct{ prefix.Reader }
49
50func (pr *prefixReader) Init(r io.Reader) {
51	pr.Reader.Init(r, true)
52}
53
54func (pr *prefixReader) ReadBitsBE64(nb uint) uint64 {
55	if nb <= 32 {
56		v := uint32(pr.ReadBits(nb))
57		return uint64(internal.ReverseUint32N(v, nb))
58	}
59	v0 := internal.ReverseUint32(uint32(pr.ReadBits(32)))
60	v1 := internal.ReverseUint32(uint32(pr.ReadBits(nb - 32)))
61	v := uint64(v0)<<32 | uint64(v1)
62	return v >> (64 - nb)
63}
64
65func (pr *prefixReader) ReadPrefixCodes(codes []prefix.PrefixCodes, trees []prefix.Decoder) {
66	for i, pc := range codes {
67		clen := int(pr.ReadBitsBE64(5))
68		sum := 1 << maxPrefixBits
69		for sym := range pc {
70			for {
71				if clen < 1 || clen > maxPrefixBits {
72					panicf(errors.Corrupted, "invalid prefix bit-length: %d", clen)
73				}
74
75				b, ok := pr.TryReadBits(1)
76				if !ok {
77					b = pr.ReadBits(1)
78				}
79				if b == 0 {
80					break
81				}
82
83				b, ok = pr.TryReadBits(1)
84				if !ok {
85					b = pr.ReadBits(1)
86				}
87				clen -= int(b*2) - 1 // +1 or -1
88			}
89			pc[sym] = prefix.PrefixCode{Sym: uint32(sym), Len: uint32(clen)}
90			sum -= (1 << maxPrefixBits) >> uint(clen)
91		}
92
93		if sum == 0 {
94			// Fast path, but only handles complete trees.
95			if err := prefix.GeneratePrefixes(pc); err != nil {
96				errors.Panic(err) // Using complete trees; should never fail
97			}
98		} else {
99			// Slow path, but handles anything.
100			pc = handleDegenerateCodes(pc) // Never fails, but may fail later
101			codes[i] = pc
102		}
103		trees[i].Init(pc)
104	}
105}
106
107type prefixWriter struct{ prefix.Writer }
108
109func (pw *prefixWriter) Init(w io.Writer) {
110	pw.Writer.Init(w, true)
111}
112
113func (pw *prefixWriter) WriteBitsBE64(v uint64, nb uint) {
114	if nb <= 32 {
115		v := internal.ReverseUint32N(uint32(v), nb)
116		pw.WriteBits(uint(v), nb)
117		return
118	}
119	v <<= (64 - nb)
120	v0 := internal.ReverseUint32(uint32(v >> 32))
121	v1 := internal.ReverseUint32(uint32(v))
122	pw.WriteBits(uint(v0), 32)
123	pw.WriteBits(uint(v1), nb-32)
124	return
125}
126
127func (pw *prefixWriter) WritePrefixCodes(codes []prefix.PrefixCodes, trees []prefix.Encoder) {
128	for i, pc := range codes {
129		if err := prefix.GeneratePrefixes(pc); err != nil {
130			errors.Panic(err) // Using complete trees; should never fail
131		}
132		trees[i].Init(pc)
133
134		clen := int(pc[0].Len)
135		pw.WriteBitsBE64(uint64(clen), 5)
136		for _, c := range pc {
137			for int(c.Len) < clen {
138				pw.WriteBits(3, 2) // 11
139				clen--
140			}
141			for int(c.Len) > clen {
142				pw.WriteBits(1, 2) // 10
143				clen++
144			}
145			pw.WriteBits(0, 1)
146		}
147	}
148}
149
150// handleDegenerateCodes converts a degenerate tree into a canonical tree.
151//
152// For example, when the input is an under-subscribed tree:
153//	input:  []PrefixCode{
154//		{Sym: 0, Len: 3},
155//		{Sym: 1, Len: 4},
156//		{Sym: 2, Len: 3},
157//	}
158//	output: []PrefixCode{
159//		{Sym:   0, Len: 3, Val:  0}, //  000
160//		{Sym:   1, Len: 4, Val:  2}, // 0010
161//		{Sym:   2, Len: 3, Val:  4}, //  100
162//		{Sym: 258, Len: 4, Val: 10}, // 1010
163//		{Sym: 259, Len: 3, Val:  6}, //  110
164//		{Sym: 260, Len: 1, Val:  1}, //    1
165//	}
166//
167// For example, when the input is an over-subscribed tree:
168//	input:  []PrefixCode{
169//		{Sym: 0, Len: 1},
170//		{Sym: 1, Len: 3},
171//		{Sym: 2, Len: 4},
172//		{Sym: 3, Len: 3},
173//		{Sym: 4, Len: 2},
174//	}
175//	output: []PrefixCode{
176//		{Sym: 0, Len: 1, Val: 0}, //   0
177//		{Sym: 1, Len: 3, Val: 3}, // 011
178//		{Sym: 3, Len: 3, Val: 7}, // 111
179//		{Sym: 4, Len: 2, Val: 1}, //  01
180//	}
181func handleDegenerateCodes(codes prefix.PrefixCodes) prefix.PrefixCodes {
182	// Since there is no formal definition for the BZip2 format, there is no
183	// specification that says that the code lengths must form a complete
184	// prefix tree (IE: it is neither over-subscribed nor under-subscribed).
185	// Thus, the original C implementation becomes the reference for how prefix
186	// decoding is done in these edge cases. Unfortunately, the C version does
187	// not error when an invalid tree is used, but rather allows decoding to
188	// continue and only errors if some bit pattern happens to cause an error.
189	// Thus, it is possible for an invalid tree to end up decoding an input
190	// "properly" so long as invalid bit patterns are not present. In order to
191	// replicate this non-specified behavior, we use a ported version of the
192	// C code to generate the codes as a valid canonical tree by substituting
193	// invalid nodes with invalid symbols.
194	//
195	// ====================================================
196	// This program, "bzip2", the associated library "libbzip2", and all
197	// documentation, are copyright (C) 1996-2010 Julian R Seward.  All
198	// rights reserved.
199	//
200	// Redistribution and use in source and binary forms, with or without
201	// modification, are permitted provided that the following conditions
202	// are met:
203	//
204	// 1. Redistributions of source code must retain the above copyright
205	//    notice, this list of conditions and the following disclaimer.
206	//
207	// 2. The origin of this software must not be misrepresented; you must
208	//    not claim that you wrote the original software.  If you use this
209	//    software in a product, an acknowledgment in the product
210	//    documentation would be appreciated but is not required.
211	//
212	// 3. Altered source versions must be plainly marked as such, and must
213	//    not be misrepresented as being the original software.
214	//
215	// 4. The name of the author may not be used to endorse or promote
216	//    products derived from this software without specific prior written
217	//    permission.
218	//
219	// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
220	// OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
221	// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
222	// ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
223	// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
224	// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
225	// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
226	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
227	// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
228	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
229	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
230	//
231	// Julian Seward, jseward@bzip.org
232	// bzip2/libbzip2 version 1.0.6 of 6 September 2010
233	// ====================================================
234	var (
235		limits [maxPrefixBits + 2]int32
236		bases  [maxPrefixBits + 2]int32
237		perms  [maxNumSyms]int32
238
239		minLen = uint32(maxPrefixBits)
240		maxLen = uint32(0)
241	)
242
243	const (
244		statusOkay = iota
245		statusInvalid
246		statusNeedBits
247		statusMaxBits
248	)
249
250	// createTables is the BZ2_hbCreateDecodeTables function from the C code.
251	createTables := func(codes []prefix.PrefixCode) {
252		for _, c := range codes {
253			if c.Len > maxLen {
254				maxLen = c.Len
255			}
256			if c.Len < minLen {
257				minLen = c.Len
258			}
259		}
260
261		var pp int
262		for i := minLen; i <= maxLen; i++ {
263			for j, c := range codes {
264				if c.Len == i {
265					perms[pp] = int32(j)
266					pp++
267				}
268			}
269		}
270
271		var vec int32
272		for _, c := range codes {
273			bases[c.Len+1]++
274		}
275		for i := 1; i < len(bases); i++ {
276			bases[i] += bases[i-1]
277		}
278		for i := minLen; i <= maxLen; i++ {
279			vec += bases[i+1] - bases[i]
280			limits[i] = vec - 1
281			vec <<= 1
282		}
283		for i := minLen + 1; i <= maxLen; i++ {
284			bases[i] = ((limits[i-1] + 1) << 1) - bases[i]
285		}
286	}
287
288	// getSymbol is the GET_MTF_VAL macro from the C code.
289	getSymbol := func(c prefix.PrefixCode) (uint32, int) {
290		v := internal.ReverseUint32(c.Val)
291		n := c.Len
292
293		zn := minLen
294		if zn > n {
295			return 0, statusNeedBits
296		}
297		zvec := int32(v >> (32 - zn))
298		v <<= zn
299		for {
300			if zn > maxLen {
301				return 0, statusMaxBits
302			}
303			if zvec <= limits[zn] {
304				break
305			}
306			zn++
307			if zn > n {
308				return 0, statusNeedBits
309			}
310			zvec = (zvec << 1) | int32(v>>31)
311			v <<= 1
312		}
313		if zvec-bases[zn] < 0 || zvec-bases[zn] >= maxNumSyms {
314			return 0, statusInvalid
315		}
316		return uint32(perms[zvec-bases[zn]]), statusOkay
317	}
318
319	// Step 1: Create the prefix trees using the C algorithm.
320	createTables(codes)
321
322	// Step 2: Starting with the shortest bit pattern, explore the whole tree.
323	// If tree is under-subscribed, the worst-case runtime is O(1<<maxLen).
324	// If tree is over-subscribed, the worst-case runtime is O(maxNumSyms).
325	var pcodesArr [2 * maxNumSyms]prefix.PrefixCode
326	pcodes := pcodesArr[:maxNumSyms]
327	var exploreCode func(prefix.PrefixCode) bool
328	exploreCode = func(c prefix.PrefixCode) (term bool) {
329		sym, status := getSymbol(c)
330		switch status {
331		case statusOkay:
332			// This code is valid, so insert it.
333			c.Sym = sym
334			pcodes[sym] = c
335			term = true
336		case statusInvalid:
337			// This code is invalid, so insert an invalid symbol.
338			c.Sym = uint32(len(pcodes))
339			pcodes = append(pcodes, c)
340			term = true
341		case statusNeedBits:
342			// This code is too short, so explore both children.
343			c.Len++
344			c0, c1 := c, c
345			c1.Val |= 1 << (c.Len - 1)
346
347			b0 := exploreCode(c0)
348			b1 := exploreCode(c1)
349			switch {
350			case !b0 && b1:
351				c0.Sym = uint32(len(pcodes))
352				pcodes = append(pcodes, c0)
353			case !b1 && b0:
354				c1.Sym = uint32(len(pcodes))
355				pcodes = append(pcodes, c1)
356			}
357			term = b0 || b1
358		case statusMaxBits:
359			// This code is too long, so report it upstream.
360			term = false
361		}
362		return term // Did this code terminate?
363	}
364	exploreCode(prefix.PrefixCode{})
365
366	// Step 3: Copy new sparse codes to old output codes.
367	codes = codes[:0]
368	for _, c := range pcodes {
369		if c.Len > 0 {
370			codes = append(codes, c)
371		}
372	}
373	return codes
374}
375