1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package bzip2
6
7import "sort"
8
9// A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
10// symbol.
11type huffmanTree struct {
12	// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
13	// root of the tree and nextNode contains the index of the next element
14	// of nodes to use when the tree is being constructed.
15	nodes    []huffmanNode
16	nextNode int
17}
18
19// A huffmanNode is a node in the tree. left and right contain indexes into the
20// nodes slice of the tree. If left or right is invalidNodeValue then the child
21// is a left node and its value is in leftValue/rightValue.
22//
23// The symbols are uint16s because bzip2 encodes not only MTF indexes in the
24// tree, but also two magic values for run-length encoding and an EOF symbol.
25// Thus there are more than 256 possible symbols.
26type huffmanNode struct {
27	left, right           uint16
28	leftValue, rightValue uint16
29}
30
31// invalidNodeValue is an invalid index which marks a leaf node in the tree.
32const invalidNodeValue = 0xffff
33
34// Decode reads bits from the given bitReader and navigates the tree until a
35// symbol is found.
36func (t *huffmanTree) Decode(br *bitReader) (v uint16) {
37	nodeIndex := uint16(0) // node 0 is the root of the tree.
38
39	for {
40		node := &t.nodes[nodeIndex]
41
42		var bit uint16
43		if br.bits > 0 {
44			// Get next bit - fast path.
45			br.bits--
46			bit = uint16(br.n>>(br.bits&63)) & 1
47		} else {
48			// Get next bit - slow path.
49			// Use ReadBits to retrieve a single bit
50			// from the underling io.ByteReader.
51			bit = uint16(br.ReadBits(1))
52		}
53
54		// Trick a compiler into generating conditional move instead of branch,
55		// by making both loads unconditional.
56		l, r := node.left, node.right
57
58		if bit == 1 {
59			nodeIndex = l
60		} else {
61			nodeIndex = r
62		}
63
64		if nodeIndex == invalidNodeValue {
65			// We found a leaf. Use the value of bit to decide
66			// whether is a left or a right value.
67			l, r := node.leftValue, node.rightValue
68			if bit == 1 {
69				v = l
70			} else {
71				v = r
72			}
73			return
74		}
75	}
76}
77
78// newHuffmanTree builds a Huffman tree from a slice containing the code
79// lengths of each symbol. The maximum code length is 32 bits.
80func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
81	// There are many possible trees that assign the same code length to
82	// each symbol (consider reflecting a tree down the middle, for
83	// example). Since the code length assignments determine the
84	// efficiency of the tree, each of these trees is equally good. In
85	// order to minimize the amount of information needed to build a tree
86	// bzip2 uses a canonical tree so that it can be reconstructed given
87	// only the code length assignments.
88
89	if len(lengths) < 2 {
90		panic("newHuffmanTree: too few symbols")
91	}
92
93	var t huffmanTree
94
95	// First we sort the code length assignments by ascending code length,
96	// using the symbol value to break ties.
97	pairs := make([]huffmanSymbolLengthPair, len(lengths))
98	for i, length := range lengths {
99		pairs[i].value = uint16(i)
100		pairs[i].length = length
101	}
102
103	sort.Slice(pairs, func(i, j int) bool {
104		if pairs[i].length < pairs[j].length {
105			return true
106		}
107		if pairs[i].length > pairs[j].length {
108			return false
109		}
110		if pairs[i].value < pairs[j].value {
111			return true
112		}
113		return false
114	})
115
116	// Now we assign codes to the symbols, starting with the longest code.
117	// We keep the codes packed into a uint32, at the most-significant end.
118	// So branches are taken from the MSB downwards. This makes it easy to
119	// sort them later.
120	code := uint32(0)
121	length := uint8(32)
122
123	codes := make([]huffmanCode, len(lengths))
124	for i := len(pairs) - 1; i >= 0; i-- {
125		if length > pairs[i].length {
126			length = pairs[i].length
127		}
128		codes[i].code = code
129		codes[i].codeLen = length
130		codes[i].value = pairs[i].value
131		// We need to 'increment' the code, which means treating |code|
132		// like a |length| bit number.
133		code += 1 << (32 - length)
134	}
135
136	// Now we can sort by the code so that the left half of each branch are
137	// grouped together, recursively.
138	sort.Slice(codes, func(i, j int) bool {
139		return codes[i].code < codes[j].code
140	})
141
142	t.nodes = make([]huffmanNode, len(codes))
143	_, err := buildHuffmanNode(&t, codes, 0)
144	return t, err
145}
146
147// huffmanSymbolLengthPair contains a symbol and its code length.
148type huffmanSymbolLengthPair struct {
149	value  uint16
150	length uint8
151}
152
153// huffmanCode contains a symbol, its code and code length.
154type huffmanCode struct {
155	code    uint32
156	codeLen uint8
157	value   uint16
158}
159
160// buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
161// the Huffman tree at the given level. It returns the index of the newly
162// constructed node.
163func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
164	test := uint32(1) << (31 - level)
165
166	// We have to search the list of codes to find the divide between the left and right sides.
167	firstRightIndex := len(codes)
168	for i, code := range codes {
169		if code.code&test != 0 {
170			firstRightIndex = i
171			break
172		}
173	}
174
175	left := codes[:firstRightIndex]
176	right := codes[firstRightIndex:]
177
178	if len(left) == 0 || len(right) == 0 {
179		// There is a superfluous level in the Huffman tree indicating
180		// a bug in the encoder. However, this bug has been observed in
181		// the wild so we handle it.
182
183		// If this function was called recursively then we know that
184		// len(codes) >= 2 because, otherwise, we would have hit the
185		// "leaf node" case, below, and not recursed.
186		//
187		// However, for the initial call it's possible that len(codes)
188		// is zero or one. Both cases are invalid because a zero length
189		// tree cannot encode anything and a length-1 tree can only
190		// encode EOF and so is superfluous. We reject both.
191		if len(codes) < 2 {
192			return 0, StructuralError("empty Huffman tree")
193		}
194
195		// In this case the recursion doesn't always reduce the length
196		// of codes so we need to ensure termination via another
197		// mechanism.
198		if level == 31 {
199			// Since len(codes) >= 2 the only way that the values
200			// can match at all 32 bits is if they are equal, which
201			// is invalid. This ensures that we never enter
202			// infinite recursion.
203			return 0, StructuralError("equal symbols in Huffman tree")
204		}
205
206		if len(left) == 0 {
207			return buildHuffmanNode(t, right, level+1)
208		}
209		return buildHuffmanNode(t, left, level+1)
210	}
211
212	nodeIndex = uint16(t.nextNode)
213	node := &t.nodes[t.nextNode]
214	t.nextNode++
215
216	if len(left) == 1 {
217		// leaf node
218		node.left = invalidNodeValue
219		node.leftValue = left[0].value
220	} else {
221		node.left, err = buildHuffmanNode(t, left, level+1)
222	}
223
224	if err != nil {
225		return
226	}
227
228	if len(right) == 1 {
229		// leaf node
230		node.right = invalidNodeValue
231		node.rightValue = right[0].value
232	} else {
233		node.right, err = buildHuffmanNode(t, right, level+1)
234	}
235
236	return
237}
238