1// Copyright 2015, Joe Tsai. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE.md file.
4
5package bzip2
6
7import (
8	"bytes"
9	"io"
10	"io/ioutil"
11	"testing"
12
13	"github.com/dsnet/compress/internal/errors"
14	"github.com/dsnet/compress/internal/testutil"
15)
16
17func TestReader(t *testing.T) {
18	db := testutil.MustDecodeBitGen
19
20	errFuncs := map[string]func(error) bool{
21		"IsUnexpectedEOF": func(err error) bool { return err == io.ErrUnexpectedEOF },
22		"IsCorrupted":     errors.IsCorrupted,
23		"IsDeprecated":    errors.IsDeprecated,
24	}
25	vectors := []struct {
26		name   string // Sub-test name
27		input  []byte // Test input string
28		output []byte // Expected output string
29		inIdx  int64  // Expected input offset after reading
30		outIdx int64  // Expected output offset after reading
31		errf   string // Name of error checking callback
32	}{{
33		name: "EmptyString",
34		errf: "IsUnexpectedEOF",
35	}, {
36		name:  "EmptyOutput",
37		input: db(`>>> > "BZh9" H48:177245385090 H32:00000000`),
38		inIdx: 14,
39	}, {
40		name: "EmptyOutput9S",
41		input: db(`>>> >
42			"BZh1" H48:177245385090 H32:00000000
43			"BZh2" H48:177245385090 H32:00000000
44			"BZh3" H48:177245385090 H32:00000000
45			"BZh4" H48:177245385090 H32:00000000
46			"BZh5" H48:177245385090 H32:00000000
47			"BZh6" H48:177245385090 H32:00000000
48			"BZh7" H48:177245385090 H32:00000000
49			"BZh8" H48:177245385090 H32:00000000
50			"BZh9" H48:177245385090 H32:00000000
51		`),
52		inIdx: 14 * 9, outIdx: 0,
53	}, {
54		name:  "InvalidStreamMagic",
55		input: db(`>>> > "XX"`),
56		inIdx: 2, outIdx: 0,
57		errf: "IsCorrupted",
58	}, {
59		name:  "InvalidVersion",
60		input: db(`>>> > "BZX1"`),
61		inIdx: 3, outIdx: 0,
62		errf: "IsCorrupted",
63	}, {
64		name:  "DeprecatedVersion",
65		input: db(`>>> > "BZ01"`),
66		inIdx: 3, outIdx: 0,
67		errf: "IsDeprecated",
68	}, {
69		name:  "InvalidLevel",
70		input: db(`>>> > "BZh0"`),
71		inIdx: 4, outIdx: 0,
72		errf: "IsCorrupted",
73	}, {
74		name:  "InvalidBlockMagic",
75		input: db(`>>> > "BZh9" H48:000000000000`),
76		inIdx: 10, outIdx: 0,
77		errf: "IsCorrupted",
78	}, {
79		name:  "DeprecatedRandomization",
80		input: db(`>>> > "BZh9" H48:314159265359 H32:8e9a7706 1 H24:0`),
81		inIdx: 15, outIdx: 0,
82		errf: "IsDeprecated",
83	}, {
84		name:  "Truncated1",
85		input: db(`>>> "BZh9"`),
86		inIdx: 4, outIdx: 0,
87		errf: "IsUnexpectedEOF",
88	}, {
89		name:  "Truncated2",
90		input: db(`>>> > "BZh9" H40:3141592653`),
91		inIdx: 8, outIdx: 0,
92		errf: "IsUnexpectedEOF",
93	}, {
94		name:  "Truncated3",
95		input: db(`>>> > "BZh9" H48:314159265359`),
96		inIdx: 10, outIdx: 0,
97		errf: "IsUnexpectedEOF",
98	}, {
99		name:  "Truncated4",
100		input: db(`>>> > "BZh9" H48:314159265359 H16:8e9a`),
101		inIdx: 10, outIdx: 0,
102		errf: "IsUnexpectedEOF",
103	}, {
104		name:  "Truncated5",
105		input: db(`>>> > "BZh9" H48:314159265359 H32:8e9a7706`),
106		inIdx: 14, outIdx: 0,
107		errf: "IsUnexpectedEOF",
108	}, {
109		name:  "Truncated6",
110		input: db(`>>> > "BZh9" H48:314159265359 H32:8e9a7706 0 H24:3`),
111		inIdx: 18, outIdx: 0,
112		errf: "IsUnexpectedEOF",
113	}, {
114		name:  "Truncated7",
115		input: db(`>>> > "BZh9"  H48:314159265359 H32:8e9a7706 0 H24:3 < H16:00d4 H16:1003`),
116		inIdx: 22, outIdx: 0,
117		errf: "IsUnexpectedEOF",
118	}, {
119		name: "Truncated8",
120		input: db(`>>>
121			"BZh9"
122			> H48:314159265359 H32:8e9a7706 0 H24:3
123			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
124			> D3:2 D15:1 0
125			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
126		`),
127		inIdx: 33, outIdx: 0,
128		errf: "IsUnexpectedEOF",
129	}, {
130		name: "Truncated9",
131		input: db(`>>>
132			"BZh9"
133			> H48:314159265359 H32:8e9a7706 0 H24:3
134			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
135			> D3:2 D15:1 0
136			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
137			> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
138			< 1101 000 100 000 100 0111 010 010
139		`),
140		inIdx: 39, outIdx: 0,
141		errf: "IsUnexpectedEOF",
142	}, {
143		name: "Truncated10",
144		input: db(`>>>
145			"BZh9"
146			> H48:314159265359 H32:8e9a7706 0 H24:3
147			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
148			> D3:2 D15:1 0
149			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
150			> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
151			< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
152		`),
153		output: []byte("Hello, world!"),
154		inIdx:  41, outIdx: 13,
155		errf: "IsUnexpectedEOF",
156	}, {
157		name: "HelloWorld",
158		input: db(`>>>
159			"BZh9"
160			> H48:314159265359 H32:8e9a7706 0 H24:3
161			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
162			> D3:2 D15:1 0
163			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
164			> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
165			< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
166			> H48:177245385090 H32:8e9a7706
167		`),
168		output: []byte("Hello, world!"),
169		inIdx:  51, outIdx: 13,
170	}, {
171		name: "HelloWorld2B",
172		input: db(`>>>
173			"BZh9"
174
175			( # Two blocks
176				> H48:314159265359 H32:8e9a7706 0 H24:3
177				< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
178				> D3:2 D15:1 0
179				> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
180				> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
181				< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
182			)*2
183
184			> H48:177245385090 H32:93ae990b
185		`),
186		output: db(`>>> "Hello, world!"*2`),
187		inIdx:  51*2 - 4 - 10, outIdx: 13 * 2,
188	}, {
189		name: "HelloWorld2S",
190		input: db(`>>>
191			( # Two streams
192				"BZh9"
193
194				> H48:314159265359 H32:8e9a7706 0 H24:3
195				< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
196				> D3:2 D15:1 0
197				> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
198				> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
199				< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
200
201				> H48:177245385090 H32:8e9a7706
202			)*2
203		`),
204		output: db(`>>> "Hello, world!"*2`),
205		inIdx:  51 * 2, outIdx: 13 * 2,
206	}, {
207		name: "Banana0",
208		input: db(`>>>
209			> "BZh1" H48:314159265359 H32:87f465d8 0 H24:0
210			< H16:0050 H16:0004 H16:4002
211			> D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0
212			< 1111 0 01 0 0 01 011
213			> H48:177245385090 H32:87f465d8
214		`),
215		output: []byte("Banana"),
216		inIdx:  42, outIdx: 6,
217	}, {
218		name: "Banana1",
219		input: db(`>>>
220			> "BZh1" H48:314159265359 H32:71d297e8 0 H24:1
221			< H16:0050 H16:0004 H16:4002
222			> D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0
223			< 1111 0 01 0 0 01 011
224			> H48:177245385090 H32:71d297e8
225		`),
226		output: []byte("aBanan"),
227		inIdx:  42, outIdx: 6,
228	}, {
229		name: "Banana2",
230		input: db(`>>>
231			> "BZh1" H48:314159265359 H32:21185406 0 H24:2
232			< H16:0050 H16:0004 H16:4002
233			> D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0
234			< 1111 0 01 0 0 01 011
235			> H48:177245385090 H32:21185406
236		`),
237		output: []byte("anaBan"),
238		inIdx:  42, outIdx: 6,
239	}, {
240		name: "Banana3",
241		input: db(`>>>
242			> "BZh1" H48:314159265359 H32:be853f46 0 H24:3
243			< H16:0050 H16:0004 H16:4002
244			> D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0
245			< 1111 0 01 0 0 01 011
246			> H48:177245385090 H32:be853f46
247		`),
248		output: []byte("ananaB"),
249		inIdx:  42, outIdx: 6,
250	}, {
251		name: "Banana4",
252		input: db(`>>>
253			> "BZh1" H48:314159265359 H32:35a020df 0 H24:4
254			< H16:0050 H16:0004 H16:4002
255			> D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0
256			< 1111 0 01 0 0 01 011
257			> H48:177245385090 H32:35a020df
258		`),
259		output: []byte("naBana"),
260		inIdx:  42, outIdx: 6,
261	}, {
262		name: "Banana5",
263		input: db(`>>>
264			> "BZh1" H48:314159265359 H32:b599e6fc 0 H24:5
265			< H16:0050 H16:0004 H16:4002
266			> D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0
267			< 1111 0 01 0 0 01 011
268			> H48:177245385090 H32:b599e6fc
269		`),
270		output: []byte("nanaBa"),
271		inIdx:  42, outIdx: 6,
272	}, {
273		// This is invalid since the BWT pointer exceeds the block size.
274		name: "Banana6",
275		input: db(`>>>
276			> "BZh1" H48:314159265359 H32:87f465d8 0 H24:6
277			< H16:0050 H16:0004 H16:4002
278			> D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0
279			< 1111 0 01 0 0 01 011
280			> H48:177245385090 H32:87f465d8
281		`),
282		inIdx: 42 - 10, outIdx: 0,
283		errf: "IsCorrupted",
284	}, {
285		// There must be between 2..6 trees, inclusive. This test uses only 1.
286		name: "MinTrees",
287		input: db(`>>>
288			"BZh1"
289			> H48:314159265359 H32:8e9a7706 0 H24:3
290			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
291			> D3:1 D15:1 0
292			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
293			< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
294			> H48:177245385090 H32:8e9a7706
295		`),
296		inIdx: 28, outIdx: 0,
297		errf: "IsCorrupted",
298	}, {
299		// Define more trees than allowed. The test uses 7.
300		name: "MaxTrees",
301		input: db(`>>>
302			"BZh1"
303			> H48:314159265359 H32:8e9a7706 0 H24:3
304			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
305			> D3:7 D15:1 0
306			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
307			>(D5:4 0 0 0 0 0 0 0 0 110 0 0 0)*6
308			< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
309			> H48:177245385090 H32:8e9a7706
310		`),
311		inIdx: 28, outIdx: 0,
312		errf: "IsCorrupted",
313	}, {
314		// Define more trees and selectors than actually used.
315		name: "SuboptimalTrees",
316		input: db(`>>>
317			"BZh1"
318			> H48:314159265359 H32:8e9a7706 0 H24:3
319			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
320			> D3:6 D15:12 111110 11110 1110 110 10 0 111110 11110 1110 110 10 0
321			>(D5:4 0 0 0 0 0 0 0 0 110 0 0 0)*5
322			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
323			< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
324			> H48:177245385090 H32:8e9a7706
325		`),
326		output: []byte("Hello, world!"),
327		inIdx:  66, outIdx: 13,
328	}, {
329		// Do not define any tree selectors. This should fail when decoding
330		// the prefix codes later on.
331		name: "MinTreeSels",
332		input: db(`>>>
333			"BZh1"
334			> H48:314159265359 H32:8e9a7706 0 H24:3
335			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
336			> D3:2 D15:0 # No selectors defined
337			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
338			> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
339			< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
340			> H48:177245385090 H32:8e9a7706
341		`),
342		inIdx: 35, outIdx: 0,
343		errf: "IsCorrupted",
344	}, {
345		// Define up to 32767 tree selectors, even though only 1 is used.
346		name: "MaxTreeSels",
347		input: db(`>>>
348			"BZh1"
349			> H48:314159265359 H32:8e9a7706 0 H24:3
350			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
351			> D3:2 D15:32767 0*32767 # Define all selectors
352			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
353			> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
354			< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
355			> H48:177245385090 H32:8e9a7706
356		`),
357		output: []byte("Hello, world!"),
358		inIdx:  4147, outIdx: 13,
359	}, {
360		name: "InvalidTreeSels1",
361		input: db(`>>>
362			"BZh1"
363			> H48:314159265359 H32:8e9a7706 0 H24:3
364			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
365			> D3:2 D15:1 110 # Select tree2, which does not exist
366			> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
367			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
368			< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
369			> H48:177245385090 H32:8e9a7706
370		`),
371		inIdx: 30, outIdx: 0,
372		errf: "IsCorrupted",
373	}, {
374		name: "InvalidTreeSels2",
375		input: db(`>>>
376			"BZh1"
377			> H48:314159265359 H32:8e9a7706 0 H24:3
378			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
379			> D3:6 D15:1 111111 # Select tree6, which is invalid
380			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
381			>(D5:4 0 0 0 0 0 0 0 0 110 0 0 0)*5
382			< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
383			> H48:177245385090 H32:8e9a7706
384		`),
385		inIdx: 31, outIdx: 0,
386		errf: "IsCorrupted",
387	}, {
388		name: "JunkPadding",
389		input: db(`>>>
390			"BZh1"
391			> H48:314159265359 H32:b1f7404b 0 H24:0
392			< H16:0001 H16:0001
393			> D3:2 D15:1 0 D5:2 0 0 110 D5:2 0 0 110
394			< 01 0
395			> H48:177245385090 H32:b1f7404b 10101 # Non-zero padding bits
396		`),
397		output: []byte{0x00},
398		inIdx:  37, outIdx: 1,
399	}, {
400		name: "MinSymMap",
401		input: db(`>>>
402			"BZh1"
403			> H48:314159265359 H32:b1f7404b 0 H24:0
404			< H16:0001 H16:0001 # Only one symbol used
405			> D3:2 D15:1 0
406			>(D5:2 0 0 110)*2
407			< 01 0
408			> H48:177245385090 H32:b1f7404b
409		`),
410		output: []byte{0x00},
411		inIdx:  37, outIdx: 1,
412	}, {
413		// This block satisfies the minimum of 3 symbols for prefix encoding.
414		// The data section terminates immediately with an EOF symbol.
415		// However, this is still invalid since a BWT pointer of 0 >= 0.
416		name: "EmptyBlock",
417		input: db(`>>>
418			"BZh1"
419			> H48:314159265359 H32:00000000 0 H24:0 # BWT pointer of 0
420			< H16:0001 H16:0001 # Only one symbol used
421			> D3:2 D15:1 0
422			>(D5:2 0 0 110)*2
423			< 0 # Data ends immediately with EOF
424			> H48:177245385090 H32:00000000
425		`),
426		inIdx: 27, outIdx: 0,
427		errf: "IsCorrupted",
428	}, {
429		// The high-order symbol map says that all groups have symbols,
430		// but only the first group indicates any symbols are set.
431		name: "SuboptimalSymMap1",
432		input: db(`>>>
433			"BZh1"
434			> H48:314159265359 H32:b1f7404b 0 H24:0
435			< H16:ffff H16:0001 H16:0000*15 # All groups used, only one symbol used
436			> D3:2 D15:1 0
437			>(D5:2 0 0 110)*2
438			< 01 0
439			> H48:177245385090 H32:b1f7404b
440		`),
441		output: []byte{0x00},
442		inIdx:  67, outIdx: 1,
443	}, {
444		// The symbol map declares that all symbols are used, even though
445		// only one is actually used.
446		name: "SuboptimalSymMap2",
447		input: db(`>>>
448			"BZh1"
449			> H48:314159265359 H32:b1f7404b 0 H24:0
450			< H16:ffff*17 # All symbols used
451			> D3:2 D15:1 0
452			> D5:2 0 10101010101010100 0*255 1111111111111111110
453			> D5:9 0*4 110 0*253
454			< 01 0
455			> H48:177245385090 H32:b1f7404b
456		`),
457		output: []byte{0x00},
458		inIdx:  135, outIdx: 1,
459	}, {
460		// It is impossible for the format to encode a block with no symbols
461		// since at least one symbol must exist for the EOF symbol.
462		name: "InvalidSymMap",
463		input: db(`>>>
464			"BZh1"
465			> H48:314159265359 H32:b1f7404b 0 H24:0
466			< H16:0000 # Need at least one symbol
467		`),
468		inIdx: 20, outIdx: 0,
469		errf: "IsCorrupted", // Should not be IsUnexpectedEOF
470	}, {
471		name: "InvalidBlockChecksum",
472		input: db(`>>>
473			"BZh9"
474			> H48:314159265359 H32:00000000 0 H24:3 # Bad checksum
475			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
476			> D3:2 D15:1 0
477			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
478			> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
479			< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
480			> H48:177245385090 H32:8e9a7706
481		`),
482		output: []byte("Hello, world!"),
483		inIdx:  51 - 10, outIdx: 13,
484		errf: "IsCorrupted",
485	}, {
486		name: "InvalidStreamChecksum",
487		input: db(`>>>
488			"BZh9"
489			> H48:314159265359 H32:8e9a7706 0 H24:3
490			< H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084
491			> D3:2 D15:1 0
492			> D5:4 0 0 0 0 0 110 100 0 110 0 0 100
493			> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
494			< 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111
495			> H48:177245385090 H32:00000000 # Bad checksum
496		`),
497		output: []byte("Hello, world!"),
498		inIdx:  51, outIdx: 13,
499		errf: "IsCorrupted",
500	}, {
501		// RLE1 stage with maximum repeater length.
502		name: "RLE1-1",
503		input: db(`>>>
504			"BZh1"
505			> H48:314159265359 H32:e1fac440 0 H24:0
506			< H16:8010 H16:0002 H16:8000
507			> D3:2 D15:1 0
508			> D5:2 0 100 11110 10100
509			> D5:2 0 0 0 0
510			< 0 0 01 01 111 # Pre-RLE1: "AAAA\xff"
511			> H48:177245385090 H32:e1fac440
512		`),
513		output: db(`>>> X:41*259`),
514		inIdx:  41, outIdx: 259,
515	}, {
516		// RLE1 stage with minimum repeater length.
517		name: "RLE1-2",
518		input: db(`>>>
519			"BZh1"
520			> H48:314159265359 H32:e16e6571 0 H24:4
521			< H16:0011 H16:0001 H16:0002
522			> D3:2 D15:1 0
523			> D5:2 0 100 11110 10100
524			> D5:2 0 0 0 0
525			< 0 01 01 0 111 # Pre-RLE1: "AAAA\x00"
526			> H48:177245385090 H32:e16e6571
527		`),
528		output: db(`>>> X:41*4`),
529		inIdx:  41, outIdx: 4,
530	}, {
531		// RLE1 stage with missing repeater value.
532		name: "RLE1-3",
533		input: db(`>>>
534			"BZh1"
535			> H48:314159265359 H32:e16e6571 0 H24:3
536			< H16:0010 H16:0002
537			> D3:2 D15:1 0
538			>(D5:2 0 0 110)*2
539			< 11 01 0 # Pre-RLE1: "AAAA"
540			> H48:177245385090 H32:e16e6571
541		`),
542		output: db(`>>> X:41*4`),
543		inIdx:  37 - 10, outIdx: 4,
544		errf: "IsCorrupted",
545	}, {
546		// RLE1 stage with sub-optimal repeater usage.
547		name: "RLE1-4",
548		input: db(`>>>
549			"BZh1"
550			> H48:314159265359 H32:f59a903a 0 H24:9
551			< H16:0011 H16:0001 H16:0002
552			> D3:2 D15:1 0
553			> D5:1 0 10100 110 100
554			> D5:2 0 0 0 0
555			< 01 0 0 0 01 0 111 # Pre-RLE1: "AAAA\x00AAAA\x00"
556			> H48:177245385090 H32:f59a903a
557		`),
558		output: db(`>>> X:41*8`),
559		inIdx:  41, outIdx: 8,
560	}, {
561		// RLE1 stage with sub-optimal repeater usage.
562		name: "RLE1-5",
563		input: db(`>>>
564			"BZh1"
565			> H48:314159265359 H32:f59a903a 0 H24:4
566			< H16:0011 H16:0002 H16:0002
567			> D3:2 D15:1 0
568			> D5:3 0 110 110 10100
569			> D5:2 0 0 0 0
570			< 0 01 01 0 111 # Pre-RLE1: "AAAA\x01AAA"
571			> H48:177245385090 H32:f59a903a
572		`),
573		output: db(`>>> X:41*8`),
574		inIdx:  40, outIdx: 8,
575	}, {
576		name: "RLE2-1",
577		input: db(`>>>
578			"BZh1"
579			> H48:314159265359 H32:6b4f087c 0 H24:000000
580			< H16:0040 H16:0006
581			> D3:2 D15:1 0
582			> D5:1 0 100 100 0
583			> D5:2 0 0 0 0
584			< 01 0 0 0 0 01 0 01 0 01 01 0 0 0 0 01 111 # a*100k
585			> H48:177245385090 H32:6b4f087c
586		`),
587		output: db(`>>> "a"*2020000`),
588		inIdx:  40, outIdx: 2020000,
589	}, {
590		name: "RLE2-2",
591		input: db(`>>>
592			"BZh1"
593			> H48:314159265359 H32:d175ea9d 0 H24:000000
594			< H16:0040 H16:0006
595			> D3:2 D15:1 0
596			> D5:1 0 100 100 0
597			> D5:2 0 0 0 0
598			< 0 01 0 0 0 01 0 01 0 01 01 0 0 0 0 01 111 # a*(100k+1)
599			> H48:177245385090 H32:d175ea9d
600		`),
601		inIdx: 40 - 10, outIdx: 0,
602		errf: "IsCorrupted",
603	}, {
604		name: "RLE2-3",
605		input: db(`>>>
606			"BZh1"
607			> H48:314159265359 H32:6b4f087c 0 H24:000000
608			< H16:0040 H16:0006
609			> D3:2 D15:1 0
610			> D5:1 0 100 100 0
611			> D5:2 0 0 0 0
612			< 0 0 0 0 0 01 0 01 0 01 01 0 0 0 0 01 011 111 # a*(100k-1) b
613			> H48:177245385090 H32:6b4f087c
614		`),
615		output: db(`>>> "a"*2020000`),
616		inIdx:  40, outIdx: 2020000,
617	}, {
618		name: "RLE2-4",
619		input: db(`>>>
620			"BZh1"
621			> H48:314159265359 H32:d175ea9d 0 H24:000000
622			< H16:0040 H16:0006
623			> D3:2 D15:1 0
624			> D5:1 0 100 100 0
625			> D5:2 0 0 0 0
626			< 0 0 0 0 0 01 0 01 0 01 01 0 0 0 0 01 011 011 111 # a*(100k-1) b a
627			> H48:177245385090 H32:d175ea9d
628		`),
629		inIdx: 40 - 10, outIdx: 0,
630		errf: "IsCorrupted",
631	}, {
632		name: "RLE2-5",
633		input: db(`>>>
634			"BZh1"
635			> H48:314159265359 H32:79235035 0 H24:000000
636			< H16:0040 H16:0006
637			> D3:2 D15:1 0
638			> D5:1 0 100 100 0
639			> D5:2 0 0 0 0
640			< 0 0 0 0 0 01 0 01 0 01 01 0 0 0 0 01 011 0 011 111 # a*(100k-1) b*2 a
641			> H48:177245385090 H32:79235035
642		`),
643		inIdx: 41 - 10, outIdx: 0,
644		errf: "IsCorrupted",
645	}, {
646		// This input has a sequence of RUNA and RUNB symbols that tries to
647		// cause an integer overflow in the numeral decoding.
648		name: "RLE2-6",
649		input: db(`>>>
650			"BZh1"
651			> H48:314159265359 H32:6b4f087c 0 H24:000000
652			< H16:0040 H16:0006
653			> D3:2 D15:1 0
654			> D5:1 0 100 100 0
655			> D5:2 0 0 0 0
656			< 0*32 111
657			> H48:177245385090 H32:6b4f087c
658		`),
659		inIdx: 41 - 10, outIdx: 0,
660		errf: "IsCorrupted",
661	}, {
662		// This is valid, but with suboptimal clen value.
663		name: "PrefixBits1",
664		input: db(`>>>
665			"BZh1"
666			> H48:314159265359 H32:b1f7404b 0 H24:0
667			< H16:0001 H16:0001
668			> D3:2 D15:1 0
669			> D5:1 100 0 110 # (1)+1=2 (2)=2 (2)-1=1
670			> D5:2 0 0 110
671			< 01 0
672			> H48:177245385090 H32:b1f7404b
673		`),
674		output: []byte{0x00},
675		inIdx:  37, outIdx: 1,
676	}, {
677		// This is invalid, clen starts at 0.
678		name: "PrefixBits2",
679		input: db(`>>>
680			"BZh1"
681			> H48:314159265359 H32:b1f7404b 0 H24:0
682			< H16:0001 H16:0001
683			> D3:2 D15:1 0
684			> D5:0 10100 0 110 # (0)+2=2 (2)=2 (2)-1=1
685			> D5:2 0 0 110
686			< 01 0
687			> H48:177245385090 H32:b1f7404b
688		`),
689		inIdx: 25, outIdx: 0,
690		errf: "IsCorrupted",
691	}, {
692		// This is valid, although suboptimal, since clen stays within bounds.
693		name: "PrefixBits3",
694		input: db(`>>>
695			"BZh1"
696			> H48:314159265359 H32:b1f7404b 0 H24:0
697			< H16:0001 H16:0001
698			> D3:2 D15:1 0
699			> D5:4 11*3 10*19 11*18 0 0 110 # (4)-3+19-18=2 (2)=2 (2)-1=1
700			> D5:2 0 0 110
701			< 01 0
702			> H48:177245385090 H32:b1f7404b
703		`),
704		output: []byte{0x00},
705		inIdx:  47, outIdx: 1,
706	}, {
707		// This is invalid since clen temporarily goes over max bits,
708		// even though the end value is still "valid".
709		name: "PrefixBits4",
710		input: db(`>>>
711			"BZh1"
712			> H48:314159265359 H32:b1f7404b 0 H24:0
713			< H16:0001 H16:0001
714			> D3:2 D15:1 0
715			> D5:4 11*3 10*20 11*19 0 0 110 # (4)-3+20-19=2 (2)=2 (2)-1=1
716			> D5:2 0 0 110
717			< 01 0
718			> H48:177245385090 H32:b1f7404b
719		`),
720		inIdx: 30, outIdx: 0,
721		errf: "IsCorrupted",
722	}, {
723		// This is invalid since clen temporarily hits zero,
724		// even though the end value is still "valid".
725		name: "PrefixBits5",
726		input: db(`>>>
727			"BZh1"
728			> H48:314159265359 H32:b1f7404b 0 H24:0
729			< H16:0001 H16:0001
730			> D3:2 D15:1 0
731			> D5:4 11*4 10*20 11*18 0 0 110 # (4)-4+20-18=2 (2)=2 (2)-1=1
732			> D5:2 0 0 110
733			< 01 0
734			> H48:177245385090 H32:b1f7404b
735		`),
736		inIdx: 26, outIdx: 0,
737		errf: "IsCorrupted",
738	}, {
739		// This is valid since clen starts at max bits and works down.
740		name: "PrefixBits6",
741		input: db(`>>>
742			"BZh1"
743			> H48:314159265359 H32:b1f7404b 0 H24:0
744			< H16:0001 H16:0001
745			> D3:2 D15:1 0
746			> D5:20 11*18 0 0 110 # (20)-18=2 (2)=2 (2)-1=1
747			> D5:2 0 0 110
748			< 01 0
749			> H48:177245385090 H32:b1f7404b
750		`),
751		output: []byte{0x00},
752		inIdx:  41, outIdx: 1,
753	}, {
754		// This is invalid because clen starts at 21, which violates max bits.
755		name: "PrefixBits7",
756		input: db(`>>>
757			"BZh1"
758			> H48:314159265359 H32:b1f7404b 0 H24:0
759			< H16:0001 H16:0001
760			> D3:2 D15:1 0
761			> D5:21 11*19 0 0 110 # (21)-19=2 (2)=2 (2)-1=1
762			> D5:2 0 0 110
763			< 01 0
764			> H48:177245385090 H32:b1f7404b
765		`),
766		inIdx: 25, outIdx: 0,
767		errf: "IsCorrupted",
768	}, {
769		// There are way more prefix symbols in this block than the format
770		// even allows. The prefix decoder should detect this cause and
771		// report the stream as corrupted.
772		name: "MaxPrefixSymbols",
773		input: db(`>>>
774			"BZh1"
775			> H48:314159265359 H32:b1f7404b 0 H24:0
776			< H16:0001 H16:0001
777			> D3:2 D15:32767 0*32767 # Define all selectors
778			> D5:1 0 100 0
779			> D5:2 0 0 110
780			< H64:0*1000000 11 # 64M symbols
781			> H48:177245385090 H32:b1f7404b
782		`),
783		inIdx: 16622, outIdx: 0,
784		errf: "IsCorrupted",
785	}, {
786		// Use of an over-subscribed tree.
787		name: "PrefixTrees1",
788		input: db(`>>>
789			"BZh1"
790			> H48:314159265359 H32:952735b9 0 H24:000000
791			< H16:0008 H16:03ff
792			> D3:2 D15:1 0
793			> D5:5 0 110 0 0 0 0 0 110 0 0 0 0
794			> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
795			< 110 0101 1101 0011 1011 0111 000 100 010 110 001
796			> H48:177245385090 H32:952735b9
797		`),
798		output: []byte("03791589269"),
799		inIdx:  44, outIdx: 11,
800	}, {
801		// Use of an under-subscribed tree.
802		name: "PrefixTrees2",
803		input: db(`>>>
804			"BZh1"
805			> H48:314159265359 H32:58fdd3b0 0 H24:000000
806			< H16:0008 H16:03ff
807			> D3:2 D15:1 0
808			> D5:5 0 0 0 0 110 0 0 110 0 0 0 0
809			> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
810			< 000 100 00111 1101 11011 10111 0101 010 0011 110 01011 001
811			> H48:177245385090 H32:58fdd3b0
812		`),
813		output: []byte("071876222607"),
814		inIdx:  45, outIdx: 12,
815	}, {
816		// Use of an under-subscribed tree and using an invalid symbol.
817		name: "PrefixTrees3",
818		input: db(`>>>
819			"BZh1"
820			> H48:314159265359 H32:58fdd3b0 0 H24:000000
821			< H16:0008 H16:03ff
822			> D3:2 D15:1 0
823			> D5:5 0 0 0 0 110 0 0 110 0 0 0 0
824			> D5:4 0 0 0 0 0 0 0 0 110 0 0 0
825			< 000 100 00111 1101 11011 10111 0101 010 0011 110 01011 1111 001
826			> H48:177245385090 H32:58fdd3b0
827		`),
828		inIdx: 35, outIdx: 0,
829		errf: "IsCorrupted",
830	}, {
831		// The BWT should be a permutation, but the use of an origin pointer
832		// means that the transformation is not a pure bijective function.
833		// Thus, there are some inputs where the input is not permuted.
834		// This test makes sure we have the same behavior as the C library.
835		// No sane encoder should ever output a BWT transform like this.
836		name: "NonReversibleBWT",
837		input: db(`>>>
838			"BZh6"
839			> H48:314159265359 H32:01007588 0 H24:000000
840			< H16:0040 H16:0006
841			> D3:2 D15:1 0
842			> D5:3 0 110 110 10100
843			> D5:2 0 0 0 0
844			< 011 011 0 0 01 0 0 01 0 0 01 0 0 01 0 111
845			> H48:177245385090 H32:01007588
846		`),
847		output: db(`>>> "a"*404`),
848		inIdx:  40, outIdx: 404,
849	}, {
850		// The next "stream" is only a single byte 0x30, which the Reader
851		// detects as being truncated since it loads 2 bytes for the magic.
852		name:  "Fuzz1",
853		input: db(`>>> > "BZh8" H48:177245385090 H32:00000000 X:30`),
854		inIdx: 14, outIdx: 0,
855		errf: "IsUnexpectedEOF", // Could be IsCorrupted
856	}, {
857		// Compared to Fuzz1, the next "stream" has 2 bytes 0x3030,
858		// which allows the Reader to properly compare with the magic header
859		// and reject the stream as invalid.
860		name:  "Fuzz2",
861		input: db(`>>> > "BZh8" H48:177245385090 H32:00000000 X:3030`),
862		inIdx: 16, outIdx: 0,
863		errf: "IsCorrupted",
864	}}
865
866	for _, v := range vectors {
867		t.Run(v.name, func(t *testing.T) {
868			rd, err := NewReader(bytes.NewReader(v.input), nil)
869			if err != nil {
870				t.Fatalf("unexpected NewReader error: %v", err)
871			}
872			output, err := ioutil.ReadAll(rd)
873			if cerr := rd.Close(); cerr != nil {
874				err = cerr
875			}
876
877			if got, want, ok := testutil.BytesCompare(output, v.output); !ok {
878				t.Errorf("output mismatch:\ngot  %s\nwant %s", got, want)
879			}
880			if rd.InputOffset != v.inIdx {
881				t.Errorf("input offset mismatch: got %d, want %d", rd.InputOffset, v.inIdx)
882			}
883			if rd.OutputOffset != v.outIdx {
884				t.Errorf("output offset mismatch: got %d, want %d", rd.OutputOffset, v.outIdx)
885			}
886			if v.errf != "" && !errFuncs[v.errf](err) {
887				t.Errorf("mismatching error:\ngot %v\nwant %s(err) == true", err, v.errf)
888			} else if v.errf == "" && err != nil {
889				t.Errorf("unexpected error: got %v", err)
890			}
891
892			// If the zcheck flag is set, then we verify that the test vectors
893			// themselves are consistent with what the C bzip2 library outputs.
894			if *zcheck {
895				output, err := cmdDecompress(v.input)
896				if got, want := bool(v.errf == ""), bool(err == nil); got != want {
897					t.Errorf("pass mismatch: got %v, want %v", got, err)
898				}
899				if got, want, ok := testutil.BytesCompare(output, v.output); !ok && err == nil {
900					t.Errorf("output mismatch:\ngot  %s\nwant %s", got, want)
901				}
902			}
903		})
904	}
905}
906
907func BenchmarkDecode(b *testing.B) {
908	runBenchmarks(b, func(b *testing.B, data []byte, lvl int) {
909		b.StopTimer()
910		b.ReportAllocs()
911
912		buf := new(bytes.Buffer)
913		wr, _ := NewWriter(buf, &WriterConfig{Level: lvl})
914		wr.Write(data)
915		wr.Close()
916
917		br := new(bytes.Reader)
918		rd := new(Reader)
919
920		b.SetBytes(int64(len(data)))
921		b.StartTimer()
922		for i := 0; i < b.N; i++ {
923			br.Reset(buf.Bytes())
924			rd.Reset(br)
925
926			n, err := io.Copy(ioutil.Discard, rd)
927			if n != int64(len(data)) || err != nil {
928				b.Fatalf("Copy() = (%d, %v), want (%d, nil)", n, err, len(data))
929			}
930			if err := rd.Close(); err != nil {
931				b.Fatalf("Close() = %v, want nil", err)
932			}
933		}
934	})
935}
936