1// Copyright 2015, Joe Tsai. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE.md file. 4 5package bzip2 6 7import ( 8 "bytes" 9 "io" 10 "io/ioutil" 11 "testing" 12 13 "github.com/dsnet/compress/internal/errors" 14 "github.com/dsnet/compress/internal/testutil" 15) 16 17func TestReader(t *testing.T) { 18 db := testutil.MustDecodeBitGen 19 20 errFuncs := map[string]func(error) bool{ 21 "IsUnexpectedEOF": func(err error) bool { return err == io.ErrUnexpectedEOF }, 22 "IsCorrupted": errors.IsCorrupted, 23 "IsDeprecated": errors.IsDeprecated, 24 } 25 vectors := []struct { 26 name string // Sub-test name 27 input []byte // Test input string 28 output []byte // Expected output string 29 inIdx int64 // Expected input offset after reading 30 outIdx int64 // Expected output offset after reading 31 errf string // Name of error checking callback 32 }{{ 33 name: "EmptyString", 34 errf: "IsUnexpectedEOF", 35 }, { 36 name: "EmptyOutput", 37 input: db(`>>> > "BZh9" H48:177245385090 H32:00000000`), 38 inIdx: 14, 39 }, { 40 name: "EmptyOutput9S", 41 input: db(`>>> > 42 "BZh1" H48:177245385090 H32:00000000 43 "BZh2" H48:177245385090 H32:00000000 44 "BZh3" H48:177245385090 H32:00000000 45 "BZh4" H48:177245385090 H32:00000000 46 "BZh5" H48:177245385090 H32:00000000 47 "BZh6" H48:177245385090 H32:00000000 48 "BZh7" H48:177245385090 H32:00000000 49 "BZh8" H48:177245385090 H32:00000000 50 "BZh9" H48:177245385090 H32:00000000 51 `), 52 inIdx: 14 * 9, outIdx: 0, 53 }, { 54 name: "InvalidStreamMagic", 55 input: db(`>>> > "XX"`), 56 inIdx: 2, outIdx: 0, 57 errf: "IsCorrupted", 58 }, { 59 name: "InvalidVersion", 60 input: db(`>>> > "BZX1"`), 61 inIdx: 3, outIdx: 0, 62 errf: "IsCorrupted", 63 }, { 64 name: "DeprecatedVersion", 65 input: db(`>>> > "BZ01"`), 66 inIdx: 3, outIdx: 0, 67 errf: "IsDeprecated", 68 }, { 69 name: "InvalidLevel", 70 input: db(`>>> > "BZh0"`), 71 inIdx: 4, outIdx: 0, 72 errf: "IsCorrupted", 73 }, { 74 name: "InvalidBlockMagic", 75 input: db(`>>> > "BZh9" H48:000000000000`), 76 inIdx: 10, outIdx: 0, 77 errf: "IsCorrupted", 78 }, { 79 name: "DeprecatedRandomization", 80 input: db(`>>> > "BZh9" H48:314159265359 H32:8e9a7706 1 H24:0`), 81 inIdx: 15, outIdx: 0, 82 errf: "IsDeprecated", 83 }, { 84 name: "Truncated1", 85 input: db(`>>> "BZh9"`), 86 inIdx: 4, outIdx: 0, 87 errf: "IsUnexpectedEOF", 88 }, { 89 name: "Truncated2", 90 input: db(`>>> > "BZh9" H40:3141592653`), 91 inIdx: 8, outIdx: 0, 92 errf: "IsUnexpectedEOF", 93 }, { 94 name: "Truncated3", 95 input: db(`>>> > "BZh9" H48:314159265359`), 96 inIdx: 10, outIdx: 0, 97 errf: "IsUnexpectedEOF", 98 }, { 99 name: "Truncated4", 100 input: db(`>>> > "BZh9" H48:314159265359 H16:8e9a`), 101 inIdx: 10, outIdx: 0, 102 errf: "IsUnexpectedEOF", 103 }, { 104 name: "Truncated5", 105 input: db(`>>> > "BZh9" H48:314159265359 H32:8e9a7706`), 106 inIdx: 14, outIdx: 0, 107 errf: "IsUnexpectedEOF", 108 }, { 109 name: "Truncated6", 110 input: db(`>>> > "BZh9" H48:314159265359 H32:8e9a7706 0 H24:3`), 111 inIdx: 18, outIdx: 0, 112 errf: "IsUnexpectedEOF", 113 }, { 114 name: "Truncated7", 115 input: db(`>>> > "BZh9" H48:314159265359 H32:8e9a7706 0 H24:3 < H16:00d4 H16:1003`), 116 inIdx: 22, outIdx: 0, 117 errf: "IsUnexpectedEOF", 118 }, { 119 name: "Truncated8", 120 input: db(`>>> 121 "BZh9" 122 > H48:314159265359 H32:8e9a7706 0 H24:3 123 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 124 > D3:2 D15:1 0 125 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 126 `), 127 inIdx: 33, outIdx: 0, 128 errf: "IsUnexpectedEOF", 129 }, { 130 name: "Truncated9", 131 input: db(`>>> 132 "BZh9" 133 > H48:314159265359 H32:8e9a7706 0 H24:3 134 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 135 > D3:2 D15:1 0 136 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 137 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 138 < 1101 000 100 000 100 0111 010 010 139 `), 140 inIdx: 39, outIdx: 0, 141 errf: "IsUnexpectedEOF", 142 }, { 143 name: "Truncated10", 144 input: db(`>>> 145 "BZh9" 146 > H48:314159265359 H32:8e9a7706 0 H24:3 147 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 148 > D3:2 D15:1 0 149 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 150 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 151 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 152 `), 153 output: []byte("Hello, world!"), 154 inIdx: 41, outIdx: 13, 155 errf: "IsUnexpectedEOF", 156 }, { 157 name: "HelloWorld", 158 input: db(`>>> 159 "BZh9" 160 > H48:314159265359 H32:8e9a7706 0 H24:3 161 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 162 > D3:2 D15:1 0 163 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 164 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 165 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 166 > H48:177245385090 H32:8e9a7706 167 `), 168 output: []byte("Hello, world!"), 169 inIdx: 51, outIdx: 13, 170 }, { 171 name: "HelloWorld2B", 172 input: db(`>>> 173 "BZh9" 174 175 ( # Two blocks 176 > H48:314159265359 H32:8e9a7706 0 H24:3 177 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 178 > D3:2 D15:1 0 179 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 180 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 181 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 182 )*2 183 184 > H48:177245385090 H32:93ae990b 185 `), 186 output: db(`>>> "Hello, world!"*2`), 187 inIdx: 51*2 - 4 - 10, outIdx: 13 * 2, 188 }, { 189 name: "HelloWorld2S", 190 input: db(`>>> 191 ( # Two streams 192 "BZh9" 193 194 > H48:314159265359 H32:8e9a7706 0 H24:3 195 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 196 > D3:2 D15:1 0 197 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 198 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 199 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 200 201 > H48:177245385090 H32:8e9a7706 202 )*2 203 `), 204 output: db(`>>> "Hello, world!"*2`), 205 inIdx: 51 * 2, outIdx: 13 * 2, 206 }, { 207 name: "Banana0", 208 input: db(`>>> 209 > "BZh1" H48:314159265359 H32:87f465d8 0 H24:0 210 < H16:0050 H16:0004 H16:4002 211 > D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0 212 < 1111 0 01 0 0 01 011 213 > H48:177245385090 H32:87f465d8 214 `), 215 output: []byte("Banana"), 216 inIdx: 42, outIdx: 6, 217 }, { 218 name: "Banana1", 219 input: db(`>>> 220 > "BZh1" H48:314159265359 H32:71d297e8 0 H24:1 221 < H16:0050 H16:0004 H16:4002 222 > D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0 223 < 1111 0 01 0 0 01 011 224 > H48:177245385090 H32:71d297e8 225 `), 226 output: []byte("aBanan"), 227 inIdx: 42, outIdx: 6, 228 }, { 229 name: "Banana2", 230 input: db(`>>> 231 > "BZh1" H48:314159265359 H32:21185406 0 H24:2 232 < H16:0050 H16:0004 H16:4002 233 > D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0 234 < 1111 0 01 0 0 01 011 235 > H48:177245385090 H32:21185406 236 `), 237 output: []byte("anaBan"), 238 inIdx: 42, outIdx: 6, 239 }, { 240 name: "Banana3", 241 input: db(`>>> 242 > "BZh1" H48:314159265359 H32:be853f46 0 H24:3 243 < H16:0050 H16:0004 H16:4002 244 > D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0 245 < 1111 0 01 0 0 01 011 246 > H48:177245385090 H32:be853f46 247 `), 248 output: []byte("ananaB"), 249 inIdx: 42, outIdx: 6, 250 }, { 251 name: "Banana4", 252 input: db(`>>> 253 > "BZh1" H48:314159265359 H32:35a020df 0 H24:4 254 < H16:0050 H16:0004 H16:4002 255 > D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0 256 < 1111 0 01 0 0 01 011 257 > H48:177245385090 H32:35a020df 258 `), 259 output: []byte("naBana"), 260 inIdx: 42, outIdx: 6, 261 }, { 262 name: "Banana5", 263 input: db(`>>> 264 > "BZh1" H48:314159265359 H32:b599e6fc 0 H24:5 265 < H16:0050 H16:0004 H16:4002 266 > D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0 267 < 1111 0 01 0 0 01 011 268 > H48:177245385090 H32:b599e6fc 269 `), 270 output: []byte("nanaBa"), 271 inIdx: 42, outIdx: 6, 272 }, { 273 // This is invalid since the BWT pointer exceeds the block size. 274 name: "Banana6", 275 input: db(`>>> 276 > "BZh1" H48:314159265359 H32:87f465d8 0 H24:6 277 < H16:0050 H16:0004 H16:4002 278 > D3:2 D15:1 0 D5:2 0 10100 0 1111110 10100 D5:3 0 0 110 0 0 279 < 1111 0 01 0 0 01 011 280 > H48:177245385090 H32:87f465d8 281 `), 282 inIdx: 42 - 10, outIdx: 0, 283 errf: "IsCorrupted", 284 }, { 285 // There must be between 2..6 trees, inclusive. This test uses only 1. 286 name: "MinTrees", 287 input: db(`>>> 288 "BZh1" 289 > H48:314159265359 H32:8e9a7706 0 H24:3 290 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 291 > D3:1 D15:1 0 292 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 293 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 294 > H48:177245385090 H32:8e9a7706 295 `), 296 inIdx: 28, outIdx: 0, 297 errf: "IsCorrupted", 298 }, { 299 // Define more trees than allowed. The test uses 7. 300 name: "MaxTrees", 301 input: db(`>>> 302 "BZh1" 303 > H48:314159265359 H32:8e9a7706 0 H24:3 304 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 305 > D3:7 D15:1 0 306 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 307 >(D5:4 0 0 0 0 0 0 0 0 110 0 0 0)*6 308 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 309 > H48:177245385090 H32:8e9a7706 310 `), 311 inIdx: 28, outIdx: 0, 312 errf: "IsCorrupted", 313 }, { 314 // Define more trees and selectors than actually used. 315 name: "SuboptimalTrees", 316 input: db(`>>> 317 "BZh1" 318 > H48:314159265359 H32:8e9a7706 0 H24:3 319 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 320 > D3:6 D15:12 111110 11110 1110 110 10 0 111110 11110 1110 110 10 0 321 >(D5:4 0 0 0 0 0 0 0 0 110 0 0 0)*5 322 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 323 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 324 > H48:177245385090 H32:8e9a7706 325 `), 326 output: []byte("Hello, world!"), 327 inIdx: 66, outIdx: 13, 328 }, { 329 // Do not define any tree selectors. This should fail when decoding 330 // the prefix codes later on. 331 name: "MinTreeSels", 332 input: db(`>>> 333 "BZh1" 334 > H48:314159265359 H32:8e9a7706 0 H24:3 335 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 336 > D3:2 D15:0 # No selectors defined 337 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 338 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 339 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 340 > H48:177245385090 H32:8e9a7706 341 `), 342 inIdx: 35, outIdx: 0, 343 errf: "IsCorrupted", 344 }, { 345 // Define up to 32767 tree selectors, even though only 1 is used. 346 name: "MaxTreeSels", 347 input: db(`>>> 348 "BZh1" 349 > H48:314159265359 H32:8e9a7706 0 H24:3 350 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 351 > D3:2 D15:32767 0*32767 # Define all selectors 352 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 353 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 354 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 355 > H48:177245385090 H32:8e9a7706 356 `), 357 output: []byte("Hello, world!"), 358 inIdx: 4147, outIdx: 13, 359 }, { 360 name: "InvalidTreeSels1", 361 input: db(`>>> 362 "BZh1" 363 > H48:314159265359 H32:8e9a7706 0 H24:3 364 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 365 > D3:2 D15:1 110 # Select tree2, which does not exist 366 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 367 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 368 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 369 > H48:177245385090 H32:8e9a7706 370 `), 371 inIdx: 30, outIdx: 0, 372 errf: "IsCorrupted", 373 }, { 374 name: "InvalidTreeSels2", 375 input: db(`>>> 376 "BZh1" 377 > H48:314159265359 H32:8e9a7706 0 H24:3 378 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 379 > D3:6 D15:1 111111 # Select tree6, which is invalid 380 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 381 >(D5:4 0 0 0 0 0 0 0 0 110 0 0 0)*5 382 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 383 > H48:177245385090 H32:8e9a7706 384 `), 385 inIdx: 31, outIdx: 0, 386 errf: "IsCorrupted", 387 }, { 388 name: "JunkPadding", 389 input: db(`>>> 390 "BZh1" 391 > H48:314159265359 H32:b1f7404b 0 H24:0 392 < H16:0001 H16:0001 393 > D3:2 D15:1 0 D5:2 0 0 110 D5:2 0 0 110 394 < 01 0 395 > H48:177245385090 H32:b1f7404b 10101 # Non-zero padding bits 396 `), 397 output: []byte{0x00}, 398 inIdx: 37, outIdx: 1, 399 }, { 400 name: "MinSymMap", 401 input: db(`>>> 402 "BZh1" 403 > H48:314159265359 H32:b1f7404b 0 H24:0 404 < H16:0001 H16:0001 # Only one symbol used 405 > D3:2 D15:1 0 406 >(D5:2 0 0 110)*2 407 < 01 0 408 > H48:177245385090 H32:b1f7404b 409 `), 410 output: []byte{0x00}, 411 inIdx: 37, outIdx: 1, 412 }, { 413 // This block satisfies the minimum of 3 symbols for prefix encoding. 414 // The data section terminates immediately with an EOF symbol. 415 // However, this is still invalid since a BWT pointer of 0 >= 0. 416 name: "EmptyBlock", 417 input: db(`>>> 418 "BZh1" 419 > H48:314159265359 H32:00000000 0 H24:0 # BWT pointer of 0 420 < H16:0001 H16:0001 # Only one symbol used 421 > D3:2 D15:1 0 422 >(D5:2 0 0 110)*2 423 < 0 # Data ends immediately with EOF 424 > H48:177245385090 H32:00000000 425 `), 426 inIdx: 27, outIdx: 0, 427 errf: "IsCorrupted", 428 }, { 429 // The high-order symbol map says that all groups have symbols, 430 // but only the first group indicates any symbols are set. 431 name: "SuboptimalSymMap1", 432 input: db(`>>> 433 "BZh1" 434 > H48:314159265359 H32:b1f7404b 0 H24:0 435 < H16:ffff H16:0001 H16:0000*15 # All groups used, only one symbol used 436 > D3:2 D15:1 0 437 >(D5:2 0 0 110)*2 438 < 01 0 439 > H48:177245385090 H32:b1f7404b 440 `), 441 output: []byte{0x00}, 442 inIdx: 67, outIdx: 1, 443 }, { 444 // The symbol map declares that all symbols are used, even though 445 // only one is actually used. 446 name: "SuboptimalSymMap2", 447 input: db(`>>> 448 "BZh1" 449 > H48:314159265359 H32:b1f7404b 0 H24:0 450 < H16:ffff*17 # All symbols used 451 > D3:2 D15:1 0 452 > D5:2 0 10101010101010100 0*255 1111111111111111110 453 > D5:9 0*4 110 0*253 454 < 01 0 455 > H48:177245385090 H32:b1f7404b 456 `), 457 output: []byte{0x00}, 458 inIdx: 135, outIdx: 1, 459 }, { 460 // It is impossible for the format to encode a block with no symbols 461 // since at least one symbol must exist for the EOF symbol. 462 name: "InvalidSymMap", 463 input: db(`>>> 464 "BZh1" 465 > H48:314159265359 H32:b1f7404b 0 H24:0 466 < H16:0000 # Need at least one symbol 467 `), 468 inIdx: 20, outIdx: 0, 469 errf: "IsCorrupted", // Should not be IsUnexpectedEOF 470 }, { 471 name: "InvalidBlockChecksum", 472 input: db(`>>> 473 "BZh9" 474 > H48:314159265359 H32:00000000 0 H24:3 # Bad checksum 475 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 476 > D3:2 D15:1 0 477 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 478 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 479 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 480 > H48:177245385090 H32:8e9a7706 481 `), 482 output: []byte("Hello, world!"), 483 inIdx: 51 - 10, outIdx: 13, 484 errf: "IsCorrupted", 485 }, { 486 name: "InvalidStreamChecksum", 487 input: db(`>>> 488 "BZh9" 489 > H48:314159265359 H32:8e9a7706 0 H24:3 490 < H16:00d4 H16:1003 H16:0100 H16:9030 H16:0084 491 > D3:2 D15:1 0 492 > D5:4 0 0 0 0 0 110 100 0 110 0 0 100 493 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 494 < 1101 000 100 000 100 0111 010 010 0011 0001 110 0111 110 1111 495 > H48:177245385090 H32:00000000 # Bad checksum 496 `), 497 output: []byte("Hello, world!"), 498 inIdx: 51, outIdx: 13, 499 errf: "IsCorrupted", 500 }, { 501 // RLE1 stage with maximum repeater length. 502 name: "RLE1-1", 503 input: db(`>>> 504 "BZh1" 505 > H48:314159265359 H32:e1fac440 0 H24:0 506 < H16:8010 H16:0002 H16:8000 507 > D3:2 D15:1 0 508 > D5:2 0 100 11110 10100 509 > D5:2 0 0 0 0 510 < 0 0 01 01 111 # Pre-RLE1: "AAAA\xff" 511 > H48:177245385090 H32:e1fac440 512 `), 513 output: db(`>>> X:41*259`), 514 inIdx: 41, outIdx: 259, 515 }, { 516 // RLE1 stage with minimum repeater length. 517 name: "RLE1-2", 518 input: db(`>>> 519 "BZh1" 520 > H48:314159265359 H32:e16e6571 0 H24:4 521 < H16:0011 H16:0001 H16:0002 522 > D3:2 D15:1 0 523 > D5:2 0 100 11110 10100 524 > D5:2 0 0 0 0 525 < 0 01 01 0 111 # Pre-RLE1: "AAAA\x00" 526 > H48:177245385090 H32:e16e6571 527 `), 528 output: db(`>>> X:41*4`), 529 inIdx: 41, outIdx: 4, 530 }, { 531 // RLE1 stage with missing repeater value. 532 name: "RLE1-3", 533 input: db(`>>> 534 "BZh1" 535 > H48:314159265359 H32:e16e6571 0 H24:3 536 < H16:0010 H16:0002 537 > D3:2 D15:1 0 538 >(D5:2 0 0 110)*2 539 < 11 01 0 # Pre-RLE1: "AAAA" 540 > H48:177245385090 H32:e16e6571 541 `), 542 output: db(`>>> X:41*4`), 543 inIdx: 37 - 10, outIdx: 4, 544 errf: "IsCorrupted", 545 }, { 546 // RLE1 stage with sub-optimal repeater usage. 547 name: "RLE1-4", 548 input: db(`>>> 549 "BZh1" 550 > H48:314159265359 H32:f59a903a 0 H24:9 551 < H16:0011 H16:0001 H16:0002 552 > D3:2 D15:1 0 553 > D5:1 0 10100 110 100 554 > D5:2 0 0 0 0 555 < 01 0 0 0 01 0 111 # Pre-RLE1: "AAAA\x00AAAA\x00" 556 > H48:177245385090 H32:f59a903a 557 `), 558 output: db(`>>> X:41*8`), 559 inIdx: 41, outIdx: 8, 560 }, { 561 // RLE1 stage with sub-optimal repeater usage. 562 name: "RLE1-5", 563 input: db(`>>> 564 "BZh1" 565 > H48:314159265359 H32:f59a903a 0 H24:4 566 < H16:0011 H16:0002 H16:0002 567 > D3:2 D15:1 0 568 > D5:3 0 110 110 10100 569 > D5:2 0 0 0 0 570 < 0 01 01 0 111 # Pre-RLE1: "AAAA\x01AAA" 571 > H48:177245385090 H32:f59a903a 572 `), 573 output: db(`>>> X:41*8`), 574 inIdx: 40, outIdx: 8, 575 }, { 576 name: "RLE2-1", 577 input: db(`>>> 578 "BZh1" 579 > H48:314159265359 H32:6b4f087c 0 H24:000000 580 < H16:0040 H16:0006 581 > D3:2 D15:1 0 582 > D5:1 0 100 100 0 583 > D5:2 0 0 0 0 584 < 01 0 0 0 0 01 0 01 0 01 01 0 0 0 0 01 111 # a*100k 585 > H48:177245385090 H32:6b4f087c 586 `), 587 output: db(`>>> "a"*2020000`), 588 inIdx: 40, outIdx: 2020000, 589 }, { 590 name: "RLE2-2", 591 input: db(`>>> 592 "BZh1" 593 > H48:314159265359 H32:d175ea9d 0 H24:000000 594 < H16:0040 H16:0006 595 > D3:2 D15:1 0 596 > D5:1 0 100 100 0 597 > D5:2 0 0 0 0 598 < 0 01 0 0 0 01 0 01 0 01 01 0 0 0 0 01 111 # a*(100k+1) 599 > H48:177245385090 H32:d175ea9d 600 `), 601 inIdx: 40 - 10, outIdx: 0, 602 errf: "IsCorrupted", 603 }, { 604 name: "RLE2-3", 605 input: db(`>>> 606 "BZh1" 607 > H48:314159265359 H32:6b4f087c 0 H24:000000 608 < H16:0040 H16:0006 609 > D3:2 D15:1 0 610 > D5:1 0 100 100 0 611 > D5:2 0 0 0 0 612 < 0 0 0 0 0 01 0 01 0 01 01 0 0 0 0 01 011 111 # a*(100k-1) b 613 > H48:177245385090 H32:6b4f087c 614 `), 615 output: db(`>>> "a"*2020000`), 616 inIdx: 40, outIdx: 2020000, 617 }, { 618 name: "RLE2-4", 619 input: db(`>>> 620 "BZh1" 621 > H48:314159265359 H32:d175ea9d 0 H24:000000 622 < H16:0040 H16:0006 623 > D3:2 D15:1 0 624 > D5:1 0 100 100 0 625 > D5:2 0 0 0 0 626 < 0 0 0 0 0 01 0 01 0 01 01 0 0 0 0 01 011 011 111 # a*(100k-1) b a 627 > H48:177245385090 H32:d175ea9d 628 `), 629 inIdx: 40 - 10, outIdx: 0, 630 errf: "IsCorrupted", 631 }, { 632 name: "RLE2-5", 633 input: db(`>>> 634 "BZh1" 635 > H48:314159265359 H32:79235035 0 H24:000000 636 < H16:0040 H16:0006 637 > D3:2 D15:1 0 638 > D5:1 0 100 100 0 639 > D5:2 0 0 0 0 640 < 0 0 0 0 0 01 0 01 0 01 01 0 0 0 0 01 011 0 011 111 # a*(100k-1) b*2 a 641 > H48:177245385090 H32:79235035 642 `), 643 inIdx: 41 - 10, outIdx: 0, 644 errf: "IsCorrupted", 645 }, { 646 // This input has a sequence of RUNA and RUNB symbols that tries to 647 // cause an integer overflow in the numeral decoding. 648 name: "RLE2-6", 649 input: db(`>>> 650 "BZh1" 651 > H48:314159265359 H32:6b4f087c 0 H24:000000 652 < H16:0040 H16:0006 653 > D3:2 D15:1 0 654 > D5:1 0 100 100 0 655 > D5:2 0 0 0 0 656 < 0*32 111 657 > H48:177245385090 H32:6b4f087c 658 `), 659 inIdx: 41 - 10, outIdx: 0, 660 errf: "IsCorrupted", 661 }, { 662 // This is valid, but with suboptimal clen value. 663 name: "PrefixBits1", 664 input: db(`>>> 665 "BZh1" 666 > H48:314159265359 H32:b1f7404b 0 H24:0 667 < H16:0001 H16:0001 668 > D3:2 D15:1 0 669 > D5:1 100 0 110 # (1)+1=2 (2)=2 (2)-1=1 670 > D5:2 0 0 110 671 < 01 0 672 > H48:177245385090 H32:b1f7404b 673 `), 674 output: []byte{0x00}, 675 inIdx: 37, outIdx: 1, 676 }, { 677 // This is invalid, clen starts at 0. 678 name: "PrefixBits2", 679 input: db(`>>> 680 "BZh1" 681 > H48:314159265359 H32:b1f7404b 0 H24:0 682 < H16:0001 H16:0001 683 > D3:2 D15:1 0 684 > D5:0 10100 0 110 # (0)+2=2 (2)=2 (2)-1=1 685 > D5:2 0 0 110 686 < 01 0 687 > H48:177245385090 H32:b1f7404b 688 `), 689 inIdx: 25, outIdx: 0, 690 errf: "IsCorrupted", 691 }, { 692 // This is valid, although suboptimal, since clen stays within bounds. 693 name: "PrefixBits3", 694 input: db(`>>> 695 "BZh1" 696 > H48:314159265359 H32:b1f7404b 0 H24:0 697 < H16:0001 H16:0001 698 > D3:2 D15:1 0 699 > D5:4 11*3 10*19 11*18 0 0 110 # (4)-3+19-18=2 (2)=2 (2)-1=1 700 > D5:2 0 0 110 701 < 01 0 702 > H48:177245385090 H32:b1f7404b 703 `), 704 output: []byte{0x00}, 705 inIdx: 47, outIdx: 1, 706 }, { 707 // This is invalid since clen temporarily goes over max bits, 708 // even though the end value is still "valid". 709 name: "PrefixBits4", 710 input: db(`>>> 711 "BZh1" 712 > H48:314159265359 H32:b1f7404b 0 H24:0 713 < H16:0001 H16:0001 714 > D3:2 D15:1 0 715 > D5:4 11*3 10*20 11*19 0 0 110 # (4)-3+20-19=2 (2)=2 (2)-1=1 716 > D5:2 0 0 110 717 < 01 0 718 > H48:177245385090 H32:b1f7404b 719 `), 720 inIdx: 30, outIdx: 0, 721 errf: "IsCorrupted", 722 }, { 723 // This is invalid since clen temporarily hits zero, 724 // even though the end value is still "valid". 725 name: "PrefixBits5", 726 input: db(`>>> 727 "BZh1" 728 > H48:314159265359 H32:b1f7404b 0 H24:0 729 < H16:0001 H16:0001 730 > D3:2 D15:1 0 731 > D5:4 11*4 10*20 11*18 0 0 110 # (4)-4+20-18=2 (2)=2 (2)-1=1 732 > D5:2 0 0 110 733 < 01 0 734 > H48:177245385090 H32:b1f7404b 735 `), 736 inIdx: 26, outIdx: 0, 737 errf: "IsCorrupted", 738 }, { 739 // This is valid since clen starts at max bits and works down. 740 name: "PrefixBits6", 741 input: db(`>>> 742 "BZh1" 743 > H48:314159265359 H32:b1f7404b 0 H24:0 744 < H16:0001 H16:0001 745 > D3:2 D15:1 0 746 > D5:20 11*18 0 0 110 # (20)-18=2 (2)=2 (2)-1=1 747 > D5:2 0 0 110 748 < 01 0 749 > H48:177245385090 H32:b1f7404b 750 `), 751 output: []byte{0x00}, 752 inIdx: 41, outIdx: 1, 753 }, { 754 // This is invalid because clen starts at 21, which violates max bits. 755 name: "PrefixBits7", 756 input: db(`>>> 757 "BZh1" 758 > H48:314159265359 H32:b1f7404b 0 H24:0 759 < H16:0001 H16:0001 760 > D3:2 D15:1 0 761 > D5:21 11*19 0 0 110 # (21)-19=2 (2)=2 (2)-1=1 762 > D5:2 0 0 110 763 < 01 0 764 > H48:177245385090 H32:b1f7404b 765 `), 766 inIdx: 25, outIdx: 0, 767 errf: "IsCorrupted", 768 }, { 769 // There are way more prefix symbols in this block than the format 770 // even allows. The prefix decoder should detect this cause and 771 // report the stream as corrupted. 772 name: "MaxPrefixSymbols", 773 input: db(`>>> 774 "BZh1" 775 > H48:314159265359 H32:b1f7404b 0 H24:0 776 < H16:0001 H16:0001 777 > D3:2 D15:32767 0*32767 # Define all selectors 778 > D5:1 0 100 0 779 > D5:2 0 0 110 780 < H64:0*1000000 11 # 64M symbols 781 > H48:177245385090 H32:b1f7404b 782 `), 783 inIdx: 16622, outIdx: 0, 784 errf: "IsCorrupted", 785 }, { 786 // Use of an over-subscribed tree. 787 name: "PrefixTrees1", 788 input: db(`>>> 789 "BZh1" 790 > H48:314159265359 H32:952735b9 0 H24:000000 791 < H16:0008 H16:03ff 792 > D3:2 D15:1 0 793 > D5:5 0 110 0 0 0 0 0 110 0 0 0 0 794 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 795 < 110 0101 1101 0011 1011 0111 000 100 010 110 001 796 > H48:177245385090 H32:952735b9 797 `), 798 output: []byte("03791589269"), 799 inIdx: 44, outIdx: 11, 800 }, { 801 // Use of an under-subscribed tree. 802 name: "PrefixTrees2", 803 input: db(`>>> 804 "BZh1" 805 > H48:314159265359 H32:58fdd3b0 0 H24:000000 806 < H16:0008 H16:03ff 807 > D3:2 D15:1 0 808 > D5:5 0 0 0 0 110 0 0 110 0 0 0 0 809 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 810 < 000 100 00111 1101 11011 10111 0101 010 0011 110 01011 001 811 > H48:177245385090 H32:58fdd3b0 812 `), 813 output: []byte("071876222607"), 814 inIdx: 45, outIdx: 12, 815 }, { 816 // Use of an under-subscribed tree and using an invalid symbol. 817 name: "PrefixTrees3", 818 input: db(`>>> 819 "BZh1" 820 > H48:314159265359 H32:58fdd3b0 0 H24:000000 821 < H16:0008 H16:03ff 822 > D3:2 D15:1 0 823 > D5:5 0 0 0 0 110 0 0 110 0 0 0 0 824 > D5:4 0 0 0 0 0 0 0 0 110 0 0 0 825 < 000 100 00111 1101 11011 10111 0101 010 0011 110 01011 1111 001 826 > H48:177245385090 H32:58fdd3b0 827 `), 828 inIdx: 35, outIdx: 0, 829 errf: "IsCorrupted", 830 }, { 831 // The BWT should be a permutation, but the use of an origin pointer 832 // means that the transformation is not a pure bijective function. 833 // Thus, there are some inputs where the input is not permuted. 834 // This test makes sure we have the same behavior as the C library. 835 // No sane encoder should ever output a BWT transform like this. 836 name: "NonReversibleBWT", 837 input: db(`>>> 838 "BZh6" 839 > H48:314159265359 H32:01007588 0 H24:000000 840 < H16:0040 H16:0006 841 > D3:2 D15:1 0 842 > D5:3 0 110 110 10100 843 > D5:2 0 0 0 0 844 < 011 011 0 0 01 0 0 01 0 0 01 0 0 01 0 111 845 > H48:177245385090 H32:01007588 846 `), 847 output: db(`>>> "a"*404`), 848 inIdx: 40, outIdx: 404, 849 }, { 850 // The next "stream" is only a single byte 0x30, which the Reader 851 // detects as being truncated since it loads 2 bytes for the magic. 852 name: "Fuzz1", 853 input: db(`>>> > "BZh8" H48:177245385090 H32:00000000 X:30`), 854 inIdx: 14, outIdx: 0, 855 errf: "IsUnexpectedEOF", // Could be IsCorrupted 856 }, { 857 // Compared to Fuzz1, the next "stream" has 2 bytes 0x3030, 858 // which allows the Reader to properly compare with the magic header 859 // and reject the stream as invalid. 860 name: "Fuzz2", 861 input: db(`>>> > "BZh8" H48:177245385090 H32:00000000 X:3030`), 862 inIdx: 16, outIdx: 0, 863 errf: "IsCorrupted", 864 }} 865 866 for _, v := range vectors { 867 t.Run(v.name, func(t *testing.T) { 868 rd, err := NewReader(bytes.NewReader(v.input), nil) 869 if err != nil { 870 t.Fatalf("unexpected NewReader error: %v", err) 871 } 872 output, err := ioutil.ReadAll(rd) 873 if cerr := rd.Close(); cerr != nil { 874 err = cerr 875 } 876 877 if got, want, ok := testutil.BytesCompare(output, v.output); !ok { 878 t.Errorf("output mismatch:\ngot %s\nwant %s", got, want) 879 } 880 if rd.InputOffset != v.inIdx { 881 t.Errorf("input offset mismatch: got %d, want %d", rd.InputOffset, v.inIdx) 882 } 883 if rd.OutputOffset != v.outIdx { 884 t.Errorf("output offset mismatch: got %d, want %d", rd.OutputOffset, v.outIdx) 885 } 886 if v.errf != "" && !errFuncs[v.errf](err) { 887 t.Errorf("mismatching error:\ngot %v\nwant %s(err) == true", err, v.errf) 888 } else if v.errf == "" && err != nil { 889 t.Errorf("unexpected error: got %v", err) 890 } 891 892 // If the zcheck flag is set, then we verify that the test vectors 893 // themselves are consistent with what the C bzip2 library outputs. 894 if *zcheck { 895 output, err := cmdDecompress(v.input) 896 if got, want := bool(v.errf == ""), bool(err == nil); got != want { 897 t.Errorf("pass mismatch: got %v, want %v", got, err) 898 } 899 if got, want, ok := testutil.BytesCompare(output, v.output); !ok && err == nil { 900 t.Errorf("output mismatch:\ngot %s\nwant %s", got, want) 901 } 902 } 903 }) 904 } 905} 906 907func BenchmarkDecode(b *testing.B) { 908 runBenchmarks(b, func(b *testing.B, data []byte, lvl int) { 909 b.StopTimer() 910 b.ReportAllocs() 911 912 buf := new(bytes.Buffer) 913 wr, _ := NewWriter(buf, &WriterConfig{Level: lvl}) 914 wr.Write(data) 915 wr.Close() 916 917 br := new(bytes.Reader) 918 rd := new(Reader) 919 920 b.SetBytes(int64(len(data))) 921 b.StartTimer() 922 for i := 0; i < b.N; i++ { 923 br.Reset(buf.Bytes()) 924 rd.Reset(br) 925 926 n, err := io.Copy(ioutil.Discard, rd) 927 if n != int64(len(data)) || err != nil { 928 b.Fatalf("Copy() = (%d, %v), want (%d, nil)", n, err, len(data)) 929 } 930 if err := rd.Close(); err != nil { 931 b.Fatalf("Close() = %v, want nil", err) 932 } 933 } 934 }) 935} 936