1// Copyright 2015, Joe Tsai. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE.md file. 4 5package bzip2 6 7import ( 8 "io" 9 10 "github.com/dsnet/compress/internal" 11 "github.com/dsnet/compress/internal/errors" 12 "github.com/dsnet/compress/internal/prefix" 13) 14 15type Reader struct { 16 InputOffset int64 // Total number of bytes read from underlying io.Reader 17 OutputOffset int64 // Total number of bytes emitted from Read 18 19 rd prefixReader 20 err error 21 level int // The current compression level 22 rdHdrFtr int // Number of times we read the stream header and footer 23 blkCRC uint32 // CRC-32 IEEE of each block (as stored) 24 endCRC uint32 // Checksum of all blocks using bzip2's custom method 25 26 crc crc 27 mtf moveToFront 28 bwt burrowsWheelerTransform 29 rle runLengthEncoding 30 31 // These fields are allocated with Reader and re-used later. 32 treeSels []uint8 33 codes2D [maxNumTrees][maxNumSyms]prefix.PrefixCode 34 codes1D [maxNumTrees]prefix.PrefixCodes 35 trees1D [maxNumTrees]prefix.Decoder 36 syms []uint16 37 38 fuzzReader // Exported functionality when fuzz testing 39} 40 41type ReaderConfig struct { 42 _ struct{} // Blank field to prevent unkeyed struct literals 43} 44 45func NewReader(r io.Reader, conf *ReaderConfig) (*Reader, error) { 46 zr := new(Reader) 47 zr.Reset(r) 48 return zr, nil 49} 50 51func (zr *Reader) Reset(r io.Reader) error { 52 *zr = Reader{ 53 rd: zr.rd, 54 55 mtf: zr.mtf, 56 bwt: zr.bwt, 57 rle: zr.rle, 58 59 treeSels: zr.treeSels, 60 trees1D: zr.trees1D, 61 syms: zr.syms, 62 } 63 zr.rd.Init(r) 64 return nil 65} 66 67func (zr *Reader) Read(buf []byte) (int, error) { 68 for { 69 cnt, err := zr.rle.Read(buf) 70 if err != rleDone && zr.err == nil { 71 zr.err = err 72 } 73 if cnt > 0 { 74 zr.crc.update(buf[:cnt]) 75 zr.OutputOffset += int64(cnt) 76 return cnt, nil 77 } 78 if zr.err != nil || len(buf) == 0 { 79 return 0, zr.err 80 } 81 82 // Read the next chunk. 83 zr.rd.Offset = zr.InputOffset 84 func() { 85 defer errors.Recover(&zr.err) 86 if zr.rdHdrFtr%2 == 0 { 87 // Check if we are already at EOF. 88 if err := zr.rd.PullBits(1); err != nil { 89 if err == io.ErrUnexpectedEOF && zr.rdHdrFtr > 0 { 90 err = io.EOF // EOF is okay if we read at least one stream 91 } 92 errors.Panic(err) 93 } 94 95 // Read stream header. 96 if zr.rd.ReadBitsBE64(16) != hdrMagic { 97 panicf(errors.Corrupted, "invalid stream magic") 98 } 99 if ver := zr.rd.ReadBitsBE64(8); ver != 'h' { 100 if ver == '0' { 101 panicf(errors.Deprecated, "bzip1 format is not supported") 102 } 103 panicf(errors.Corrupted, "invalid version: %q", ver) 104 } 105 lvl := int(zr.rd.ReadBitsBE64(8)) - '0' 106 if lvl < BestSpeed || lvl > BestCompression { 107 panicf(errors.Corrupted, "invalid block size: %d", lvl*blockSize) 108 } 109 zr.level = lvl 110 zr.rdHdrFtr++ 111 } else { 112 // Check and update the CRC. 113 if internal.GoFuzz { 114 zr.updateChecksum(-1, zr.crc.val) // Update with value 115 zr.blkCRC = zr.crc.val // Suppress CRC failures 116 } 117 if zr.blkCRC != zr.crc.val { 118 panicf(errors.Corrupted, "mismatching block checksum") 119 } 120 zr.endCRC = (zr.endCRC<<1 | zr.endCRC>>31) ^ zr.blkCRC 121 } 122 buf := zr.decodeBlock() 123 zr.rle.Init(buf) 124 }() 125 if zr.InputOffset, err = zr.rd.Flush(); zr.err == nil { 126 zr.err = err 127 } 128 if zr.err != nil { 129 zr.err = errWrap(zr.err, errors.Corrupted) 130 return 0, zr.err 131 } 132 } 133} 134 135func (zr *Reader) Close() error { 136 if zr.err == io.EOF || zr.err == errClosed { 137 zr.rle.Init(nil) // Make sure future reads fail 138 zr.err = errClosed 139 return nil 140 } 141 return zr.err // Return the persistent error 142} 143 144func (zr *Reader) decodeBlock() []byte { 145 if magic := zr.rd.ReadBitsBE64(48); magic != blkMagic { 146 if magic == endMagic { 147 endCRC := uint32(zr.rd.ReadBitsBE64(32)) 148 if internal.GoFuzz { 149 zr.updateChecksum(zr.rd.BitsRead()-32, zr.endCRC) 150 endCRC = zr.endCRC // Suppress CRC failures 151 } 152 if zr.endCRC != endCRC { 153 panicf(errors.Corrupted, "mismatching stream checksum") 154 } 155 zr.endCRC = 0 156 zr.rd.ReadPads() 157 zr.rdHdrFtr++ 158 return nil 159 } 160 panicf(errors.Corrupted, "invalid block or footer magic") 161 } 162 163 zr.crc.val = 0 164 zr.blkCRC = uint32(zr.rd.ReadBitsBE64(32)) 165 if internal.GoFuzz { 166 zr.updateChecksum(zr.rd.BitsRead()-32, 0) // Record offset only 167 } 168 if zr.rd.ReadBitsBE64(1) != 0 { 169 panicf(errors.Deprecated, "block randomization is not supported") 170 } 171 172 // Read BWT related fields. 173 ptr := int(zr.rd.ReadBitsBE64(24)) // BWT origin pointer 174 175 // Read MTF related fields. 176 var dictArr [256]uint8 177 dict := dictArr[:0] 178 bmapHi := uint16(zr.rd.ReadBits(16)) 179 for i := 0; i < 256; i, bmapHi = i+16, bmapHi>>1 { 180 if bmapHi&1 > 0 { 181 bmapLo := uint16(zr.rd.ReadBits(16)) 182 for j := 0; j < 16; j, bmapLo = j+1, bmapLo>>1 { 183 if bmapLo&1 > 0 { 184 dict = append(dict, uint8(i+j)) 185 } 186 } 187 } 188 } 189 190 // Step 1: Prefix encoding. 191 syms := zr.decodePrefix(len(dict)) 192 193 // Step 2: Move-to-front transform and run-length encoding. 194 zr.mtf.Init(dict, zr.level*blockSize) 195 buf := zr.mtf.Decode(syms) 196 197 // Step 3: Burrows-Wheeler transformation. 198 if ptr >= len(buf) { 199 panicf(errors.Corrupted, "origin pointer (0x%06x) exceeds block size: %d", ptr, len(buf)) 200 } 201 zr.bwt.Decode(buf, ptr) 202 203 return buf 204} 205 206func (zr *Reader) decodePrefix(numSyms int) (syms []uint16) { 207 numSyms += 2 // Remove 0 symbol, add RUNA, RUNB, and EOF symbols 208 if numSyms < 3 { 209 panicf(errors.Corrupted, "not enough prefix symbols: %d", numSyms) 210 } 211 212 // Read information about the trees and tree selectors. 213 var mtf internal.MoveToFront 214 numTrees := int(zr.rd.ReadBitsBE64(3)) 215 if numTrees < minNumTrees || numTrees > maxNumTrees { 216 panicf(errors.Corrupted, "invalid number of prefix trees: %d", numTrees) 217 } 218 numSels := int(zr.rd.ReadBitsBE64(15)) 219 if cap(zr.treeSels) < numSels { 220 zr.treeSels = make([]uint8, numSels) 221 } 222 treeSels := zr.treeSels[:numSels] 223 for i := range treeSels { 224 sym, ok := zr.rd.TryReadSymbol(&decSel) 225 if !ok { 226 sym = zr.rd.ReadSymbol(&decSel) 227 } 228 if int(sym) >= numTrees { 229 panicf(errors.Corrupted, "invalid prefix tree selector: %d", sym) 230 } 231 treeSels[i] = uint8(sym) 232 } 233 mtf.Decode(treeSels) 234 zr.treeSels = treeSels 235 236 // Initialize prefix codes. 237 for i := range zr.codes2D[:numTrees] { 238 zr.codes1D[i] = zr.codes2D[i][:numSyms] 239 } 240 zr.rd.ReadPrefixCodes(zr.codes1D[:numTrees], zr.trees1D[:numTrees]) 241 242 // Read prefix encoded symbols of compressed data. 243 var tree *prefix.Decoder 244 var blkLen, selIdx int 245 syms = zr.syms[:0] 246 for { 247 if blkLen == 0 { 248 blkLen = numBlockSyms 249 if selIdx >= len(treeSels) { 250 panicf(errors.Corrupted, "not enough prefix tree selectors") 251 } 252 tree = &zr.trees1D[treeSels[selIdx]] 253 selIdx++ 254 } 255 blkLen-- 256 sym, ok := zr.rd.TryReadSymbol(tree) 257 if !ok { 258 sym = zr.rd.ReadSymbol(tree) 259 } 260 261 if int(sym) == numSyms-1 { 262 break // EOF marker 263 } 264 if int(sym) >= numSyms { 265 panicf(errors.Corrupted, "invalid prefix symbol: %d", sym) 266 } 267 if len(syms) >= zr.level*blockSize { 268 panicf(errors.Corrupted, "number of prefix symbols exceeds block size") 269 } 270 syms = append(syms, uint16(sym)) 271 } 272 zr.syms = syms 273 return syms 274} 275