1// Copyright 2015, Joe Tsai. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE.md file. 4 5package bzip2 6 7import ( 8 "io" 9 10 "github.com/dsnet/compress/internal" 11 "github.com/dsnet/compress/internal/errors" 12 "github.com/dsnet/compress/internal/prefix" 13) 14 15const ( 16 minNumTrees = 2 17 maxNumTrees = 6 18 19 maxPrefixBits = 20 // Maximum bit-width of a prefix code 20 maxNumSyms = 256 + 2 // Maximum number of symbols in the alphabet 21 numBlockSyms = 50 // Number of bytes in a block 22) 23 24// encSel and decSel are used to handle the prefix encoding for tree selectors. 25// The prefix encoding is as follows: 26// 27// Code TreeIdx 28// 0 <=> 0 29// 10 <=> 1 30// 110 <=> 2 31// 1110 <=> 3 32// 11110 <=> 4 33// 111110 <=> 5 34// 111111 <=> 6 Invalid tree index, so should fail 35// 36var encSel, decSel = func() (e prefix.Encoder, d prefix.Decoder) { 37 var selCodes [maxNumTrees + 1]prefix.PrefixCode 38 for i := range selCodes { 39 selCodes[i] = prefix.PrefixCode{Sym: uint32(i), Len: uint32(i + 1)} 40 } 41 selCodes[maxNumTrees] = prefix.PrefixCode{Sym: maxNumTrees, Len: maxNumTrees} 42 prefix.GeneratePrefixes(selCodes[:]) 43 e.Init(selCodes[:]) 44 d.Init(selCodes[:]) 45 return 46}() 47 48type prefixReader struct{ prefix.Reader } 49 50func (pr *prefixReader) Init(r io.Reader) { 51 pr.Reader.Init(r, true) 52} 53 54func (pr *prefixReader) ReadBitsBE64(nb uint) uint64 { 55 if nb <= 32 { 56 v := uint32(pr.ReadBits(nb)) 57 return uint64(internal.ReverseUint32N(v, nb)) 58 } 59 v0 := internal.ReverseUint32(uint32(pr.ReadBits(32))) 60 v1 := internal.ReverseUint32(uint32(pr.ReadBits(nb - 32))) 61 v := uint64(v0)<<32 | uint64(v1) 62 return v >> (64 - nb) 63} 64 65func (pr *prefixReader) ReadPrefixCodes(codes []prefix.PrefixCodes, trees []prefix.Decoder) { 66 for i, pc := range codes { 67 clen := int(pr.ReadBitsBE64(5)) 68 sum := 1 << maxPrefixBits 69 for sym := range pc { 70 for { 71 if clen < 1 || clen > maxPrefixBits { 72 panicf(errors.Corrupted, "invalid prefix bit-length: %d", clen) 73 } 74 75 b, ok := pr.TryReadBits(1) 76 if !ok { 77 b = pr.ReadBits(1) 78 } 79 if b == 0 { 80 break 81 } 82 83 b, ok = pr.TryReadBits(1) 84 if !ok { 85 b = pr.ReadBits(1) 86 } 87 clen -= int(b*2) - 1 // +1 or -1 88 } 89 pc[sym] = prefix.PrefixCode{Sym: uint32(sym), Len: uint32(clen)} 90 sum -= (1 << maxPrefixBits) >> uint(clen) 91 } 92 93 if sum == 0 { 94 // Fast path, but only handles complete trees. 95 if err := prefix.GeneratePrefixes(pc); err != nil { 96 errors.Panic(err) // Using complete trees; should never fail 97 } 98 } else { 99 // Slow path, but handles anything. 100 pc = handleDegenerateCodes(pc) // Never fails, but may fail later 101 codes[i] = pc 102 } 103 trees[i].Init(pc) 104 } 105} 106 107type prefixWriter struct{ prefix.Writer } 108 109func (pw *prefixWriter) Init(w io.Writer) { 110 pw.Writer.Init(w, true) 111} 112 113func (pw *prefixWriter) WriteBitsBE64(v uint64, nb uint) { 114 if nb <= 32 { 115 v := internal.ReverseUint32N(uint32(v), nb) 116 pw.WriteBits(uint(v), nb) 117 return 118 } 119 v <<= (64 - nb) 120 v0 := internal.ReverseUint32(uint32(v >> 32)) 121 v1 := internal.ReverseUint32(uint32(v)) 122 pw.WriteBits(uint(v0), 32) 123 pw.WriteBits(uint(v1), nb-32) 124 return 125} 126 127func (pw *prefixWriter) WritePrefixCodes(codes []prefix.PrefixCodes, trees []prefix.Encoder) { 128 for i, pc := range codes { 129 if err := prefix.GeneratePrefixes(pc); err != nil { 130 errors.Panic(err) // Using complete trees; should never fail 131 } 132 trees[i].Init(pc) 133 134 clen := int(pc[0].Len) 135 pw.WriteBitsBE64(uint64(clen), 5) 136 for _, c := range pc { 137 for int(c.Len) < clen { 138 pw.WriteBits(3, 2) // 11 139 clen-- 140 } 141 for int(c.Len) > clen { 142 pw.WriteBits(1, 2) // 10 143 clen++ 144 } 145 pw.WriteBits(0, 1) 146 } 147 } 148} 149 150// handleDegenerateCodes converts a degenerate tree into a canonical tree. 151// 152// For example, when the input is an under-subscribed tree: 153// input: []PrefixCode{ 154// {Sym: 0, Len: 3}, 155// {Sym: 1, Len: 4}, 156// {Sym: 2, Len: 3}, 157// } 158// output: []PrefixCode{ 159// {Sym: 0, Len: 3, Val: 0}, // 000 160// {Sym: 1, Len: 4, Val: 2}, // 0010 161// {Sym: 2, Len: 3, Val: 4}, // 100 162// {Sym: 258, Len: 4, Val: 10}, // 1010 163// {Sym: 259, Len: 3, Val: 6}, // 110 164// {Sym: 260, Len: 1, Val: 1}, // 1 165// } 166// 167// For example, when the input is an over-subscribed tree: 168// input: []PrefixCode{ 169// {Sym: 0, Len: 1}, 170// {Sym: 1, Len: 3}, 171// {Sym: 2, Len: 4}, 172// {Sym: 3, Len: 3}, 173// {Sym: 4, Len: 2}, 174// } 175// output: []PrefixCode{ 176// {Sym: 0, Len: 1, Val: 0}, // 0 177// {Sym: 1, Len: 3, Val: 3}, // 011 178// {Sym: 3, Len: 3, Val: 7}, // 111 179// {Sym: 4, Len: 2, Val: 1}, // 01 180// } 181func handleDegenerateCodes(codes prefix.PrefixCodes) prefix.PrefixCodes { 182 // Since there is no formal definition for the BZip2 format, there is no 183 // specification that says that the code lengths must form a complete 184 // prefix tree (IE: it is neither over-subscribed nor under-subscribed). 185 // Thus, the original C implementation becomes the reference for how prefix 186 // decoding is done in these edge cases. Unfortunately, the C version does 187 // not error when an invalid tree is used, but rather allows decoding to 188 // continue and only errors if some bit pattern happens to cause an error. 189 // Thus, it is possible for an invalid tree to end up decoding an input 190 // "properly" so long as invalid bit patterns are not present. In order to 191 // replicate this non-specified behavior, we use a ported version of the 192 // C code to generate the codes as a valid canonical tree by substituting 193 // invalid nodes with invalid symbols. 194 // 195 // ==================================================== 196 // This program, "bzip2", the associated library "libbzip2", and all 197 // documentation, are copyright (C) 1996-2010 Julian R Seward. All 198 // rights reserved. 199 // 200 // Redistribution and use in source and binary forms, with or without 201 // modification, are permitted provided that the following conditions 202 // are met: 203 // 204 // 1. Redistributions of source code must retain the above copyright 205 // notice, this list of conditions and the following disclaimer. 206 // 207 // 2. The origin of this software must not be misrepresented; you must 208 // not claim that you wrote the original software. If you use this 209 // software in a product, an acknowledgment in the product 210 // documentation would be appreciated but is not required. 211 // 212 // 3. Altered source versions must be plainly marked as such, and must 213 // not be misrepresented as being the original software. 214 // 215 // 4. The name of the author may not be used to endorse or promote 216 // products derived from this software without specific prior written 217 // permission. 218 // 219 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 220 // OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 221 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 222 // ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 223 // DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 224 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 225 // GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 226 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 227 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 228 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 229 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 230 // 231 // Julian Seward, jseward@bzip.org 232 // bzip2/libbzip2 version 1.0.6 of 6 September 2010 233 // ==================================================== 234 var ( 235 limits [maxPrefixBits + 2]int32 236 bases [maxPrefixBits + 2]int32 237 perms [maxNumSyms]int32 238 239 minLen = uint32(maxPrefixBits) 240 maxLen = uint32(0) 241 ) 242 243 const ( 244 statusOkay = iota 245 statusInvalid 246 statusNeedBits 247 statusMaxBits 248 ) 249 250 // createTables is the BZ2_hbCreateDecodeTables function from the C code. 251 createTables := func(codes []prefix.PrefixCode) { 252 for _, c := range codes { 253 if c.Len > maxLen { 254 maxLen = c.Len 255 } 256 if c.Len < minLen { 257 minLen = c.Len 258 } 259 } 260 261 var pp int 262 for i := minLen; i <= maxLen; i++ { 263 for j, c := range codes { 264 if c.Len == i { 265 perms[pp] = int32(j) 266 pp++ 267 } 268 } 269 } 270 271 var vec int32 272 for _, c := range codes { 273 bases[c.Len+1]++ 274 } 275 for i := 1; i < len(bases); i++ { 276 bases[i] += bases[i-1] 277 } 278 for i := minLen; i <= maxLen; i++ { 279 vec += bases[i+1] - bases[i] 280 limits[i] = vec - 1 281 vec <<= 1 282 } 283 for i := minLen + 1; i <= maxLen; i++ { 284 bases[i] = ((limits[i-1] + 1) << 1) - bases[i] 285 } 286 } 287 288 // getSymbol is the GET_MTF_VAL macro from the C code. 289 getSymbol := func(c prefix.PrefixCode) (uint32, int) { 290 v := internal.ReverseUint32(c.Val) 291 n := c.Len 292 293 zn := minLen 294 if zn > n { 295 return 0, statusNeedBits 296 } 297 zvec := int32(v >> (32 - zn)) 298 v <<= zn 299 for { 300 if zn > maxLen { 301 return 0, statusMaxBits 302 } 303 if zvec <= limits[zn] { 304 break 305 } 306 zn++ 307 if zn > n { 308 return 0, statusNeedBits 309 } 310 zvec = (zvec << 1) | int32(v>>31) 311 v <<= 1 312 } 313 if zvec-bases[zn] < 0 || zvec-bases[zn] >= maxNumSyms { 314 return 0, statusInvalid 315 } 316 return uint32(perms[zvec-bases[zn]]), statusOkay 317 } 318 319 // Step 1: Create the prefix trees using the C algorithm. 320 createTables(codes) 321 322 // Step 2: Starting with the shortest bit pattern, explore the whole tree. 323 // If tree is under-subscribed, the worst-case runtime is O(1<<maxLen). 324 // If tree is over-subscribed, the worst-case runtime is O(maxNumSyms). 325 var pcodesArr [2 * maxNumSyms]prefix.PrefixCode 326 pcodes := pcodesArr[:maxNumSyms] 327 var exploreCode func(prefix.PrefixCode) bool 328 exploreCode = func(c prefix.PrefixCode) (term bool) { 329 sym, status := getSymbol(c) 330 switch status { 331 case statusOkay: 332 // This code is valid, so insert it. 333 c.Sym = sym 334 pcodes[sym] = c 335 term = true 336 case statusInvalid: 337 // This code is invalid, so insert an invalid symbol. 338 c.Sym = uint32(len(pcodes)) 339 pcodes = append(pcodes, c) 340 term = true 341 case statusNeedBits: 342 // This code is too short, so explore both children. 343 c.Len++ 344 c0, c1 := c, c 345 c1.Val |= 1 << (c.Len - 1) 346 347 b0 := exploreCode(c0) 348 b1 := exploreCode(c1) 349 switch { 350 case !b0 && b1: 351 c0.Sym = uint32(len(pcodes)) 352 pcodes = append(pcodes, c0) 353 case !b1 && b0: 354 c1.Sym = uint32(len(pcodes)) 355 pcodes = append(pcodes, c1) 356 } 357 term = b0 || b1 358 case statusMaxBits: 359 // This code is too long, so report it upstream. 360 term = false 361 } 362 return term // Did this code terminate? 363 } 364 exploreCode(prefix.PrefixCode{}) 365 366 // Step 3: Copy new sparse codes to old output codes. 367 codes = codes[:0] 368 for _, c := range pcodes { 369 if c.Len > 0 { 370 codes = append(codes, c) 371 } 372 } 373 return codes 374} 375