1// Copyright (c) 2010, Andrei Vieru. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// The lzma package implements reading and writing of LZMA format compressed data.
6// Reference implementation is LZMA SDK version 4.65 originaly developed by Igor
7// Pavlov, available online at:
8//
9//  http://www.7-zip.org/sdk.html
10//
11//
12//
13// Usage examples. Write compressed data to a buffer:
14//
15//  var b bytes.Buffer
16//  w := lzma.NewWriter(&b)
17//  w.Write([]byte("hello, world\n"))
18//  w.Close()
19//
20// read that data back:
21//
22//  r := lzma.NewReader(&b)
23//  io.Copy(os.Stdout, r)
24//  r.Close()
25//
26//
27//
28// If the data is bigger than you'd like to hold into memory, use pipes. Write
29// compressed data to an io.PipeWriter:
30//
31//  pr, pw := io.Pipe()
32//  go func() {
33//  	defer pw.Close()
34//	w := lzma.NewWriter(pw)
35//	defer w.Close()
36//	// the bytes.Buffer would be an io.Reader used to read uncompressed data from
37//	io.Copy(w, bytes.NewBuffer([]byte("hello, world\n")))
38//  }()
39//
40// and read it back:
41//
42//  defer pr.Close()
43//  r := lzma.NewReader(pr)
44//  defer r.Close()
45//  // the os.Stdout would be an io.Writer used to write uncompressed data to
46//  io.Copy(os.Stdout, r)
47//
48//
49//
50package lzma
51
52import (
53	"errors"
54	"io"
55)
56
57const (
58	inBufSize           = 1 << 16
59	outBufSize          = 1 << 16
60	lzmaPropSize        = 5
61	lzmaHeaderSize      = lzmaPropSize + 8
62	lzmaMaxReqInputSize = 20
63
64	kNumRepDistances                = 4
65	kNumStates                      = 12
66	kNumPosSlotBits                 = 6
67	kDicLogSizeMin                  = 0
68	kNumLenToPosStatesBits          = 2
69	kNumLenToPosStates              = 1 << kNumLenToPosStatesBits
70	kMatchMinLen                    = 2
71	kNumAlignBits                   = 4
72	kAlignTableSize                 = 1 << kNumAlignBits
73	kAlignMask                      = kAlignTableSize - 1
74	kStartPosModelIndex             = 4
75	kEndPosModelIndex               = 14
76	kNumPosModels                   = kEndPosModelIndex - kStartPosModelIndex
77	kNumFullDistances               = 1 << (kEndPosModelIndex / 2)
78	kNumLitPosStatesBitsEncodingMax = 4
79	kNumLitContextBitsMax           = 8
80	kNumPosStatesBitsMax            = 4
81	kNumPosStatesMax                = 1 << kNumPosStatesBitsMax
82	kNumLowLenBits                  = 3
83	kNumMidLenBits                  = 3
84	kNumHighLenBits                 = 8
85	kNumLowLenSymbols               = 1 << kNumLowLenBits
86	kNumMidLenSymbols               = 1 << kNumMidLenBits
87	kNumLenSymbols                  = kNumLowLenSymbols + kNumMidLenSymbols + (1 << kNumHighLenBits)
88	kMatchMaxLen                    = kMatchMinLen + kNumLenSymbols - 1
89)
90
91// A streamError reports the presence of corrupt input stream.
92var streamError = errors.New("error in lzma encoded data stream")
93
94// A headerError reports an error in the header of the lzma encoder file.
95var headerError = errors.New("error in lzma header")
96
97// A nReadError reports what its message reads
98var nReadError = errors.New("number of bytes returned by Reader.Read() didn't meet expectances")
99
100// A nWriteError reports what its message reads
101var nWriteError = errors.New("number of bytes returned by Writer.Write() didn't meet expectances")
102
103// TODO: implement this err
104// A dataIntegrityError reports an error encountered while cheching data integrity.
105// -- from lzma.txt:
106// You can use multiple checks to test data integrity after full decompression:
107// 1) Check Result and "status" variable.
108// 2) Check that output(destLen) = uncompressedSize, if you know real uncompressedSize.
109// 3) Check that output(srcLen) = compressedSize, if you know real compressedSize.
110//     You must use correct finish mode in that case.
111//
112//type dataIntegrityError struct {
113//	msg string
114//	// hz
115//}
116
117func stateUpdateChar(index uint32) uint32 {
118	if index < 4 {
119		return 0
120	}
121	if index < 10 {
122		return index - 3
123	}
124	return index - 6
125}
126
127func stateUpdateMatch(index uint32) uint32 {
128	if index < 7 {
129		return 7
130	}
131	return 10
132}
133
134func stateUpdateRep(index uint32) uint32 {
135	if index < 7 {
136		return 8
137	}
138	return 11
139}
140
141func stateUpdateShortRep(index uint32) uint32 {
142	if index < 7 {
143		return 9
144	}
145	return 11
146}
147
148func stateIsCharState(index uint32) bool {
149	if index < 7 {
150		return true
151	}
152	return false
153}
154
155func getLenToPosState(length uint32) uint32 {
156	length -= kMatchMinLen
157	if length < kNumLenToPosStates {
158		return length
159	}
160	return kNumLenToPosStates - 1
161}
162
163// LZMA compressed file format
164// ---------------------------
165// Offset Size 	      Description
166//   0     1   		Special LZMA properties (lc,lp, pb in encoded form)
167//   1     4   		Dictionary size (little endian)
168//   5     8   		Uncompressed size (little endian). Size -1 stands for unknown size
169
170// lzma properties
171type props struct {
172	litContextBits, // lc
173	litPosStateBits, // lp
174	posStateBits uint8 // pb
175	dictSize uint32
176}
177
178func (p *props) decodeProps(buf []byte) {
179	d := buf[0]
180	if d > (9 * 5 * 5) {
181		throw(headerError)
182	}
183	p.litContextBits = d % 9
184	d /= 9
185	p.posStateBits = d / 5
186	p.litPosStateBits = d % 5
187	if p.litContextBits > kNumLitContextBitsMax || p.litPosStateBits > 4 || p.posStateBits > kNumPosStatesBitsMax {
188		throw(headerError)
189	}
190	for i := 0; i < 4; i++ {
191		p.dictSize += uint32(buf[i+1]) << uint32(i*8)
192	}
193}
194
195type decoder struct {
196	// i/o
197	rd     *rangeDecoder // r
198	outWin *lzOutWindow  // w
199
200	// lzma header
201	prop       *props
202	unpackSize int64
203
204	// hz
205	matchDecoders    []uint16
206	repDecoders      []uint16
207	repG0Decoders    []uint16
208	repG1Decoders    []uint16
209	repG2Decoders    []uint16
210	rep0LongDecoders []uint16
211	posSlotCoders    []*rangeBitTreeCoder
212	posDecoders      []uint16
213	posAlignCoder    *rangeBitTreeCoder
214	lenCoder         *lenCoder
215	repLenCoder      *lenCoder
216	litCoder         *litCoder
217	dictSizeCheck    uint32
218	posStateMask     uint32
219}
220
221func (z *decoder) doDecode() {
222	var state uint32 = 0
223	var rep0 uint32 = 0
224	var rep1 uint32 = 0
225	var rep2 uint32 = 0
226	var rep3 uint32 = 0
227	var nowPos uint64 = 0
228	var prevByte byte = 0
229
230	for z.unpackSize < 0 || int64(nowPos) < z.unpackSize {
231		posState := uint32(nowPos) & z.posStateMask
232		if z.rd.decodeBit(z.matchDecoders, state<<kNumPosStatesBitsMax+posState) == 0 {
233			lsc := z.litCoder.getSubCoder(uint32(nowPos), prevByte)
234			if !stateIsCharState(state) {
235				prevByte = lsc.decodeWithMatchByte(z.rd, z.outWin.getByte(rep0))
236			} else {
237				prevByte = lsc.decodeNormal(z.rd)
238			}
239			z.outWin.putByte(prevByte)
240			state = stateUpdateChar(state)
241			nowPos++
242		} else {
243			var length uint32
244			if z.rd.decodeBit(z.repDecoders, state) == 1 {
245				length = 0
246				if z.rd.decodeBit(z.repG0Decoders, state) == 0 {
247					if z.rd.decodeBit(z.rep0LongDecoders, state<<kNumPosStatesBitsMax+posState) == 0 {
248						state = stateUpdateShortRep(state)
249						length = 1
250					}
251				} else {
252					var distance uint32
253					if z.rd.decodeBit(z.repG1Decoders, state) == 0 {
254						distance = rep1
255					} else {
256						if z.rd.decodeBit(z.repG2Decoders, state) == 0 {
257							distance = rep2
258						} else {
259							distance, rep3 = rep3, rep2
260						}
261						rep2 = rep1
262					}
263					rep1, rep0 = rep0, distance
264				}
265				if length == 0 {
266					length = z.repLenCoder.decode(z.rd, posState) + kMatchMinLen
267					state = stateUpdateRep(state)
268				}
269			} else {
270				rep3, rep2, rep1 = rep2, rep1, rep0
271				length = z.lenCoder.decode(z.rd, posState) + kMatchMinLen
272				state = stateUpdateMatch(state)
273				posSlot := z.posSlotCoders[getLenToPosState(length)].decode(z.rd)
274				if posSlot >= kStartPosModelIndex {
275					numDirectBits := posSlot>>1 - 1
276					rep0 = (2 | posSlot&1) << numDirectBits
277					if posSlot < kEndPosModelIndex {
278						rep0 += reverseDecodeIndex(z.rd, z.posDecoders, rep0-posSlot-1, numDirectBits)
279					} else {
280						rep0 += z.rd.decodeDirectBits(numDirectBits-kNumAlignBits) << kNumAlignBits
281						rep0 += z.posAlignCoder.reverseDecode(z.rd)
282						if int32(rep0) < 0 {
283							if rep0 == 0xFFFFFFFF {
284								break
285							}
286							throw(streamError)
287						}
288					}
289				} else {
290					rep0 = posSlot
291				}
292			}
293			if uint64(rep0) >= nowPos || rep0 >= z.dictSizeCheck {
294				throw(streamError)
295			}
296			z.outWin.copyBlock(rep0, length)
297			nowPos += uint64(length)
298			prevByte = z.outWin.getByte(0)
299		}
300	}
301	z.outWin.flush()
302	//if z.unpackSize != -1 {
303	//	if z.outWin.unpacked != z.unpackSize {
304	//		throw(&dataIntegrityError{})
305	//	}
306	//}
307}
308
309func (z *decoder) decoder(r io.Reader, w io.Writer) (err error) {
310	defer handlePanics(&err)
311
312	// read 13 bytes (lzma header)
313	header := make([]byte, lzmaHeaderSize)
314	n, err := r.Read(header)
315	if err != nil {
316		return
317	}
318	if n != lzmaHeaderSize {
319		return nReadError
320	}
321	z.prop = &props{}
322	z.prop.decodeProps(header)
323
324	z.unpackSize = 0
325	for i := 0; i < 8; i++ {
326		b := header[lzmaPropSize+i]
327		z.unpackSize = z.unpackSize | int64(b)<<uint64(8*i)
328	}
329
330	// do not move before r.Read(header)
331	z.rd = newRangeDecoder(r)
332
333	z.dictSizeCheck = maxUInt32(z.prop.dictSize, 1)
334	z.outWin = newLzOutWindow(w, maxUInt32(z.dictSizeCheck, 1<<12))
335
336	z.litCoder = newLitCoder(uint32(z.prop.litPosStateBits), uint32(z.prop.litContextBits))
337	z.lenCoder = newLenCoder(uint32(1 << z.prop.posStateBits))
338	z.repLenCoder = newLenCoder(uint32(1 << z.prop.posStateBits))
339	z.posStateMask = uint32(1<<z.prop.posStateBits - 1)
340	z.matchDecoders = initBitModels(kNumStates << kNumPosStatesBitsMax)
341	z.repDecoders = initBitModels(kNumStates)
342	z.repG0Decoders = initBitModels(kNumStates)
343	z.repG1Decoders = initBitModels(kNumStates)
344	z.repG2Decoders = initBitModels(kNumStates)
345	z.rep0LongDecoders = initBitModels(kNumStates << kNumPosStatesBitsMax)
346	z.posDecoders = initBitModels(kNumFullDistances - kEndPosModelIndex)
347	z.posSlotCoders = make([]*rangeBitTreeCoder, kNumLenToPosStates)
348	for i := 0; i < kNumLenToPosStates; i++ {
349		z.posSlotCoders[i] = newRangeBitTreeCoder(kNumPosSlotBits)
350	}
351	z.posAlignCoder = newRangeBitTreeCoder(kNumAlignBits)
352
353	z.doDecode()
354	return
355}
356
357// NewReader returns a new ReadCloser that can be used to read the uncompressed
358// version of r. It is the caller's responsibility to call Close on the ReadCloser
359// when finished reading.
360//
361func NewReader(r io.Reader) io.ReadCloser {
362	var z decoder
363	pr, pw := io.Pipe()
364	go func() {
365		err := z.decoder(r, pw)
366		pw.CloseWithError(err)
367	}()
368	return pr
369}
370