1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package flate
6
7import (
8	"bytes"
9	"encoding/binary"
10	"fmt"
11	"io"
12	"math"
13)
14
15const (
16	// From top
17	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
18	// 8 bits:   xlength = length - MIN_MATCH_LENGTH
19	// 5 bits    offsetcode
20	// 16 bits   xoffset = offset - MIN_OFFSET_SIZE, or literal
21	lengthShift         = 22
22	offsetMask          = 1<<lengthShift - 1
23	typeMask            = 3 << 30
24	literalType         = 0 << 30
25	matchType           = 1 << 30
26	matchOffsetOnlyMask = 0xffff
27)
28
29// The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
30// is lengthCodes[length - MIN_MATCH_LENGTH]
31var lengthCodes = [256]uint8{
32	0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
33	9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
34	13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
35	15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
36	17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
37	18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
38	19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
39	20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
40	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
41	21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
42	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
43	22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
44	23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
45	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
46	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
47	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
48	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
49	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
50	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
51	25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
52	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
53	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
54	26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
55	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
56	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
57	27, 27, 27, 27, 27, 28,
58}
59
60// lengthCodes1 is length codes, but starting at 1.
61var lengthCodes1 = [256]uint8{
62	1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
63	10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
64	14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
65	16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
66	18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
67	19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
68	20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
69	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
70	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
71	22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
72	23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
73	23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
74	24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
75	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
76	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
77	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
78	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
79	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
80	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
81	26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
82	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
83	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
84	27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
85	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
86	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
87	28, 28, 28, 28, 28, 29,
88}
89
90var offsetCodes = [256]uint32{
91	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
92	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
93	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
94	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
95	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
96	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
97	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
98	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
99	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
100	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
101	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
102	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
103	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
104	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
105	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
106	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
107}
108
109// offsetCodes14 are offsetCodes, but with 14 added.
110var offsetCodes14 = [256]uint32{
111	14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
112	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
113	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
114	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
115	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
116	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
117	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
118	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
119	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
120	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
121	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
122	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
123	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
124	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
125	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
126	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
127}
128
129type token uint32
130
131type tokens struct {
132	nLits     int
133	extraHist [32]uint16  // codes 256->maxnumlit
134	offHist   [32]uint16  // offset codes
135	litHist   [256]uint16 // codes 0->255
136	n         uint16      // Must be able to contain maxStoreBlockSize
137	tokens    [maxStoreBlockSize + 1]token
138}
139
140func (t *tokens) Reset() {
141	if t.n == 0 {
142		return
143	}
144	t.n = 0
145	t.nLits = 0
146	for i := range t.litHist[:] {
147		t.litHist[i] = 0
148	}
149	for i := range t.extraHist[:] {
150		t.extraHist[i] = 0
151	}
152	for i := range t.offHist[:] {
153		t.offHist[i] = 0
154	}
155}
156
157func (t *tokens) Fill() {
158	if t.n == 0 {
159		return
160	}
161	for i, v := range t.litHist[:] {
162		if v == 0 {
163			t.litHist[i] = 1
164			t.nLits++
165		}
166	}
167	for i, v := range t.extraHist[:literalCount-256] {
168		if v == 0 {
169			t.nLits++
170			t.extraHist[i] = 1
171		}
172	}
173	for i, v := range t.offHist[:offsetCodeCount] {
174		if v == 0 {
175			t.offHist[i] = 1
176		}
177	}
178}
179
180func indexTokens(in []token) tokens {
181	var t tokens
182	t.indexTokens(in)
183	return t
184}
185
186func (t *tokens) indexTokens(in []token) {
187	t.Reset()
188	for _, tok := range in {
189		if tok < matchType {
190			t.AddLiteral(tok.literal())
191			continue
192		}
193		t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask)
194	}
195}
196
197// emitLiteral writes a literal chunk and returns the number of bytes written.
198func emitLiteral(dst *tokens, lit []byte) {
199	ol := int(dst.n)
200	for i, v := range lit {
201		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
202		dst.litHist[v]++
203	}
204	dst.n += uint16(len(lit))
205	dst.nLits += len(lit)
206}
207
208func (t *tokens) AddLiteral(lit byte) {
209	t.tokens[t.n] = token(lit)
210	t.litHist[lit]++
211	t.n++
212	t.nLits++
213}
214
215// from https://stackoverflow.com/a/28730362
216func mFastLog2(val float32) float32 {
217	ux := int32(math.Float32bits(val))
218	log2 := (float32)(((ux >> 23) & 255) - 128)
219	ux &= -0x7f800001
220	ux += 127 << 23
221	uval := math.Float32frombits(uint32(ux))
222	log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759
223	return log2
224}
225
226// EstimatedBits will return an minimum size estimated by an *optimal*
227// compression of the block.
228// The size of the block
229func (t *tokens) EstimatedBits() int {
230	shannon := float32(0)
231	bits := int(0)
232	nMatches := 0
233	if t.nLits > 0 {
234		invTotal := 1.0 / float32(t.nLits)
235		for _, v := range t.litHist[:] {
236			if v > 0 {
237				n := float32(v)
238				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
239			}
240		}
241		// Just add 15 for EOB
242		shannon += 15
243		for i, v := range t.extraHist[1 : literalCount-256] {
244			if v > 0 {
245				n := float32(v)
246				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
247				bits += int(lengthExtraBits[i&31]) * int(v)
248				nMatches += int(v)
249			}
250		}
251	}
252	if nMatches > 0 {
253		invTotal := 1.0 / float32(nMatches)
254		for i, v := range t.offHist[:offsetCodeCount] {
255			if v > 0 {
256				n := float32(v)
257				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
258				bits += int(offsetExtraBits[i&31]) * int(v)
259			}
260		}
261	}
262	return int(shannon) + bits
263}
264
265// AddMatch adds a match to the tokens.
266// This function is very sensitive to inlining and right on the border.
267func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
268	if debugDeflate {
269		if xlength >= maxMatchLength+baseMatchLength {
270			panic(fmt.Errorf("invalid length: %v", xlength))
271		}
272		if xoffset >= maxMatchOffset+baseMatchOffset {
273			panic(fmt.Errorf("invalid offset: %v", xoffset))
274		}
275	}
276	oCode := offsetCode(xoffset)
277	xoffset |= oCode << 16
278	t.nLits++
279
280	t.extraHist[lengthCodes1[uint8(xlength)]]++
281	t.offHist[oCode]++
282	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
283	t.n++
284}
285
286// AddMatchLong adds a match to the tokens, potentially longer than max match length.
287// Length should NOT have the base subtracted, only offset should.
288func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
289	if debugDeflate {
290		if xoffset >= maxMatchOffset+baseMatchOffset {
291			panic(fmt.Errorf("invalid offset: %v", xoffset))
292		}
293	}
294	oc := offsetCode(xoffset)
295	xoffset |= oc << 16
296	for xlength > 0 {
297		xl := xlength
298		if xl > 258 {
299			// We need to have at least baseMatchLength left over for next loop.
300			xl = 258 - baseMatchLength
301		}
302		xlength -= xl
303		xl -= baseMatchLength
304		t.nLits++
305		t.extraHist[lengthCodes1[uint8(xl)]]++
306		t.offHist[oc]++
307		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
308		t.n++
309	}
310}
311
312func (t *tokens) AddEOB() {
313	t.tokens[t.n] = token(endBlockMarker)
314	t.extraHist[0]++
315	t.n++
316}
317
318func (t *tokens) Slice() []token {
319	return t.tokens[:t.n]
320}
321
322// VarInt returns the tokens as varint encoded bytes.
323func (t *tokens) VarInt() []byte {
324	var b = make([]byte, binary.MaxVarintLen32*int(t.n))
325	var off int
326	for _, v := range t.tokens[:t.n] {
327		off += binary.PutUvarint(b[off:], uint64(v))
328	}
329	return b[:off]
330}
331
332// FromVarInt restores t to the varint encoded tokens provided.
333// Any data in t is removed.
334func (t *tokens) FromVarInt(b []byte) error {
335	var buf = bytes.NewReader(b)
336	var toks []token
337	for {
338		r, err := binary.ReadUvarint(buf)
339		if err == io.EOF {
340			break
341		}
342		if err != nil {
343			return err
344		}
345		toks = append(toks, token(r))
346	}
347	t.indexTokens(toks)
348	return nil
349}
350
351// Returns the type of a token
352func (t token) typ() uint32 { return uint32(t) & typeMask }
353
354// Returns the literal of a literal token
355func (t token) literal() uint8 { return uint8(t) }
356
357// Returns the extra offset of a match token
358func (t token) offset() uint32 { return uint32(t) & offsetMask }
359
360func (t token) length() uint8 { return uint8(t >> lengthShift) }
361
362// The code is never more than 8 bits, but is returned as uint32 for convenience.
363func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) }
364
365// Returns the offset code corresponding to a specific offset
366func offsetCode(off uint32) uint32 {
367	if false {
368		if off < uint32(len(offsetCodes)) {
369			return offsetCodes[off&255]
370		} else if off>>7 < uint32(len(offsetCodes)) {
371			return offsetCodes[(off>>7)&255] + 14
372		} else {
373			return offsetCodes[(off>>14)&255] + 28
374		}
375	}
376	if off < uint32(len(offsetCodes)) {
377		return offsetCodes[uint8(off)]
378	}
379	return offsetCodes14[uint8(off>>7)]
380}
381