1// Copyright 2015, Joe Tsai. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE.md file.
4
5// Package meta implements the XFLATE meta encoding scheme.
6//
7// The XFLATE meta encoding is a method of encoding arbitrary data into one
8// or more RFC 1951 compliant DEFLATE blocks. This encoding has the special
9// property that when the blocks are decoded by a RFC 1951 compliant
10// decompressor, they produce absolutely no output. However, when decoded with
11// the XFLATE meta decoder, it losslessly produces the original input.
12//
13// The meta encoding works by encoding arbitrary data into the Huffman tree
14// definition of dynamic DEFLATE blocks. These blocks have an empty data section
15// and produce no output. Due to the Huffman definition overhead, the encoded
16// output is usually larger than the input. However, for most input datasets,
17// this encoding scheme is able to achieve an efficiency of at least 50%.
18package meta
19
20import (
21	"fmt"
22
23	"github.com/dsnet/compress/internal/errors"
24	"github.com/dsnet/compress/internal/prefix"
25)
26
27// These are the magic values that begin every single meta block.
28const (
29	magicVals uint32 = 0x05860004
30	magicMask uint32 = 0xfffe3fc6
31)
32
33// ReverseSearch searches for a meta header in reverse. This returns the last
34// index where the header was found. If not found, it returns -1.
35func ReverseSearch(data []byte) int {
36	var magic uint32
37	for i := len(data) - 1; i >= 0; i-- {
38		magic = (magic << 8) | uint32(data[i])
39		if magic&magicMask == magicVals {
40			return i
41		}
42	}
43	return -1
44}
45
46const (
47	maxSyms    = 257 // Maximum number of literal codes (with EOB marker)
48	minHuffLen = 1   // Minimum number of bits for each Huffman code
49	maxHuffLen = 7   // Maximum number of bits for each Huffman code
50	minRepLast = 3   // Minimum number of repeated codes (clen: 16)
51	maxRepLast = 6   // Maximum number of repeated codes (clen: 16)
52	minRepZero = 11  // Minimum number of repeated zeros (clen: 18)
53	maxRepZero = 138 // Maximum number of repeated zeros (clen: 18)
54)
55
56// These are some constants regarding the theoretical and practical limits for
57// the meta encoding of a single block.
58const (
59	// MinRawBytes and MaxRawBytes are the theoretical minimum and maximum
60	// number of bytes a block can encode.
61	MinRawBytes = 0
62	MaxRawBytes = 31
63
64	// MinEncBytes and MaxEncBytes are the theoretical minimum and maximum
65	// number of bytes an encoded block will occupy.
66	MinEncBytes = 12
67	MaxEncBytes = 64
68
69	// EnsureRawBytes is the maximum number of bytes that a single block is
70	// ensured to encode (computed using brute force).
71	EnsureRawBytes = 22
72)
73
74// FinalMode controls or indicates which final bits are set in the last block
75// in the meta stream. In the meta encoding, there are 2 final bits:
76//
77//	Stream: This is the final bit from DEFLATE (as defined in RFC 1951).
78//	and indicates that the entire compression stream has come to an end.
79//	This bit indicate absolute termination of the stream.
80//
81//	Meta: This final bit indicates that the current sequence of meta blocks has
82//	terminated. The decoded data from those blocks form a meta substream.
83//	This bit is used as a form of message framing for the meta encoding format.
84//
85// It invalid for the stream final bit to be set, while the meta final bit is
86// not set. All other combinations are legal.
87type FinalMode int
88
89const (
90	// FinalNil has neither the stream nor meta final bits set.
91	FinalNil FinalMode = iota
92
93	// FinalMeta has the final bit set, but not stream final bit.
94	FinalMeta
95
96	// FinalStream has both the meta and stream final bits set.
97	FinalStream
98)
99
100// The Huffman encoding used by the XFLATE meta encoding uses a partially
101// pre-determined HCLEN table. The symbols are 0, 16, 18, and another symbol
102// between minHuffLen and maxHuffLen, inclusively. The 0 symbol represents a
103// logical zero in the meta encoding, and the symbol between minHuffLen and
104// maxHuffLen represents a logical one. Symbols 16 and 18 are used to provide a
105// primitive form of run-length encoding. The codes that these symbols map to
106// are fixed and are shown below.
107//
108//	Code      Symbol
109//	0    <=>  0                      (symZero)
110//	10   <=>  minHuffLen..maxHuffLen (symOne)
111//	110  <=>  16                     (symRepLast)
112//	111  <=>  18                     (symRepZero)
113//
114// The symZero symbol occupies 1 bit since it is the most commonly needed bit,
115// while symOne occupies 2 bits. Thus, it is cheaper to encode logical zeros
116// than it is to encode logical ones. The invert bit in the meta encoding takes
117// advantage of this fact and allows all data bits to be inverted so that the
118// number zero bits is greater than the number of one bits.
119const (
120	symZero    = iota // Symbol to encode a input zero (clen: 0)
121	symOne            // Symbol to encode a input one  (clen: minHuffLen..maxHuffLen)
122	symRepLast        // Symbol to repeat last symbol  (clen: 16)
123	symRepZero        // Symbol to repeat zero symbol  (clen: 18)
124)
125
126var encHuff, decHuff = func() (enc prefix.Encoder, dec prefix.Decoder) {
127	codes := [4]prefix.PrefixCode{
128		{Sym: symZero, Len: 1},
129		{Sym: symOne, Len: 2},
130		{Sym: symRepLast, Len: 3},
131		{Sym: symRepZero, Len: 3},
132	}
133	prefix.GeneratePrefixes(codes[:])
134	enc.Init(codes[:])
135	dec.Init(codes[:])
136	return
137}()
138
139func errorf(c int, f string, a ...interface{}) error {
140	return errors.Error{Code: c, Pkg: "meta", Msg: fmt.Sprintf(f, a...)}
141}
142
143var errClosed = errorf(errors.Closed, "")
144
145// oneBitsLUT reports the number of set bits in the input byte.
146var oneBitsLUT = func() (lut [256]byte) {
147	for i := range lut[:] {
148		for b := byte(i); b > 0; b >>= 1 {
149			lut[i] += b & 1
150		}
151	}
152	return
153}()
154
155// numBits counts the number of zero and one bits in the byte.
156func numBits(b byte) (zeros, ones int) {
157	ones = int(oneBitsLUT[b])
158	zeros = 8 - ones
159	return
160}
161
162// numPads computes number of bits needed to pad n-bits to a byte alignment.
163func numPads(n uint) uint {
164	return -n & 7
165}
166
167// btoi converts a bool to a integer 0 or 1.
168func btoi(b bool) int {
169	if b {
170		return 1
171	}
172	return 0
173}
174