1 /* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: expandtab:ts=8:sw=4:softtabstop=4:
3 /**
4  * \file        lzma/lzma.h
5  * \brief       LZMA1 and LZMA2 filters
6  */
7 
8 /*
9  * Author: Lasse Collin
10  *
11  * This file has been put into the public domain.
12  * You can do whatever you want with this file.
13  *
14  * See ../lzma.h for information about liblzma as a whole.
15  */
16 
17 #ifndef LZMA_H_INTERNAL
18 #	error Never include this file directly. Use <lzma.h> instead.
19 #endif
20 
21 
22 /**
23  * \brief       LZMA1 Filter ID
24  *
25  * LZMA1 is the very same thing as what was called just LZMA in LZMA Utils,
26  * 7-Zip, and LZMA SDK. It's called LZMA1 here to prevent developers from
27  * accidentally using LZMA when they actually want LZMA2.
28  *
29  * LZMA1 shouldn't be used for new applications unless you _really_ know
30  * what you are doing. LZMA2 is almost always a better choice.
31  */
32 #define LZMA_FILTER_LZMA1       LZMA_VLI_C(0x4000000000000001)
33 
34 /**
35  * \brief       LZMA2 Filter ID
36  *
37  * Usually you want this instead of LZMA1. Compared to LZMA1, LZMA2 adds
38  * support for LZMA_SYNC_FLUSH, uncompressed chunks (smaller expansion
39  * when trying to compress uncompressible data), possibility to change
40  * lc/lp/pb in the middle of encoding, and some other internal improvements.
41  */
42 #define LZMA_FILTER_LZMA2       LZMA_VLI_C(0x21)
43 
44 
45 /**
46  * \brief       Match finders
47  *
48  * Match finder has major effect on both speed and compression ratio.
49  * Usually hash chains are faster than binary trees.
50  *
51  * The memory usage formulas are only rough estimates, which are closest to
52  * reality when dict_size is a power of two. The formulas are  more complex
53  * in reality, and can also change a little between liblzma versions. Use
54  * lzma_memusage_encoder() to get more accurate estimate of memory usage.
55  */
56 typedef enum {
57 	LZMA_MF_HC3     = 0x03,
58 		/**<
59 		 * \brief       Hash Chain with 2- and 3-byte hashing
60 		 *
61 		 * Minimum nice_len: 3
62 		 *
63 		 * Memory usage:
64 		 *  - dict_size <= 16 MiB: dict_size * 7.5
65 		 *  - dict_size > 16 MiB: dict_size * 5.5 + 64 MiB
66 		 */
67 
68 	LZMA_MF_HC4     = 0x04,
69 		/**<
70 		 * \brief       Hash Chain with 2-, 3-, and 4-byte hashing
71 		 *
72 		 * Minimum nice_len: 4
73 		 *
74 		 * Memory usage: dict_size * 7.5
75 		 */
76 
77 	LZMA_MF_BT2     = 0x12,
78 		/**<
79 		 * \brief       Binary Tree with 2-byte hashing
80 		 *
81 		 * Minimum nice_len: 2
82 		 *
83 		 * Memory usage: dict_size * 9.5
84 		 */
85 
86 	LZMA_MF_BT3     = 0x13,
87 		/**<
88 		 * \brief       Binary Tree with 2- and 3-byte hashing
89 		 *
90 		 * Minimum nice_len: 3
91 		 *
92 		 * Memory usage:
93 		 *  - dict_size <= 16 MiB: dict_size * 11.5
94 		 *  - dict_size > 16 MiB: dict_size * 9.5 + 64 MiB
95 		 */
96 
97 	LZMA_MF_BT4     = 0x14
98 		/**<
99 		 * \brief       Binary Tree with 2-, 3-, and 4-byte hashing
100 		 *
101 		 * Minimum nice_len: 4
102 		 *
103 		 * Memory usage: dict_size * 11.5
104 		 */
105 } lzma_match_finder;
106 
107 
108 /**
109  * \brief       Test if given match finder is supported
110  *
111  * Return true if the given match finder is supported by this liblzma build.
112  * Otherwise false is returned. It is safe to call this with a value that
113  * isn't listed in lzma_match_finder enumeration; the return value will be
114  * false.
115  *
116  * There is no way to list which match finders are available in this
117  * particular liblzma version and build. It would be useless, because
118  * a new match finder, which the application developer wasn't aware,
119  * could require giving additional options to the encoder that the older
120  * match finders don't need.
121  */
122 extern LZMA_API(lzma_bool) lzma_mf_is_supported(lzma_match_finder match_finder)
123 		lzma_nothrow lzma_attr_const;
124 
125 
126 /**
127  * \brief       Compression modes
128  *
129  * This selects the function used to analyze the data produced by the match
130  * finder.
131  */
132 typedef enum {
133 	LZMA_MODE_FAST = 1,
134 		/**<
135 		 * \brief       Fast compression
136 		 *
137 		 * Fast mode is usually at its best when combined with
138 		 * a hash chain match finder.
139 		 */
140 
141 	LZMA_MODE_NORMAL = 2
142 		/**<
143 		 * \brief       Normal compression
144 		 *
145 		 * This is usually notably slower than fast mode. Use this
146 		 * together with binary tree match finders to expose the
147 		 * full potential of the LZMA1 or LZMA2 encoder.
148 		 */
149 } lzma_mode;
150 
151 
152 /**
153  * \brief       Test if given compression mode is supported
154  *
155  * Return true if the given compression mode is supported by this liblzma
156  * build. Otherwise false is returned. It is safe to call this with a value
157  * that isn't listed in lzma_mode enumeration; the return value will be false.
158  *
159  * There is no way to list which modes are available in this particular
160  * liblzma version and build. It would be useless, because a new compression
161  * mode, which the application developer wasn't aware, could require giving
162  * additional options to the encoder that the older modes don't need.
163  */
164 extern LZMA_API(lzma_bool) lzma_mode_is_supported(lzma_mode mode)
165 		lzma_nothrow lzma_attr_const;
166 
167 
168 /**
169  * \brief       Options specific to the LZMA1 and LZMA2 filters
170  *
171  * Since LZMA1 and LZMA2 share most of the code, it's simplest to share
172  * the options structure too. For encoding, all but the reserved variables
173  * need to be initialized unless specifically mentioned otherwise.
174  *
175  * For raw decoding, both LZMA1 and LZMA2 need dict_size, preset_dict, and
176  * preset_dict_size (if preset_dict != NULL). LZMA1 needs also lc, lp, and pb.
177  */
178 typedef struct {
179 	/**
180 	 * \brief       Dictionary size in bytes
181 	 *
182 	 * Dictionary size indicates how many bytes of the recently processed
183 	 * uncompressed data is kept in memory. One method to reduce size of
184 	 * the uncompressed data is to store distance-length pairs, which
185 	 * indicate what data to repeat from the dictionary buffer. Thus,
186 	 * the bigger the dictionary, the better the compression ratio
187 	 * usually is.
188 	 *
189 	 * Maximum size of the dictionary depends on multiple things:
190 	 *  - Memory usage limit
191 	 *  - Available address space (not a problem on 64-bit systems)
192 	 *  - Selected match finder (encoder only)
193 	 *
194 	 * Currently the maximum dictionary size for encoding is 1.5 GiB
195 	 * (i.e. (UINT32_C(1) << 30) + (UINT32_C(1) << 29)) even on 64-bit
196 	 * systems for certain match finder implementation reasons. In the
197 	 * future, there may be match finders that support bigger
198 	 * dictionaries.
199 	 *
200 	 * Decoder already supports dictionaries up to 4 GiB - 1 B (i.e.
201 	 * UINT32_MAX), so increasing the maximum dictionary size of the
202 	 * encoder won't cause problems for old decoders.
203 	 *
204 	 * Because extremely small dictionaries sizes would have unneeded
205 	 * overhead in the decoder, the minimum dictionary size is 4096 bytes.
206 	 *
207 	 * \note        When decoding, too big dictionary does no other harm
208 	 *              than wasting memory.
209 	 */
210 	uint32_t dict_size;
211 #	define LZMA_DICT_SIZE_MIN       UINT32_C(4096)
212 #	define LZMA_DICT_SIZE_DEFAULT   (UINT32_C(1) << 23)
213 
214 	/**
215 	 * \brief       Pointer to an initial dictionary
216 	 *
217 	 * It is possible to initialize the LZ77 history window using
218 	 * a preset dictionary. It is useful when compressing many
219 	 * similar, relatively small chunks of data independently from
220 	 * each other. The preset dictionary should contain typical
221 	 * strings that occur in the files being compressed. The most
222 	 * probable strings should be near the end of the preset dictionary.
223 	 *
224 	 * This feature should be used only in special situations. For
225 	 * now, it works correctly only with raw encoding and decoding.
226 	 * Currently none of the container formats supported by
227 	 * liblzma allow preset dictionary when decoding, thus if
228 	 * you create a .xz or .lzma file with preset dictionary, it
229 	 * cannot be decoded with the regular decoder functions. In the
230 	 * future, the .xz format will likely get support for preset
231 	 * dictionary though.
232 	 */
233 	const uint8_t *preset_dict;
234 
235 	/**
236 	 * \brief       Size of the preset dictionary
237 	 *
238 	 * Specifies the size of the preset dictionary. If the size is
239 	 * bigger than dict_size, only the last dict_size bytes are
240 	 * processed.
241 	 *
242 	 * This variable is read only when preset_dict is not NULL.
243 	 * If preset_dict is not NULL but preset_dict_size is zero,
244 	 * no preset dictionary is used (identical to only setting
245 	 * preset_dict to NULL).
246 	 */
247 	uint32_t preset_dict_size;
248 
249 	/**
250 	 * \brief       Number of literal context bits
251 	 *
252 	 * How many of the highest bits of the previous uncompressed
253 	 * eight-bit byte (also known as `literal') are taken into
254 	 * account when predicting the bits of the next literal.
255 	 *
256 	 * \todo        Example
257 	 *
258 	 * There is a limit that applies to literal context bits and literal
259 	 * position bits together: lc + lp <= 4. Without this limit the
260 	 * decoding could become very slow, which could have security related
261 	 * results in some cases like email servers doing virus scanning.
262 	 * This limit also simplifies the internal implementation in liblzma.
263 	 *
264 	 * There may be LZMA1 streams that have lc + lp > 4 (maximum possible
265 	 * lc would be 8). It is not possible to decode such streams with
266 	 * liblzma.
267 	 */
268 	uint32_t lc;
269 #	define LZMA_LCLP_MIN    0
270 #	define LZMA_LCLP_MAX    4
271 #	define LZMA_LC_DEFAULT  3
272 
273 	/**
274 	 * \brief       Number of literal position bits
275 	 *
276 	 * How many of the lowest bits of the current position (number
277 	 * of bytes from the beginning of the uncompressed data) in the
278 	 * uncompressed data is taken into account when predicting the
279 	 * bits of the next literal (a single eight-bit byte).
280 	 *
281 	 * \todo        Example
282 	 */
283 	uint32_t lp;
284 #	define LZMA_LP_DEFAULT  0
285 
286 	/**
287 	 * \brief       Number of position bits
288 	 *
289 	 * How many of the lowest bits of the current position in the
290 	 * uncompressed data is taken into account when estimating
291 	 * probabilities of matches. A match is a sequence of bytes for
292 	 * which a matching sequence is found from the dictionary and
293 	 * thus can be stored as distance-length pair.
294 	 *
295 	 * Example: If most of the matches occur at byte positions of
296 	 * 8 * n + 3, that is, 3, 11, 19, ... set pb to 3, because 2**3 == 8.
297 	 */
298 	uint32_t pb;
299 #	define LZMA_PB_MIN      0
300 #	define LZMA_PB_MAX      4
301 #	define LZMA_PB_DEFAULT  2
302 
303 	/**
304 	 * \brief       Indicate if the options structure is persistent
305 	 *
306 	 * If this is true, the application must keep this options structure
307 	 * available after the LZMA2 encoder has been initialized. With
308 	 * persistent structure it is possible to change some encoder options
309 	 * in the middle of the encoding process without resetting the encoder.
310 	 *
311 	 * This option is used only by LZMA2. LZMA1 ignores this and it is
312 	 * safe to not initialize this when encoding with LZMA1.
313 	 */
314 	lzma_bool persistent;
315 
316 	/** Compression mode */
317 	lzma_mode mode;
318 
319 	/**
320 	 * \brief       Nice length of a match
321 	 *
322 	 * This determines how many bytes the encoder compares from the match
323 	 * candidates when looking for the best match. Once a match of at
324 	 * least nice_len bytes long is found, the encoder stops looking for
325 	 * better condidates and encodes the match. (Naturally, if the found
326 	 * match is actually longer than nice_len, the actual length is
327 	 * encoded; it's not truncated to nice_len.)
328 	 *
329 	 * Bigger values usually increase the compression ratio and
330 	 * compression time. For most files, 32 to 128 is a good value,
331 	 * which gives very good compression ratio at good speed.
332 	 *
333 	 * The exact minimum value depends on the match finder. The maximum
334 	 * is 273, which is the maximum length of a match that LZMA1 and
335 	 * LZMA2 can encode.
336 	 */
337 	uint32_t nice_len;
338 
339 	/** Match finder ID */
340 	lzma_match_finder mf;
341 
342 	/**
343 	 * \brief       Maximum search depth in the match finder
344 	 *
345 	 * For every input byte, match finder searches through the hash chain
346 	 * or binary tree in a loop, each iteration going one step deeper in
347 	 * the chain or tree. The searching stops if
348 	 *  - a match of at least nice_len bytes long is found;
349 	 *  - all match candidates from the hash chain or binary tree have
350 	 *    been checked; or
351 	 *  - maximum search depth is reached.
352 	 *
353 	 * Maximum search depth is needed to prevent the match finder from
354 	 * wasting too much time in case there are lots of short match
355 	 * candidates. On the other hand, stopping the search before all
356 	 * candidates have been checked can reduce compression ratio.
357 	 *
358 	 * Setting depth to zero tells liblzma to use an automatic default
359 	 * value, that depends on the selected match finder and nice_len.
360 	 * The default is in the range [10, 200] or so (it may vary between
361 	 * liblzma versions).
362 	 *
363 	 * Using a bigger depth value than the default can increase
364 	 * compression ratio in some cases. There is no strict maximum value,
365 	 * but high values (thousands or millions) should be used with care:
366 	 * the encoder could remain fast enough with typical input, but
367 	 * malicious input could cause the match finder to slow down
368 	 * dramatically, possibly creating a denial of service attack.
369 	 */
370 	uint32_t depth;
371 
372 	/*
373 	 * Reserved space to allow possible future extensions without
374 	 * breaking the ABI. You should not touch these, because the names
375 	 * of these variables may change. These are and will never be used
376 	 * with the currently supported options, so it is safe to leave these
377 	 * uninitialized.
378 	 */
379 	void *reserved_ptr1;
380 	void *reserved_ptr2;
381 	uint32_t reserved_int1;
382 	uint32_t reserved_int2;
383 	uint32_t reserved_int3;
384 	uint32_t reserved_int4;
385 	uint32_t reserved_int5;
386 	uint32_t reserved_int6;
387 	uint32_t reserved_int7;
388 	uint32_t reserved_int8;
389 	lzma_reserved_enum reserved_enum1;
390 	lzma_reserved_enum reserved_enum2;
391 	lzma_reserved_enum reserved_enum3;
392 	lzma_reserved_enum reserved_enum4;
393 
394 } lzma_options_lzma;
395 
396 
397 /**
398  * \brief       Set a compression preset to lzma_options_lzma structure
399  *
400  * 0 is the fastest and 9 is the slowest. These match the switches -0 .. -9
401  * of the xz command line tool. In addition, it is possible to bitwise-or
402  * flags to the preset. Currently only LZMA_PRESET_EXTREME is supported.
403  * The flags are defined in container.h, because the flags are used also
404  * with lzma_easy_encoder().
405  *
406  * The preset values are subject to changes between liblzma versions.
407  *
408  * This function is available only if LZMA1 or LZMA2 encoder has been enabled
409  * when building liblzma.
410  */
411 extern LZMA_API(lzma_bool) lzma_lzma_preset(
412 		lzma_options_lzma *options, uint32_t preset) lzma_nothrow;
413