1 /*
2  * deflate_compress.c - a compressor for DEFLATE
3  *
4  * Originally public domain; changes after 2016-09-07 are copyrighted.
5  *
6  * Copyright 2016 Eric Biggers
7  *
8  * Permission is hereby granted, free of charge, to any person
9  * obtaining a copy of this software and associated documentation
10  * files (the "Software"), to deal in the Software without
11  * restriction, including without limitation the rights to use,
12  * copy, modify, merge, publish, distribute, sublicense, and/or sell
13  * copies of the Software, and to permit persons to whom the
14  * Software is furnished to do so, subject to the following
15  * conditions:
16  *
17  * The above copyright notice and this permission notice shall be
18  * included in all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27  * OTHER DEALINGS IN THE SOFTWARE.
28  */
29 
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "aligned_malloc.h"
34 #include "deflate_compress.h"
35 #include "deflate_constants.h"
36 #include "unaligned.h"
37 
38 #include "libdeflate.h"
39 
40 /*
41  * By default, the near-optimal parsing algorithm is enabled at compression
42  * level 8 and above.  The near-optimal parsing algorithm produces a compression
43  * ratio significantly better than the greedy and lazy algorithms implemented
44  * here, and also the algorithm used by zlib at level 9.  However, it is slow.
45  */
46 #define SUPPORT_NEAR_OPTIMAL_PARSING 1
47 
48 /*
49  * Define to 1 to maintain the full map from match offsets to offset slots.
50  * This slightly speeds up translations of match offsets to offset slots, but it
51  * uses 32769 bytes of memory rather than the 512 bytes used by the condensed
52  * map.  The speedup provided by the larger map is most helpful when the
53  * near-optimal parsing algorithm is being used.
54  */
55 #define USE_FULL_OFFSET_SLOT_FAST	SUPPORT_NEAR_OPTIMAL_PARSING
56 
57 /*
58  * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
59  * appropriately.
60  */
61 #define MATCHFINDER_WINDOW_ORDER	15
62 
63 #include "hc_matchfinder.h"
64 #if SUPPORT_NEAR_OPTIMAL_PARSING
65 #  include "bt_matchfinder.h"
66 #endif
67 
68 /*
69  * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
70  * except if the last block has to be shorter.
71  */
72 #define MIN_BLOCK_LENGTH	10000
73 
74 /*
75  * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
76  * the final length might be slightly longer due to matches extending beyond
77  * this limit.
78  */
79 #define SOFT_MAX_BLOCK_LENGTH	300000
80 
81 /*
82  * The number of observed matches or literals that represents sufficient data to
83  * decide whether the current block should be terminated or not.
84  */
85 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK       512
86 
87 
88 #if SUPPORT_NEAR_OPTIMAL_PARSING
89 /* Constants specific to the near-optimal parsing algorithm */
90 
91 /*
92  * The maximum number of matches the matchfinder can find at a single position.
93  * Since the matchfinder never finds more than one match for the same length,
94  * presuming one of each possible length is sufficient for an upper bound.
95  * (This says nothing about whether it is worthwhile to consider so many
96  * matches; this is just defining the worst case.)
97  */
98 #  define MAX_MATCHES_PER_POS	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
99 
100 /*
101  * The number of lz_match structures in the match cache, excluding the extra
102  * "overflow" entries.  This value should be high enough so that nearly the
103  * time, all matches found in a given block can fit in the match cache.
104  * However, fallback behavior (immediately terminating the block) on cache
105  * overflow is still required.
106  */
107 #  define CACHE_LENGTH      (SOFT_MAX_BLOCK_LENGTH * 5)
108 
109 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
110 
111 /*
112  * These are the compressor-side limits on the codeword lengths for each Huffman
113  * code.  To make outputting bits slightly faster, some of these limits are
114  * lower than the limits defined by the DEFLATE format.  This does not
115  * significantly affect the compression ratio, at least for the block lengths we
116  * use.
117  */
118 #define MAX_LITLEN_CODEWORD_LEN		14
119 #define MAX_OFFSET_CODEWORD_LEN		DEFLATE_MAX_OFFSET_CODEWORD_LEN
120 #define MAX_PRE_CODEWORD_LEN		DEFLATE_MAX_PRE_CODEWORD_LEN
121 
122 /* Table: length slot => length slot base value  */
123 static const unsigned deflate_length_slot_base[] = {
124 	3   , 4   , 5   , 6   , 7   , 8   , 9   , 10  ,
125 	11  , 13  , 15  , 17  , 19  , 23  , 27  , 31  ,
126 	35  , 43  , 51  , 59  , 67  , 83  , 99  , 115 ,
127 	131 , 163 , 195 , 227 , 258 ,
128 };
129 
130 /* Table: length slot => number of extra length bits  */
131 static const u8 deflate_extra_length_bits[] = {
132 	0   , 0   , 0   , 0   , 0   , 0   , 0   , 0 ,
133 	1   , 1   , 1   , 1   , 2   , 2   , 2   , 2 ,
134 	3   , 3   , 3   , 3   , 4   , 4   , 4   , 4 ,
135 	5   , 5   , 5   , 5   , 0   ,
136 };
137 
138 /* Table: offset slot => offset slot base value  */
139 static const unsigned deflate_offset_slot_base[] = {
140 	1    , 2    , 3    , 4     , 5     , 7     , 9     , 13    ,
141 	17   , 25   , 33   , 49    , 65    , 97    , 129   , 193   ,
142 	257  , 385  , 513  , 769   , 1025  , 1537  , 2049  , 3073  ,
143 	4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
144 };
145 
146 /* Table: offset slot => number of extra offset bits  */
147 static const u8 deflate_extra_offset_bits[] = {
148 	0    , 0    , 0    , 0     , 1     , 1     , 2     , 2     ,
149 	3    , 3    , 4    , 4     , 5     , 5     , 6     , 6     ,
150 	7    , 7    , 8    , 8     , 9     , 9     , 10    , 10    ,
151 	11   , 11   , 12   , 12    , 13    , 13    ,
152 };
153 
154 /* Table: length => length slot  */
155 static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
156 	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
157 	12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
158 	16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
159 	18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
160 	20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
161 	21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
162 	22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
163 	23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
164 	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
165 	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
166 	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
167 	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
168 	26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
169 	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
170 	27, 27, 28,
171 };
172 
173 /* The order in which precode codeword lengths are stored */
174 static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
175 	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
176 };
177 
178 /* Codewords for the DEFLATE Huffman codes.  */
179 struct deflate_codewords {
180 	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
181 	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
182 };
183 
184 /* Codeword lengths (in bits) for the DEFLATE Huffman codes.
185  * A zero length means the corresponding symbol had zero frequency.  */
186 struct deflate_lens {
187 	u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
188 	u8 offset[DEFLATE_NUM_OFFSET_SYMS];
189 };
190 
191 /* Codewords and lengths for the DEFLATE Huffman codes.  */
192 struct deflate_codes {
193 	struct deflate_codewords codewords;
194 	struct deflate_lens lens;
195 };
196 
197 /* Symbol frequency counters for the DEFLATE Huffman codes.  */
198 struct deflate_freqs {
199 	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
200 	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
201 };
202 
203 #if SUPPORT_NEAR_OPTIMAL_PARSING
204 
205 /* Costs for the near-optimal parsing algorithm.  */
206 struct deflate_costs {
207 
208 	/* The cost to output each possible literal.  */
209 	u32 literal[DEFLATE_NUM_LITERALS];
210 
211 	/* The cost to output each possible match length.  */
212 	u32 length[DEFLATE_MAX_MATCH_LEN + 1];
213 
214 	/* The cost to output a match offset of each possible offset slot.  */
215 	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
216 };
217 
218 /*
219  * COST_SHIFT is a scaling factor that makes it possible to consider fractional
220  * bit costs.  A token requiring 'n' bits to represent has cost n << COST_SHIFT.
221  *
222  * Note: this is only useful as a statistical trick for when the true costs are
223  * unknown.  In reality, each token in DEFLATE requires a whole number of bits
224  * to output.
225  */
226 #define COST_SHIFT	3
227 
228 /*
229  * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
230  * be needed to output a symbol that was unused in the previous optimization
231  * pass.  Assigning a default cost allows the symbol to be used in the next
232  * optimization pass.  However, the cost should be relatively high because the
233  * symbol probably won't be used very many times (if at all).
234  */
235 #define LITERAL_NOSTAT_BITS	13
236 #define LENGTH_NOSTAT_BITS	13
237 #define OFFSET_NOSTAT_BITS	10
238 
239 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
240 
241 /*
242  * Represents a run of literals followed by a match or end-of-block.  This
243  * struct is needed to temporarily store items chosen by the parser, since items
244  * cannot be written until all items for the block have been chosen and the
245  * block's Huffman codes have been computed.
246  */
247 struct deflate_sequence {
248 
249 	/* Bits 0..22: the number of literals in this run.  This may be 0 and
250 	 * can be at most about SOFT_MAX_BLOCK_LENGTH.  The literals are not
251 	 * stored explicitly in this structure; instead, they are read directly
252 	 * from the uncompressed data.
253 	 *
254 	 * Bits 23..31: the length of the match which follows the literals, or 0
255 	 * if this literal run was the last in the block, so there is no match
256 	 * which follows it.  */
257 	u32 litrunlen_and_length;
258 
259 	/* If 'length' doesn't indicate end-of-block, then this is the offset of
260 	 * the match which follows the literals.  */
261 	u16 offset;
262 
263 	/* If 'length' doesn't indicate end-of-block, then this is the offset
264 	 * symbol of the match which follows the literals.  */
265 	u8 offset_symbol;
266 
267 	/* If 'length' doesn't indicate end-of-block, then this is the length
268 	 * slot of the match which follows the literals.  */
269 	u8 length_slot;
270 };
271 
272 #if SUPPORT_NEAR_OPTIMAL_PARSING
273 
274 /*
275  * This structure represents a byte position in the input data and a node in the
276  * graph of possible match/literal choices for the current block.
277  *
278  * Logically, each incoming edge to this node is labeled with a literal or a
279  * match that can be taken to reach this position from an earlier position; and
280  * each outgoing edge from this node is labeled with a literal or a match that
281  * can be taken to advance from this position to a later position.
282  *
283  * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
284  * associate with each node just two pieces of information:
285  *
286  *	'cost_to_end' is the minimum cost to reach the end of the block from
287  *	this position.
288  *
289  *	'item' represents the literal or match that must be chosen from here to
290  *	reach the end of the block with the minimum cost.  Equivalently, this
291  *	can be interpreted as the label of the outgoing edge on the minimum-cost
292  *	path to the "end of block" node from this node.
293  */
294 struct deflate_optimum_node {
295 
296 	u32 cost_to_end;
297 
298 	/*
299 	 * Notes on the match/literal representation used here:
300 	 *
301 	 *	The low bits of 'item' are the length: 1 if this is a literal,
302 	 *	or the match length if this is a match.
303 	 *
304 	 *	The high bits of 'item' are the actual literal byte if this is a
305 	 *	literal, or the match offset if this is a match.
306 	 */
307 #define OPTIMUM_OFFSET_SHIFT 9
308 #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
309 	u32 item;
310 
311 };
312 
313 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
314 
315 /* Block split statistics.  See "Block splitting algorithm" below. */
316 #define NUM_LITERAL_OBSERVATION_TYPES 8
317 #define NUM_MATCH_OBSERVATION_TYPES 2
318 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
319 struct block_split_stats {
320 	u32 new_observations[NUM_OBSERVATION_TYPES];
321 	u32 observations[NUM_OBSERVATION_TYPES];
322 	u32 num_new_observations;
323 	u32 num_observations;
324 };
325 
326 /* The main DEFLATE compressor structure  */
327 struct libdeflate_compressor {
328 
329 	/* Pointer to the compress() implementation chosen at allocation time */
330 	size_t (*impl)(struct libdeflate_compressor *,
331 		       const u8 *, size_t, u8 *, size_t);
332 
333 	/* Frequency counters for the current block  */
334 	struct deflate_freqs freqs;
335 
336 	/* Dynamic Huffman codes for the current block  */
337 	struct deflate_codes codes;
338 
339 	/* Static Huffman codes */
340 	struct deflate_codes static_codes;
341 
342 	/* Block split statistics for the currently pending block */
343 	struct block_split_stats split_stats;
344 
345 	/* A table for fast lookups of offset slot by match offset.
346 	 *
347 	 * If the full table is being used, it is a direct mapping from offset
348 	 * to offset slot.
349 	 *
350 	 * If the condensed table is being used, the first 256 entries map
351 	 * directly to the offset slots of offsets 1 through 256.  The next 256
352 	 * entries map to the offset slots for the remaining offsets, stepping
353 	 * through the offsets with a stride of 128.  This relies on the fact
354 	 * that each of the remaining offset slots contains at least 128 offsets
355 	 * and has an offset base that is a multiple of 128.  */
356 #if USE_FULL_OFFSET_SLOT_FAST
357 	u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
358 #else
359 	u8 offset_slot_fast[512];
360 #endif
361 
362 	/* The "nice" match length: if a match of this length is found, choose
363 	 * it immediately without further consideration.  */
364 	unsigned nice_match_length;
365 
366 	/* The maximum search depth: consider at most this many potential
367 	 * matches at each position.  */
368 	unsigned max_search_depth;
369 
370 	/* The compression level with which this compressor was created.  */
371 	unsigned compression_level;
372 
373 	/* Temporary space for Huffman code output  */
374 	u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
375 	u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
376 	u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
377 	unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
378 	unsigned num_litlen_syms;
379 	unsigned num_offset_syms;
380 	unsigned num_explicit_lens;
381 	unsigned num_precode_items;
382 
383 	union {
384 		/* Data for greedy or lazy parsing  */
385 		struct {
386 			/* Hash chain matchfinder  */
387 			struct hc_matchfinder hc_mf;
388 
389 			/* The matches and literals that the parser has chosen
390 			 * for the current block.  The required length of this
391 			 * array is limited by the maximum number of matches
392 			 * that can ever be chosen for a single block, plus one
393 			 * for the special entry at the end.  */
394 			struct deflate_sequence sequences[
395 				DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
396 					     DEFLATE_MIN_MATCH_LEN) + 1];
397 		} g; /* (g)reedy */
398 
399 	#if SUPPORT_NEAR_OPTIMAL_PARSING
400 		/* Data for near-optimal parsing  */
401 		struct {
402 
403 			/* Binary tree matchfinder  */
404 			struct bt_matchfinder bt_mf;
405 
406 			/*
407 			 * Cached matches for the current block.  This array
408 			 * contains the matches that were found at each position
409 			 * in the block.  Specifically, for each position, there
410 			 * is a list of matches found at that position, if any,
411 			 * sorted by strictly increasing length.  In addition,
412 			 * following the matches for each position, there is a
413 			 * special 'struct lz_match' whose 'length' member
414 			 * contains the number of matches found at that
415 			 * position, and whose 'offset' member contains the
416 			 * literal at that position.
417 			 *
418 			 * Note: in rare cases, there will be a very high number
419 			 * of matches in the block and this array will overflow.
420 			 * If this happens, we force the end of the current
421 			 * block.  CACHE_LENGTH is the length at which we
422 			 * actually check for overflow.  The extra slots beyond
423 			 * this are enough to absorb the worst case overflow,
424 			 * which occurs if starting at &match_cache[CACHE_LENGTH
425 			 * - 1], we write MAX_MATCHES_PER_POS matches and a
426 			 * match count header, then skip searching for matches
427 			 * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write
428 			 * the match count header for each.
429 			 */
430 			struct lz_match match_cache[CACHE_LENGTH +
431 						    MAX_MATCHES_PER_POS +
432 						    DEFLATE_MAX_MATCH_LEN - 1];
433 
434 			/*
435 			 * Array of nodes, one per position, for running the
436 			 * minimum-cost path algorithm.
437 			 *
438 			 * This array must be large enough to accommodate the
439 			 * worst-case number of nodes, which occurs if we find a
440 			 * match of length DEFLATE_MAX_MATCH_LEN at position
441 			 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
442 			 * length SOFT_MAX_BLOCK_LENGTH - 1 +
443 			 * DEFLATE_MAX_MATCH_LEN.  Add one for the end-of-block
444 			 * node.
445 			 */
446 			struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
447 								  DEFLATE_MAX_MATCH_LEN + 1];
448 
449 			/* The current cost model being used.  */
450 			struct deflate_costs costs;
451 
452 			unsigned num_optim_passes;
453 		} n; /* (n)ear-optimal */
454 	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
455 
456 	} p; /* (p)arser */
457 };
458 
459 /*
460  * The type for the bitbuffer variable, which temporarily holds bits that are
461  * being packed into bytes and written to the output buffer.  For best
462  * performance, this should have size equal to a machine word.
463  */
464 typedef machine_word_t bitbuf_t;
465 #define BITBUF_NBITS	(8 * sizeof(bitbuf_t))
466 
467 /* Can the specified number of bits always be added to 'bitbuf' after any
468  * pending bytes have been flushed?  */
469 #define CAN_BUFFER(n)	((n) <= BITBUF_NBITS - 7)
470 
471 /*
472  * Structure to keep track of the current state of sending bits to the
473  * compressed output buffer.
474  */
475 struct deflate_output_bitstream {
476 
477 	/* Bits that haven't yet been written to the output buffer.  */
478 	bitbuf_t bitbuf;
479 
480 	/* Number of bits currently held in @bitbuf.  */
481 	unsigned bitcount;
482 
483 	/* Pointer to the beginning of the output buffer.  */
484 	u8 *begin;
485 
486 	/* Pointer to the position in the output buffer at which the next byte
487 	 * should be written.  */
488 	u8 *next;
489 
490 	/* Pointer just past the end of the output buffer.  */
491 	u8 *end;
492 };
493 
494 /*
495  * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
496  * present following os->end, in order to not overrun the buffer when generating
497  * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
498  * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
499  * to make the compression algorithm produce the same result on all CPU
500  * architectures (which is sometimes desirable), we have to unconditionally use
501  * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
502  */
503 #define OUTPUT_END_PADDING	8
504 
505 /* Initialize the output bitstream.  'size' is assumed to be at least
506  * OUTPUT_END_PADDING.  */
507 static void
deflate_init_output(struct deflate_output_bitstream * os,void * buffer,size_t size)508 deflate_init_output(struct deflate_output_bitstream *os,
509 		    void *buffer, size_t size)
510 {
511 	os->bitbuf = 0;
512 	os->bitcount = 0;
513 	os->begin = buffer;
514 	os->next = os->begin;
515 	os->end = os->begin + size - OUTPUT_END_PADDING;
516 }
517 
518 /* Add some bits to the bitbuffer variable of the output bitstream.  The caller
519  * must make sure there is enough room.  */
520 static forceinline void
deflate_add_bits(struct deflate_output_bitstream * os,const bitbuf_t bits,const unsigned num_bits)521 deflate_add_bits(struct deflate_output_bitstream *os,
522 		 const bitbuf_t bits, const unsigned num_bits)
523 {
524 	os->bitbuf |= bits << os->bitcount;
525 	os->bitcount += num_bits;
526 }
527 
528 /* Flush bits from the bitbuffer variable to the output buffer.  */
529 static forceinline void
deflate_flush_bits(struct deflate_output_bitstream * os)530 deflate_flush_bits(struct deflate_output_bitstream *os)
531 {
532 	if (UNALIGNED_ACCESS_IS_FAST) {
533 		/* Flush a whole word (branchlessly).  */
534 		put_unaligned_leword(os->bitbuf, os->next);
535 		os->bitbuf >>= os->bitcount & ~7;
536 		os->next += MIN(os->end - os->next, os->bitcount >> 3);
537 		os->bitcount &= 7;
538 	} else {
539 		/* Flush a byte at a time.  */
540 		while (os->bitcount >= 8) {
541 			*os->next = os->bitbuf;
542 			if (os->next != os->end)
543 				os->next++;
544 			os->bitcount -= 8;
545 			os->bitbuf >>= 8;
546 		}
547 	}
548 }
549 
550 /* Align the bitstream on a byte boundary. */
551 static forceinline void
deflate_align_bitstream(struct deflate_output_bitstream * os)552 deflate_align_bitstream(struct deflate_output_bitstream *os)
553 {
554 	os->bitcount += -os->bitcount & 7;
555 	deflate_flush_bits(os);
556 }
557 
558 /*
559  * Flush any remaining bits to the output buffer if needed.  Return the total
560  * number of bytes written to the output buffer, or 0 if an overflow occurred.
561  */
562 static u32
deflate_flush_output(struct deflate_output_bitstream * os)563 deflate_flush_output(struct deflate_output_bitstream *os)
564 {
565 	if (os->next == os->end) /* overflow?  */
566 		return 0;
567 
568 	while ((int)os->bitcount > 0) {
569 		*os->next++ = os->bitbuf;
570 		os->bitcount -= 8;
571 		os->bitbuf >>= 8;
572 	}
573 
574 	return os->next - os->begin;
575 }
576 
577 /* Given the binary tree node A[subtree_idx] whose children already
578  * satisfy the maxheap property, swap the node with its greater child
579  * until it is greater than both its children, so that the maxheap
580  * property is satisfied in the subtree rooted at A[subtree_idx].  */
581 static void
heapify_subtree(u32 A[],unsigned length,unsigned subtree_idx)582 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
583 {
584 	unsigned parent_idx;
585 	unsigned child_idx;
586 	u32 v;
587 
588 	v = A[subtree_idx];
589 	parent_idx = subtree_idx;
590 	while ((child_idx = parent_idx * 2) <= length) {
591 		if (child_idx < length && A[child_idx + 1] > A[child_idx])
592 			child_idx++;
593 		if (v >= A[child_idx])
594 			break;
595 		A[parent_idx] = A[child_idx];
596 		parent_idx = child_idx;
597 	}
598 	A[parent_idx] = v;
599 }
600 
601 /* Rearrange the array 'A' so that it satisfies the maxheap property.
602  * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
603  */
604 static void
heapify_array(u32 A[],unsigned length)605 heapify_array(u32 A[], unsigned length)
606 {
607 	unsigned subtree_idx;
608 
609 	for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
610 		heapify_subtree(A, length, subtree_idx);
611 }
612 
613 /*
614  * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
615  *
616  * Note: name this function heap_sort() instead of heapsort() to avoid colliding
617  * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
618  * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
619  */
620 static void
heap_sort(u32 A[],unsigned length)621 heap_sort(u32 A[], unsigned length)
622 {
623 	A--; /* Use 1-based indices  */
624 
625 	heapify_array(A, length);
626 
627 	while (length >= 2) {
628 		u32 tmp = A[length];
629 		A[length] = A[1];
630 		A[1] = tmp;
631 		length--;
632 		heapify_subtree(A, length, 1);
633 	}
634 }
635 
636 #define NUM_SYMBOL_BITS 10
637 #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
638 
639 #define GET_NUM_COUNTERS(num_syms)	((((num_syms) + 3 / 4) + 3) & ~3)
640 /*
641  * Sort the symbols primarily by frequency and secondarily by symbol
642  * value.  Discard symbols with zero frequency and fill in an array with
643  * the remaining symbols, along with their frequencies.  The low
644  * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
645  * value, and the remaining bits will contain the frequency.
646  *
647  * @num_syms
648  *	Number of symbols in the alphabet.
649  *	Can't be greater than (1 << NUM_SYMBOL_BITS).
650  *
651  * @freqs[num_syms]
652  *	The frequency of each symbol.
653  *
654  * @lens[num_syms]
655  *	An array that eventually will hold the length of each codeword.
656  *	This function only fills in the codeword lengths for symbols that
657  *	have zero frequency, which are not well defined per se but will
658  *	be set to 0.
659  *
660  * @symout[num_syms]
661  *	The output array, described above.
662  *
663  * Returns the number of entries in 'symout' that were filled.  This is
664  * the number of symbols that have nonzero frequency.
665  */
666 static unsigned
sort_symbols(unsigned num_syms,const u32 freqs[restrict],u8 lens[restrict],u32 symout[restrict])667 sort_symbols(unsigned num_syms, const u32 freqs[restrict],
668 	     u8 lens[restrict], u32 symout[restrict])
669 {
670 	unsigned sym;
671 	unsigned i;
672 	unsigned num_used_syms;
673 	unsigned num_counters;
674 	unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
675 
676 	/* We rely on heapsort, but with an added optimization.  Since
677 	 * it's common for most symbol frequencies to be low, we first do
678 	 * a count sort using a limited number of counters.  High
679 	 * frequencies will be counted in the last counter, and only they
680 	 * will be sorted with heapsort.
681 	 *
682 	 * Note: with more symbols, it is generally beneficial to have more
683 	 * counters.  About 1 counter per 4 symbols seems fast.
684 	 *
685 	 * Note: I also tested radix sort, but even for large symbol
686 	 * counts (> 255) and frequencies bounded at 16 bits (enabling
687 	 * radix sort by just two base-256 digits), it didn't seem any
688 	 * faster than the method implemented here.
689 	 *
690 	 * Note: I tested the optimized quicksort implementation from
691 	 * glibc (with indirection overhead removed), but it was only
692 	 * marginally faster than the simple heapsort implemented here.
693 	 *
694 	 * Tests were done with building the codes for LZX.  Results may
695 	 * vary for different compression algorithms...!  */
696 
697 	num_counters = GET_NUM_COUNTERS(num_syms);
698 
699 	memset(counters, 0, num_counters * sizeof(counters[0]));
700 
701 	/* Count the frequencies.  */
702 	for (sym = 0; sym < num_syms; sym++)
703 		counters[MIN(freqs[sym], num_counters - 1)]++;
704 
705 	/* Make the counters cumulative, ignoring the zero-th, which
706 	 * counted symbols with zero frequency.  As a side effect, this
707 	 * calculates the number of symbols with nonzero frequency.  */
708 	num_used_syms = 0;
709 	for (i = 1; i < num_counters; i++) {
710 		unsigned count = counters[i];
711 		counters[i] = num_used_syms;
712 		num_used_syms += count;
713 	}
714 
715 	/* Sort nonzero-frequency symbols using the counters.  At the
716 	 * same time, set the codeword lengths of zero-frequency symbols
717 	 * to 0.  */
718 	for (sym = 0; sym < num_syms; sym++) {
719 		u32 freq = freqs[sym];
720 		if (freq != 0) {
721 			symout[counters[MIN(freq, num_counters - 1)]++] =
722 				sym | (freq << NUM_SYMBOL_BITS);
723 		} else {
724 			lens[sym] = 0;
725 		}
726 	}
727 
728 	/* Sort the symbols counted in the last counter.  */
729 	heap_sort(symout + counters[num_counters - 2],
730 		  counters[num_counters - 1] - counters[num_counters - 2]);
731 
732 	return num_used_syms;
733 }
734 
735 /*
736  * Build the Huffman tree.
737  *
738  * This is an optimized implementation that
739  *	(a) takes advantage of the frequencies being already sorted;
740  *	(b) only generates non-leaf nodes, since the non-leaf nodes of a
741  *	    Huffman tree are sufficient to generate a canonical code;
742  *	(c) Only stores parent pointers, not child pointers;
743  *	(d) Produces the nodes in the same memory used for input
744  *	    frequency information.
745  *
746  * Array 'A', which contains 'sym_count' entries, is used for both input
747  * and output.  For this function, 'sym_count' must be at least 2.
748  *
749  * For input, the array must contain the frequencies of the symbols,
750  * sorted in increasing order.  Specifically, each entry must contain a
751  * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
752  * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
753  * Although these bits will, in fact, contain the symbols that correspond
754  * to the frequencies, this function is concerned with frequencies only
755  * and keeps the symbols as-is.
756  *
757  * For output, this function will produce the non-leaf nodes of the
758  * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
759  * entries of the array.  Entry A[sym_count - 2] will represent the root
760  * node.  Each other node will contain the zero-based index of its parent
761  * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
762  * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
763  * note that although these low bits will, in fact, contain a symbol
764  * value, this symbol will have *no relationship* with the Huffman tree
765  * node that happens to occupy the same slot.  This is because this
766  * implementation only generates the non-leaf nodes of the tree.
767  */
768 static void
build_tree(u32 A[],unsigned sym_count)769 build_tree(u32 A[], unsigned sym_count)
770 {
771 	/* Index, in 'A', of next lowest frequency symbol that has not
772 	 * yet been processed.  */
773 	unsigned i = 0;
774 
775 	/* Index, in 'A', of next lowest frequency parentless non-leaf
776 	 * node; or, if equal to 'e', then no such node exists yet.  */
777 	unsigned b = 0;
778 
779 	/* Index, in 'A', of next node to allocate as a non-leaf.  */
780 	unsigned e = 0;
781 
782 	do {
783 		unsigned m, n;
784 		u32 freq_shifted;
785 
786 		/* Choose the two next lowest frequency entries.  */
787 
788 		if (i != sym_count &&
789 		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
790 			m = i++;
791 		else
792 			m = b++;
793 
794 		if (i != sym_count &&
795 		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
796 			n = i++;
797 		else
798 			n = b++;
799 
800 		/* Allocate a non-leaf node and link the entries to it.
801 		 *
802 		 * If we link an entry that we're visiting for the first
803 		 * time (via index 'i'), then we're actually linking a
804 		 * leaf node and it will have no effect, since the leaf
805 		 * will be overwritten with a non-leaf when index 'e'
806 		 * catches up to it.  But it's not any slower to
807 		 * unconditionally set the parent index.
808 		 *
809 		 * We also compute the frequency of the non-leaf node as
810 		 * the sum of its two children's frequencies.  */
811 
812 		freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
813 
814 		A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
815 		A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
816 		A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
817 		e++;
818 	} while (sym_count - e > 1);
819 		/* When just one entry remains, it is a "leaf" that was
820 		 * linked to some other node.  We ignore it, since the
821 		 * rest of the array contains the non-leaves which we
822 		 * need.  (Note that we're assuming the cases with 0 or 1
823 		 * symbols were handled separately.) */
824 }
825 
826 /*
827  * Given the stripped-down Huffman tree constructed by build_tree(),
828  * determine the number of codewords that should be assigned each
829  * possible length, taking into account the length-limited constraint.
830  *
831  * @A
832  *	The array produced by build_tree(), containing parent index
833  *	information for the non-leaf nodes of the Huffman tree.  Each
834  *	entry in this array is a node; a node's parent always has a
835  *	greater index than that node itself.  This function will
836  *	overwrite the parent index information in this array, so
837  *	essentially it will destroy the tree.  However, the data in the
838  *	low NUM_SYMBOL_BITS of each entry will be preserved.
839  *
840  * @root_idx
841  *	The 0-based index of the root node in 'A', and consequently one
842  *	less than the number of tree node entries in 'A'.  (Or, really 2
843  *	less than the actual length of 'A'.)
844  *
845  * @len_counts
846  *	An array of length ('max_codeword_len' + 1) in which the number of
847  *	codewords having each length <= max_codeword_len will be
848  *	returned.
849  *
850  * @max_codeword_len
851  *	The maximum permissible codeword length.
852  */
853 static void
compute_length_counts(u32 A[restrict],unsigned root_idx,unsigned len_counts[restrict],unsigned max_codeword_len)854 compute_length_counts(u32 A[restrict], unsigned root_idx,
855 		      unsigned len_counts[restrict], unsigned max_codeword_len)
856 {
857 	unsigned len;
858 	int node;
859 
860 	/* The key observations are:
861 	 *
862 	 * (1) We can traverse the non-leaf nodes of the tree, always
863 	 * visiting a parent before its children, by simply iterating
864 	 * through the array in reverse order.  Consequently, we can
865 	 * compute the depth of each node in one pass, overwriting the
866 	 * parent indices with depths.
867 	 *
868 	 * (2) We can initially assume that in the real Huffman tree,
869 	 * both children of the root are leaves.  This corresponds to two
870 	 * codewords of length 1.  Then, whenever we visit a (non-leaf)
871 	 * node during the traversal, we modify this assumption to
872 	 * account for the current node *not* being a leaf, but rather
873 	 * its two children being leaves.  This causes the loss of one
874 	 * codeword for the current depth and the addition of two
875 	 * codewords for the current depth plus one.
876 	 *
877 	 * (3) We can handle the length-limited constraint fairly easily
878 	 * by simply using the largest length available when a depth
879 	 * exceeds max_codeword_len.
880 	 */
881 
882 	for (len = 0; len <= max_codeword_len; len++)
883 		len_counts[len] = 0;
884 	len_counts[1] = 2;
885 
886 	/* Set the root node's depth to 0.  */
887 	A[root_idx] &= SYMBOL_MASK;
888 
889 	for (node = root_idx - 1; node >= 0; node--) {
890 
891 		/* Calculate the depth of this node.  */
892 
893 		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
894 		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
895 		unsigned depth = parent_depth + 1;
896 		unsigned len = depth;
897 
898 		/* Set the depth of this node so that it is available
899 		 * when its children (if any) are processed.  */
900 
901 		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
902 
903 		/* If needed, decrease the length to meet the
904 		 * length-limited constraint.  This is not the optimal
905 		 * method for generating length-limited Huffman codes!
906 		 * But it should be good enough.  */
907 		if (len >= max_codeword_len) {
908 			len = max_codeword_len;
909 			do {
910 				len--;
911 			} while (len_counts[len] == 0);
912 		}
913 
914 		/* Account for the fact that we have a non-leaf node at
915 		 * the current depth.  */
916 		len_counts[len]--;
917 		len_counts[len + 1] += 2;
918 	}
919 }
920 
921 /*
922  * Generate the codewords for a canonical Huffman code.
923  *
924  * @A
925  *	The output array for codewords.  In addition, initially this
926  *	array must contain the symbols, sorted primarily by frequency and
927  *	secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
928  *	each entry.
929  *
930  * @len
931  *	Output array for codeword lengths.
932  *
933  * @len_counts
934  *	An array that provides the number of codewords that will have
935  *	each possible length <= max_codeword_len.
936  *
937  * @max_codeword_len
938  *	Maximum length, in bits, of each codeword.
939  *
940  * @num_syms
941  *	Number of symbols in the alphabet, including symbols with zero
942  *	frequency.  This is the length of the 'A' and 'len' arrays.
943  */
944 static void
gen_codewords(u32 A[restrict],u8 lens[restrict],const unsigned len_counts[restrict],unsigned max_codeword_len,unsigned num_syms)945 gen_codewords(u32 A[restrict], u8 lens[restrict],
946 	      const unsigned len_counts[restrict],
947 	      unsigned max_codeword_len, unsigned num_syms)
948 {
949 	u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
950 	unsigned i;
951 	unsigned len;
952 	unsigned sym;
953 
954 	/* Given the number of codewords that will have each length,
955 	 * assign codeword lengths to symbols.  We do this by assigning
956 	 * the lengths in decreasing order to the symbols sorted
957 	 * primarily by increasing frequency and secondarily by
958 	 * increasing symbol value.  */
959 	for (i = 0, len = max_codeword_len; len >= 1; len--) {
960 		unsigned count = len_counts[len];
961 		while (count--)
962 			lens[A[i++] & SYMBOL_MASK] = len;
963 	}
964 
965 	/* Generate the codewords themselves.  We initialize the
966 	 * 'next_codewords' array to provide the lexicographically first
967 	 * codeword of each length, then assign codewords in symbol
968 	 * order.  This produces a canonical code.  */
969 	next_codewords[0] = 0;
970 	next_codewords[1] = 0;
971 	for (len = 2; len <= max_codeword_len; len++)
972 		next_codewords[len] =
973 			(next_codewords[len - 1] + len_counts[len - 1]) << 1;
974 
975 	for (sym = 0; sym < num_syms; sym++)
976 		A[sym] = next_codewords[lens[sym]]++;
977 }
978 
979 /*
980  * ---------------------------------------------------------------------
981  *			make_canonical_huffman_code()
982  * ---------------------------------------------------------------------
983  *
984  * Given an alphabet and the frequency of each symbol in it, construct a
985  * length-limited canonical Huffman code.
986  *
987  * @num_syms
988  *	The number of symbols in the alphabet.  The symbols are the
989  *	integers in the range [0, num_syms - 1].  This parameter must be
990  *	at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
991  *
992  * @max_codeword_len
993  *	The maximum permissible codeword length.
994  *
995  * @freqs
996  *	An array of @num_syms entries, each of which specifies the
997  *	frequency of the corresponding symbol.  It is valid for some,
998  *	none, or all of the frequencies to be 0.
999  *
1000  * @lens
1001  *	An array of @num_syms entries in which this function will return
1002  *	the length, in bits, of the codeword assigned to each symbol.
1003  *	Symbols with 0 frequency will not have codewords per se, but
1004  *	their entries in this array will be set to 0.  No lengths greater
1005  *	than @max_codeword_len will be assigned.
1006  *
1007  * @codewords
1008  *	An array of @num_syms entries in which this function will return
1009  *	the codeword for each symbol, right-justified and padded on the
1010  *	left with zeroes.  Codewords for symbols with 0 frequency will be
1011  *	undefined.
1012  *
1013  * ---------------------------------------------------------------------
1014  *
1015  * This function builds a length-limited canonical Huffman code.
1016  *
1017  * A length-limited Huffman code contains no codewords longer than some
1018  * specified length, and has exactly (with some algorithms) or
1019  * approximately (with the algorithm used here) the minimum weighted path
1020  * length from the root, given this constraint.
1021  *
1022  * A canonical Huffman code satisfies the properties that a longer
1023  * codeword never lexicographically precedes a shorter codeword, and the
1024  * lexicographic ordering of codewords of the same length is the same as
1025  * the lexicographic ordering of the corresponding symbols.  A canonical
1026  * Huffman code, or more generally a canonical prefix code, can be
1027  * reconstructed from only a list containing the codeword length of each
1028  * symbol.
1029  *
1030  * The classic algorithm to generate a Huffman code creates a node for
1031  * each symbol, then inserts these nodes into a min-heap keyed by symbol
1032  * frequency.  Then, repeatedly, the two lowest-frequency nodes are
1033  * removed from the min-heap and added as the children of a new node
1034  * having frequency equal to the sum of its two children, which is then
1035  * inserted into the min-heap.  When only a single node remains in the
1036  * min-heap, it is the root of the Huffman tree.  The codeword for each
1037  * symbol is determined by the path needed to reach the corresponding
1038  * node from the root.  Descending to the left child appends a 0 bit,
1039  * whereas descending to the right child appends a 1 bit.
1040  *
1041  * The classic algorithm is relatively easy to understand, but it is
1042  * subject to a number of inefficiencies.  In practice, it is fastest to
1043  * first sort the symbols by frequency.  (This itself can be subject to
1044  * an optimization based on the fact that most frequencies tend to be
1045  * low.)  At the same time, we sort secondarily by symbol value, which
1046  * aids the process of generating a canonical code.  Then, during tree
1047  * construction, no heap is necessary because both the leaf nodes and the
1048  * unparented non-leaf nodes can be easily maintained in sorted order.
1049  * Consequently, there can never be more than two possibilities for the
1050  * next-lowest-frequency node.
1051  *
1052  * In addition, because we're generating a canonical code, we actually
1053  * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
1054  * This is because for canonical code generation we don't need to know
1055  * where the symbols are in the tree.  Rather, we only need to know how
1056  * many leaf nodes have each depth (codeword length).  And this
1057  * information can, in fact, be quickly generated from the tree of
1058  * non-leaves only.
1059  *
1060  * Furthermore, we can build this stripped-down Huffman tree directly in
1061  * the array in which the codewords are to be generated, provided that
1062  * these array slots are large enough to hold a symbol and frequency
1063  * value.
1064  *
1065  * Still furthermore, we don't even need to maintain explicit child
1066  * pointers.  We only need the parent pointers, and even those can be
1067  * overwritten in-place with depth information as part of the process of
1068  * extracting codeword lengths from the tree.  So in summary, we do NOT
1069  * need a big structure like:
1070  *
1071  *	struct huffman_tree_node {
1072  *		unsigned int symbol;
1073  *		unsigned int frequency;
1074  *		unsigned int depth;
1075  *		struct huffman_tree_node *left_child;
1076  *		struct huffman_tree_node *right_child;
1077  *	};
1078  *
1079  *
1080  *   ... which often gets used in "naive" implementations of Huffman code
1081  *   generation.
1082  *
1083  * Many of these optimizations are based on the implementation in 7-Zip
1084  * (source file: C/HuffEnc.c), which has been placed in the public domain
1085  * by Igor Pavlov.
1086  */
1087 static void
make_canonical_huffman_code(unsigned num_syms,unsigned max_codeword_len,const u32 freqs[restrict],u8 lens[restrict],u32 codewords[restrict])1088 make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1089 			    const u32 freqs[restrict],
1090 			    u8 lens[restrict], u32 codewords[restrict])
1091 {
1092 	u32 *A = codewords;
1093 	unsigned num_used_syms;
1094 
1095 	STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
1096 
1097 	/* We begin by sorting the symbols primarily by frequency and
1098 	 * secondarily by symbol value.  As an optimization, the array
1099 	 * used for this purpose ('A') shares storage with the space in
1100 	 * which we will eventually return the codewords.  */
1101 
1102 	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
1103 
1104 	/* 'num_used_syms' is the number of symbols with nonzero
1105 	 * frequency.  This may be less than @num_syms.  'num_used_syms'
1106 	 * is also the number of entries in 'A' that are valid.  Each
1107 	 * entry consists of a distinct symbol and a nonzero frequency
1108 	 * packed into a 32-bit integer.  */
1109 
1110 	/* Handle special cases where only 0 or 1 symbols were used (had
1111 	 * nonzero frequency).  */
1112 
1113 	if (unlikely(num_used_syms == 0)) {
1114 		/* Code is empty.  sort_symbols() already set all lengths
1115 		 * to 0, so there is nothing more to do.  */
1116 		return;
1117 	}
1118 
1119 	if (unlikely(num_used_syms == 1)) {
1120 		/* Only one symbol was used, so we only need one
1121 		 * codeword.  But two codewords are needed to form the
1122 		 * smallest complete Huffman code, which uses codewords 0
1123 		 * and 1.  Therefore, we choose another symbol to which
1124 		 * to assign a codeword.  We use 0 (if the used symbol is
1125 		 * not 0) or 1 (if the used symbol is 0).  In either
1126 		 * case, the lesser-valued symbol must be assigned
1127 		 * codeword 0 so that the resulting code is canonical.  */
1128 
1129 		unsigned sym = A[0] & SYMBOL_MASK;
1130 		unsigned nonzero_idx = sym ? sym : 1;
1131 
1132 		codewords[0] = 0;
1133 		lens[0] = 1;
1134 		codewords[nonzero_idx] = 1;
1135 		lens[nonzero_idx] = 1;
1136 		return;
1137 	}
1138 
1139 	/* Build a stripped-down version of the Huffman tree, sharing the
1140 	 * array 'A' with the symbol values.  Then extract length counts
1141 	 * from the tree and use them to generate the final codewords.  */
1142 
1143 	build_tree(A, num_used_syms);
1144 
1145 	{
1146 		unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
1147 
1148 		compute_length_counts(A, num_used_syms - 2,
1149 				      len_counts, max_codeword_len);
1150 
1151 		gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
1152 	}
1153 }
1154 
1155 /*
1156  * Clear the Huffman symbol frequency counters.
1157  * This must be called when starting a new DEFLATE block.
1158  */
1159 static void
deflate_reset_symbol_frequencies(struct libdeflate_compressor * c)1160 deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
1161 {
1162 	memset(&c->freqs, 0, sizeof(c->freqs));
1163 }
1164 
1165 /* Reverse the Huffman codeword 'codeword', which is 'len' bits in length.  */
1166 static u32
deflate_reverse_codeword(u32 codeword,u8 len)1167 deflate_reverse_codeword(u32 codeword, u8 len)
1168 {
1169 	/* The following branchless algorithm is faster than going bit by bit.
1170 	 * Note: since no codewords are longer than 16 bits, we only need to
1171 	 * reverse the low 16 bits of the 'u32'.  */
1172 	STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
1173 
1174 	/* Flip adjacent 1-bit fields  */
1175 	codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
1176 
1177 	/* Flip adjacent 2-bit fields  */
1178 	codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
1179 
1180 	/* Flip adjacent 4-bit fields  */
1181 	codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
1182 
1183 	/* Flip adjacent 8-bit fields  */
1184 	codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
1185 
1186 	/* Return the high 'len' bits of the bit-reversed 16 bit value.  */
1187 	return codeword >> (16 - len);
1188 }
1189 
1190 /* Make a canonical Huffman code with bit-reversed codewords.  */
1191 static void
deflate_make_huffman_code(unsigned num_syms,unsigned max_codeword_len,const u32 freqs[],u8 lens[],u32 codewords[])1192 deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1193 			  const u32 freqs[], u8 lens[], u32 codewords[])
1194 {
1195 	unsigned sym;
1196 
1197 	make_canonical_huffman_code(num_syms, max_codeword_len,
1198 				    freqs, lens, codewords);
1199 
1200 	for (sym = 0; sym < num_syms; sym++)
1201 		codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
1202 }
1203 
1204 /*
1205  * Build the literal/length and offset Huffman codes for a DEFLATE block.
1206  *
1207  * This takes as input the frequency tables for each code and produces as output
1208  * a set of tables that map symbols to codewords and codeword lengths.
1209  */
1210 static void
deflate_make_huffman_codes(const struct deflate_freqs * freqs,struct deflate_codes * codes)1211 deflate_make_huffman_codes(const struct deflate_freqs *freqs,
1212 			   struct deflate_codes *codes)
1213 {
1214 	STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
1215 	STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
1216 
1217 	deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
1218 				  MAX_LITLEN_CODEWORD_LEN,
1219 				  freqs->litlen,
1220 				  codes->lens.litlen,
1221 				  codes->codewords.litlen);
1222 
1223 	deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
1224 				  MAX_OFFSET_CODEWORD_LEN,
1225 				  freqs->offset,
1226 				  codes->lens.offset,
1227 				  codes->codewords.offset);
1228 }
1229 
1230 /* Initialize c->static_codes.  */
1231 static void
deflate_init_static_codes(struct libdeflate_compressor * c)1232 deflate_init_static_codes(struct libdeflate_compressor *c)
1233 {
1234 	unsigned i;
1235 
1236 	for (i = 0; i < 144; i++)
1237 		c->freqs.litlen[i] = 1 << (9 - 8);
1238 	for (; i < 256; i++)
1239 		c->freqs.litlen[i] = 1 << (9 - 9);
1240 	for (; i < 280; i++)
1241 		c->freqs.litlen[i] = 1 << (9 - 7);
1242 	for (; i < 288; i++)
1243 		c->freqs.litlen[i] = 1 << (9 - 8);
1244 
1245 	for (i = 0; i < 32; i++)
1246 		c->freqs.offset[i] = 1 << (5 - 5);
1247 
1248 	deflate_make_huffman_codes(&c->freqs, &c->static_codes);
1249 }
1250 
1251 /* Return the offset slot for the specified match offset.  */
1252 static forceinline unsigned
deflate_get_offset_slot(struct libdeflate_compressor * c,unsigned offset)1253 deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
1254 {
1255 #if USE_FULL_OFFSET_SLOT_FAST
1256 	return c->offset_slot_fast[offset];
1257 #else
1258 	if (offset <= 256)
1259 		return c->offset_slot_fast[offset - 1];
1260 	else
1261 		return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
1262 #endif
1263 }
1264 
1265 /* Write the header fields common to all DEFLATE block types.  */
1266 static void
deflate_write_block_header(struct deflate_output_bitstream * os,bool is_final_block,unsigned block_type)1267 deflate_write_block_header(struct deflate_output_bitstream *os,
1268 			   bool is_final_block, unsigned block_type)
1269 {
1270 	deflate_add_bits(os, is_final_block, 1);
1271 	deflate_add_bits(os, block_type, 2);
1272 	deflate_flush_bits(os);
1273 }
1274 
1275 static unsigned
deflate_compute_precode_items(const u8 lens[restrict],const unsigned num_lens,u32 precode_freqs[restrict],unsigned precode_items[restrict])1276 deflate_compute_precode_items(const u8 lens[restrict],
1277 			      const unsigned num_lens,
1278 			      u32 precode_freqs[restrict],
1279 			      unsigned precode_items[restrict])
1280 {
1281 	unsigned *itemptr;
1282 	unsigned run_start;
1283 	unsigned run_end;
1284 	unsigned extra_bits;
1285 	u8 len;
1286 
1287 	memset(precode_freqs, 0,
1288 	       DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
1289 
1290 	itemptr = precode_items;
1291 	run_start = 0;
1292 	do {
1293 		/* Find the next run of codeword lengths.  */
1294 
1295 		/* len = the length being repeated  */
1296 		len = lens[run_start];
1297 
1298 		/* Extend the run.  */
1299 		run_end = run_start;
1300 		do {
1301 			run_end++;
1302 		} while (run_end != num_lens && len == lens[run_end]);
1303 
1304 		if (len == 0) {
1305 			/* Run of zeroes.  */
1306 
1307 			/* Symbol 18: RLE 11 to 138 zeroes at a time.  */
1308 			while ((run_end - run_start) >= 11) {
1309 				extra_bits = MIN((run_end - run_start) - 11, 0x7F);
1310 				precode_freqs[18]++;
1311 				*itemptr++ = 18 | (extra_bits << 5);
1312 				run_start += 11 + extra_bits;
1313 			}
1314 
1315 			/* Symbol 17: RLE 3 to 10 zeroes at a time.  */
1316 			if ((run_end - run_start) >= 3) {
1317 				extra_bits = MIN((run_end - run_start) - 3, 0x7);
1318 				precode_freqs[17]++;
1319 				*itemptr++ = 17 | (extra_bits << 5);
1320 				run_start += 3 + extra_bits;
1321 			}
1322 		} else {
1323 
1324 			/* A run of nonzero lengths. */
1325 
1326 			/* Symbol 16: RLE 3 to 6 of the previous length.  */
1327 			if ((run_end - run_start) >= 4) {
1328 				precode_freqs[len]++;
1329 				*itemptr++ = len;
1330 				run_start++;
1331 				do {
1332 					extra_bits = MIN((run_end - run_start) - 3, 0x3);
1333 					precode_freqs[16]++;
1334 					*itemptr++ = 16 | (extra_bits << 5);
1335 					run_start += 3 + extra_bits;
1336 				} while ((run_end - run_start) >= 3);
1337 			}
1338 		}
1339 
1340 		/* Output any remaining lengths without RLE.  */
1341 		while (run_start != run_end) {
1342 			precode_freqs[len]++;
1343 			*itemptr++ = len;
1344 			run_start++;
1345 		}
1346 	} while (run_start != num_lens);
1347 
1348 	return itemptr - precode_items;
1349 }
1350 
1351 /*
1352  * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
1353  * separate Huffman code, the "precode", which contains a symbol for each
1354  * possible codeword length in the larger code as well as several special
1355  * symbols to represent repeated codeword lengths (a form of run-length
1356  * encoding).  The precode is itself constructed in canonical form, and its
1357  * codeword lengths are represented literally in 19 3-bit fields that
1358  * immediately precede the compressed codeword lengths of the larger code.
1359  */
1360 
1361 /* Precompute the information needed to output Huffman codes. */
1362 static void
deflate_precompute_huffman_header(struct libdeflate_compressor * c)1363 deflate_precompute_huffman_header(struct libdeflate_compressor *c)
1364 {
1365 	/* Compute how many litlen and offset symbols are needed. */
1366 
1367 	for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
1368 	     c->num_litlen_syms > 257;
1369 	     c->num_litlen_syms--)
1370 		if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
1371 			break;
1372 
1373 	for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
1374 	     c->num_offset_syms > 1;
1375 	     c->num_offset_syms--)
1376 		if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
1377 			break;
1378 
1379 	/* If we're not using the full set of literal/length codeword lengths,
1380 	 * then temporarily move the offset codeword lengths over so that the
1381 	 * literal/length and offset codeword lengths are contiguous. */
1382 
1383 	STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
1384 		      DEFLATE_NUM_LITLEN_SYMS);
1385 
1386 	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1387 		memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
1388 			(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1389 			c->num_offset_syms);
1390 	}
1391 
1392 	/* Compute the "items" (RLE / literal tokens and extra bits) with which
1393 	 * the codeword lengths in the larger code will be output. */
1394 	c->num_precode_items =
1395 		deflate_compute_precode_items((u8 *)&c->codes.lens,
1396 					      c->num_litlen_syms +
1397 							c->num_offset_syms,
1398 					      c->precode_freqs,
1399 					      c->precode_items);
1400 
1401 	/* Build the precode. */
1402 	STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
1403 	deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
1404 				  MAX_PRE_CODEWORD_LEN,
1405 				  c->precode_freqs, c->precode_lens,
1406 				  c->precode_codewords);
1407 
1408 	/* Count how many precode lengths we actually need to output. */
1409 	for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
1410 	     c->num_explicit_lens > 4;
1411 	     c->num_explicit_lens--)
1412 		if (c->precode_lens[deflate_precode_lens_permutation[
1413 						c->num_explicit_lens - 1]] != 0)
1414 			break;
1415 
1416 	/* Restore the offset codeword lengths if needed. */
1417 	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1418 		memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1419 			(u8 *)&c->codes.lens + c->num_litlen_syms,
1420 			c->num_offset_syms);
1421 	}
1422 }
1423 
1424 /* Output the Huffman codes. */
1425 static void
deflate_write_huffman_header(struct libdeflate_compressor * c,struct deflate_output_bitstream * os)1426 deflate_write_huffman_header(struct libdeflate_compressor *c,
1427 			     struct deflate_output_bitstream *os)
1428 {
1429 	unsigned i;
1430 
1431 	deflate_add_bits(os, c->num_litlen_syms - 257, 5);
1432 	deflate_add_bits(os, c->num_offset_syms - 1, 5);
1433 	deflate_add_bits(os, c->num_explicit_lens - 4, 4);
1434 	deflate_flush_bits(os);
1435 
1436 	/* Output the lengths of the codewords in the precode.  */
1437 	for (i = 0; i < c->num_explicit_lens; i++) {
1438 		deflate_add_bits(os, c->precode_lens[
1439 				       deflate_precode_lens_permutation[i]], 3);
1440 		deflate_flush_bits(os);
1441 	}
1442 
1443 	/* Output the encoded lengths of the codewords in the larger code.  */
1444 	for (i = 0; i < c->num_precode_items; i++) {
1445 		unsigned precode_item = c->precode_items[i];
1446 		unsigned precode_sym = precode_item & 0x1F;
1447 		deflate_add_bits(os, c->precode_codewords[precode_sym],
1448 				 c->precode_lens[precode_sym]);
1449 		if (precode_sym >= 16) {
1450 			if (precode_sym == 16)
1451 				deflate_add_bits(os, precode_item >> 5, 2);
1452 			else if (precode_sym == 17)
1453 				deflate_add_bits(os, precode_item >> 5, 3);
1454 			else
1455 				deflate_add_bits(os, precode_item >> 5, 7);
1456 		}
1457 		STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7));
1458 		deflate_flush_bits(os);
1459 	}
1460 }
1461 
1462 static void
deflate_write_sequences(struct deflate_output_bitstream * restrict os,const struct deflate_codes * restrict codes,const struct deflate_sequence sequences[restrict],const u8 * restrict in_next)1463 deflate_write_sequences(struct deflate_output_bitstream * restrict os,
1464 			const struct deflate_codes * restrict codes,
1465 			const struct deflate_sequence sequences[restrict],
1466 			const u8 * restrict in_next)
1467 {
1468 	const struct deflate_sequence *seq = sequences;
1469 
1470 	for (;;) {
1471 		u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
1472 		unsigned length = seq->litrunlen_and_length >> 23;
1473 		unsigned length_slot;
1474 		unsigned litlen_symbol;
1475 		unsigned offset_symbol;
1476 
1477 		if (litrunlen) {
1478 		#if 1
1479 			while (litrunlen >= 4) {
1480 				unsigned lit0 = in_next[0];
1481 				unsigned lit1 = in_next[1];
1482 				unsigned lit2 = in_next[2];
1483 				unsigned lit3 = in_next[3];
1484 
1485 				deflate_add_bits(os, codes->codewords.litlen[lit0],
1486 						 codes->lens.litlen[lit0]);
1487 				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1488 					deflate_flush_bits(os);
1489 
1490 				deflate_add_bits(os, codes->codewords.litlen[lit1],
1491 						 codes->lens.litlen[lit1]);
1492 				if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
1493 					deflate_flush_bits(os);
1494 
1495 				deflate_add_bits(os, codes->codewords.litlen[lit2],
1496 						 codes->lens.litlen[lit2]);
1497 				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1498 					deflate_flush_bits(os);
1499 
1500 				deflate_add_bits(os, codes->codewords.litlen[lit3],
1501 						 codes->lens.litlen[lit3]);
1502 				deflate_flush_bits(os);
1503 				in_next += 4;
1504 				litrunlen -= 4;
1505 			}
1506 			if (litrunlen-- != 0) {
1507 				deflate_add_bits(os, codes->codewords.litlen[*in_next],
1508 						 codes->lens.litlen[*in_next]);
1509 				if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1510 					deflate_flush_bits(os);
1511 				in_next++;
1512 				if (litrunlen-- != 0) {
1513 					deflate_add_bits(os, codes->codewords.litlen[*in_next],
1514 							 codes->lens.litlen[*in_next]);
1515 					if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1516 						deflate_flush_bits(os);
1517 					in_next++;
1518 					if (litrunlen-- != 0) {
1519 						deflate_add_bits(os, codes->codewords.litlen[*in_next],
1520 								 codes->lens.litlen[*in_next]);
1521 						if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1522 							deflate_flush_bits(os);
1523 						in_next++;
1524 					}
1525 				}
1526 				if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1527 					deflate_flush_bits(os);
1528 			}
1529 		#else
1530 			do {
1531 				unsigned lit = *in_next++;
1532 				deflate_add_bits(os, codes->codewords.litlen[lit],
1533 						 codes->lens.litlen[lit]);
1534 				deflate_flush_bits(os);
1535 			} while (--litrunlen);
1536 		#endif
1537 		}
1538 
1539 		if (length == 0)
1540 			return;
1541 
1542 		in_next += length;
1543 
1544 		length_slot = seq->length_slot;
1545 		litlen_symbol = 257 + length_slot;
1546 
1547 		/* Litlen symbol  */
1548 		deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1549 				 codes->lens.litlen[litlen_symbol]);
1550 
1551 		/* Extra length bits  */
1552 		STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1553 					 DEFLATE_MAX_EXTRA_LENGTH_BITS));
1554 		deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1555 				 deflate_extra_length_bits[length_slot]);
1556 
1557 		if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1558 				DEFLATE_MAX_EXTRA_LENGTH_BITS +
1559 				MAX_OFFSET_CODEWORD_LEN +
1560 				DEFLATE_MAX_EXTRA_OFFSET_BITS))
1561 			deflate_flush_bits(os);
1562 
1563 		/* Offset symbol  */
1564 		offset_symbol = seq->offset_symbol;
1565 		deflate_add_bits(os, codes->codewords.offset[offset_symbol],
1566 				 codes->lens.offset[offset_symbol]);
1567 
1568 		if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1569 				DEFLATE_MAX_EXTRA_OFFSET_BITS))
1570 			deflate_flush_bits(os);
1571 
1572 		/* Extra offset bits  */
1573 		deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
1574 				 deflate_extra_offset_bits[offset_symbol]);
1575 
1576 		deflate_flush_bits(os);
1577 
1578 		seq++;
1579 	}
1580 }
1581 
1582 #if SUPPORT_NEAR_OPTIMAL_PARSING
1583 /*
1584  * Follow the minimum-cost path in the graph of possible match/literal choices
1585  * for the current block and write out the matches/literals using the specified
1586  * Huffman codes.
1587  *
1588  * Note: this is slightly duplicated with deflate_write_sequences(), the reason
1589  * being that we don't want to waste time translating between intermediate
1590  * match/literal representations.
1591  */
1592 static void
deflate_write_item_list(struct deflate_output_bitstream * os,const struct deflate_codes * codes,struct libdeflate_compressor * c,u32 block_length)1593 deflate_write_item_list(struct deflate_output_bitstream *os,
1594 			const struct deflate_codes *codes,
1595 			struct libdeflate_compressor *c,
1596 			u32 block_length)
1597 {
1598 	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
1599 	struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
1600 	do {
1601 		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
1602 		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1603 		unsigned litlen_symbol;
1604 		unsigned length_slot;
1605 		unsigned offset_slot;
1606 
1607 		if (length == 1) {
1608 			/* Literal  */
1609 			litlen_symbol = offset;
1610 			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1611 					 codes->lens.litlen[litlen_symbol]);
1612 			deflate_flush_bits(os);
1613 		} else {
1614 			/* Match length  */
1615 			length_slot = deflate_length_slot[length];
1616 			litlen_symbol = 257 + length_slot;
1617 			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1618 					 codes->lens.litlen[litlen_symbol]);
1619 
1620 			deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1621 					 deflate_extra_length_bits[length_slot]);
1622 
1623 			if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1624 					DEFLATE_MAX_EXTRA_LENGTH_BITS +
1625 					MAX_OFFSET_CODEWORD_LEN +
1626 					DEFLATE_MAX_EXTRA_OFFSET_BITS))
1627 				deflate_flush_bits(os);
1628 
1629 
1630 			/* Match offset  */
1631 			offset_slot = deflate_get_offset_slot(c, offset);
1632 			deflate_add_bits(os, codes->codewords.offset[offset_slot],
1633 					 codes->lens.offset[offset_slot]);
1634 
1635 			if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1636 					DEFLATE_MAX_EXTRA_OFFSET_BITS))
1637 				deflate_flush_bits(os);
1638 
1639 			deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
1640 					 deflate_extra_offset_bits[offset_slot]);
1641 
1642 			deflate_flush_bits(os);
1643 		}
1644 		cur_node += length;
1645 	} while (cur_node != end_node);
1646 }
1647 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
1648 
1649 /* Output the end-of-block symbol.  */
1650 static void
deflate_write_end_of_block(struct deflate_output_bitstream * os,const struct deflate_codes * codes)1651 deflate_write_end_of_block(struct deflate_output_bitstream *os,
1652 			   const struct deflate_codes *codes)
1653 {
1654 	deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
1655 			 codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
1656 	deflate_flush_bits(os);
1657 }
1658 
1659 static void
deflate_write_uncompressed_block(struct deflate_output_bitstream * os,const u8 * data,u16 len,bool is_final_block)1660 deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
1661 				 const u8 *data, u16 len,
1662 				 bool is_final_block)
1663 {
1664 	deflate_write_block_header(os, is_final_block,
1665 				   DEFLATE_BLOCKTYPE_UNCOMPRESSED);
1666 	deflate_align_bitstream(os);
1667 
1668 	if (4 + (u32)len >= os->end - os->next) {
1669 		os->next = os->end;
1670 		return;
1671 	}
1672 
1673 	put_unaligned_le16(len, os->next);
1674 	os->next += 2;
1675 	put_unaligned_le16(~len, os->next);
1676 	os->next += 2;
1677 	memcpy(os->next, data, len);
1678 	os->next += len;
1679 }
1680 
1681 static void
deflate_write_uncompressed_blocks(struct deflate_output_bitstream * os,const u8 * data,u32 data_length,bool is_final_block)1682 deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
1683 				  const u8 *data, u32 data_length,
1684 				  bool is_final_block)
1685 {
1686 	do {
1687 		u16 len = MIN(data_length, UINT16_MAX);
1688 
1689 		deflate_write_uncompressed_block(os, data, len,
1690 					is_final_block && len == data_length);
1691 		data += len;
1692 		data_length -= len;
1693 	} while (data_length != 0);
1694 }
1695 
1696 /*
1697  * Choose the best type of block to use (dynamic Huffman, static Huffman, or
1698  * uncompressed), then output it.
1699  */
1700 static void
deflate_flush_block(struct libdeflate_compressor * restrict c,struct deflate_output_bitstream * restrict os,const u8 * restrict block_begin,u32 block_length,bool is_final_block,bool use_item_list)1701 deflate_flush_block(struct libdeflate_compressor * restrict c,
1702 		    struct deflate_output_bitstream * restrict os,
1703 		    const u8 * restrict block_begin, u32 block_length,
1704 		    bool is_final_block, bool use_item_list)
1705 {
1706 	static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
1707 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
1708 	};
1709 
1710 	/* Costs are measured in bits */
1711 	u32 dynamic_cost = 0;
1712 	u32 static_cost = 0;
1713 	u32 uncompressed_cost = 0;
1714 	struct deflate_codes *codes;
1715 	int block_type;
1716 	unsigned sym;
1717 
1718 	/* Tally the end-of-block symbol. */
1719 	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
1720 
1721 	/* Build dynamic Huffman codes. */
1722 	deflate_make_huffman_codes(&c->freqs, &c->codes);
1723 
1724 	/* Account for the cost of sending dynamic Huffman codes. */
1725 	deflate_precompute_huffman_header(c);
1726 	dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
1727 	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
1728 		u32 extra = deflate_extra_precode_bits[sym];
1729 		dynamic_cost += c->precode_freqs[sym] *
1730 				(extra + c->precode_lens[sym]);
1731 	}
1732 
1733 	/* Account for the cost of encoding literals. */
1734 	for (sym = 0; sym < 256; sym++) {
1735 		dynamic_cost += c->freqs.litlen[sym] *
1736 				c->codes.lens.litlen[sym];
1737 	}
1738 	for (sym = 0; sym < 144; sym++)
1739 		static_cost += c->freqs.litlen[sym] * 8;
1740 	for (; sym < 256; sym++)
1741 		static_cost += c->freqs.litlen[sym] * 9;
1742 
1743 	/* Account for the cost of encoding the end-of-block symbol. */
1744 	dynamic_cost += c->codes.lens.litlen[256];
1745 	static_cost += 7;
1746 
1747 	/* Account for the cost of encoding lengths. */
1748 	for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) {
1749 		u32 extra = deflate_extra_length_bits[sym - 257];
1750 		dynamic_cost += c->freqs.litlen[sym] *
1751 				(extra + c->codes.lens.litlen[sym]);
1752 		static_cost += c->freqs.litlen[sym] *
1753 				(extra + c->static_codes.lens.litlen[sym]);
1754 	}
1755 
1756 	/* Account for the cost of encoding offsets. */
1757 	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
1758 		u32 extra = deflate_extra_offset_bits[sym];
1759 		dynamic_cost += c->freqs.offset[sym] *
1760 				(extra + c->codes.lens.offset[sym]);
1761 		static_cost += c->freqs.offset[sym] * (extra + 5);
1762 	}
1763 
1764 	/* Compute the cost of using uncompressed blocks. */
1765 	uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
1766 			     (40 * (DIV_ROUND_UP(block_length,
1767 						 UINT16_MAX) - 1)) +
1768 			     (8 * block_length);
1769 
1770 	/* Choose the cheapest block type. */
1771 	if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
1772 		block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
1773 		codes = &c->codes;
1774 	} else if (static_cost < uncompressed_cost) {
1775 		block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
1776 		codes = &c->static_codes;
1777 	} else {
1778 		block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
1779 	}
1780 
1781 	/* Now actually output the block. */
1782 
1783 	if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
1784 		/* Note: the length being flushed may exceed the maximum length
1785 		 * of an uncompressed block (65535 bytes).  Therefore, more than
1786 		 * one uncompressed block might be needed. */
1787 		deflate_write_uncompressed_blocks(os, block_begin, block_length,
1788 						  is_final_block);
1789 	} else {
1790 		/* Output the block header. */
1791 		deflate_write_block_header(os, is_final_block, block_type);
1792 
1793 		/* Output the Huffman codes (dynamic Huffman blocks only). */
1794 		if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
1795 			deflate_write_huffman_header(c, os);
1796 
1797 		/* Output the literals, matches, and end-of-block symbol. */
1798 	#if SUPPORT_NEAR_OPTIMAL_PARSING
1799 		if (use_item_list)
1800 			deflate_write_item_list(os, codes, c, block_length);
1801 		else
1802 	#endif
1803 			deflate_write_sequences(os, codes, c->p.g.sequences,
1804 						block_begin);
1805 		deflate_write_end_of_block(os, codes);
1806 	}
1807 }
1808 
1809 static forceinline void
deflate_choose_literal(struct libdeflate_compressor * c,unsigned literal,u32 * litrunlen_p)1810 deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
1811 		       u32 *litrunlen_p)
1812 {
1813 	c->freqs.litlen[literal]++;
1814 	++*litrunlen_p;
1815 }
1816 
1817 static forceinline void
deflate_choose_match(struct libdeflate_compressor * c,unsigned length,unsigned offset,u32 * litrunlen_p,struct deflate_sequence ** next_seq_p)1818 deflate_choose_match(struct libdeflate_compressor *c,
1819 		     unsigned length, unsigned offset,
1820 		     u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
1821 {
1822 	struct deflate_sequence *seq = *next_seq_p;
1823 	unsigned length_slot = deflate_length_slot[length];
1824 	unsigned offset_slot = deflate_get_offset_slot(c, offset);
1825 
1826 	c->freqs.litlen[257 + length_slot]++;
1827 	c->freqs.offset[offset_slot]++;
1828 
1829 	seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
1830 	seq->offset = offset;
1831 	seq->length_slot = length_slot;
1832 	seq->offset_symbol = offset_slot;
1833 
1834 	*litrunlen_p = 0;
1835 	*next_seq_p = seq + 1;
1836 }
1837 
1838 static forceinline void
deflate_finish_sequence(struct deflate_sequence * seq,u32 litrunlen)1839 deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
1840 {
1841 	seq->litrunlen_and_length = litrunlen; /* length = 0 */
1842 }
1843 
1844 /******************************************************************************/
1845 
1846 /*
1847  * Block splitting algorithm.  The problem is to decide when it is worthwhile to
1848  * start a new block with new Huffman codes.  There is a theoretically optimal
1849  * solution: recursively consider every possible block split, considering the
1850  * exact cost of each block, and choose the minimum cost approach.  But this is
1851  * far too slow.  Instead, as an approximation, we can count symbols and after
1852  * every N symbols, compare the expected distribution of symbols based on the
1853  * previous data with the actual distribution.  If they differ "by enough", then
1854  * start a new block.
1855  *
1856  * As an optimization and heuristic, we don't distinguish between every symbol
1857  * but rather we combine many symbols into a single "observation type".  For
1858  * literals we only look at the high bits and low bits, and for matches we only
1859  * look at whether the match is long or not.  The assumption is that for typical
1860  * "real" data, places that are good block boundaries will tend to be noticable
1861  * based only on changes in these aggregate frequencies, without looking for
1862  * subtle differences in individual symbols.  For example, a change from ASCII
1863  * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1864  * to many matches (generally more compressible), would be easily noticed based
1865  * on the aggregates.
1866  *
1867  * For determining whether the frequency distributions are "different enough" to
1868  * start a new block, the simply heuristic of splitting when the sum of absolute
1869  * differences exceeds a constant seems to be good enough.  We also add a number
1870  * proportional to the block length so that the algorithm is more likely to end
1871  * long blocks than short blocks.  This reflects the general expectation that it
1872  * will become increasingly beneficial to start a new block as the current
1873  * block grows longer.
1874  *
1875  * Finally, for an approximation, it is not strictly necessary that the exact
1876  * symbols being used are considered.  With "near-optimal parsing", for example,
1877  * the actual symbols that will be used are unknown until after the block
1878  * boundary is chosen and the block has been optimized.  Since the final choices
1879  * cannot be used, we can use preliminary "greedy" choices instead.
1880  */
1881 
1882 /* Initialize the block split statistics when starting a new block. */
1883 static void
init_block_split_stats(struct block_split_stats * stats)1884 init_block_split_stats(struct block_split_stats *stats)
1885 {
1886 	int i;
1887 
1888 	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1889 		stats->new_observations[i] = 0;
1890 		stats->observations[i] = 0;
1891 	}
1892 	stats->num_new_observations = 0;
1893 	stats->num_observations = 0;
1894 }
1895 
1896 /* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
1897  * literal, for 8 possible literal observation types.  */
1898 static forceinline void
observe_literal(struct block_split_stats * stats,u8 lit)1899 observe_literal(struct block_split_stats *stats, u8 lit)
1900 {
1901 	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1902 	stats->num_new_observations++;
1903 }
1904 
1905 /* Match observation.  Heuristic: use one observation type for "short match" and
1906  * one observation type for "long match".  */
1907 static forceinline void
observe_match(struct block_split_stats * stats,unsigned length)1908 observe_match(struct block_split_stats *stats, unsigned length)
1909 {
1910 	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
1911 	stats->num_new_observations++;
1912 }
1913 
1914 static bool
do_end_block_check(struct block_split_stats * stats,u32 block_length)1915 do_end_block_check(struct block_split_stats *stats, u32 block_length)
1916 {
1917 	int i;
1918 
1919 	if (stats->num_observations > 0) {
1920 
1921 		/* Note: to avoid slow divisions, we do not divide by
1922 		 * 'num_observations', but rather do all math with the numbers
1923 		 * multiplied by 'num_observations'.  */
1924 		u32 total_delta = 0;
1925 		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1926 			u32 expected = stats->observations[i] * stats->num_new_observations;
1927 			u32 actual = stats->new_observations[i] * stats->num_observations;
1928 			u32 delta = (actual > expected) ? actual - expected :
1929 							  expected - actual;
1930 			total_delta += delta;
1931 		}
1932 
1933 		/* Ready to end the block? */
1934 		if (total_delta + (block_length / 4096) * stats->num_observations >=
1935 		    NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
1936 			return true;
1937 	}
1938 
1939 	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1940 		stats->num_observations += stats->new_observations[i];
1941 		stats->observations[i] += stats->new_observations[i];
1942 		stats->new_observations[i] = 0;
1943 	}
1944 	stats->num_new_observations = 0;
1945 	return false;
1946 }
1947 
1948 static forceinline bool
should_end_block(struct block_split_stats * stats,const u8 * in_block_begin,const u8 * in_next,const u8 * in_end)1949 should_end_block(struct block_split_stats *stats,
1950 		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
1951 {
1952 	/* Ready to check block split statistics? */
1953 	if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
1954 	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
1955 	    in_end - in_next < MIN_BLOCK_LENGTH)
1956 		return false;
1957 
1958 	return do_end_block_check(stats, in_next - in_block_begin);
1959 }
1960 
1961 /******************************************************************************/
1962 
1963 /*
1964  * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
1965  */
1966 static size_t
deflate_compress_greedy(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)1967 deflate_compress_greedy(struct libdeflate_compressor * restrict c,
1968 			const u8 * restrict in, size_t in_nbytes,
1969 			u8 * restrict out, size_t out_nbytes_avail)
1970 {
1971 	const u8 *in_next = in;
1972 	const u8 *in_end = in_next + in_nbytes;
1973 	struct deflate_output_bitstream os;
1974 	const u8 *in_cur_base = in_next;
1975 	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
1976 	unsigned nice_len = MIN(c->nice_match_length, max_len);
1977 	u32 next_hashes[2] = {0, 0};
1978 
1979 	deflate_init_output(&os, out, out_nbytes_avail);
1980 	hc_matchfinder_init(&c->p.g.hc_mf);
1981 
1982 	do {
1983 		/* Starting a new DEFLATE block.  */
1984 
1985 		const u8 * const in_block_begin = in_next;
1986 		const u8 * const in_max_block_end =
1987 			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
1988 		u32 litrunlen = 0;
1989 		struct deflate_sequence *next_seq = c->p.g.sequences;
1990 
1991 		init_block_split_stats(&c->split_stats);
1992 		deflate_reset_symbol_frequencies(c);
1993 
1994 		do {
1995 			u32 length;
1996 			u32 offset;
1997 
1998 			/* Decrease the maximum and nice match lengths if we're
1999 			 * approaching the end of the input buffer.  */
2000 			if (unlikely(max_len > in_end - in_next)) {
2001 				max_len = in_end - in_next;
2002 				nice_len = MIN(nice_len, max_len);
2003 			}
2004 
2005 			length = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2006 							      &in_cur_base,
2007 							      in_next,
2008 							      DEFLATE_MIN_MATCH_LEN - 1,
2009 							      max_len,
2010 							      nice_len,
2011 							      c->max_search_depth,
2012 							      next_hashes,
2013 							      &offset);
2014 
2015 			if (length >= DEFLATE_MIN_MATCH_LEN) {
2016 				/* Match found.  */
2017 				deflate_choose_match(c, length, offset,
2018 						     &litrunlen, &next_seq);
2019 				observe_match(&c->split_stats, length);
2020 				in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2021 									&in_cur_base,
2022 									in_next + 1,
2023 									in_end,
2024 									length - 1,
2025 									next_hashes);
2026 			} else {
2027 				/* No match found.  */
2028 				deflate_choose_literal(c, *in_next, &litrunlen);
2029 				observe_literal(&c->split_stats, *in_next);
2030 				in_next++;
2031 			}
2032 
2033 			/* Check if it's time to output another block.  */
2034 		} while (in_next < in_max_block_end &&
2035 			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2036 
2037 		deflate_finish_sequence(next_seq, litrunlen);
2038 		deflate_flush_block(c, &os, in_block_begin,
2039 				    in_next - in_block_begin,
2040 				    in_next == in_end, false);
2041 	} while (in_next != in_end);
2042 
2043 	return deflate_flush_output(&os);
2044 }
2045 
2046 /*
2047  * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
2048  * see if there's a longer match at the next position.  If yes, it outputs a
2049  * literal and continues to the next position.  If no, it outputs the match.
2050  */
2051 static size_t
deflate_compress_lazy(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)2052 deflate_compress_lazy(struct libdeflate_compressor * restrict c,
2053 		      const u8 * restrict in, size_t in_nbytes,
2054 		      u8 * restrict out, size_t out_nbytes_avail)
2055 {
2056 	const u8 *in_next = in;
2057 	const u8 *in_end = in_next + in_nbytes;
2058 	struct deflate_output_bitstream os;
2059 	const u8 *in_cur_base = in_next;
2060 	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2061 	unsigned nice_len = MIN(c->nice_match_length, max_len);
2062 	u32 next_hashes[2] = {0, 0};
2063 
2064 	deflate_init_output(&os, out, out_nbytes_avail);
2065 	hc_matchfinder_init(&c->p.g.hc_mf);
2066 
2067 	do {
2068 		/* Starting a new DEFLATE block.  */
2069 
2070 		const u8 * const in_block_begin = in_next;
2071 		const u8 * const in_max_block_end =
2072 			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2073 		u32 litrunlen = 0;
2074 		struct deflate_sequence *next_seq = c->p.g.sequences;
2075 
2076 		init_block_split_stats(&c->split_stats);
2077 		deflate_reset_symbol_frequencies(c);
2078 
2079 		do {
2080 			unsigned cur_len;
2081 			unsigned cur_offset;
2082 			unsigned next_len;
2083 			unsigned next_offset;
2084 
2085 			if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2086 				max_len = in_end - in_next;
2087 				nice_len = MIN(nice_len, max_len);
2088 			}
2089 
2090 			/* Find the longest match at the current position.  */
2091 			cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2092 							       &in_cur_base,
2093 							       in_next,
2094 							       DEFLATE_MIN_MATCH_LEN - 1,
2095 							       max_len,
2096 							       nice_len,
2097 							       c->max_search_depth,
2098 							       next_hashes,
2099 							       &cur_offset);
2100 			in_next += 1;
2101 
2102 			if (cur_len < DEFLATE_MIN_MATCH_LEN) {
2103 				/* No match found.  Choose a literal.  */
2104 				deflate_choose_literal(c, *(in_next - 1), &litrunlen);
2105 				observe_literal(&c->split_stats, *(in_next - 1));
2106 				continue;
2107 			}
2108 
2109 		have_cur_match:
2110 			observe_match(&c->split_stats, cur_len);
2111 
2112 			/* We have a match at the current position.  */
2113 
2114 			/* If the current match is very long, choose it
2115 			 * immediately.  */
2116 			if (cur_len >= nice_len) {
2117 				deflate_choose_match(c, cur_len, cur_offset,
2118 						     &litrunlen, &next_seq);
2119 				in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2120 									&in_cur_base,
2121 									in_next,
2122 									in_end,
2123 									cur_len - 1,
2124 									next_hashes);
2125 				continue;
2126 			}
2127 
2128 			/*
2129 			 * Try to find a match at the next position.
2130 			 *
2131 			 * Note: since we already have a match at the *current*
2132 			 * position, we use only half the 'max_search_depth'
2133 			 * when checking the *next* position.  This is a useful
2134 			 * trade-off because it's more worthwhile to use a
2135 			 * greater search depth on the initial match.
2136 			 *
2137 			 * Note: it's possible to structure the code such that
2138 			 * there's only one call to longest_match(), which
2139 			 * handles both the "find the initial match" and "try to
2140 			 * find a longer match" cases.  However, it is faster to
2141 			 * have two call sites, with longest_match() inlined at
2142 			 * each.
2143 			 */
2144 			if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2145 				max_len = in_end - in_next;
2146 				nice_len = MIN(nice_len, max_len);
2147 			}
2148 			next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2149 								&in_cur_base,
2150 								in_next,
2151 								cur_len,
2152 								max_len,
2153 								nice_len,
2154 								c->max_search_depth / 2,
2155 								next_hashes,
2156 								&next_offset);
2157 			in_next += 1;
2158 
2159 			if (next_len > cur_len) {
2160 				/* Found a longer match at the next position.
2161 				 * Output a literal.  Then the next match
2162 				 * becomes the current match.  */
2163 				deflate_choose_literal(c, *(in_next - 2), &litrunlen);
2164 				cur_len = next_len;
2165 				cur_offset = next_offset;
2166 				goto have_cur_match;
2167 			}
2168 
2169 			/* No longer match at the next position.
2170 			 * Output the current match.  */
2171 			deflate_choose_match(c, cur_len, cur_offset,
2172 					     &litrunlen, &next_seq);
2173 			in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2174 								&in_cur_base,
2175 								in_next,
2176 								in_end,
2177 								cur_len - 2,
2178 								next_hashes);
2179 
2180 			/* Check if it's time to output another block.  */
2181 		} while (in_next < in_max_block_end &&
2182 			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2183 
2184 		deflate_finish_sequence(next_seq, litrunlen);
2185 		deflate_flush_block(c, &os, in_block_begin,
2186 				    in_next - in_block_begin,
2187 				    in_next == in_end, false);
2188 	} while (in_next != in_end);
2189 
2190 	return deflate_flush_output(&os);
2191 }
2192 
2193 #if SUPPORT_NEAR_OPTIMAL_PARSING
2194 
2195 /*
2196  * Follow the minimum-cost path in the graph of possible match/literal choices
2197  * for the current block and compute the frequencies of the Huffman symbols that
2198  * would be needed to output those matches and literals.
2199  */
2200 static void
deflate_tally_item_list(struct libdeflate_compressor * c,u32 block_length)2201 deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
2202 {
2203 	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
2204 	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2205 	do {
2206 		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
2207 		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
2208 
2209 		if (length == 1) {
2210 			/* Literal  */
2211 			c->freqs.litlen[offset]++;
2212 		} else {
2213 			/* Match  */
2214 			c->freqs.litlen[257 + deflate_length_slot[length]]++;
2215 			c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
2216 		}
2217 		cur_node += length;
2218 	} while (cur_node != end_node);
2219 }
2220 
2221 /* Set the current cost model from the codeword lengths specified in @lens.  */
2222 static void
deflate_set_costs_from_codes(struct libdeflate_compressor * c,const struct deflate_lens * lens)2223 deflate_set_costs_from_codes(struct libdeflate_compressor *c,
2224 			     const struct deflate_lens *lens)
2225 {
2226 	unsigned i;
2227 
2228 	/* Literals  */
2229 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
2230 		u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
2231 		c->p.n.costs.literal[i] = bits << COST_SHIFT;
2232 	}
2233 
2234 	/* Lengths  */
2235 	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
2236 		unsigned length_slot = deflate_length_slot[i];
2237 		unsigned litlen_sym = 257 + length_slot;
2238 		u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
2239 		bits += deflate_extra_length_bits[length_slot];
2240 		c->p.n.costs.length[i] = bits << COST_SHIFT;
2241 	}
2242 
2243 	/* Offset slots  */
2244 	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
2245 		u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
2246 		bits += deflate_extra_offset_bits[i];
2247 		c->p.n.costs.offset_slot[i] = bits << COST_SHIFT;
2248 	}
2249 }
2250 
2251 static forceinline u32
deflate_default_literal_cost(unsigned literal)2252 deflate_default_literal_cost(unsigned literal)
2253 {
2254 	STATIC_ASSERT(COST_SHIFT == 3);
2255 	/* 66 is 8.25 bits/symbol  */
2256 	return 66;
2257 }
2258 
2259 static forceinline u32
deflate_default_length_slot_cost(unsigned length_slot)2260 deflate_default_length_slot_cost(unsigned length_slot)
2261 {
2262 	STATIC_ASSERT(COST_SHIFT == 3);
2263 	/* 60 is 7.5 bits/symbol  */
2264 	return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
2265 }
2266 
2267 static forceinline u32
deflate_default_offset_slot_cost(unsigned offset_slot)2268 deflate_default_offset_slot_cost(unsigned offset_slot)
2269 {
2270 	STATIC_ASSERT(COST_SHIFT == 3);
2271 	/* 39 is 4.875 bits/symbol  */
2272 	return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
2273 }
2274 
2275 /*
2276  * Set default symbol costs for the first block's first optimization pass.
2277  *
2278  * It works well to assume that each symbol is equally probable.  This results
2279  * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
2280  * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
2281  * alphabet.  However, we intentionally bias the parse towards matches rather
2282  * than literals by using a slightly lower default cost for length symbols than
2283  * for literals.  This often improves the compression ratio slightly.
2284  */
2285 static void
deflate_set_default_costs(struct libdeflate_compressor * c)2286 deflate_set_default_costs(struct libdeflate_compressor *c)
2287 {
2288 	unsigned i;
2289 
2290 	/* Literals  */
2291 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2292 		c->p.n.costs.literal[i] = deflate_default_literal_cost(i);
2293 
2294 	/* Lengths  */
2295 	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2296 		c->p.n.costs.length[i] = deflate_default_length_slot_cost(
2297 						deflate_length_slot[i]);
2298 
2299 	/* Offset slots  */
2300 	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2301 		c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
2302 }
2303 
2304 static forceinline void
deflate_adjust_cost(u32 * cost_p,u32 default_cost)2305 deflate_adjust_cost(u32 *cost_p, u32 default_cost)
2306 {
2307 	*cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
2308 }
2309 
2310 /*
2311  * Adjust the costs when beginning a new block.
2312  *
2313  * Since the current costs have been optimized for the data, it's undesirable to
2314  * throw them away and start over with the default costs.  At the same time, we
2315  * don't want to bias the parse by assuming that the next block will be similar
2316  * to the current block.  As a compromise, make the costs closer to the
2317  * defaults, but don't simply set them to the defaults.
2318  */
2319 static void
deflate_adjust_costs(struct libdeflate_compressor * c)2320 deflate_adjust_costs(struct libdeflate_compressor *c)
2321 {
2322 	unsigned i;
2323 
2324 	/* Literals  */
2325 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2326 		deflate_adjust_cost(&c->p.n.costs.literal[i],
2327 				    deflate_default_literal_cost(i));
2328 
2329 	/* Lengths  */
2330 	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2331 		deflate_adjust_cost(&c->p.n.costs.length[i],
2332 				    deflate_default_length_slot_cost(
2333 						deflate_length_slot[i]));
2334 
2335 	/* Offset slots  */
2336 	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2337 		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
2338 				    deflate_default_offset_slot_cost(i));
2339 }
2340 
2341 /*
2342  * Find the minimum-cost path through the graph of possible match/literal
2343  * choices for this block.
2344  *
2345  * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
2346  * represents the node at the beginning of the block, to
2347  * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
2348  * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
2349  *
2350  * The algorithm works backwards, starting at the end node and proceeding
2351  * backwards one node at a time.  At each node, the minimum cost to reach the
2352  * end node is computed and the match/literal choice that begins that path is
2353  * saved.
2354  */
2355 static void
deflate_find_min_cost_path(struct libdeflate_compressor * c,const u32 block_length,const struct lz_match * cache_ptr)2356 deflate_find_min_cost_path(struct libdeflate_compressor *c,
2357 			   const u32 block_length,
2358 			   const struct lz_match *cache_ptr)
2359 {
2360 	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2361 	struct deflate_optimum_node *cur_node = end_node;
2362 
2363 	cur_node->cost_to_end = 0;
2364 	do {
2365 		unsigned num_matches;
2366 		unsigned literal;
2367 		u32 best_cost_to_end;
2368 
2369 		cur_node--;
2370 		cache_ptr--;
2371 
2372 		num_matches = cache_ptr->length;
2373 		literal = cache_ptr->offset;
2374 
2375 		/* It's always possible to choose a literal.  */
2376 		best_cost_to_end = c->p.n.costs.literal[literal] +
2377 				   (cur_node + 1)->cost_to_end;
2378 		cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
2379 
2380 		/* Also consider matches if there are any.  */
2381 		if (num_matches) {
2382 			const struct lz_match *match;
2383 			unsigned len;
2384 			unsigned offset;
2385 			unsigned offset_slot;
2386 			u32 offset_cost;
2387 			u32 cost_to_end;
2388 
2389 			/*
2390 			 * Consider each length from the minimum
2391 			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
2392 			 * match found at this position.  For each length, we
2393 			 * consider only the smallest offset for which that
2394 			 * length is available.  Although this is not guaranteed
2395 			 * to be optimal due to the possibility of a larger
2396 			 * offset costing less than a smaller offset to code,
2397 			 * this is a very useful heuristic.
2398 			 */
2399 			match = cache_ptr - num_matches;
2400 			len = DEFLATE_MIN_MATCH_LEN;
2401 			do {
2402 				offset = match->offset;
2403 				offset_slot = deflate_get_offset_slot(c, offset);
2404 				offset_cost = c->p.n.costs.offset_slot[offset_slot];
2405 				do {
2406 					cost_to_end = offset_cost +
2407 						      c->p.n.costs.length[len] +
2408 						      (cur_node + len)->cost_to_end;
2409 					if (cost_to_end < best_cost_to_end) {
2410 						best_cost_to_end = cost_to_end;
2411 						cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
2412 					}
2413 				} while (++len <= match->length);
2414 			} while (++match != cache_ptr);
2415 			cache_ptr -= num_matches;
2416 		}
2417 		cur_node->cost_to_end = best_cost_to_end;
2418 	} while (cur_node != &c->p.n.optimum_nodes[0]);
2419 }
2420 
2421 /*
2422  * Choose the literal/match sequence to use for the current block.  The basic
2423  * algorithm finds a minimum-cost path through the block's graph of
2424  * literal/match choices, given a cost model.  However, the cost of each symbol
2425  * is unknown until the Huffman codes have been built, but at the same time the
2426  * Huffman codes depend on the frequencies of chosen symbols.  Consequently,
2427  * multiple passes must be used to try to approximate an optimal solution.  The
2428  * first pass uses default costs, mixed with the costs from the previous block
2429  * if any.  Later passes use the Huffman codeword lengths from the previous pass
2430  * as the costs.
2431  */
2432 static void
deflate_optimize_block(struct libdeflate_compressor * c,u32 block_length,const struct lz_match * cache_ptr,bool is_first_block)2433 deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
2434 		       const struct lz_match *cache_ptr, bool is_first_block)
2435 {
2436 	unsigned num_passes_remaining = c->p.n.num_optim_passes;
2437 	u32 i;
2438 
2439 	/* Force the block to really end at the desired length, even if some
2440 	 * matches extend beyond it. */
2441 	for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
2442 					ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
2443 		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
2444 
2445 	/* Set the initial costs. */
2446 	if (is_first_block)
2447 		deflate_set_default_costs(c);
2448 	else
2449 		deflate_adjust_costs(c);
2450 
2451 	for (;;) {
2452 		/* Find the minimum cost path for this pass. */
2453 		deflate_find_min_cost_path(c, block_length, cache_ptr);
2454 
2455 		/* Compute frequencies of the chosen symbols. */
2456 		deflate_reset_symbol_frequencies(c);
2457 		deflate_tally_item_list(c, block_length);
2458 
2459 		if (--num_passes_remaining == 0)
2460 			break;
2461 
2462 		/* At least one optimization pass remains; update the costs. */
2463 		deflate_make_huffman_codes(&c->freqs, &c->codes);
2464 		deflate_set_costs_from_codes(c, &c->codes.lens);
2465 	}
2466 }
2467 
2468 /*
2469  * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
2470  * representation of each DEFLATE block using a minimum-cost path search over
2471  * the graph of possible match/literal choices for that block, assuming a
2472  * certain cost for each Huffman symbol.
2473  *
2474  * For several reasons, the end result is not guaranteed to be optimal:
2475  *
2476  * - Nonoptimal choice of blocks
2477  * - Heuristic limitations on which matches are actually considered
2478  * - Symbol costs are unknown until the symbols have already been chosen
2479  *   (so iterative optimization must be used)
2480  */
2481 static size_t
deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)2482 deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
2483 			      const u8 * restrict in, size_t in_nbytes,
2484 			      u8 * restrict out, size_t out_nbytes_avail)
2485 {
2486 	const u8 *in_next = in;
2487 	const u8 *in_end = in_next + in_nbytes;
2488 	struct deflate_output_bitstream os;
2489 	const u8 *in_cur_base = in_next;
2490 	const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
2491 	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2492 	unsigned nice_len = MIN(c->nice_match_length, max_len);
2493 	u32 next_hashes[2] = {0, 0};
2494 
2495 	deflate_init_output(&os, out, out_nbytes_avail);
2496 	bt_matchfinder_init(&c->p.n.bt_mf);
2497 
2498 	do {
2499 		/* Starting a new DEFLATE block.  */
2500 
2501 		struct lz_match *cache_ptr = c->p.n.match_cache;
2502 		const u8 * const in_block_begin = in_next;
2503 		const u8 * const in_max_block_end =
2504 			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2505 		const u8 *next_observation = in_next;
2506 
2507 		init_block_split_stats(&c->split_stats);
2508 
2509 		/*
2510 		 * Find matches until we decide to end the block.  We end the
2511 		 * block if any of the following is true:
2512 		 *
2513 		 * (1) Maximum block length has been reached
2514 		 * (2) Match catch may overflow.
2515 		 * (3) Block split heuristic says to split now.
2516 		 */
2517 		do {
2518 			struct lz_match *matches;
2519 			unsigned best_len;
2520 
2521 			/* Slide the window forward if needed.  */
2522 			if (in_next == in_next_slide) {
2523 				bt_matchfinder_slide_window(&c->p.n.bt_mf);
2524 				in_cur_base = in_next;
2525 				in_next_slide = in_next + MIN(in_end - in_next,
2526 							      MATCHFINDER_WINDOW_SIZE);
2527 			}
2528 
2529 			/* Decrease the maximum and nice match lengths if we're
2530 			 * approaching the end of the input buffer.  */
2531 			if (unlikely(max_len > in_end - in_next)) {
2532 				max_len = in_end - in_next;
2533 				nice_len = MIN(nice_len, max_len);
2534 			}
2535 
2536 			/*
2537 			 * Find matches with the current position using the
2538 			 * binary tree matchfinder and save them in
2539 			 * 'match_cache'.
2540 			 *
2541 			 * Note: the binary tree matchfinder is more suited for
2542 			 * optimal parsing than the hash chain matchfinder.  The
2543 			 * reasons for this include:
2544 			 *
2545 			 * - The binary tree matchfinder can find more matches
2546 			 *   in the same number of steps.
2547 			 * - One of the major advantages of hash chains is that
2548 			 *   skipping positions (not searching for matches at
2549 			 *   them) is faster; however, with optimal parsing we
2550 			 *   search for matches at almost all positions, so this
2551 			 *   advantage of hash chains is negated.
2552 			 */
2553 			matches = cache_ptr;
2554 			best_len = 0;
2555 			if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
2556 				cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf,
2557 								       in_cur_base,
2558 								       in_next - in_cur_base,
2559 								       max_len,
2560 								       nice_len,
2561 								       c->max_search_depth,
2562 								       next_hashes,
2563 								       &best_len,
2564 								       matches);
2565 			}
2566 
2567 			if (in_next >= next_observation) {
2568 				if (best_len >= 4) {
2569 					observe_match(&c->split_stats, best_len);
2570 					next_observation = in_next + best_len;
2571 				} else {
2572 					observe_literal(&c->split_stats, *in_next);
2573 					next_observation = in_next + 1;
2574 				}
2575 			}
2576 
2577 			cache_ptr->length = cache_ptr - matches;
2578 			cache_ptr->offset = *in_next;
2579 			in_next++;
2580 			cache_ptr++;
2581 
2582 			/*
2583 			 * If there was a very long match found, don't cache any
2584 			 * matches for the bytes covered by that match.  This
2585 			 * avoids degenerate behavior when compressing highly
2586 			 * redundant data, where the number of matches can be
2587 			 * very large.
2588 			 *
2589 			 * This heuristic doesn't actually hurt the compression
2590 			 * ratio very much.  If there's a long match, then the
2591 			 * data must be highly compressible, so it doesn't
2592 			 * matter much what we do.
2593 			 */
2594 			if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
2595 				--best_len;
2596 				do {
2597 					if (in_next == in_next_slide) {
2598 						bt_matchfinder_slide_window(&c->p.n.bt_mf);
2599 						in_cur_base = in_next;
2600 						in_next_slide = in_next + MIN(in_end - in_next,
2601 									      MATCHFINDER_WINDOW_SIZE);
2602 					}
2603 					if (unlikely(max_len > in_end - in_next)) {
2604 						max_len = in_end - in_next;
2605 						nice_len = MIN(nice_len, max_len);
2606 					}
2607 					if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) {
2608 						bt_matchfinder_skip_position(&c->p.n.bt_mf,
2609 									     in_cur_base,
2610 									     in_next - in_cur_base,
2611 									     nice_len,
2612 									     c->max_search_depth,
2613 									     next_hashes);
2614 					}
2615 					cache_ptr->length = 0;
2616 					cache_ptr->offset = *in_next;
2617 					in_next++;
2618 					cache_ptr++;
2619 				} while (--best_len);
2620 			}
2621 		} while (in_next < in_max_block_end &&
2622 			 cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
2623 			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2624 
2625 		/* All the matches for this block have been cached.  Now choose
2626 		 * the sequence of items to output and flush the block.  */
2627 		deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
2628 				       in_block_begin == in);
2629 		deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
2630 				    in_next == in_end, true);
2631 	} while (in_next != in_end);
2632 
2633 	return deflate_flush_output(&os);
2634 }
2635 
2636 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
2637 
2638 /* Initialize c->offset_slot_fast.  */
2639 static void
deflate_init_offset_slot_fast(struct libdeflate_compressor * c)2640 deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
2641 {
2642 	unsigned offset_slot;
2643 	unsigned offset;
2644 	unsigned offset_end;
2645 
2646 	for (offset_slot = 0;
2647 	     offset_slot < ARRAY_LEN(deflate_offset_slot_base);
2648 	     offset_slot++)
2649 	{
2650 		offset = deflate_offset_slot_base[offset_slot];
2651 	#if USE_FULL_OFFSET_SLOT_FAST
2652 		offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2653 		do {
2654 			c->offset_slot_fast[offset] = offset_slot;
2655 		} while (++offset != offset_end);
2656 	#else
2657 		if (offset <= 256) {
2658 			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2659 			do {
2660 				c->offset_slot_fast[offset - 1] = offset_slot;
2661 			} while (++offset != offset_end);
2662 		} else {
2663 			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2664 			do {
2665 				c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
2666 			} while ((offset += (1 << 7)) != offset_end);
2667 		}
2668 	#endif
2669 	}
2670 }
2671 
2672 LIBDEFLATEAPI struct libdeflate_compressor *
libdeflate_alloc_compressor(int compression_level)2673 libdeflate_alloc_compressor(int compression_level)
2674 {
2675 	struct libdeflate_compressor *c;
2676 	size_t size;
2677 
2678 #if SUPPORT_NEAR_OPTIMAL_PARSING
2679 	if (compression_level >= 8)
2680 		size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.n);
2681 	else
2682 #endif
2683 		size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.g);
2684 
2685 	c = aligned_malloc(MATCHFINDER_ALIGNMENT, size);
2686 	if (!c)
2687 		return NULL;
2688 
2689 	switch (compression_level) {
2690 	case 1:
2691 		c->impl = deflate_compress_greedy;
2692 		c->max_search_depth = 2;
2693 		c->nice_match_length = 8;
2694 		break;
2695 	case 2:
2696 		c->impl = deflate_compress_greedy;
2697 		c->max_search_depth = 6;
2698 		c->nice_match_length = 10;
2699 		break;
2700 	case 3:
2701 		c->impl = deflate_compress_greedy;
2702 		c->max_search_depth = 12;
2703 		c->nice_match_length = 14;
2704 		break;
2705 	case 4:
2706 		c->impl = deflate_compress_greedy;
2707 		c->max_search_depth = 24;
2708 		c->nice_match_length = 24;
2709 		break;
2710 	case 5:
2711 		c->impl = deflate_compress_lazy;
2712 		c->max_search_depth = 20;
2713 		c->nice_match_length = 30;
2714 		break;
2715 	case 6:
2716 		c->impl = deflate_compress_lazy;
2717 		c->max_search_depth = 40;
2718 		c->nice_match_length = 65;
2719 		break;
2720 	case 7:
2721 		c->impl = deflate_compress_lazy;
2722 		c->max_search_depth = 100;
2723 		c->nice_match_length = 130;
2724 		break;
2725 #if SUPPORT_NEAR_OPTIMAL_PARSING
2726 	case 8:
2727 		c->impl = deflate_compress_near_optimal;
2728 		c->max_search_depth = 12;
2729 		c->nice_match_length = 20;
2730 		c->p.n.num_optim_passes = 1;
2731 		break;
2732 	case 9:
2733 		c->impl = deflate_compress_near_optimal;
2734 		c->max_search_depth = 16;
2735 		c->nice_match_length = 26;
2736 		c->p.n.num_optim_passes = 2;
2737 		break;
2738 	case 10:
2739 		c->impl = deflate_compress_near_optimal;
2740 		c->max_search_depth = 30;
2741 		c->nice_match_length = 50;
2742 		c->p.n.num_optim_passes = 2;
2743 		break;
2744 	case 11:
2745 		c->impl = deflate_compress_near_optimal;
2746 		c->max_search_depth = 60;
2747 		c->nice_match_length = 80;
2748 		c->p.n.num_optim_passes = 3;
2749 		break;
2750 	case 12:
2751 		c->impl = deflate_compress_near_optimal;
2752 		c->max_search_depth = 100;
2753 		c->nice_match_length = 133;
2754 		c->p.n.num_optim_passes = 4;
2755 		break;
2756 #else
2757 	case 8:
2758 		c->impl = deflate_compress_lazy;
2759 		c->max_search_depth = 150;
2760 		c->nice_match_length = 200;
2761 		break;
2762 	case 9:
2763 		c->impl = deflate_compress_lazy;
2764 		c->max_search_depth = 200;
2765 		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
2766 		break;
2767 #endif
2768 	default:
2769 		aligned_free(c);
2770 		return NULL;
2771 	}
2772 
2773 	c->compression_level = compression_level;
2774 
2775 	deflate_init_offset_slot_fast(c);
2776 	deflate_init_static_codes(c);
2777 
2778 	return c;
2779 }
2780 
2781 LIBDEFLATEAPI size_t
libdeflate_deflate_compress(struct libdeflate_compressor * c,const void * in,size_t in_nbytes,void * out,size_t out_nbytes_avail)2782 libdeflate_deflate_compress(struct libdeflate_compressor *c,
2783 			    const void *in, size_t in_nbytes,
2784 			    void *out, size_t out_nbytes_avail)
2785 {
2786 	if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
2787 		return 0;
2788 
2789 	/* For extremely small inputs just use a single uncompressed block. */
2790 	if (unlikely(in_nbytes < 16)) {
2791 		struct deflate_output_bitstream os;
2792 		deflate_init_output(&os, out, out_nbytes_avail);
2793 		if (in_nbytes == 0)
2794 			in = &os; /* Avoid passing NULL to memcpy() */
2795 		deflate_write_uncompressed_block(&os, in, in_nbytes, true);
2796 		return deflate_flush_output(&os);
2797 	}
2798 
2799 	return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
2800 }
2801 
2802 LIBDEFLATEAPI void
libdeflate_free_compressor(struct libdeflate_compressor * c)2803 libdeflate_free_compressor(struct libdeflate_compressor *c)
2804 {
2805 	aligned_free(c);
2806 }
2807 
2808 unsigned int
deflate_get_compression_level(struct libdeflate_compressor * c)2809 deflate_get_compression_level(struct libdeflate_compressor *c)
2810 {
2811 	return c->compression_level;
2812 }
2813 
2814 LIBDEFLATEAPI size_t
libdeflate_deflate_compress_bound(struct libdeflate_compressor * c,size_t in_nbytes)2815 libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
2816 				  size_t in_nbytes)
2817 {
2818 	/*
2819 	 * The worst case is all uncompressed blocks where one block has length
2820 	 * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
2821 	 * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
2822 	 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
2823 	 */
2824 	size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
2825 	return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
2826 }
2827