1 /*
2  * deflate_compress.c - a compressor for DEFLATE
3  *
4  * Originally public domain; changes after 2016-09-07 are copyrighted.
5  *
6  * Copyright 2016 Eric Biggers
7  *
8  * Permission is hereby granted, free of charge, to any person
9  * obtaining a copy of this software and associated documentation
10  * files (the "Software"), to deal in the Software without
11  * restriction, including without limitation the rights to use,
12  * copy, modify, merge, publish, distribute, sublicense, and/or sell
13  * copies of the Software, and to permit persons to whom the
14  * Software is furnished to do so, subject to the following
15  * conditions:
16  *
17  * The above copyright notice and this permission notice shall be
18  * included in all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27  * OTHER DEALINGS IN THE SOFTWARE.
28  */
29 
30 #include "deflate_compress.h"
31 #include "deflate_constants.h"
32 #include "unaligned.h"
33 
34 #include "libdeflate.h"
35 
36 /*
37  * By default, the near-optimal parsing algorithm is enabled at compression
38  * level 8 and above.  The near-optimal parsing algorithm produces a compression
39  * ratio significantly better than the greedy and lazy algorithms implemented
40  * here, and also the algorithm used by zlib at level 9.  However, it is slow.
41  */
42 #define SUPPORT_NEAR_OPTIMAL_PARSING 1
43 
44 /*
45  * Define to 1 to maintain the full map from match offsets to offset slots.
46  * This slightly speeds up translations of match offsets to offset slots, but it
47  * uses 32769 bytes of memory rather than the 512 bytes used by the condensed
48  * map.  The speedup provided by the larger map is most helpful when the
49  * near-optimal parsing algorithm is being used.
50  */
51 #define USE_FULL_OFFSET_SLOT_FAST	SUPPORT_NEAR_OPTIMAL_PARSING
52 
53 /*
54  * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
55  * appropriately.
56  */
57 #define MATCHFINDER_WINDOW_ORDER	15
58 
59 #include "hc_matchfinder.h"
60 #if SUPPORT_NEAR_OPTIMAL_PARSING
61 #  include "bt_matchfinder.h"
62 #endif
63 
64 /*
65  * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
66  * except if the last block has to be shorter.
67  */
68 #define MIN_BLOCK_LENGTH	10000
69 
70 /*
71  * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
72  * the final length might be slightly longer due to matches extending beyond
73  * this limit.
74  */
75 #define SOFT_MAX_BLOCK_LENGTH	300000
76 
77 /*
78  * The number of observed matches or literals that represents sufficient data to
79  * decide whether the current block should be terminated or not.
80  */
81 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK       512
82 
83 
84 #if SUPPORT_NEAR_OPTIMAL_PARSING
85 /* Constants specific to the near-optimal parsing algorithm */
86 
87 /*
88  * The maximum number of matches the matchfinder can find at a single position.
89  * Since the matchfinder never finds more than one match for the same length,
90  * presuming one of each possible length is sufficient for an upper bound.
91  * (This says nothing about whether it is worthwhile to consider so many
92  * matches; this is just defining the worst case.)
93  */
94 #  define MAX_MATCHES_PER_POS	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
95 
96 /*
97  * The number of lz_match structures in the match cache, excluding the extra
98  * "overflow" entries.  This value should be high enough so that nearly the
99  * time, all matches found in a given block can fit in the match cache.
100  * However, fallback behavior (immediately terminating the block) on cache
101  * overflow is still required.
102  */
103 #  define CACHE_LENGTH      (SOFT_MAX_BLOCK_LENGTH * 5)
104 
105 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
106 
107 /*
108  * These are the compressor-side limits on the codeword lengths for each Huffman
109  * code.  To make outputting bits slightly faster, some of these limits are
110  * lower than the limits defined by the DEFLATE format.  This does not
111  * significantly affect the compression ratio, at least for the block lengths we
112  * use.
113  */
114 #define MAX_LITLEN_CODEWORD_LEN		14
115 #define MAX_OFFSET_CODEWORD_LEN		DEFLATE_MAX_OFFSET_CODEWORD_LEN
116 #define MAX_PRE_CODEWORD_LEN		DEFLATE_MAX_PRE_CODEWORD_LEN
117 
118 /* Table: length slot => length slot base value  */
119 static const unsigned deflate_length_slot_base[] = {
120 	3   , 4   , 5   , 6   , 7   , 8   , 9   , 10  ,
121 	11  , 13  , 15  , 17  , 19  , 23  , 27  , 31  ,
122 	35  , 43  , 51  , 59  , 67  , 83  , 99  , 115 ,
123 	131 , 163 , 195 , 227 , 258 ,
124 };
125 
126 /* Table: length slot => number of extra length bits  */
127 static const u8 deflate_extra_length_bits[] = {
128 	0   , 0   , 0   , 0   , 0   , 0   , 0   , 0 ,
129 	1   , 1   , 1   , 1   , 2   , 2   , 2   , 2 ,
130 	3   , 3   , 3   , 3   , 4   , 4   , 4   , 4 ,
131 	5   , 5   , 5   , 5   , 0   ,
132 };
133 
134 /* Table: offset slot => offset slot base value  */
135 static const unsigned deflate_offset_slot_base[] = {
136 	1    , 2    , 3    , 4     , 5     , 7     , 9     , 13    ,
137 	17   , 25   , 33   , 49    , 65    , 97    , 129   , 193   ,
138 	257  , 385  , 513  , 769   , 1025  , 1537  , 2049  , 3073  ,
139 	4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
140 };
141 
142 /* Table: offset slot => number of extra offset bits  */
143 static const u8 deflate_extra_offset_bits[] = {
144 	0    , 0    , 0    , 0     , 1     , 1     , 2     , 2     ,
145 	3    , 3    , 4    , 4     , 5     , 5     , 6     , 6     ,
146 	7    , 7    , 8    , 8     , 9     , 9     , 10    , 10    ,
147 	11   , 11   , 12   , 12    , 13    , 13    ,
148 };
149 
150 /* Table: length => length slot  */
151 static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
152 	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
153 	12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
154 	16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
155 	18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
156 	20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
157 	21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
158 	22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
159 	23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
160 	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
161 	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
162 	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
163 	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
164 	26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
165 	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
166 	27, 27, 28,
167 };
168 
169 /* The order in which precode codeword lengths are stored */
170 static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
171 	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
172 };
173 
174 /* Codewords for the DEFLATE Huffman codes.  */
175 struct deflate_codewords {
176 	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
177 	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
178 };
179 
180 /* Codeword lengths (in bits) for the DEFLATE Huffman codes.
181  * A zero length means the corresponding symbol had zero frequency.  */
182 struct deflate_lens {
183 	u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
184 	u8 offset[DEFLATE_NUM_OFFSET_SYMS];
185 };
186 
187 /* Codewords and lengths for the DEFLATE Huffman codes.  */
188 struct deflate_codes {
189 	struct deflate_codewords codewords;
190 	struct deflate_lens lens;
191 };
192 
193 /* Symbol frequency counters for the DEFLATE Huffman codes.  */
194 struct deflate_freqs {
195 	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
196 	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
197 };
198 
199 #if SUPPORT_NEAR_OPTIMAL_PARSING
200 
201 /* Costs for the near-optimal parsing algorithm.  */
202 struct deflate_costs {
203 
204 	/* The cost to output each possible literal.  */
205 	u32 literal[DEFLATE_NUM_LITERALS];
206 
207 	/* The cost to output each possible match length.  */
208 	u32 length[DEFLATE_MAX_MATCH_LEN + 1];
209 
210 	/* The cost to output a match offset of each possible offset slot.  */
211 	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
212 };
213 
214 /*
215  * COST_SHIFT is a scaling factor that makes it possible to consider fractional
216  * bit costs.  A token requiring 'n' bits to represent has cost n << COST_SHIFT.
217  *
218  * Note: this is only useful as a statistical trick for when the true costs are
219  * unknown.  In reality, each token in DEFLATE requires a whole number of bits
220  * to output.
221  */
222 #define COST_SHIFT	3
223 
224 /*
225  * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
226  * be needed to output a symbol that was unused in the previous optimization
227  * pass.  Assigning a default cost allows the symbol to be used in the next
228  * optimization pass.  However, the cost should be relatively high because the
229  * symbol probably won't be used very many times (if at all).
230  */
231 #define LITERAL_NOSTAT_BITS	13
232 #define LENGTH_NOSTAT_BITS	13
233 #define OFFSET_NOSTAT_BITS	10
234 
235 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
236 
237 /*
238  * Represents a run of literals followed by a match or end-of-block.  This
239  * struct is needed to temporarily store items chosen by the parser, since items
240  * cannot be written until all items for the block have been chosen and the
241  * block's Huffman codes have been computed.
242  */
243 struct deflate_sequence {
244 
245 	/* Bits 0..22: the number of literals in this run.  This may be 0 and
246 	 * can be at most about SOFT_MAX_BLOCK_LENGTH.  The literals are not
247 	 * stored explicitly in this structure; instead, they are read directly
248 	 * from the uncompressed data.
249 	 *
250 	 * Bits 23..31: the length of the match which follows the literals, or 0
251 	 * if this literal run was the last in the block, so there is no match
252 	 * which follows it.  */
253 	u32 litrunlen_and_length;
254 
255 	/* If 'length' doesn't indicate end-of-block, then this is the offset of
256 	 * the match which follows the literals.  */
257 	u16 offset;
258 
259 	/* If 'length' doesn't indicate end-of-block, then this is the offset
260 	 * symbol of the match which follows the literals.  */
261 	u8 offset_symbol;
262 
263 	/* If 'length' doesn't indicate end-of-block, then this is the length
264 	 * slot of the match which follows the literals.  */
265 	u8 length_slot;
266 };
267 
268 #if SUPPORT_NEAR_OPTIMAL_PARSING
269 
270 /*
271  * This structure represents a byte position in the input data and a node in the
272  * graph of possible match/literal choices for the current block.
273  *
274  * Logically, each incoming edge to this node is labeled with a literal or a
275  * match that can be taken to reach this position from an earlier position; and
276  * each outgoing edge from this node is labeled with a literal or a match that
277  * can be taken to advance from this position to a later position.
278  *
279  * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
280  * associate with each node just two pieces of information:
281  *
282  *	'cost_to_end' is the minimum cost to reach the end of the block from
283  *	this position.
284  *
285  *	'item' represents the literal or match that must be chosen from here to
286  *	reach the end of the block with the minimum cost.  Equivalently, this
287  *	can be interpreted as the label of the outgoing edge on the minimum-cost
288  *	path to the "end of block" node from this node.
289  */
290 struct deflate_optimum_node {
291 
292 	u32 cost_to_end;
293 
294 	/*
295 	 * Notes on the match/literal representation used here:
296 	 *
297 	 *	The low bits of 'item' are the length: 1 if this is a literal,
298 	 *	or the match length if this is a match.
299 	 *
300 	 *	The high bits of 'item' are the actual literal byte if this is a
301 	 *	literal, or the match offset if this is a match.
302 	 */
303 #define OPTIMUM_OFFSET_SHIFT 9
304 #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
305 	u32 item;
306 
307 };
308 
309 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
310 
311 /* Block split statistics.  See "Block splitting algorithm" below. */
312 #define NUM_LITERAL_OBSERVATION_TYPES 8
313 #define NUM_MATCH_OBSERVATION_TYPES 2
314 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
315 struct block_split_stats {
316 	u32 new_observations[NUM_OBSERVATION_TYPES];
317 	u32 observations[NUM_OBSERVATION_TYPES];
318 	u32 num_new_observations;
319 	u32 num_observations;
320 };
321 
322 /* The main DEFLATE compressor structure  */
323 struct libdeflate_compressor {
324 
325 	/* Pointer to the compress() implementation chosen at allocation time */
326 	size_t (*impl)(struct libdeflate_compressor *,
327 		       const u8 *, size_t, u8 *, size_t);
328 
329 	/* Frequency counters for the current block  */
330 	struct deflate_freqs freqs;
331 
332 	/* Dynamic Huffman codes for the current block  */
333 	struct deflate_codes codes;
334 
335 	/* Static Huffman codes */
336 	struct deflate_codes static_codes;
337 
338 	/* Block split statistics for the currently pending block */
339 	struct block_split_stats split_stats;
340 
341 	/* A table for fast lookups of offset slot by match offset.
342 	 *
343 	 * If the full table is being used, it is a direct mapping from offset
344 	 * to offset slot.
345 	 *
346 	 * If the condensed table is being used, the first 256 entries map
347 	 * directly to the offset slots of offsets 1 through 256.  The next 256
348 	 * entries map to the offset slots for the remaining offsets, stepping
349 	 * through the offsets with a stride of 128.  This relies on the fact
350 	 * that each of the remaining offset slots contains at least 128 offsets
351 	 * and has an offset base that is a multiple of 128.  */
352 #if USE_FULL_OFFSET_SLOT_FAST
353 	u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
354 #else
355 	u8 offset_slot_fast[512];
356 #endif
357 
358 	/* The "nice" match length: if a match of this length is found, choose
359 	 * it immediately without further consideration.  */
360 	unsigned nice_match_length;
361 
362 	/* The maximum search depth: consider at most this many potential
363 	 * matches at each position.  */
364 	unsigned max_search_depth;
365 
366 	/* The compression level with which this compressor was created.  */
367 	unsigned compression_level;
368 
369 	/* Anything smaller than this we won't bother trying to compress.  */
370 	unsigned min_size_to_compress;
371 
372 	/* Temporary space for Huffman code output  */
373 	u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
374 	u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
375 	u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
376 	unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
377 	unsigned num_litlen_syms;
378 	unsigned num_offset_syms;
379 	unsigned num_explicit_lens;
380 	unsigned num_precode_items;
381 
382 	union {
383 		/* Data for greedy or lazy parsing  */
384 		struct {
385 			/* Hash chain matchfinder  */
386 			struct hc_matchfinder hc_mf;
387 
388 			/* The matches and literals that the parser has chosen
389 			 * for the current block.  The required length of this
390 			 * array is limited by the maximum number of matches
391 			 * that can ever be chosen for a single block, plus one
392 			 * for the special entry at the end.  */
393 			struct deflate_sequence sequences[
394 				DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
395 					     DEFLATE_MIN_MATCH_LEN) + 1];
396 		} g; /* (g)reedy */
397 
398 	#if SUPPORT_NEAR_OPTIMAL_PARSING
399 		/* Data for near-optimal parsing  */
400 		struct {
401 
402 			/* Binary tree matchfinder  */
403 			struct bt_matchfinder bt_mf;
404 
405 			/*
406 			 * Cached matches for the current block.  This array
407 			 * contains the matches that were found at each position
408 			 * in the block.  Specifically, for each position, there
409 			 * is a list of matches found at that position, if any,
410 			 * sorted by strictly increasing length.  In addition,
411 			 * following the matches for each position, there is a
412 			 * special 'struct lz_match' whose 'length' member
413 			 * contains the number of matches found at that
414 			 * position, and whose 'offset' member contains the
415 			 * literal at that position.
416 			 *
417 			 * Note: in rare cases, there will be a very high number
418 			 * of matches in the block and this array will overflow.
419 			 * If this happens, we force the end of the current
420 			 * block.  CACHE_LENGTH is the length at which we
421 			 * actually check for overflow.  The extra slots beyond
422 			 * this are enough to absorb the worst case overflow,
423 			 * which occurs if starting at &match_cache[CACHE_LENGTH
424 			 * - 1], we write MAX_MATCHES_PER_POS matches and a
425 			 * match count header, then skip searching for matches
426 			 * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write
427 			 * the match count header for each.
428 			 */
429 			struct lz_match match_cache[CACHE_LENGTH +
430 						    MAX_MATCHES_PER_POS +
431 						    DEFLATE_MAX_MATCH_LEN - 1];
432 
433 			/*
434 			 * Array of nodes, one per position, for running the
435 			 * minimum-cost path algorithm.
436 			 *
437 			 * This array must be large enough to accommodate the
438 			 * worst-case number of nodes, which occurs if we find a
439 			 * match of length DEFLATE_MAX_MATCH_LEN at position
440 			 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
441 			 * length SOFT_MAX_BLOCK_LENGTH - 1 +
442 			 * DEFLATE_MAX_MATCH_LEN.  Add one for the end-of-block
443 			 * node.
444 			 */
445 			struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
446 								  DEFLATE_MAX_MATCH_LEN + 1];
447 
448 			/* The current cost model being used.  */
449 			struct deflate_costs costs;
450 
451 			unsigned num_optim_passes;
452 		} n; /* (n)ear-optimal */
453 	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
454 
455 	} p; /* (p)arser */
456 };
457 
458 /*
459  * The type for the bitbuffer variable, which temporarily holds bits that are
460  * being packed into bytes and written to the output buffer.  For best
461  * performance, this should have size equal to a machine word.
462  */
463 typedef machine_word_t bitbuf_t;
464 #define BITBUF_NBITS	(8 * sizeof(bitbuf_t))
465 
466 /* Can the specified number of bits always be added to 'bitbuf' after any
467  * pending bytes have been flushed?  */
468 #define CAN_BUFFER(n)	((n) <= BITBUF_NBITS - 7)
469 
470 /*
471  * Structure to keep track of the current state of sending bits to the
472  * compressed output buffer.
473  */
474 struct deflate_output_bitstream {
475 
476 	/* Bits that haven't yet been written to the output buffer.  */
477 	bitbuf_t bitbuf;
478 
479 	/* Number of bits currently held in @bitbuf.  */
480 	unsigned bitcount;
481 
482 	/* Pointer to the beginning of the output buffer.  */
483 	u8 *begin;
484 
485 	/* Pointer to the position in the output buffer at which the next byte
486 	 * should be written.  */
487 	u8 *next;
488 
489 	/* Pointer just past the end of the output buffer.  */
490 	u8 *end;
491 };
492 
493 /*
494  * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
495  * present following os->end, in order to not overrun the buffer when generating
496  * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
497  * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
498  * to make the compression algorithm produce the same result on all CPU
499  * architectures (which is sometimes desirable), we have to unconditionally use
500  * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
501  */
502 #define OUTPUT_END_PADDING	8
503 
504 /* Initialize the output bitstream.  'size' is assumed to be at least
505  * OUTPUT_END_PADDING.  */
506 static void
deflate_init_output(struct deflate_output_bitstream * os,void * buffer,size_t size)507 deflate_init_output(struct deflate_output_bitstream *os,
508 		    void *buffer, size_t size)
509 {
510 	os->bitbuf = 0;
511 	os->bitcount = 0;
512 	os->begin = buffer;
513 	os->next = os->begin;
514 	os->end = os->begin + size - OUTPUT_END_PADDING;
515 }
516 
517 /* Add some bits to the bitbuffer variable of the output bitstream.  The caller
518  * must make sure there is enough room.  */
519 static forceinline void
deflate_add_bits(struct deflate_output_bitstream * os,const bitbuf_t bits,const unsigned num_bits)520 deflate_add_bits(struct deflate_output_bitstream *os,
521 		 const bitbuf_t bits, const unsigned num_bits)
522 {
523 	os->bitbuf |= bits << os->bitcount;
524 	os->bitcount += num_bits;
525 }
526 
527 /* Flush bits from the bitbuffer variable to the output buffer.  */
528 static forceinline void
deflate_flush_bits(struct deflate_output_bitstream * os)529 deflate_flush_bits(struct deflate_output_bitstream *os)
530 {
531 	if (UNALIGNED_ACCESS_IS_FAST) {
532 		/* Flush a whole word (branchlessly).  */
533 		put_unaligned_leword(os->bitbuf, os->next);
534 		os->bitbuf >>= os->bitcount & ~7;
535 		os->next += MIN(os->end - os->next, os->bitcount >> 3);
536 		os->bitcount &= 7;
537 	} else {
538 		/* Flush a byte at a time.  */
539 		while (os->bitcount >= 8) {
540 			*os->next = os->bitbuf;
541 			if (os->next != os->end)
542 				os->next++;
543 			os->bitcount -= 8;
544 			os->bitbuf >>= 8;
545 		}
546 	}
547 }
548 
549 /* Align the bitstream on a byte boundary. */
550 static forceinline void
deflate_align_bitstream(struct deflate_output_bitstream * os)551 deflate_align_bitstream(struct deflate_output_bitstream *os)
552 {
553 	os->bitcount += -os->bitcount & 7;
554 	deflate_flush_bits(os);
555 }
556 
557 /*
558  * Flush any remaining bits to the output buffer if needed.  Return the total
559  * number of bytes written to the output buffer, or 0 if an overflow occurred.
560  */
561 static size_t
deflate_flush_output(struct deflate_output_bitstream * os)562 deflate_flush_output(struct deflate_output_bitstream *os)
563 {
564 	if (os->next == os->end) /* overflow?  */
565 		return 0;
566 
567 	while ((int)os->bitcount > 0) {
568 		*os->next++ = os->bitbuf;
569 		os->bitcount -= 8;
570 		os->bitbuf >>= 8;
571 	}
572 
573 	return os->next - os->begin;
574 }
575 
576 /* Given the binary tree node A[subtree_idx] whose children already
577  * satisfy the maxheap property, swap the node with its greater child
578  * until it is greater than both its children, so that the maxheap
579  * property is satisfied in the subtree rooted at A[subtree_idx].  */
580 static void
heapify_subtree(u32 A[],unsigned length,unsigned subtree_idx)581 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
582 {
583 	unsigned parent_idx;
584 	unsigned child_idx;
585 	u32 v;
586 
587 	v = A[subtree_idx];
588 	parent_idx = subtree_idx;
589 	while ((child_idx = parent_idx * 2) <= length) {
590 		if (child_idx < length && A[child_idx + 1] > A[child_idx])
591 			child_idx++;
592 		if (v >= A[child_idx])
593 			break;
594 		A[parent_idx] = A[child_idx];
595 		parent_idx = child_idx;
596 	}
597 	A[parent_idx] = v;
598 }
599 
600 /* Rearrange the array 'A' so that it satisfies the maxheap property.
601  * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
602  */
603 static void
heapify_array(u32 A[],unsigned length)604 heapify_array(u32 A[], unsigned length)
605 {
606 	unsigned subtree_idx;
607 
608 	for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
609 		heapify_subtree(A, length, subtree_idx);
610 }
611 
612 /*
613  * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
614  *
615  * Note: name this function heap_sort() instead of heapsort() to avoid colliding
616  * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
617  * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
618  */
619 static void
heap_sort(u32 A[],unsigned length)620 heap_sort(u32 A[], unsigned length)
621 {
622 	A--; /* Use 1-based indices  */
623 
624 	heapify_array(A, length);
625 
626 	while (length >= 2) {
627 		u32 tmp = A[length];
628 		A[length] = A[1];
629 		A[1] = tmp;
630 		length--;
631 		heapify_subtree(A, length, 1);
632 	}
633 }
634 
635 #define NUM_SYMBOL_BITS 10
636 #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
637 
638 #define GET_NUM_COUNTERS(num_syms)	((((num_syms) + 3 / 4) + 3) & ~3)
639 /*
640  * Sort the symbols primarily by frequency and secondarily by symbol
641  * value.  Discard symbols with zero frequency and fill in an array with
642  * the remaining symbols, along with their frequencies.  The low
643  * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
644  * value, and the remaining bits will contain the frequency.
645  *
646  * @num_syms
647  *	Number of symbols in the alphabet.
648  *	Can't be greater than (1 << NUM_SYMBOL_BITS).
649  *
650  * @freqs[num_syms]
651  *	The frequency of each symbol.
652  *
653  * @lens[num_syms]
654  *	An array that eventually will hold the length of each codeword.
655  *	This function only fills in the codeword lengths for symbols that
656  *	have zero frequency, which are not well defined per se but will
657  *	be set to 0.
658  *
659  * @symout[num_syms]
660  *	The output array, described above.
661  *
662  * Returns the number of entries in 'symout' that were filled.  This is
663  * the number of symbols that have nonzero frequency.
664  */
665 static unsigned
sort_symbols(unsigned num_syms,const u32 freqs[restrict],u8 lens[restrict],u32 symout[restrict])666 sort_symbols(unsigned num_syms, const u32 freqs[restrict],
667 	     u8 lens[restrict], u32 symout[restrict])
668 {
669 	unsigned sym;
670 	unsigned i;
671 	unsigned num_used_syms;
672 	unsigned num_counters;
673 	unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
674 
675 	/* We rely on heapsort, but with an added optimization.  Since
676 	 * it's common for most symbol frequencies to be low, we first do
677 	 * a count sort using a limited number of counters.  High
678 	 * frequencies will be counted in the last counter, and only they
679 	 * will be sorted with heapsort.
680 	 *
681 	 * Note: with more symbols, it is generally beneficial to have more
682 	 * counters.  About 1 counter per 4 symbols seems fast.
683 	 *
684 	 * Note: I also tested radix sort, but even for large symbol
685 	 * counts (> 255) and frequencies bounded at 16 bits (enabling
686 	 * radix sort by just two base-256 digits), it didn't seem any
687 	 * faster than the method implemented here.
688 	 *
689 	 * Note: I tested the optimized quicksort implementation from
690 	 * glibc (with indirection overhead removed), but it was only
691 	 * marginally faster than the simple heapsort implemented here.
692 	 *
693 	 * Tests were done with building the codes for LZX.  Results may
694 	 * vary for different compression algorithms...!  */
695 
696 	num_counters = GET_NUM_COUNTERS(num_syms);
697 
698 	memset(counters, 0, num_counters * sizeof(counters[0]));
699 
700 	/* Count the frequencies.  */
701 	for (sym = 0; sym < num_syms; sym++)
702 		counters[MIN(freqs[sym], num_counters - 1)]++;
703 
704 	/* Make the counters cumulative, ignoring the zero-th, which
705 	 * counted symbols with zero frequency.  As a side effect, this
706 	 * calculates the number of symbols with nonzero frequency.  */
707 	num_used_syms = 0;
708 	for (i = 1; i < num_counters; i++) {
709 		unsigned count = counters[i];
710 		counters[i] = num_used_syms;
711 		num_used_syms += count;
712 	}
713 
714 	/* Sort nonzero-frequency symbols using the counters.  At the
715 	 * same time, set the codeword lengths of zero-frequency symbols
716 	 * to 0.  */
717 	for (sym = 0; sym < num_syms; sym++) {
718 		u32 freq = freqs[sym];
719 		if (freq != 0) {
720 			symout[counters[MIN(freq, num_counters - 1)]++] =
721 				sym | (freq << NUM_SYMBOL_BITS);
722 		} else {
723 			lens[sym] = 0;
724 		}
725 	}
726 
727 	/* Sort the symbols counted in the last counter.  */
728 	heap_sort(symout + counters[num_counters - 2],
729 		  counters[num_counters - 1] - counters[num_counters - 2]);
730 
731 	return num_used_syms;
732 }
733 
734 /*
735  * Build the Huffman tree.
736  *
737  * This is an optimized implementation that
738  *	(a) takes advantage of the frequencies being already sorted;
739  *	(b) only generates non-leaf nodes, since the non-leaf nodes of a
740  *	    Huffman tree are sufficient to generate a canonical code;
741  *	(c) Only stores parent pointers, not child pointers;
742  *	(d) Produces the nodes in the same memory used for input
743  *	    frequency information.
744  *
745  * Array 'A', which contains 'sym_count' entries, is used for both input
746  * and output.  For this function, 'sym_count' must be at least 2.
747  *
748  * For input, the array must contain the frequencies of the symbols,
749  * sorted in increasing order.  Specifically, each entry must contain a
750  * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
751  * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
752  * Although these bits will, in fact, contain the symbols that correspond
753  * to the frequencies, this function is concerned with frequencies only
754  * and keeps the symbols as-is.
755  *
756  * For output, this function will produce the non-leaf nodes of the
757  * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
758  * entries of the array.  Entry A[sym_count - 2] will represent the root
759  * node.  Each other node will contain the zero-based index of its parent
760  * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
761  * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
762  * note that although these low bits will, in fact, contain a symbol
763  * value, this symbol will have *no relationship* with the Huffman tree
764  * node that happens to occupy the same slot.  This is because this
765  * implementation only generates the non-leaf nodes of the tree.
766  */
767 static void
build_tree(u32 A[],unsigned sym_count)768 build_tree(u32 A[], unsigned sym_count)
769 {
770 	/* Index, in 'A', of next lowest frequency symbol that has not
771 	 * yet been processed.  */
772 	unsigned i = 0;
773 
774 	/* Index, in 'A', of next lowest frequency parentless non-leaf
775 	 * node; or, if equal to 'e', then no such node exists yet.  */
776 	unsigned b = 0;
777 
778 	/* Index, in 'A', of next node to allocate as a non-leaf.  */
779 	unsigned e = 0;
780 
781 	do {
782 		unsigned m, n;
783 		u32 freq_shifted;
784 
785 		/* Choose the two next lowest frequency entries.  */
786 
787 		if (i != sym_count &&
788 		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
789 			m = i++;
790 		else
791 			m = b++;
792 
793 		if (i != sym_count &&
794 		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
795 			n = i++;
796 		else
797 			n = b++;
798 
799 		/* Allocate a non-leaf node and link the entries to it.
800 		 *
801 		 * If we link an entry that we're visiting for the first
802 		 * time (via index 'i'), then we're actually linking a
803 		 * leaf node and it will have no effect, since the leaf
804 		 * will be overwritten with a non-leaf when index 'e'
805 		 * catches up to it.  But it's not any slower to
806 		 * unconditionally set the parent index.
807 		 *
808 		 * We also compute the frequency of the non-leaf node as
809 		 * the sum of its two children's frequencies.  */
810 
811 		freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
812 
813 		A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
814 		A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
815 		A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
816 		e++;
817 	} while (sym_count - e > 1);
818 		/* When just one entry remains, it is a "leaf" that was
819 		 * linked to some other node.  We ignore it, since the
820 		 * rest of the array contains the non-leaves which we
821 		 * need.  (Note that we're assuming the cases with 0 or 1
822 		 * symbols were handled separately.) */
823 }
824 
825 /*
826  * Given the stripped-down Huffman tree constructed by build_tree(),
827  * determine the number of codewords that should be assigned each
828  * possible length, taking into account the length-limited constraint.
829  *
830  * @A
831  *	The array produced by build_tree(), containing parent index
832  *	information for the non-leaf nodes of the Huffman tree.  Each
833  *	entry in this array is a node; a node's parent always has a
834  *	greater index than that node itself.  This function will
835  *	overwrite the parent index information in this array, so
836  *	essentially it will destroy the tree.  However, the data in the
837  *	low NUM_SYMBOL_BITS of each entry will be preserved.
838  *
839  * @root_idx
840  *	The 0-based index of the root node in 'A', and consequently one
841  *	less than the number of tree node entries in 'A'.  (Or, really 2
842  *	less than the actual length of 'A'.)
843  *
844  * @len_counts
845  *	An array of length ('max_codeword_len' + 1) in which the number of
846  *	codewords having each length <= max_codeword_len will be
847  *	returned.
848  *
849  * @max_codeword_len
850  *	The maximum permissible codeword length.
851  */
852 static void
compute_length_counts(u32 A[restrict],unsigned root_idx,unsigned len_counts[restrict],unsigned max_codeword_len)853 compute_length_counts(u32 A[restrict], unsigned root_idx,
854 		      unsigned len_counts[restrict], unsigned max_codeword_len)
855 {
856 	unsigned len;
857 	int node;
858 
859 	/* The key observations are:
860 	 *
861 	 * (1) We can traverse the non-leaf nodes of the tree, always
862 	 * visiting a parent before its children, by simply iterating
863 	 * through the array in reverse order.  Consequently, we can
864 	 * compute the depth of each node in one pass, overwriting the
865 	 * parent indices with depths.
866 	 *
867 	 * (2) We can initially assume that in the real Huffman tree,
868 	 * both children of the root are leaves.  This corresponds to two
869 	 * codewords of length 1.  Then, whenever we visit a (non-leaf)
870 	 * node during the traversal, we modify this assumption to
871 	 * account for the current node *not* being a leaf, but rather
872 	 * its two children being leaves.  This causes the loss of one
873 	 * codeword for the current depth and the addition of two
874 	 * codewords for the current depth plus one.
875 	 *
876 	 * (3) We can handle the length-limited constraint fairly easily
877 	 * by simply using the largest length available when a depth
878 	 * exceeds max_codeword_len.
879 	 */
880 
881 	for (len = 0; len <= max_codeword_len; len++)
882 		len_counts[len] = 0;
883 	len_counts[1] = 2;
884 
885 	/* Set the root node's depth to 0.  */
886 	A[root_idx] &= SYMBOL_MASK;
887 
888 	for (node = root_idx - 1; node >= 0; node--) {
889 
890 		/* Calculate the depth of this node.  */
891 
892 		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
893 		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
894 		unsigned depth = parent_depth + 1;
895 		unsigned len = depth;
896 
897 		/* Set the depth of this node so that it is available
898 		 * when its children (if any) are processed.  */
899 
900 		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
901 
902 		/* If needed, decrease the length to meet the
903 		 * length-limited constraint.  This is not the optimal
904 		 * method for generating length-limited Huffman codes!
905 		 * But it should be good enough.  */
906 		if (len >= max_codeword_len) {
907 			len = max_codeword_len;
908 			do {
909 				len--;
910 			} while (len_counts[len] == 0);
911 		}
912 
913 		/* Account for the fact that we have a non-leaf node at
914 		 * the current depth.  */
915 		len_counts[len]--;
916 		len_counts[len + 1] += 2;
917 	}
918 }
919 
920 /*
921  * Generate the codewords for a canonical Huffman code.
922  *
923  * @A
924  *	The output array for codewords.  In addition, initially this
925  *	array must contain the symbols, sorted primarily by frequency and
926  *	secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
927  *	each entry.
928  *
929  * @len
930  *	Output array for codeword lengths.
931  *
932  * @len_counts
933  *	An array that provides the number of codewords that will have
934  *	each possible length <= max_codeword_len.
935  *
936  * @max_codeword_len
937  *	Maximum length, in bits, of each codeword.
938  *
939  * @num_syms
940  *	Number of symbols in the alphabet, including symbols with zero
941  *	frequency.  This is the length of the 'A' and 'len' arrays.
942  */
943 static void
gen_codewords(u32 A[restrict],u8 lens[restrict],const unsigned len_counts[restrict],unsigned max_codeword_len,unsigned num_syms)944 gen_codewords(u32 A[restrict], u8 lens[restrict],
945 	      const unsigned len_counts[restrict],
946 	      unsigned max_codeword_len, unsigned num_syms)
947 {
948 	u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
949 	unsigned i;
950 	unsigned len;
951 	unsigned sym;
952 
953 	/* Given the number of codewords that will have each length,
954 	 * assign codeword lengths to symbols.  We do this by assigning
955 	 * the lengths in decreasing order to the symbols sorted
956 	 * primarily by increasing frequency and secondarily by
957 	 * increasing symbol value.  */
958 	for (i = 0, len = max_codeword_len; len >= 1; len--) {
959 		unsigned count = len_counts[len];
960 		while (count--)
961 			lens[A[i++] & SYMBOL_MASK] = len;
962 	}
963 
964 	/* Generate the codewords themselves.  We initialize the
965 	 * 'next_codewords' array to provide the lexicographically first
966 	 * codeword of each length, then assign codewords in symbol
967 	 * order.  This produces a canonical code.  */
968 	next_codewords[0] = 0;
969 	next_codewords[1] = 0;
970 	for (len = 2; len <= max_codeword_len; len++)
971 		next_codewords[len] =
972 			(next_codewords[len - 1] + len_counts[len - 1]) << 1;
973 
974 	for (sym = 0; sym < num_syms; sym++)
975 		A[sym] = next_codewords[lens[sym]]++;
976 }
977 
978 /*
979  * ---------------------------------------------------------------------
980  *			make_canonical_huffman_code()
981  * ---------------------------------------------------------------------
982  *
983  * Given an alphabet and the frequency of each symbol in it, construct a
984  * length-limited canonical Huffman code.
985  *
986  * @num_syms
987  *	The number of symbols in the alphabet.  The symbols are the
988  *	integers in the range [0, num_syms - 1].  This parameter must be
989  *	at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
990  *
991  * @max_codeword_len
992  *	The maximum permissible codeword length.
993  *
994  * @freqs
995  *	An array of @num_syms entries, each of which specifies the
996  *	frequency of the corresponding symbol.  It is valid for some,
997  *	none, or all of the frequencies to be 0.
998  *
999  * @lens
1000  *	An array of @num_syms entries in which this function will return
1001  *	the length, in bits, of the codeword assigned to each symbol.
1002  *	Symbols with 0 frequency will not have codewords per se, but
1003  *	their entries in this array will be set to 0.  No lengths greater
1004  *	than @max_codeword_len will be assigned.
1005  *
1006  * @codewords
1007  *	An array of @num_syms entries in which this function will return
1008  *	the codeword for each symbol, right-justified and padded on the
1009  *	left with zeroes.  Codewords for symbols with 0 frequency will be
1010  *	undefined.
1011  *
1012  * ---------------------------------------------------------------------
1013  *
1014  * This function builds a length-limited canonical Huffman code.
1015  *
1016  * A length-limited Huffman code contains no codewords longer than some
1017  * specified length, and has exactly (with some algorithms) or
1018  * approximately (with the algorithm used here) the minimum weighted path
1019  * length from the root, given this constraint.
1020  *
1021  * A canonical Huffman code satisfies the properties that a longer
1022  * codeword never lexicographically precedes a shorter codeword, and the
1023  * lexicographic ordering of codewords of the same length is the same as
1024  * the lexicographic ordering of the corresponding symbols.  A canonical
1025  * Huffman code, or more generally a canonical prefix code, can be
1026  * reconstructed from only a list containing the codeword length of each
1027  * symbol.
1028  *
1029  * The classic algorithm to generate a Huffman code creates a node for
1030  * each symbol, then inserts these nodes into a min-heap keyed by symbol
1031  * frequency.  Then, repeatedly, the two lowest-frequency nodes are
1032  * removed from the min-heap and added as the children of a new node
1033  * having frequency equal to the sum of its two children, which is then
1034  * inserted into the min-heap.  When only a single node remains in the
1035  * min-heap, it is the root of the Huffman tree.  The codeword for each
1036  * symbol is determined by the path needed to reach the corresponding
1037  * node from the root.  Descending to the left child appends a 0 bit,
1038  * whereas descending to the right child appends a 1 bit.
1039  *
1040  * The classic algorithm is relatively easy to understand, but it is
1041  * subject to a number of inefficiencies.  In practice, it is fastest to
1042  * first sort the symbols by frequency.  (This itself can be subject to
1043  * an optimization based on the fact that most frequencies tend to be
1044  * low.)  At the same time, we sort secondarily by symbol value, which
1045  * aids the process of generating a canonical code.  Then, during tree
1046  * construction, no heap is necessary because both the leaf nodes and the
1047  * unparented non-leaf nodes can be easily maintained in sorted order.
1048  * Consequently, there can never be more than two possibilities for the
1049  * next-lowest-frequency node.
1050  *
1051  * In addition, because we're generating a canonical code, we actually
1052  * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
1053  * This is because for canonical code generation we don't need to know
1054  * where the symbols are in the tree.  Rather, we only need to know how
1055  * many leaf nodes have each depth (codeword length).  And this
1056  * information can, in fact, be quickly generated from the tree of
1057  * non-leaves only.
1058  *
1059  * Furthermore, we can build this stripped-down Huffman tree directly in
1060  * the array in which the codewords are to be generated, provided that
1061  * these array slots are large enough to hold a symbol and frequency
1062  * value.
1063  *
1064  * Still furthermore, we don't even need to maintain explicit child
1065  * pointers.  We only need the parent pointers, and even those can be
1066  * overwritten in-place with depth information as part of the process of
1067  * extracting codeword lengths from the tree.  So in summary, we do NOT
1068  * need a big structure like:
1069  *
1070  *	struct huffman_tree_node {
1071  *		unsigned int symbol;
1072  *		unsigned int frequency;
1073  *		unsigned int depth;
1074  *		struct huffman_tree_node *left_child;
1075  *		struct huffman_tree_node *right_child;
1076  *	};
1077  *
1078  *
1079  *   ... which often gets used in "naive" implementations of Huffman code
1080  *   generation.
1081  *
1082  * Many of these optimizations are based on the implementation in 7-Zip
1083  * (source file: C/HuffEnc.c), which has been placed in the public domain
1084  * by Igor Pavlov.
1085  */
1086 static void
make_canonical_huffman_code(unsigned num_syms,unsigned max_codeword_len,const u32 freqs[restrict],u8 lens[restrict],u32 codewords[restrict])1087 make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1088 			    const u32 freqs[restrict],
1089 			    u8 lens[restrict], u32 codewords[restrict])
1090 {
1091 	u32 *A = codewords;
1092 	unsigned num_used_syms;
1093 
1094 	STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
1095 
1096 	/* We begin by sorting the symbols primarily by frequency and
1097 	 * secondarily by symbol value.  As an optimization, the array
1098 	 * used for this purpose ('A') shares storage with the space in
1099 	 * which we will eventually return the codewords.  */
1100 
1101 	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
1102 
1103 	/* 'num_used_syms' is the number of symbols with nonzero
1104 	 * frequency.  This may be less than @num_syms.  'num_used_syms'
1105 	 * is also the number of entries in 'A' that are valid.  Each
1106 	 * entry consists of a distinct symbol and a nonzero frequency
1107 	 * packed into a 32-bit integer.  */
1108 
1109 	/* Handle special cases where only 0 or 1 symbols were used (had
1110 	 * nonzero frequency).  */
1111 
1112 	if (unlikely(num_used_syms == 0)) {
1113 		/* Code is empty.  sort_symbols() already set all lengths
1114 		 * to 0, so there is nothing more to do.  */
1115 		return;
1116 	}
1117 
1118 	if (unlikely(num_used_syms == 1)) {
1119 		/* Only one symbol was used, so we only need one
1120 		 * codeword.  But two codewords are needed to form the
1121 		 * smallest complete Huffman code, which uses codewords 0
1122 		 * and 1.  Therefore, we choose another symbol to which
1123 		 * to assign a codeword.  We use 0 (if the used symbol is
1124 		 * not 0) or 1 (if the used symbol is 0).  In either
1125 		 * case, the lesser-valued symbol must be assigned
1126 		 * codeword 0 so that the resulting code is canonical.  */
1127 
1128 		unsigned sym = A[0] & SYMBOL_MASK;
1129 		unsigned nonzero_idx = sym ? sym : 1;
1130 
1131 		codewords[0] = 0;
1132 		lens[0] = 1;
1133 		codewords[nonzero_idx] = 1;
1134 		lens[nonzero_idx] = 1;
1135 		return;
1136 	}
1137 
1138 	/* Build a stripped-down version of the Huffman tree, sharing the
1139 	 * array 'A' with the symbol values.  Then extract length counts
1140 	 * from the tree and use them to generate the final codewords.  */
1141 
1142 	build_tree(A, num_used_syms);
1143 
1144 	{
1145 		unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
1146 
1147 		compute_length_counts(A, num_used_syms - 2,
1148 				      len_counts, max_codeword_len);
1149 
1150 		gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
1151 	}
1152 }
1153 
1154 /*
1155  * Clear the Huffman symbol frequency counters.
1156  * This must be called when starting a new DEFLATE block.
1157  */
1158 static void
deflate_reset_symbol_frequencies(struct libdeflate_compressor * c)1159 deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
1160 {
1161 	memset(&c->freqs, 0, sizeof(c->freqs));
1162 }
1163 
1164 /* Reverse the Huffman codeword 'codeword', which is 'len' bits in length.  */
1165 static u32
deflate_reverse_codeword(u32 codeword,u8 len)1166 deflate_reverse_codeword(u32 codeword, u8 len)
1167 {
1168 	/* The following branchless algorithm is faster than going bit by bit.
1169 	 * Note: since no codewords are longer than 16 bits, we only need to
1170 	 * reverse the low 16 bits of the 'u32'.  */
1171 	STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
1172 
1173 	/* Flip adjacent 1-bit fields  */
1174 	codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
1175 
1176 	/* Flip adjacent 2-bit fields  */
1177 	codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
1178 
1179 	/* Flip adjacent 4-bit fields  */
1180 	codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
1181 
1182 	/* Flip adjacent 8-bit fields  */
1183 	codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
1184 
1185 	/* Return the high 'len' bits of the bit-reversed 16 bit value.  */
1186 	return codeword >> (16 - len);
1187 }
1188 
1189 /* Make a canonical Huffman code with bit-reversed codewords.  */
1190 static void
deflate_make_huffman_code(unsigned num_syms,unsigned max_codeword_len,const u32 freqs[],u8 lens[],u32 codewords[])1191 deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1192 			  const u32 freqs[], u8 lens[], u32 codewords[])
1193 {
1194 	unsigned sym;
1195 
1196 	make_canonical_huffman_code(num_syms, max_codeword_len,
1197 				    freqs, lens, codewords);
1198 
1199 	for (sym = 0; sym < num_syms; sym++)
1200 		codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
1201 }
1202 
1203 /*
1204  * Build the literal/length and offset Huffman codes for a DEFLATE block.
1205  *
1206  * This takes as input the frequency tables for each code and produces as output
1207  * a set of tables that map symbols to codewords and codeword lengths.
1208  */
1209 static void
deflate_make_huffman_codes(const struct deflate_freqs * freqs,struct deflate_codes * codes)1210 deflate_make_huffman_codes(const struct deflate_freqs *freqs,
1211 			   struct deflate_codes *codes)
1212 {
1213 	STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
1214 	STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
1215 
1216 	deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
1217 				  MAX_LITLEN_CODEWORD_LEN,
1218 				  freqs->litlen,
1219 				  codes->lens.litlen,
1220 				  codes->codewords.litlen);
1221 
1222 	deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
1223 				  MAX_OFFSET_CODEWORD_LEN,
1224 				  freqs->offset,
1225 				  codes->lens.offset,
1226 				  codes->codewords.offset);
1227 }
1228 
1229 /* Initialize c->static_codes.  */
1230 static void
deflate_init_static_codes(struct libdeflate_compressor * c)1231 deflate_init_static_codes(struct libdeflate_compressor *c)
1232 {
1233 	unsigned i;
1234 
1235 	for (i = 0; i < 144; i++)
1236 		c->freqs.litlen[i] = 1 << (9 - 8);
1237 	for (; i < 256; i++)
1238 		c->freqs.litlen[i] = 1 << (9 - 9);
1239 	for (; i < 280; i++)
1240 		c->freqs.litlen[i] = 1 << (9 - 7);
1241 	for (; i < 288; i++)
1242 		c->freqs.litlen[i] = 1 << (9 - 8);
1243 
1244 	for (i = 0; i < 32; i++)
1245 		c->freqs.offset[i] = 1 << (5 - 5);
1246 
1247 	deflate_make_huffman_codes(&c->freqs, &c->static_codes);
1248 }
1249 
1250 /* Return the offset slot for the specified match offset.  */
1251 static forceinline unsigned
deflate_get_offset_slot(struct libdeflate_compressor * c,unsigned offset)1252 deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
1253 {
1254 #if USE_FULL_OFFSET_SLOT_FAST
1255 	return c->offset_slot_fast[offset];
1256 #else
1257 	if (offset <= 256)
1258 		return c->offset_slot_fast[offset - 1];
1259 	else
1260 		return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
1261 #endif
1262 }
1263 
1264 /* Write the header fields common to all DEFLATE block types.  */
1265 static void
deflate_write_block_header(struct deflate_output_bitstream * os,bool is_final_block,unsigned block_type)1266 deflate_write_block_header(struct deflate_output_bitstream *os,
1267 			   bool is_final_block, unsigned block_type)
1268 {
1269 	deflate_add_bits(os, is_final_block, 1);
1270 	deflate_add_bits(os, block_type, 2);
1271 	deflate_flush_bits(os);
1272 }
1273 
1274 static unsigned
deflate_compute_precode_items(const u8 lens[restrict],const unsigned num_lens,u32 precode_freqs[restrict],unsigned precode_items[restrict])1275 deflate_compute_precode_items(const u8 lens[restrict],
1276 			      const unsigned num_lens,
1277 			      u32 precode_freqs[restrict],
1278 			      unsigned precode_items[restrict])
1279 {
1280 	unsigned *itemptr;
1281 	unsigned run_start;
1282 	unsigned run_end;
1283 	unsigned extra_bits;
1284 	u8 len;
1285 
1286 	memset(precode_freqs, 0,
1287 	       DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
1288 
1289 	itemptr = precode_items;
1290 	run_start = 0;
1291 	do {
1292 		/* Find the next run of codeword lengths.  */
1293 
1294 		/* len = the length being repeated  */
1295 		len = lens[run_start];
1296 
1297 		/* Extend the run.  */
1298 		run_end = run_start;
1299 		do {
1300 			run_end++;
1301 		} while (run_end != num_lens && len == lens[run_end]);
1302 
1303 		if (len == 0) {
1304 			/* Run of zeroes.  */
1305 
1306 			/* Symbol 18: RLE 11 to 138 zeroes at a time.  */
1307 			while ((run_end - run_start) >= 11) {
1308 				extra_bits = MIN((run_end - run_start) - 11, 0x7F);
1309 				precode_freqs[18]++;
1310 				*itemptr++ = 18 | (extra_bits << 5);
1311 				run_start += 11 + extra_bits;
1312 			}
1313 
1314 			/* Symbol 17: RLE 3 to 10 zeroes at a time.  */
1315 			if ((run_end - run_start) >= 3) {
1316 				extra_bits = MIN((run_end - run_start) - 3, 0x7);
1317 				precode_freqs[17]++;
1318 				*itemptr++ = 17 | (extra_bits << 5);
1319 				run_start += 3 + extra_bits;
1320 			}
1321 		} else {
1322 
1323 			/* A run of nonzero lengths. */
1324 
1325 			/* Symbol 16: RLE 3 to 6 of the previous length.  */
1326 			if ((run_end - run_start) >= 4) {
1327 				precode_freqs[len]++;
1328 				*itemptr++ = len;
1329 				run_start++;
1330 				do {
1331 					extra_bits = MIN((run_end - run_start) - 3, 0x3);
1332 					precode_freqs[16]++;
1333 					*itemptr++ = 16 | (extra_bits << 5);
1334 					run_start += 3 + extra_bits;
1335 				} while ((run_end - run_start) >= 3);
1336 			}
1337 		}
1338 
1339 		/* Output any remaining lengths without RLE.  */
1340 		while (run_start != run_end) {
1341 			precode_freqs[len]++;
1342 			*itemptr++ = len;
1343 			run_start++;
1344 		}
1345 	} while (run_start != num_lens);
1346 
1347 	return itemptr - precode_items;
1348 }
1349 
1350 /*
1351  * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
1352  * separate Huffman code, the "precode", which contains a symbol for each
1353  * possible codeword length in the larger code as well as several special
1354  * symbols to represent repeated codeword lengths (a form of run-length
1355  * encoding).  The precode is itself constructed in canonical form, and its
1356  * codeword lengths are represented literally in 19 3-bit fields that
1357  * immediately precede the compressed codeword lengths of the larger code.
1358  */
1359 
1360 /* Precompute the information needed to output Huffman codes. */
1361 static void
deflate_precompute_huffman_header(struct libdeflate_compressor * c)1362 deflate_precompute_huffman_header(struct libdeflate_compressor *c)
1363 {
1364 	/* Compute how many litlen and offset symbols are needed. */
1365 
1366 	for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
1367 	     c->num_litlen_syms > 257;
1368 	     c->num_litlen_syms--)
1369 		if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
1370 			break;
1371 
1372 	for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
1373 	     c->num_offset_syms > 1;
1374 	     c->num_offset_syms--)
1375 		if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
1376 			break;
1377 
1378 	/* If we're not using the full set of literal/length codeword lengths,
1379 	 * then temporarily move the offset codeword lengths over so that the
1380 	 * literal/length and offset codeword lengths are contiguous. */
1381 
1382 	STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
1383 		      DEFLATE_NUM_LITLEN_SYMS);
1384 
1385 	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1386 		memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
1387 			(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1388 			c->num_offset_syms);
1389 	}
1390 
1391 	/* Compute the "items" (RLE / literal tokens and extra bits) with which
1392 	 * the codeword lengths in the larger code will be output. */
1393 	c->num_precode_items =
1394 		deflate_compute_precode_items((u8 *)&c->codes.lens,
1395 					      c->num_litlen_syms +
1396 							c->num_offset_syms,
1397 					      c->precode_freqs,
1398 					      c->precode_items);
1399 
1400 	/* Build the precode. */
1401 	STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
1402 	deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
1403 				  MAX_PRE_CODEWORD_LEN,
1404 				  c->precode_freqs, c->precode_lens,
1405 				  c->precode_codewords);
1406 
1407 	/* Count how many precode lengths we actually need to output. */
1408 	for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
1409 	     c->num_explicit_lens > 4;
1410 	     c->num_explicit_lens--)
1411 		if (c->precode_lens[deflate_precode_lens_permutation[
1412 						c->num_explicit_lens - 1]] != 0)
1413 			break;
1414 
1415 	/* Restore the offset codeword lengths if needed. */
1416 	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1417 		memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1418 			(u8 *)&c->codes.lens + c->num_litlen_syms,
1419 			c->num_offset_syms);
1420 	}
1421 }
1422 
1423 /* Output the Huffman codes. */
1424 static void
deflate_write_huffman_header(struct libdeflate_compressor * c,struct deflate_output_bitstream * os)1425 deflate_write_huffman_header(struct libdeflate_compressor *c,
1426 			     struct deflate_output_bitstream *os)
1427 {
1428 	unsigned i;
1429 
1430 	deflate_add_bits(os, c->num_litlen_syms - 257, 5);
1431 	deflate_add_bits(os, c->num_offset_syms - 1, 5);
1432 	deflate_add_bits(os, c->num_explicit_lens - 4, 4);
1433 	deflate_flush_bits(os);
1434 
1435 	/* Output the lengths of the codewords in the precode.  */
1436 	for (i = 0; i < c->num_explicit_lens; i++) {
1437 		deflate_add_bits(os, c->precode_lens[
1438 				       deflate_precode_lens_permutation[i]], 3);
1439 		deflate_flush_bits(os);
1440 	}
1441 
1442 	/* Output the encoded lengths of the codewords in the larger code.  */
1443 	for (i = 0; i < c->num_precode_items; i++) {
1444 		unsigned precode_item = c->precode_items[i];
1445 		unsigned precode_sym = precode_item & 0x1F;
1446 		deflate_add_bits(os, c->precode_codewords[precode_sym],
1447 				 c->precode_lens[precode_sym]);
1448 		if (precode_sym >= 16) {
1449 			if (precode_sym == 16)
1450 				deflate_add_bits(os, precode_item >> 5, 2);
1451 			else if (precode_sym == 17)
1452 				deflate_add_bits(os, precode_item >> 5, 3);
1453 			else
1454 				deflate_add_bits(os, precode_item >> 5, 7);
1455 		}
1456 		STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7));
1457 		deflate_flush_bits(os);
1458 	}
1459 }
1460 
1461 static void
deflate_write_sequences(struct deflate_output_bitstream * restrict os,const struct deflate_codes * restrict codes,const struct deflate_sequence sequences[restrict],const u8 * restrict in_next)1462 deflate_write_sequences(struct deflate_output_bitstream * restrict os,
1463 			const struct deflate_codes * restrict codes,
1464 			const struct deflate_sequence sequences[restrict],
1465 			const u8 * restrict in_next)
1466 {
1467 	const struct deflate_sequence *seq = sequences;
1468 
1469 	for (;;) {
1470 		u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
1471 		unsigned length = seq->litrunlen_and_length >> 23;
1472 		unsigned length_slot;
1473 		unsigned litlen_symbol;
1474 		unsigned offset_symbol;
1475 
1476 		if (litrunlen) {
1477 		#if 1
1478 			while (litrunlen >= 4) {
1479 				unsigned lit0 = in_next[0];
1480 				unsigned lit1 = in_next[1];
1481 				unsigned lit2 = in_next[2];
1482 				unsigned lit3 = in_next[3];
1483 
1484 				deflate_add_bits(os, codes->codewords.litlen[lit0],
1485 						 codes->lens.litlen[lit0]);
1486 				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1487 					deflate_flush_bits(os);
1488 
1489 				deflate_add_bits(os, codes->codewords.litlen[lit1],
1490 						 codes->lens.litlen[lit1]);
1491 				if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
1492 					deflate_flush_bits(os);
1493 
1494 				deflate_add_bits(os, codes->codewords.litlen[lit2],
1495 						 codes->lens.litlen[lit2]);
1496 				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1497 					deflate_flush_bits(os);
1498 
1499 				deflate_add_bits(os, codes->codewords.litlen[lit3],
1500 						 codes->lens.litlen[lit3]);
1501 				deflate_flush_bits(os);
1502 				in_next += 4;
1503 				litrunlen -= 4;
1504 			}
1505 			if (litrunlen-- != 0) {
1506 				deflate_add_bits(os, codes->codewords.litlen[*in_next],
1507 						 codes->lens.litlen[*in_next]);
1508 				if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1509 					deflate_flush_bits(os);
1510 				in_next++;
1511 				if (litrunlen-- != 0) {
1512 					deflate_add_bits(os, codes->codewords.litlen[*in_next],
1513 							 codes->lens.litlen[*in_next]);
1514 					if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1515 						deflate_flush_bits(os);
1516 					in_next++;
1517 					if (litrunlen-- != 0) {
1518 						deflate_add_bits(os, codes->codewords.litlen[*in_next],
1519 								 codes->lens.litlen[*in_next]);
1520 						if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1521 							deflate_flush_bits(os);
1522 						in_next++;
1523 					}
1524 				}
1525 				if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1526 					deflate_flush_bits(os);
1527 			}
1528 		#else
1529 			do {
1530 				unsigned lit = *in_next++;
1531 				deflate_add_bits(os, codes->codewords.litlen[lit],
1532 						 codes->lens.litlen[lit]);
1533 				deflate_flush_bits(os);
1534 			} while (--litrunlen);
1535 		#endif
1536 		}
1537 
1538 		if (length == 0)
1539 			return;
1540 
1541 		in_next += length;
1542 
1543 		length_slot = seq->length_slot;
1544 		litlen_symbol = 257 + length_slot;
1545 
1546 		/* Litlen symbol  */
1547 		deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1548 				 codes->lens.litlen[litlen_symbol]);
1549 
1550 		/* Extra length bits  */
1551 		STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1552 					 DEFLATE_MAX_EXTRA_LENGTH_BITS));
1553 		deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1554 				 deflate_extra_length_bits[length_slot]);
1555 
1556 		if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1557 				DEFLATE_MAX_EXTRA_LENGTH_BITS +
1558 				MAX_OFFSET_CODEWORD_LEN +
1559 				DEFLATE_MAX_EXTRA_OFFSET_BITS))
1560 			deflate_flush_bits(os);
1561 
1562 		/* Offset symbol  */
1563 		offset_symbol = seq->offset_symbol;
1564 		deflate_add_bits(os, codes->codewords.offset[offset_symbol],
1565 				 codes->lens.offset[offset_symbol]);
1566 
1567 		if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1568 				DEFLATE_MAX_EXTRA_OFFSET_BITS))
1569 			deflate_flush_bits(os);
1570 
1571 		/* Extra offset bits  */
1572 		deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
1573 				 deflate_extra_offset_bits[offset_symbol]);
1574 
1575 		deflate_flush_bits(os);
1576 
1577 		seq++;
1578 	}
1579 }
1580 
1581 #if SUPPORT_NEAR_OPTIMAL_PARSING
1582 /*
1583  * Follow the minimum-cost path in the graph of possible match/literal choices
1584  * for the current block and write out the matches/literals using the specified
1585  * Huffman codes.
1586  *
1587  * Note: this is slightly duplicated with deflate_write_sequences(), the reason
1588  * being that we don't want to waste time translating between intermediate
1589  * match/literal representations.
1590  */
1591 static void
deflate_write_item_list(struct deflate_output_bitstream * os,const struct deflate_codes * codes,struct libdeflate_compressor * c,u32 block_length)1592 deflate_write_item_list(struct deflate_output_bitstream *os,
1593 			const struct deflate_codes *codes,
1594 			struct libdeflate_compressor *c,
1595 			u32 block_length)
1596 {
1597 	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
1598 	struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
1599 	do {
1600 		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
1601 		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1602 		unsigned litlen_symbol;
1603 		unsigned length_slot;
1604 		unsigned offset_slot;
1605 
1606 		if (length == 1) {
1607 			/* Literal  */
1608 			litlen_symbol = offset;
1609 			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1610 					 codes->lens.litlen[litlen_symbol]);
1611 			deflate_flush_bits(os);
1612 		} else {
1613 			/* Match length  */
1614 			length_slot = deflate_length_slot[length];
1615 			litlen_symbol = 257 + length_slot;
1616 			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1617 					 codes->lens.litlen[litlen_symbol]);
1618 
1619 			deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1620 					 deflate_extra_length_bits[length_slot]);
1621 
1622 			if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1623 					DEFLATE_MAX_EXTRA_LENGTH_BITS +
1624 					MAX_OFFSET_CODEWORD_LEN +
1625 					DEFLATE_MAX_EXTRA_OFFSET_BITS))
1626 				deflate_flush_bits(os);
1627 
1628 
1629 			/* Match offset  */
1630 			offset_slot = deflate_get_offset_slot(c, offset);
1631 			deflate_add_bits(os, codes->codewords.offset[offset_slot],
1632 					 codes->lens.offset[offset_slot]);
1633 
1634 			if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1635 					DEFLATE_MAX_EXTRA_OFFSET_BITS))
1636 				deflate_flush_bits(os);
1637 
1638 			deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
1639 					 deflate_extra_offset_bits[offset_slot]);
1640 
1641 			deflate_flush_bits(os);
1642 		}
1643 		cur_node += length;
1644 	} while (cur_node != end_node);
1645 }
1646 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
1647 
1648 /* Output the end-of-block symbol.  */
1649 static void
deflate_write_end_of_block(struct deflate_output_bitstream * os,const struct deflate_codes * codes)1650 deflate_write_end_of_block(struct deflate_output_bitstream *os,
1651 			   const struct deflate_codes *codes)
1652 {
1653 	deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
1654 			 codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
1655 	deflate_flush_bits(os);
1656 }
1657 
1658 static void
deflate_write_uncompressed_block(struct deflate_output_bitstream * os,const u8 * data,u16 len,bool is_final_block)1659 deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
1660 				 const u8 *data, u16 len,
1661 				 bool is_final_block)
1662 {
1663 	deflate_write_block_header(os, is_final_block,
1664 				   DEFLATE_BLOCKTYPE_UNCOMPRESSED);
1665 	deflate_align_bitstream(os);
1666 
1667 	if (4 + (u32)len >= os->end - os->next) {
1668 		os->next = os->end;
1669 		return;
1670 	}
1671 
1672 	put_unaligned_le16(len, os->next);
1673 	os->next += 2;
1674 	put_unaligned_le16(~len, os->next);
1675 	os->next += 2;
1676 	memcpy(os->next, data, len);
1677 	os->next += len;
1678 }
1679 
1680 static void
deflate_write_uncompressed_blocks(struct deflate_output_bitstream * os,const u8 * data,size_t data_length,bool is_final_block)1681 deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
1682 				  const u8 *data, size_t data_length,
1683 				  bool is_final_block)
1684 {
1685 	do {
1686 		u16 len = MIN(data_length, UINT16_MAX);
1687 
1688 		deflate_write_uncompressed_block(os, data, len,
1689 					is_final_block && len == data_length);
1690 		data += len;
1691 		data_length -= len;
1692 	} while (data_length != 0);
1693 }
1694 
1695 /*
1696  * Choose the best type of block to use (dynamic Huffman, static Huffman, or
1697  * uncompressed), then output it.
1698  */
1699 static void
deflate_flush_block(struct libdeflate_compressor * restrict c,struct deflate_output_bitstream * restrict os,const u8 * restrict block_begin,u32 block_length,bool is_final_block,bool use_item_list)1700 deflate_flush_block(struct libdeflate_compressor * restrict c,
1701 		    struct deflate_output_bitstream * restrict os,
1702 		    const u8 * restrict block_begin, u32 block_length,
1703 		    bool is_final_block, bool use_item_list)
1704 {
1705 	static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
1706 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
1707 	};
1708 
1709 	/* Costs are measured in bits */
1710 	u32 dynamic_cost = 0;
1711 	u32 static_cost = 0;
1712 	u32 uncompressed_cost = 0;
1713 	struct deflate_codes *codes;
1714 	int block_type;
1715 	unsigned sym;
1716 
1717 	/* Tally the end-of-block symbol. */
1718 	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
1719 
1720 	/* Build dynamic Huffman codes. */
1721 	deflate_make_huffman_codes(&c->freqs, &c->codes);
1722 
1723 	/* Account for the cost of sending dynamic Huffman codes. */
1724 	deflate_precompute_huffman_header(c);
1725 	dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
1726 	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
1727 		u32 extra = deflate_extra_precode_bits[sym];
1728 		dynamic_cost += c->precode_freqs[sym] *
1729 				(extra + c->precode_lens[sym]);
1730 	}
1731 
1732 	/* Account for the cost of encoding literals. */
1733 	for (sym = 0; sym < 256; sym++) {
1734 		dynamic_cost += c->freqs.litlen[sym] *
1735 				c->codes.lens.litlen[sym];
1736 	}
1737 	for (sym = 0; sym < 144; sym++)
1738 		static_cost += c->freqs.litlen[sym] * 8;
1739 	for (; sym < 256; sym++)
1740 		static_cost += c->freqs.litlen[sym] * 9;
1741 
1742 	/* Account for the cost of encoding the end-of-block symbol. */
1743 	dynamic_cost += c->codes.lens.litlen[256];
1744 	static_cost += 7;
1745 
1746 	/* Account for the cost of encoding lengths. */
1747 	for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) {
1748 		u32 extra = deflate_extra_length_bits[sym - 257];
1749 		dynamic_cost += c->freqs.litlen[sym] *
1750 				(extra + c->codes.lens.litlen[sym]);
1751 		static_cost += c->freqs.litlen[sym] *
1752 				(extra + c->static_codes.lens.litlen[sym]);
1753 	}
1754 
1755 	/* Account for the cost of encoding offsets. */
1756 	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
1757 		u32 extra = deflate_extra_offset_bits[sym];
1758 		dynamic_cost += c->freqs.offset[sym] *
1759 				(extra + c->codes.lens.offset[sym]);
1760 		static_cost += c->freqs.offset[sym] * (extra + 5);
1761 	}
1762 
1763 	/* Compute the cost of using uncompressed blocks. */
1764 	uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
1765 			     (40 * (DIV_ROUND_UP(block_length,
1766 						 UINT16_MAX) - 1)) +
1767 			     (8 * block_length);
1768 
1769 	/* Choose the cheapest block type. */
1770 	if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
1771 		block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
1772 		codes = &c->codes;
1773 	} else if (static_cost < uncompressed_cost) {
1774 		block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
1775 		codes = &c->static_codes;
1776 	} else {
1777 		block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
1778 	}
1779 
1780 	/* Now actually output the block. */
1781 
1782 	if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
1783 		/* Note: the length being flushed may exceed the maximum length
1784 		 * of an uncompressed block (65535 bytes).  Therefore, more than
1785 		 * one uncompressed block might be needed. */
1786 		deflate_write_uncompressed_blocks(os, block_begin, block_length,
1787 						  is_final_block);
1788 	} else {
1789 		/* Output the block header. */
1790 		deflate_write_block_header(os, is_final_block, block_type);
1791 
1792 		/* Output the Huffman codes (dynamic Huffman blocks only). */
1793 		if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
1794 			deflate_write_huffman_header(c, os);
1795 
1796 		/* Output the literals, matches, and end-of-block symbol. */
1797 	#if SUPPORT_NEAR_OPTIMAL_PARSING
1798 		if (use_item_list)
1799 			deflate_write_item_list(os, codes, c, block_length);
1800 		else
1801 	#endif
1802 			deflate_write_sequences(os, codes, c->p.g.sequences,
1803 						block_begin);
1804 		deflate_write_end_of_block(os, codes);
1805 	}
1806 }
1807 
1808 static forceinline void
deflate_choose_literal(struct libdeflate_compressor * c,unsigned literal,u32 * litrunlen_p)1809 deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
1810 		       u32 *litrunlen_p)
1811 {
1812 	c->freqs.litlen[literal]++;
1813 	++*litrunlen_p;
1814 }
1815 
1816 static forceinline void
deflate_choose_match(struct libdeflate_compressor * c,unsigned length,unsigned offset,u32 * litrunlen_p,struct deflate_sequence ** next_seq_p)1817 deflate_choose_match(struct libdeflate_compressor *c,
1818 		     unsigned length, unsigned offset,
1819 		     u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
1820 {
1821 	struct deflate_sequence *seq = *next_seq_p;
1822 	unsigned length_slot = deflate_length_slot[length];
1823 	unsigned offset_slot = deflate_get_offset_slot(c, offset);
1824 
1825 	c->freqs.litlen[257 + length_slot]++;
1826 	c->freqs.offset[offset_slot]++;
1827 
1828 	seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
1829 	seq->offset = offset;
1830 	seq->length_slot = length_slot;
1831 	seq->offset_symbol = offset_slot;
1832 
1833 	*litrunlen_p = 0;
1834 	*next_seq_p = seq + 1;
1835 }
1836 
1837 static forceinline void
deflate_finish_sequence(struct deflate_sequence * seq,u32 litrunlen)1838 deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
1839 {
1840 	seq->litrunlen_and_length = litrunlen; /* length = 0 */
1841 }
1842 
1843 /******************************************************************************/
1844 
1845 /*
1846  * Block splitting algorithm.  The problem is to decide when it is worthwhile to
1847  * start a new block with new Huffman codes.  There is a theoretically optimal
1848  * solution: recursively consider every possible block split, considering the
1849  * exact cost of each block, and choose the minimum cost approach.  But this is
1850  * far too slow.  Instead, as an approximation, we can count symbols and after
1851  * every N symbols, compare the expected distribution of symbols based on the
1852  * previous data with the actual distribution.  If they differ "by enough", then
1853  * start a new block.
1854  *
1855  * As an optimization and heuristic, we don't distinguish between every symbol
1856  * but rather we combine many symbols into a single "observation type".  For
1857  * literals we only look at the high bits and low bits, and for matches we only
1858  * look at whether the match is long or not.  The assumption is that for typical
1859  * "real" data, places that are good block boundaries will tend to be noticeable
1860  * based only on changes in these aggregate frequencies, without looking for
1861  * subtle differences in individual symbols.  For example, a change from ASCII
1862  * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1863  * to many matches (generally more compressible), would be easily noticed based
1864  * on the aggregates.
1865  *
1866  * For determining whether the frequency distributions are "different enough" to
1867  * start a new block, the simply heuristic of splitting when the sum of absolute
1868  * differences exceeds a constant seems to be good enough.  We also add a number
1869  * proportional to the block length so that the algorithm is more likely to end
1870  * long blocks than short blocks.  This reflects the general expectation that it
1871  * will become increasingly beneficial to start a new block as the current
1872  * block grows longer.
1873  *
1874  * Finally, for an approximation, it is not strictly necessary that the exact
1875  * symbols being used are considered.  With "near-optimal parsing", for example,
1876  * the actual symbols that will be used are unknown until after the block
1877  * boundary is chosen and the block has been optimized.  Since the final choices
1878  * cannot be used, we can use preliminary "greedy" choices instead.
1879  */
1880 
1881 /* Initialize the block split statistics when starting a new block. */
1882 static void
init_block_split_stats(struct block_split_stats * stats)1883 init_block_split_stats(struct block_split_stats *stats)
1884 {
1885 	int i;
1886 
1887 	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1888 		stats->new_observations[i] = 0;
1889 		stats->observations[i] = 0;
1890 	}
1891 	stats->num_new_observations = 0;
1892 	stats->num_observations = 0;
1893 }
1894 
1895 /* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
1896  * literal, for 8 possible literal observation types.  */
1897 static forceinline void
observe_literal(struct block_split_stats * stats,u8 lit)1898 observe_literal(struct block_split_stats *stats, u8 lit)
1899 {
1900 	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1901 	stats->num_new_observations++;
1902 }
1903 
1904 /* Match observation.  Heuristic: use one observation type for "short match" and
1905  * one observation type for "long match".  */
1906 static forceinline void
observe_match(struct block_split_stats * stats,unsigned length)1907 observe_match(struct block_split_stats *stats, unsigned length)
1908 {
1909 	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
1910 	stats->num_new_observations++;
1911 }
1912 
1913 static bool
do_end_block_check(struct block_split_stats * stats,u32 block_length)1914 do_end_block_check(struct block_split_stats *stats, u32 block_length)
1915 {
1916 	int i;
1917 
1918 	if (stats->num_observations > 0) {
1919 
1920 		/* Note: to avoid slow divisions, we do not divide by
1921 		 * 'num_observations', but rather do all math with the numbers
1922 		 * multiplied by 'num_observations'.  */
1923 		u32 total_delta = 0;
1924 		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1925 			u32 expected = stats->observations[i] * stats->num_new_observations;
1926 			u32 actual = stats->new_observations[i] * stats->num_observations;
1927 			u32 delta = (actual > expected) ? actual - expected :
1928 							  expected - actual;
1929 			total_delta += delta;
1930 		}
1931 
1932 		/* Ready to end the block? */
1933 		if (total_delta + (block_length / 4096) * stats->num_observations >=
1934 		    NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
1935 			return true;
1936 	}
1937 
1938 	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1939 		stats->num_observations += stats->new_observations[i];
1940 		stats->observations[i] += stats->new_observations[i];
1941 		stats->new_observations[i] = 0;
1942 	}
1943 	stats->num_new_observations = 0;
1944 	return false;
1945 }
1946 
1947 static forceinline bool
should_end_block(struct block_split_stats * stats,const u8 * in_block_begin,const u8 * in_next,const u8 * in_end)1948 should_end_block(struct block_split_stats *stats,
1949 		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
1950 {
1951 	/* Ready to check block split statistics? */
1952 	if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
1953 	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
1954 	    in_end - in_next < MIN_BLOCK_LENGTH)
1955 		return false;
1956 
1957 	return do_end_block_check(stats, in_next - in_block_begin);
1958 }
1959 
1960 /******************************************************************************/
1961 
1962 /*
1963  * This is the level 0 "compressor".  It always outputs uncompressed blocks.
1964  */
1965 static size_t
deflate_compress_none(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)1966 deflate_compress_none(struct libdeflate_compressor * restrict c,
1967 		      const u8 * restrict in, size_t in_nbytes,
1968 		      u8 * restrict out, size_t out_nbytes_avail)
1969 {
1970 	struct deflate_output_bitstream os;
1971 
1972 	deflate_init_output(&os, out, out_nbytes_avail);
1973 
1974 	deflate_write_uncompressed_blocks(&os, in, in_nbytes, true);
1975 
1976 	return deflate_flush_output(&os);
1977 }
1978 
1979 /*
1980  * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
1981  */
1982 static size_t
deflate_compress_greedy(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)1983 deflate_compress_greedy(struct libdeflate_compressor * restrict c,
1984 			const u8 * restrict in, size_t in_nbytes,
1985 			u8 * restrict out, size_t out_nbytes_avail)
1986 {
1987 	const u8 *in_next = in;
1988 	const u8 *in_end = in_next + in_nbytes;
1989 	struct deflate_output_bitstream os;
1990 	const u8 *in_cur_base = in_next;
1991 	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
1992 	unsigned nice_len = MIN(c->nice_match_length, max_len);
1993 	u32 next_hashes[2] = {0, 0};
1994 
1995 	deflate_init_output(&os, out, out_nbytes_avail);
1996 	hc_matchfinder_init(&c->p.g.hc_mf);
1997 
1998 	do {
1999 		/* Starting a new DEFLATE block.  */
2000 
2001 		const u8 * const in_block_begin = in_next;
2002 		const u8 * const in_max_block_end =
2003 			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2004 		u32 litrunlen = 0;
2005 		struct deflate_sequence *next_seq = c->p.g.sequences;
2006 
2007 		init_block_split_stats(&c->split_stats);
2008 		deflate_reset_symbol_frequencies(c);
2009 
2010 		do {
2011 			u32 length;
2012 			u32 offset;
2013 
2014 			/* Decrease the maximum and nice match lengths if we're
2015 			 * approaching the end of the input buffer.  */
2016 			if (unlikely(max_len > in_end - in_next)) {
2017 				max_len = in_end - in_next;
2018 				nice_len = MIN(nice_len, max_len);
2019 			}
2020 
2021 			length = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2022 							      &in_cur_base,
2023 							      in_next,
2024 							      DEFLATE_MIN_MATCH_LEN - 1,
2025 							      max_len,
2026 							      nice_len,
2027 							      c->max_search_depth,
2028 							      next_hashes,
2029 							      &offset);
2030 
2031 			if (length >= DEFLATE_MIN_MATCH_LEN) {
2032 				/* Match found.  */
2033 				deflate_choose_match(c, length, offset,
2034 						     &litrunlen, &next_seq);
2035 				observe_match(&c->split_stats, length);
2036 				in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2037 									&in_cur_base,
2038 									in_next + 1,
2039 									in_end,
2040 									length - 1,
2041 									next_hashes);
2042 			} else {
2043 				/* No match found.  */
2044 				deflate_choose_literal(c, *in_next, &litrunlen);
2045 				observe_literal(&c->split_stats, *in_next);
2046 				in_next++;
2047 			}
2048 
2049 			/* Check if it's time to output another block.  */
2050 		} while (in_next < in_max_block_end &&
2051 			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2052 
2053 		deflate_finish_sequence(next_seq, litrunlen);
2054 		deflate_flush_block(c, &os, in_block_begin,
2055 				    in_next - in_block_begin,
2056 				    in_next == in_end, false);
2057 	} while (in_next != in_end);
2058 
2059 	return deflate_flush_output(&os);
2060 }
2061 
2062 /*
2063  * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
2064  * see if there's a longer match at the next position.  If yes, it outputs a
2065  * literal and continues to the next position.  If no, it outputs the match.
2066  */
2067 static size_t
deflate_compress_lazy(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)2068 deflate_compress_lazy(struct libdeflate_compressor * restrict c,
2069 		      const u8 * restrict in, size_t in_nbytes,
2070 		      u8 * restrict out, size_t out_nbytes_avail)
2071 {
2072 	const u8 *in_next = in;
2073 	const u8 *in_end = in_next + in_nbytes;
2074 	struct deflate_output_bitstream os;
2075 	const u8 *in_cur_base = in_next;
2076 	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2077 	unsigned nice_len = MIN(c->nice_match_length, max_len);
2078 	u32 next_hashes[2] = {0, 0};
2079 
2080 	deflate_init_output(&os, out, out_nbytes_avail);
2081 	hc_matchfinder_init(&c->p.g.hc_mf);
2082 
2083 	do {
2084 		/* Starting a new DEFLATE block.  */
2085 
2086 		const u8 * const in_block_begin = in_next;
2087 		const u8 * const in_max_block_end =
2088 			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2089 		u32 litrunlen = 0;
2090 		struct deflate_sequence *next_seq = c->p.g.sequences;
2091 
2092 		init_block_split_stats(&c->split_stats);
2093 		deflate_reset_symbol_frequencies(c);
2094 
2095 		do {
2096 			unsigned cur_len;
2097 			unsigned cur_offset;
2098 			unsigned next_len;
2099 			unsigned next_offset;
2100 
2101 			if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2102 				max_len = in_end - in_next;
2103 				nice_len = MIN(nice_len, max_len);
2104 			}
2105 
2106 			/* Find the longest match at the current position.  */
2107 			cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2108 							       &in_cur_base,
2109 							       in_next,
2110 							       DEFLATE_MIN_MATCH_LEN - 1,
2111 							       max_len,
2112 							       nice_len,
2113 							       c->max_search_depth,
2114 							       next_hashes,
2115 							       &cur_offset);
2116 			in_next += 1;
2117 
2118 			if (cur_len < DEFLATE_MIN_MATCH_LEN) {
2119 				/* No match found.  Choose a literal.  */
2120 				deflate_choose_literal(c, *(in_next - 1), &litrunlen);
2121 				observe_literal(&c->split_stats, *(in_next - 1));
2122 				continue;
2123 			}
2124 
2125 		have_cur_match:
2126 			observe_match(&c->split_stats, cur_len);
2127 
2128 			/* We have a match at the current position.  */
2129 
2130 			/* If the current match is very long, choose it
2131 			 * immediately.  */
2132 			if (cur_len >= nice_len) {
2133 				deflate_choose_match(c, cur_len, cur_offset,
2134 						     &litrunlen, &next_seq);
2135 				in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2136 									&in_cur_base,
2137 									in_next,
2138 									in_end,
2139 									cur_len - 1,
2140 									next_hashes);
2141 				continue;
2142 			}
2143 
2144 			/*
2145 			 * Try to find a match at the next position.
2146 			 *
2147 			 * Note: since we already have a match at the *current*
2148 			 * position, we use only half the 'max_search_depth'
2149 			 * when checking the *next* position.  This is a useful
2150 			 * trade-off because it's more worthwhile to use a
2151 			 * greater search depth on the initial match.
2152 			 *
2153 			 * Note: it's possible to structure the code such that
2154 			 * there's only one call to longest_match(), which
2155 			 * handles both the "find the initial match" and "try to
2156 			 * find a longer match" cases.  However, it is faster to
2157 			 * have two call sites, with longest_match() inlined at
2158 			 * each.
2159 			 */
2160 			if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2161 				max_len = in_end - in_next;
2162 				nice_len = MIN(nice_len, max_len);
2163 			}
2164 			next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2165 								&in_cur_base,
2166 								in_next,
2167 								cur_len,
2168 								max_len,
2169 								nice_len,
2170 								c->max_search_depth / 2,
2171 								next_hashes,
2172 								&next_offset);
2173 			in_next += 1;
2174 
2175 			if (next_len > cur_len) {
2176 				/* Found a longer match at the next position.
2177 				 * Output a literal.  Then the next match
2178 				 * becomes the current match.  */
2179 				deflate_choose_literal(c, *(in_next - 2), &litrunlen);
2180 				cur_len = next_len;
2181 				cur_offset = next_offset;
2182 				goto have_cur_match;
2183 			}
2184 
2185 			/* No longer match at the next position.
2186 			 * Output the current match.  */
2187 			deflate_choose_match(c, cur_len, cur_offset,
2188 					     &litrunlen, &next_seq);
2189 			in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2190 								&in_cur_base,
2191 								in_next,
2192 								in_end,
2193 								cur_len - 2,
2194 								next_hashes);
2195 
2196 			/* Check if it's time to output another block.  */
2197 		} while (in_next < in_max_block_end &&
2198 			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2199 
2200 		deflate_finish_sequence(next_seq, litrunlen);
2201 		deflate_flush_block(c, &os, in_block_begin,
2202 				    in_next - in_block_begin,
2203 				    in_next == in_end, false);
2204 	} while (in_next != in_end);
2205 
2206 	return deflate_flush_output(&os);
2207 }
2208 
2209 #if SUPPORT_NEAR_OPTIMAL_PARSING
2210 
2211 /*
2212  * Follow the minimum-cost path in the graph of possible match/literal choices
2213  * for the current block and compute the frequencies of the Huffman symbols that
2214  * would be needed to output those matches and literals.
2215  */
2216 static void
deflate_tally_item_list(struct libdeflate_compressor * c,u32 block_length)2217 deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
2218 {
2219 	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
2220 	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2221 	do {
2222 		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
2223 		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
2224 
2225 		if (length == 1) {
2226 			/* Literal  */
2227 			c->freqs.litlen[offset]++;
2228 		} else {
2229 			/* Match  */
2230 			c->freqs.litlen[257 + deflate_length_slot[length]]++;
2231 			c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
2232 		}
2233 		cur_node += length;
2234 	} while (cur_node != end_node);
2235 }
2236 
2237 /* Set the current cost model from the codeword lengths specified in @lens.  */
2238 static void
deflate_set_costs_from_codes(struct libdeflate_compressor * c,const struct deflate_lens * lens)2239 deflate_set_costs_from_codes(struct libdeflate_compressor *c,
2240 			     const struct deflate_lens *lens)
2241 {
2242 	unsigned i;
2243 
2244 	/* Literals  */
2245 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
2246 		u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
2247 		c->p.n.costs.literal[i] = bits << COST_SHIFT;
2248 	}
2249 
2250 	/* Lengths  */
2251 	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
2252 		unsigned length_slot = deflate_length_slot[i];
2253 		unsigned litlen_sym = 257 + length_slot;
2254 		u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
2255 		bits += deflate_extra_length_bits[length_slot];
2256 		c->p.n.costs.length[i] = bits << COST_SHIFT;
2257 	}
2258 
2259 	/* Offset slots  */
2260 	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
2261 		u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
2262 		bits += deflate_extra_offset_bits[i];
2263 		c->p.n.costs.offset_slot[i] = bits << COST_SHIFT;
2264 	}
2265 }
2266 
2267 static forceinline u32
deflate_default_literal_cost(unsigned literal)2268 deflate_default_literal_cost(unsigned literal)
2269 {
2270 	STATIC_ASSERT(COST_SHIFT == 3);
2271 	/* 66 is 8.25 bits/symbol  */
2272 	return 66;
2273 }
2274 
2275 static forceinline u32
deflate_default_length_slot_cost(unsigned length_slot)2276 deflate_default_length_slot_cost(unsigned length_slot)
2277 {
2278 	STATIC_ASSERT(COST_SHIFT == 3);
2279 	/* 60 is 7.5 bits/symbol  */
2280 	return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
2281 }
2282 
2283 static forceinline u32
deflate_default_offset_slot_cost(unsigned offset_slot)2284 deflate_default_offset_slot_cost(unsigned offset_slot)
2285 {
2286 	STATIC_ASSERT(COST_SHIFT == 3);
2287 	/* 39 is 4.875 bits/symbol  */
2288 	return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
2289 }
2290 
2291 /*
2292  * Set default symbol costs for the first block's first optimization pass.
2293  *
2294  * It works well to assume that each symbol is equally probable.  This results
2295  * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
2296  * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
2297  * alphabet.  However, we intentionally bias the parse towards matches rather
2298  * than literals by using a slightly lower default cost for length symbols than
2299  * for literals.  This often improves the compression ratio slightly.
2300  */
2301 static void
deflate_set_default_costs(struct libdeflate_compressor * c)2302 deflate_set_default_costs(struct libdeflate_compressor *c)
2303 {
2304 	unsigned i;
2305 
2306 	/* Literals  */
2307 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2308 		c->p.n.costs.literal[i] = deflate_default_literal_cost(i);
2309 
2310 	/* Lengths  */
2311 	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2312 		c->p.n.costs.length[i] = deflate_default_length_slot_cost(
2313 						deflate_length_slot[i]);
2314 
2315 	/* Offset slots  */
2316 	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2317 		c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
2318 }
2319 
2320 static forceinline void
deflate_adjust_cost(u32 * cost_p,u32 default_cost)2321 deflate_adjust_cost(u32 *cost_p, u32 default_cost)
2322 {
2323 	*cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
2324 }
2325 
2326 /*
2327  * Adjust the costs when beginning a new block.
2328  *
2329  * Since the current costs have been optimized for the data, it's undesirable to
2330  * throw them away and start over with the default costs.  At the same time, we
2331  * don't want to bias the parse by assuming that the next block will be similar
2332  * to the current block.  As a compromise, make the costs closer to the
2333  * defaults, but don't simply set them to the defaults.
2334  */
2335 static void
deflate_adjust_costs(struct libdeflate_compressor * c)2336 deflate_adjust_costs(struct libdeflate_compressor *c)
2337 {
2338 	unsigned i;
2339 
2340 	/* Literals  */
2341 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2342 		deflate_adjust_cost(&c->p.n.costs.literal[i],
2343 				    deflate_default_literal_cost(i));
2344 
2345 	/* Lengths  */
2346 	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2347 		deflate_adjust_cost(&c->p.n.costs.length[i],
2348 				    deflate_default_length_slot_cost(
2349 						deflate_length_slot[i]));
2350 
2351 	/* Offset slots  */
2352 	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2353 		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
2354 				    deflate_default_offset_slot_cost(i));
2355 }
2356 
2357 /*
2358  * Find the minimum-cost path through the graph of possible match/literal
2359  * choices for this block.
2360  *
2361  * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
2362  * represents the node at the beginning of the block, to
2363  * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
2364  * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
2365  *
2366  * The algorithm works backwards, starting at the end node and proceeding
2367  * backwards one node at a time.  At each node, the minimum cost to reach the
2368  * end node is computed and the match/literal choice that begins that path is
2369  * saved.
2370  */
2371 static void
deflate_find_min_cost_path(struct libdeflate_compressor * c,const u32 block_length,const struct lz_match * cache_ptr)2372 deflate_find_min_cost_path(struct libdeflate_compressor *c,
2373 			   const u32 block_length,
2374 			   const struct lz_match *cache_ptr)
2375 {
2376 	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2377 	struct deflate_optimum_node *cur_node = end_node;
2378 
2379 	cur_node->cost_to_end = 0;
2380 	do {
2381 		unsigned num_matches;
2382 		unsigned literal;
2383 		u32 best_cost_to_end;
2384 
2385 		cur_node--;
2386 		cache_ptr--;
2387 
2388 		num_matches = cache_ptr->length;
2389 		literal = cache_ptr->offset;
2390 
2391 		/* It's always possible to choose a literal.  */
2392 		best_cost_to_end = c->p.n.costs.literal[literal] +
2393 				   (cur_node + 1)->cost_to_end;
2394 		cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
2395 
2396 		/* Also consider matches if there are any.  */
2397 		if (num_matches) {
2398 			const struct lz_match *match;
2399 			unsigned len;
2400 			unsigned offset;
2401 			unsigned offset_slot;
2402 			u32 offset_cost;
2403 			u32 cost_to_end;
2404 
2405 			/*
2406 			 * Consider each length from the minimum
2407 			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
2408 			 * match found at this position.  For each length, we
2409 			 * consider only the smallest offset for which that
2410 			 * length is available.  Although this is not guaranteed
2411 			 * to be optimal due to the possibility of a larger
2412 			 * offset costing less than a smaller offset to code,
2413 			 * this is a very useful heuristic.
2414 			 */
2415 			match = cache_ptr - num_matches;
2416 			len = DEFLATE_MIN_MATCH_LEN;
2417 			do {
2418 				offset = match->offset;
2419 				offset_slot = deflate_get_offset_slot(c, offset);
2420 				offset_cost = c->p.n.costs.offset_slot[offset_slot];
2421 				do {
2422 					cost_to_end = offset_cost +
2423 						      c->p.n.costs.length[len] +
2424 						      (cur_node + len)->cost_to_end;
2425 					if (cost_to_end < best_cost_to_end) {
2426 						best_cost_to_end = cost_to_end;
2427 						cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
2428 					}
2429 				} while (++len <= match->length);
2430 			} while (++match != cache_ptr);
2431 			cache_ptr -= num_matches;
2432 		}
2433 		cur_node->cost_to_end = best_cost_to_end;
2434 	} while (cur_node != &c->p.n.optimum_nodes[0]);
2435 }
2436 
2437 /*
2438  * Choose the literal/match sequence to use for the current block.  The basic
2439  * algorithm finds a minimum-cost path through the block's graph of
2440  * literal/match choices, given a cost model.  However, the cost of each symbol
2441  * is unknown until the Huffman codes have been built, but at the same time the
2442  * Huffman codes depend on the frequencies of chosen symbols.  Consequently,
2443  * multiple passes must be used to try to approximate an optimal solution.  The
2444  * first pass uses default costs, mixed with the costs from the previous block
2445  * if any.  Later passes use the Huffman codeword lengths from the previous pass
2446  * as the costs.
2447  */
2448 static void
deflate_optimize_block(struct libdeflate_compressor * c,u32 block_length,const struct lz_match * cache_ptr,bool is_first_block)2449 deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
2450 		       const struct lz_match *cache_ptr, bool is_first_block)
2451 {
2452 	unsigned num_passes_remaining = c->p.n.num_optim_passes;
2453 	u32 i;
2454 
2455 	/* Force the block to really end at the desired length, even if some
2456 	 * matches extend beyond it. */
2457 	for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
2458 					ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
2459 		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
2460 
2461 	/* Set the initial costs. */
2462 	if (is_first_block)
2463 		deflate_set_default_costs(c);
2464 	else
2465 		deflate_adjust_costs(c);
2466 
2467 	for (;;) {
2468 		/* Find the minimum cost path for this pass. */
2469 		deflate_find_min_cost_path(c, block_length, cache_ptr);
2470 
2471 		/* Compute frequencies of the chosen symbols. */
2472 		deflate_reset_symbol_frequencies(c);
2473 		deflate_tally_item_list(c, block_length);
2474 
2475 		if (--num_passes_remaining == 0)
2476 			break;
2477 
2478 		/* At least one optimization pass remains; update the costs. */
2479 		deflate_make_huffman_codes(&c->freqs, &c->codes);
2480 		deflate_set_costs_from_codes(c, &c->codes.lens);
2481 	}
2482 }
2483 
2484 /*
2485  * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
2486  * representation of each DEFLATE block using a minimum-cost path search over
2487  * the graph of possible match/literal choices for that block, assuming a
2488  * certain cost for each Huffman symbol.
2489  *
2490  * For several reasons, the end result is not guaranteed to be optimal:
2491  *
2492  * - Nonoptimal choice of blocks
2493  * - Heuristic limitations on which matches are actually considered
2494  * - Symbol costs are unknown until the symbols have already been chosen
2495  *   (so iterative optimization must be used)
2496  */
2497 static size_t
deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)2498 deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
2499 			      const u8 * restrict in, size_t in_nbytes,
2500 			      u8 * restrict out, size_t out_nbytes_avail)
2501 {
2502 	const u8 *in_next = in;
2503 	const u8 *in_end = in_next + in_nbytes;
2504 	struct deflate_output_bitstream os;
2505 	const u8 *in_cur_base = in_next;
2506 	const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
2507 	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2508 	unsigned nice_len = MIN(c->nice_match_length, max_len);
2509 	u32 next_hashes[2] = {0, 0};
2510 
2511 	deflate_init_output(&os, out, out_nbytes_avail);
2512 	bt_matchfinder_init(&c->p.n.bt_mf);
2513 
2514 	do {
2515 		/* Starting a new DEFLATE block.  */
2516 
2517 		struct lz_match *cache_ptr = c->p.n.match_cache;
2518 		const u8 * const in_block_begin = in_next;
2519 		const u8 * const in_max_block_end =
2520 			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2521 		const u8 *next_observation = in_next;
2522 
2523 		init_block_split_stats(&c->split_stats);
2524 
2525 		/*
2526 		 * Find matches until we decide to end the block.  We end the
2527 		 * block if any of the following is true:
2528 		 *
2529 		 * (1) Maximum block length has been reached
2530 		 * (2) Match catch may overflow.
2531 		 * (3) Block split heuristic says to split now.
2532 		 */
2533 		do {
2534 			struct lz_match *matches;
2535 			unsigned best_len;
2536 
2537 			/* Slide the window forward if needed.  */
2538 			if (in_next == in_next_slide) {
2539 				bt_matchfinder_slide_window(&c->p.n.bt_mf);
2540 				in_cur_base = in_next;
2541 				in_next_slide = in_next + MIN(in_end - in_next,
2542 							      MATCHFINDER_WINDOW_SIZE);
2543 			}
2544 
2545 			/* Decrease the maximum and nice match lengths if we're
2546 			 * approaching the end of the input buffer.  */
2547 			if (unlikely(max_len > in_end - in_next)) {
2548 				max_len = in_end - in_next;
2549 				nice_len = MIN(nice_len, max_len);
2550 			}
2551 
2552 			/*
2553 			 * Find matches with the current position using the
2554 			 * binary tree matchfinder and save them in
2555 			 * 'match_cache'.
2556 			 *
2557 			 * Note: the binary tree matchfinder is more suited for
2558 			 * optimal parsing than the hash chain matchfinder.  The
2559 			 * reasons for this include:
2560 			 *
2561 			 * - The binary tree matchfinder can find more matches
2562 			 *   in the same number of steps.
2563 			 * - One of the major advantages of hash chains is that
2564 			 *   skipping positions (not searching for matches at
2565 			 *   them) is faster; however, with optimal parsing we
2566 			 *   search for matches at almost all positions, so this
2567 			 *   advantage of hash chains is negated.
2568 			 */
2569 			matches = cache_ptr;
2570 			best_len = 0;
2571 			if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
2572 				cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf,
2573 								       in_cur_base,
2574 								       in_next - in_cur_base,
2575 								       max_len,
2576 								       nice_len,
2577 								       c->max_search_depth,
2578 								       next_hashes,
2579 								       &best_len,
2580 								       matches);
2581 			}
2582 
2583 			if (in_next >= next_observation) {
2584 				if (best_len >= 4) {
2585 					observe_match(&c->split_stats, best_len);
2586 					next_observation = in_next + best_len;
2587 				} else {
2588 					observe_literal(&c->split_stats, *in_next);
2589 					next_observation = in_next + 1;
2590 				}
2591 			}
2592 
2593 			cache_ptr->length = cache_ptr - matches;
2594 			cache_ptr->offset = *in_next;
2595 			in_next++;
2596 			cache_ptr++;
2597 
2598 			/*
2599 			 * If there was a very long match found, don't cache any
2600 			 * matches for the bytes covered by that match.  This
2601 			 * avoids degenerate behavior when compressing highly
2602 			 * redundant data, where the number of matches can be
2603 			 * very large.
2604 			 *
2605 			 * This heuristic doesn't actually hurt the compression
2606 			 * ratio very much.  If there's a long match, then the
2607 			 * data must be highly compressible, so it doesn't
2608 			 * matter much what we do.
2609 			 */
2610 			if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
2611 				--best_len;
2612 				do {
2613 					if (in_next == in_next_slide) {
2614 						bt_matchfinder_slide_window(&c->p.n.bt_mf);
2615 						in_cur_base = in_next;
2616 						in_next_slide = in_next + MIN(in_end - in_next,
2617 									      MATCHFINDER_WINDOW_SIZE);
2618 					}
2619 					if (unlikely(max_len > in_end - in_next)) {
2620 						max_len = in_end - in_next;
2621 						nice_len = MIN(nice_len, max_len);
2622 					}
2623 					if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) {
2624 						bt_matchfinder_skip_position(&c->p.n.bt_mf,
2625 									     in_cur_base,
2626 									     in_next - in_cur_base,
2627 									     nice_len,
2628 									     c->max_search_depth,
2629 									     next_hashes);
2630 					}
2631 					cache_ptr->length = 0;
2632 					cache_ptr->offset = *in_next;
2633 					in_next++;
2634 					cache_ptr++;
2635 				} while (--best_len);
2636 			}
2637 		} while (in_next < in_max_block_end &&
2638 			 cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
2639 			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2640 
2641 		/* All the matches for this block have been cached.  Now choose
2642 		 * the sequence of items to output and flush the block.  */
2643 		deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
2644 				       in_block_begin == in);
2645 		deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
2646 				    in_next == in_end, true);
2647 	} while (in_next != in_end);
2648 
2649 	return deflate_flush_output(&os);
2650 }
2651 
2652 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
2653 
2654 /* Initialize c->offset_slot_fast.  */
2655 static void
deflate_init_offset_slot_fast(struct libdeflate_compressor * c)2656 deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
2657 {
2658 	unsigned offset_slot;
2659 	unsigned offset;
2660 	unsigned offset_end;
2661 
2662 	for (offset_slot = 0;
2663 	     offset_slot < ARRAY_LEN(deflate_offset_slot_base);
2664 	     offset_slot++)
2665 	{
2666 		offset = deflate_offset_slot_base[offset_slot];
2667 	#if USE_FULL_OFFSET_SLOT_FAST
2668 		offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2669 		do {
2670 			c->offset_slot_fast[offset] = offset_slot;
2671 		} while (++offset != offset_end);
2672 	#else
2673 		if (offset <= 256) {
2674 			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2675 			do {
2676 				c->offset_slot_fast[offset - 1] = offset_slot;
2677 			} while (++offset != offset_end);
2678 		} else {
2679 			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2680 			do {
2681 				c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
2682 			} while ((offset += (1 << 7)) != offset_end);
2683 		}
2684 	#endif
2685 	}
2686 }
2687 
2688 LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
libdeflate_alloc_compressor(int compression_level)2689 libdeflate_alloc_compressor(int compression_level)
2690 {
2691 	struct libdeflate_compressor *c;
2692 	size_t size = offsetof(struct libdeflate_compressor, p);
2693 
2694 	if (compression_level < 0 || compression_level > 12)
2695 		return NULL;
2696 
2697 #if SUPPORT_NEAR_OPTIMAL_PARSING
2698 	if (compression_level >= 8)
2699 		size += sizeof(c->p.n);
2700 	else if (compression_level >= 1)
2701 		size += sizeof(c->p.g);
2702 #else
2703 	if (compression_level >= 1)
2704 		size += sizeof(c->p.g);
2705 #endif
2706 
2707 	c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
2708 	if (!c)
2709 		return NULL;
2710 
2711 	c->compression_level = compression_level;
2712 
2713 	/*
2714 	 * The higher the compression level, the more we should bother trying to
2715 	 * compress very small inputs.
2716 	 */
2717 	c->min_size_to_compress = 56 - (compression_level * 4);
2718 
2719 	switch (compression_level) {
2720 	case 0:
2721 		c->impl = deflate_compress_none;
2722 		break;
2723 	case 1:
2724 		c->impl = deflate_compress_greedy;
2725 		c->max_search_depth = 2;
2726 		c->nice_match_length = 8;
2727 		break;
2728 	case 2:
2729 		c->impl = deflate_compress_greedy;
2730 		c->max_search_depth = 6;
2731 		c->nice_match_length = 10;
2732 		break;
2733 	case 3:
2734 		c->impl = deflate_compress_greedy;
2735 		c->max_search_depth = 12;
2736 		c->nice_match_length = 14;
2737 		break;
2738 	case 4:
2739 		c->impl = deflate_compress_greedy;
2740 		c->max_search_depth = 24;
2741 		c->nice_match_length = 24;
2742 		break;
2743 	case 5:
2744 		c->impl = deflate_compress_lazy;
2745 		c->max_search_depth = 20;
2746 		c->nice_match_length = 30;
2747 		break;
2748 	case 6:
2749 		c->impl = deflate_compress_lazy;
2750 		c->max_search_depth = 40;
2751 		c->nice_match_length = 65;
2752 		break;
2753 	case 7:
2754 		c->impl = deflate_compress_lazy;
2755 		c->max_search_depth = 100;
2756 		c->nice_match_length = 130;
2757 		break;
2758 #if SUPPORT_NEAR_OPTIMAL_PARSING
2759 	case 8:
2760 		c->impl = deflate_compress_near_optimal;
2761 		c->max_search_depth = 12;
2762 		c->nice_match_length = 20;
2763 		c->p.n.num_optim_passes = 1;
2764 		break;
2765 	case 9:
2766 		c->impl = deflate_compress_near_optimal;
2767 		c->max_search_depth = 16;
2768 		c->nice_match_length = 26;
2769 		c->p.n.num_optim_passes = 2;
2770 		break;
2771 	case 10:
2772 		c->impl = deflate_compress_near_optimal;
2773 		c->max_search_depth = 30;
2774 		c->nice_match_length = 50;
2775 		c->p.n.num_optim_passes = 2;
2776 		break;
2777 	case 11:
2778 		c->impl = deflate_compress_near_optimal;
2779 		c->max_search_depth = 60;
2780 		c->nice_match_length = 80;
2781 		c->p.n.num_optim_passes = 3;
2782 		break;
2783 	default:
2784 		c->impl = deflate_compress_near_optimal;
2785 		c->max_search_depth = 100;
2786 		c->nice_match_length = 133;
2787 		c->p.n.num_optim_passes = 4;
2788 		break;
2789 #else
2790 	case 8:
2791 		c->impl = deflate_compress_lazy;
2792 		c->max_search_depth = 150;
2793 		c->nice_match_length = 200;
2794 		break;
2795 	default:
2796 		c->impl = deflate_compress_lazy;
2797 		c->max_search_depth = 200;
2798 		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
2799 		break;
2800 #endif
2801 	}
2802 
2803 	deflate_init_offset_slot_fast(c);
2804 	deflate_init_static_codes(c);
2805 
2806 	return c;
2807 }
2808 
2809 LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_deflate_compress(struct libdeflate_compressor * c,const void * in,size_t in_nbytes,void * out,size_t out_nbytes_avail)2810 libdeflate_deflate_compress(struct libdeflate_compressor *c,
2811 			    const void *in, size_t in_nbytes,
2812 			    void *out, size_t out_nbytes_avail)
2813 {
2814 	if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
2815 		return 0;
2816 
2817 	/* For extremely small inputs just use a single uncompressed block. */
2818 	if (unlikely(in_nbytes < c->min_size_to_compress)) {
2819 		struct deflate_output_bitstream os;
2820 		deflate_init_output(&os, out, out_nbytes_avail);
2821 		if (in_nbytes == 0)
2822 			in = &os; /* Avoid passing NULL to memcpy() */
2823 		deflate_write_uncompressed_block(&os, in, in_nbytes, true);
2824 		return deflate_flush_output(&os);
2825 	}
2826 
2827 	return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
2828 }
2829 
2830 LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_free_compressor(struct libdeflate_compressor * c)2831 libdeflate_free_compressor(struct libdeflate_compressor *c)
2832 {
2833 	libdeflate_aligned_free(c);
2834 }
2835 
2836 unsigned int
deflate_get_compression_level(struct libdeflate_compressor * c)2837 deflate_get_compression_level(struct libdeflate_compressor *c)
2838 {
2839 	return c->compression_level;
2840 }
2841 
2842 LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_deflate_compress_bound(struct libdeflate_compressor * c,size_t in_nbytes)2843 libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
2844 				  size_t in_nbytes)
2845 {
2846 	/*
2847 	 * The worst case is all uncompressed blocks where one block has length
2848 	 * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
2849 	 * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
2850 	 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
2851 	 */
2852 	size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
2853 	return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
2854 }
2855