1 /*
2 * deflate_compress.c - a compressor for DEFLATE
3 *
4 * Originally public domain; changes after 2016-09-07 are copyrighted.
5 *
6 * Copyright 2016 Eric Biggers
7 *
8 * Permission is hereby granted, free of charge, to any person
9 * obtaining a copy of this software and associated documentation
10 * files (the "Software"), to deal in the Software without
11 * restriction, including without limitation the rights to use,
12 * copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following
15 * conditions:
16 *
17 * The above copyright notice and this permission notice shall be
18 * included in all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 * OTHER DEALINGS IN THE SOFTWARE.
28 */
29
30 #include "deflate_compress.h"
31 #include "deflate_constants.h"
32 #include "unaligned.h"
33
34 #include "libdeflate.h"
35
36 /*
37 * By default, the near-optimal parsing algorithm is enabled at compression
38 * level 8 and above. The near-optimal parsing algorithm produces a compression
39 * ratio significantly better than the greedy and lazy algorithms implemented
40 * here, and also the algorithm used by zlib at level 9. However, it is slow.
41 */
42 #define SUPPORT_NEAR_OPTIMAL_PARSING 1
43
44 /*
45 * Define to 1 to maintain the full map from match offsets to offset slots.
46 * This slightly speeds up translations of match offsets to offset slots, but it
47 * uses 32769 bytes of memory rather than the 512 bytes used by the condensed
48 * map. The speedup provided by the larger map is most helpful when the
49 * near-optimal parsing algorithm is being used.
50 */
51 #define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING
52
53 /*
54 * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
55 * appropriately.
56 */
57 #define MATCHFINDER_WINDOW_ORDER 15
58
59 #include "hc_matchfinder.h"
60 #if SUPPORT_NEAR_OPTIMAL_PARSING
61 # include "bt_matchfinder.h"
62 #endif
63
64 /*
65 * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
66 * except if the last block has to be shorter.
67 */
68 #define MIN_BLOCK_LENGTH 10000
69
70 /*
71 * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
72 * the final length might be slightly longer due to matches extending beyond
73 * this limit.
74 */
75 #define SOFT_MAX_BLOCK_LENGTH 300000
76
77 /*
78 * The number of observed matches or literals that represents sufficient data to
79 * decide whether the current block should be terminated or not.
80 */
81 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
82
83
84 #if SUPPORT_NEAR_OPTIMAL_PARSING
85 /* Constants specific to the near-optimal parsing algorithm */
86
87 /*
88 * The maximum number of matches the matchfinder can find at a single position.
89 * Since the matchfinder never finds more than one match for the same length,
90 * presuming one of each possible length is sufficient for an upper bound.
91 * (This says nothing about whether it is worthwhile to consider so many
92 * matches; this is just defining the worst case.)
93 */
94 # define MAX_MATCHES_PER_POS (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
95
96 /*
97 * The number of lz_match structures in the match cache, excluding the extra
98 * "overflow" entries. This value should be high enough so that nearly the
99 * time, all matches found in a given block can fit in the match cache.
100 * However, fallback behavior (immediately terminating the block) on cache
101 * overflow is still required.
102 */
103 # define CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5)
104
105 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
106
107 /*
108 * These are the compressor-side limits on the codeword lengths for each Huffman
109 * code. To make outputting bits slightly faster, some of these limits are
110 * lower than the limits defined by the DEFLATE format. This does not
111 * significantly affect the compression ratio, at least for the block lengths we
112 * use.
113 */
114 #define MAX_LITLEN_CODEWORD_LEN 14
115 #define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN
116 #define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN
117
118 /* Table: length slot => length slot base value */
119 static const unsigned deflate_length_slot_base[] = {
120 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ,
121 11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 ,
122 35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 ,
123 131 , 163 , 195 , 227 , 258 ,
124 };
125
126 /* Table: length slot => number of extra length bits */
127 static const u8 deflate_extra_length_bits[] = {
128 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
129 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 ,
130 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
131 5 , 5 , 5 , 5 , 0 ,
132 };
133
134 /* Table: offset slot => offset slot base value */
135 static const unsigned deflate_offset_slot_base[] = {
136 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 ,
137 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 ,
138 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 ,
139 4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
140 };
141
142 /* Table: offset slot => number of extra offset bits */
143 static const u8 deflate_extra_offset_bits[] = {
144 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 ,
145 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
146 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 ,
147 11 , 11 , 12 , 12 , 13 , 13 ,
148 };
149
150 /* Table: length => length slot */
151 static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
152 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
153 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
154 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
155 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
156 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
157 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
158 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
159 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
160 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
161 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
162 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
163 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
164 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
165 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
166 27, 27, 28,
167 };
168
169 /* The order in which precode codeword lengths are stored */
170 static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
171 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
172 };
173
174 /* Codewords for the DEFLATE Huffman codes. */
175 struct deflate_codewords {
176 u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
177 u32 offset[DEFLATE_NUM_OFFSET_SYMS];
178 };
179
180 /* Codeword lengths (in bits) for the DEFLATE Huffman codes.
181 * A zero length means the corresponding symbol had zero frequency. */
182 struct deflate_lens {
183 u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
184 u8 offset[DEFLATE_NUM_OFFSET_SYMS];
185 };
186
187 /* Codewords and lengths for the DEFLATE Huffman codes. */
188 struct deflate_codes {
189 struct deflate_codewords codewords;
190 struct deflate_lens lens;
191 };
192
193 /* Symbol frequency counters for the DEFLATE Huffman codes. */
194 struct deflate_freqs {
195 u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
196 u32 offset[DEFLATE_NUM_OFFSET_SYMS];
197 };
198
199 #if SUPPORT_NEAR_OPTIMAL_PARSING
200
201 /* Costs for the near-optimal parsing algorithm. */
202 struct deflate_costs {
203
204 /* The cost to output each possible literal. */
205 u32 literal[DEFLATE_NUM_LITERALS];
206
207 /* The cost to output each possible match length. */
208 u32 length[DEFLATE_MAX_MATCH_LEN + 1];
209
210 /* The cost to output a match offset of each possible offset slot. */
211 u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
212 };
213
214 /*
215 * COST_SHIFT is a scaling factor that makes it possible to consider fractional
216 * bit costs. A token requiring 'n' bits to represent has cost n << COST_SHIFT.
217 *
218 * Note: this is only useful as a statistical trick for when the true costs are
219 * unknown. In reality, each token in DEFLATE requires a whole number of bits
220 * to output.
221 */
222 #define COST_SHIFT 3
223
224 /*
225 * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
226 * be needed to output a symbol that was unused in the previous optimization
227 * pass. Assigning a default cost allows the symbol to be used in the next
228 * optimization pass. However, the cost should be relatively high because the
229 * symbol probably won't be used very many times (if at all).
230 */
231 #define LITERAL_NOSTAT_BITS 13
232 #define LENGTH_NOSTAT_BITS 13
233 #define OFFSET_NOSTAT_BITS 10
234
235 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
236
237 /*
238 * Represents a run of literals followed by a match or end-of-block. This
239 * struct is needed to temporarily store items chosen by the parser, since items
240 * cannot be written until all items for the block have been chosen and the
241 * block's Huffman codes have been computed.
242 */
243 struct deflate_sequence {
244
245 /* Bits 0..22: the number of literals in this run. This may be 0 and
246 * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not
247 * stored explicitly in this structure; instead, they are read directly
248 * from the uncompressed data.
249 *
250 * Bits 23..31: the length of the match which follows the literals, or 0
251 * if this literal run was the last in the block, so there is no match
252 * which follows it. */
253 u32 litrunlen_and_length;
254
255 /* If 'length' doesn't indicate end-of-block, then this is the offset of
256 * the match which follows the literals. */
257 u16 offset;
258
259 /* If 'length' doesn't indicate end-of-block, then this is the offset
260 * symbol of the match which follows the literals. */
261 u8 offset_symbol;
262
263 /* If 'length' doesn't indicate end-of-block, then this is the length
264 * slot of the match which follows the literals. */
265 u8 length_slot;
266 };
267
268 #if SUPPORT_NEAR_OPTIMAL_PARSING
269
270 /*
271 * This structure represents a byte position in the input data and a node in the
272 * graph of possible match/literal choices for the current block.
273 *
274 * Logically, each incoming edge to this node is labeled with a literal or a
275 * match that can be taken to reach this position from an earlier position; and
276 * each outgoing edge from this node is labeled with a literal or a match that
277 * can be taken to advance from this position to a later position.
278 *
279 * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we
280 * associate with each node just two pieces of information:
281 *
282 * 'cost_to_end' is the minimum cost to reach the end of the block from
283 * this position.
284 *
285 * 'item' represents the literal or match that must be chosen from here to
286 * reach the end of the block with the minimum cost. Equivalently, this
287 * can be interpreted as the label of the outgoing edge on the minimum-cost
288 * path to the "end of block" node from this node.
289 */
290 struct deflate_optimum_node {
291
292 u32 cost_to_end;
293
294 /*
295 * Notes on the match/literal representation used here:
296 *
297 * The low bits of 'item' are the length: 1 if this is a literal,
298 * or the match length if this is a match.
299 *
300 * The high bits of 'item' are the actual literal byte if this is a
301 * literal, or the match offset if this is a match.
302 */
303 #define OPTIMUM_OFFSET_SHIFT 9
304 #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
305 u32 item;
306
307 };
308
309 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
310
311 /* Block split statistics. See "Block splitting algorithm" below. */
312 #define NUM_LITERAL_OBSERVATION_TYPES 8
313 #define NUM_MATCH_OBSERVATION_TYPES 2
314 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
315 struct block_split_stats {
316 u32 new_observations[NUM_OBSERVATION_TYPES];
317 u32 observations[NUM_OBSERVATION_TYPES];
318 u32 num_new_observations;
319 u32 num_observations;
320 };
321
322 /* The main DEFLATE compressor structure */
323 struct libdeflate_compressor {
324
325 /* Pointer to the compress() implementation chosen at allocation time */
326 size_t (*impl)(struct libdeflate_compressor *,
327 const u8 *, size_t, u8 *, size_t);
328
329 /* Frequency counters for the current block */
330 struct deflate_freqs freqs;
331
332 /* Dynamic Huffman codes for the current block */
333 struct deflate_codes codes;
334
335 /* Static Huffman codes */
336 struct deflate_codes static_codes;
337
338 /* Block split statistics for the currently pending block */
339 struct block_split_stats split_stats;
340
341 /* A table for fast lookups of offset slot by match offset.
342 *
343 * If the full table is being used, it is a direct mapping from offset
344 * to offset slot.
345 *
346 * If the condensed table is being used, the first 256 entries map
347 * directly to the offset slots of offsets 1 through 256. The next 256
348 * entries map to the offset slots for the remaining offsets, stepping
349 * through the offsets with a stride of 128. This relies on the fact
350 * that each of the remaining offset slots contains at least 128 offsets
351 * and has an offset base that is a multiple of 128. */
352 #if USE_FULL_OFFSET_SLOT_FAST
353 u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
354 #else
355 u8 offset_slot_fast[512];
356 #endif
357
358 /* The "nice" match length: if a match of this length is found, choose
359 * it immediately without further consideration. */
360 unsigned nice_match_length;
361
362 /* The maximum search depth: consider at most this many potential
363 * matches at each position. */
364 unsigned max_search_depth;
365
366 /* The compression level with which this compressor was created. */
367 unsigned compression_level;
368
369 /* Anything smaller than this we won't bother trying to compress. */
370 unsigned min_size_to_compress;
371
372 /* Temporary space for Huffman code output */
373 u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
374 u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
375 u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
376 unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
377 unsigned num_litlen_syms;
378 unsigned num_offset_syms;
379 unsigned num_explicit_lens;
380 unsigned num_precode_items;
381
382 union {
383 /* Data for greedy or lazy parsing */
384 struct {
385 /* Hash chain matchfinder */
386 struct hc_matchfinder hc_mf;
387
388 /* The matches and literals that the parser has chosen
389 * for the current block. The required length of this
390 * array is limited by the maximum number of matches
391 * that can ever be chosen for a single block, plus one
392 * for the special entry at the end. */
393 struct deflate_sequence sequences[
394 DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
395 DEFLATE_MIN_MATCH_LEN) + 1];
396 } g; /* (g)reedy */
397
398 #if SUPPORT_NEAR_OPTIMAL_PARSING
399 /* Data for near-optimal parsing */
400 struct {
401
402 /* Binary tree matchfinder */
403 struct bt_matchfinder bt_mf;
404
405 /*
406 * Cached matches for the current block. This array
407 * contains the matches that were found at each position
408 * in the block. Specifically, for each position, there
409 * is a list of matches found at that position, if any,
410 * sorted by strictly increasing length. In addition,
411 * following the matches for each position, there is a
412 * special 'struct lz_match' whose 'length' member
413 * contains the number of matches found at that
414 * position, and whose 'offset' member contains the
415 * literal at that position.
416 *
417 * Note: in rare cases, there will be a very high number
418 * of matches in the block and this array will overflow.
419 * If this happens, we force the end of the current
420 * block. CACHE_LENGTH is the length at which we
421 * actually check for overflow. The extra slots beyond
422 * this are enough to absorb the worst case overflow,
423 * which occurs if starting at &match_cache[CACHE_LENGTH
424 * - 1], we write MAX_MATCHES_PER_POS matches and a
425 * match count header, then skip searching for matches
426 * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write
427 * the match count header for each.
428 */
429 struct lz_match match_cache[CACHE_LENGTH +
430 MAX_MATCHES_PER_POS +
431 DEFLATE_MAX_MATCH_LEN - 1];
432
433 /*
434 * Array of nodes, one per position, for running the
435 * minimum-cost path algorithm.
436 *
437 * This array must be large enough to accommodate the
438 * worst-case number of nodes, which occurs if we find a
439 * match of length DEFLATE_MAX_MATCH_LEN at position
440 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
441 * length SOFT_MAX_BLOCK_LENGTH - 1 +
442 * DEFLATE_MAX_MATCH_LEN. Add one for the end-of-block
443 * node.
444 */
445 struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
446 DEFLATE_MAX_MATCH_LEN + 1];
447
448 /* The current cost model being used. */
449 struct deflate_costs costs;
450
451 unsigned num_optim_passes;
452 } n; /* (n)ear-optimal */
453 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
454
455 } p; /* (p)arser */
456 };
457
458 /*
459 * The type for the bitbuffer variable, which temporarily holds bits that are
460 * being packed into bytes and written to the output buffer. For best
461 * performance, this should have size equal to a machine word.
462 */
463 typedef machine_word_t bitbuf_t;
464 #define BITBUF_NBITS (8 * sizeof(bitbuf_t))
465
466 /* Can the specified number of bits always be added to 'bitbuf' after any
467 * pending bytes have been flushed? */
468 #define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7)
469
470 /*
471 * Structure to keep track of the current state of sending bits to the
472 * compressed output buffer.
473 */
474 struct deflate_output_bitstream {
475
476 /* Bits that haven't yet been written to the output buffer. */
477 bitbuf_t bitbuf;
478
479 /* Number of bits currently held in @bitbuf. */
480 unsigned bitcount;
481
482 /* Pointer to the beginning of the output buffer. */
483 u8 *begin;
484
485 /* Pointer to the position in the output buffer at which the next byte
486 * should be written. */
487 u8 *next;
488
489 /* Pointer just past the end of the output buffer. */
490 u8 *end;
491 };
492
493 /*
494 * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
495 * present following os->end, in order to not overrun the buffer when generating
496 * output. When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
497 * bytes for put_unaligned_leword(). Otherwise we need only 1 byte. However,
498 * to make the compression algorithm produce the same result on all CPU
499 * architectures (which is sometimes desirable), we have to unconditionally use
500 * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
501 */
502 #define OUTPUT_END_PADDING 8
503
504 /* Initialize the output bitstream. 'size' is assumed to be at least
505 * OUTPUT_END_PADDING. */
506 static void
deflate_init_output(struct deflate_output_bitstream * os,void * buffer,size_t size)507 deflate_init_output(struct deflate_output_bitstream *os,
508 void *buffer, size_t size)
509 {
510 os->bitbuf = 0;
511 os->bitcount = 0;
512 os->begin = buffer;
513 os->next = os->begin;
514 os->end = os->begin + size - OUTPUT_END_PADDING;
515 }
516
517 /* Add some bits to the bitbuffer variable of the output bitstream. The caller
518 * must make sure there is enough room. */
519 static forceinline void
deflate_add_bits(struct deflate_output_bitstream * os,const bitbuf_t bits,const unsigned num_bits)520 deflate_add_bits(struct deflate_output_bitstream *os,
521 const bitbuf_t bits, const unsigned num_bits)
522 {
523 os->bitbuf |= bits << os->bitcount;
524 os->bitcount += num_bits;
525 }
526
527 /* Flush bits from the bitbuffer variable to the output buffer. */
528 static forceinline void
deflate_flush_bits(struct deflate_output_bitstream * os)529 deflate_flush_bits(struct deflate_output_bitstream *os)
530 {
531 if (UNALIGNED_ACCESS_IS_FAST) {
532 /* Flush a whole word (branchlessly). */
533 put_unaligned_leword(os->bitbuf, os->next);
534 os->bitbuf >>= os->bitcount & ~7;
535 os->next += MIN(os->end - os->next, os->bitcount >> 3);
536 os->bitcount &= 7;
537 } else {
538 /* Flush a byte at a time. */
539 while (os->bitcount >= 8) {
540 *os->next = os->bitbuf;
541 if (os->next != os->end)
542 os->next++;
543 os->bitcount -= 8;
544 os->bitbuf >>= 8;
545 }
546 }
547 }
548
549 /* Align the bitstream on a byte boundary. */
550 static forceinline void
deflate_align_bitstream(struct deflate_output_bitstream * os)551 deflate_align_bitstream(struct deflate_output_bitstream *os)
552 {
553 os->bitcount += -os->bitcount & 7;
554 deflate_flush_bits(os);
555 }
556
557 /*
558 * Flush any remaining bits to the output buffer if needed. Return the total
559 * number of bytes written to the output buffer, or 0 if an overflow occurred.
560 */
561 static size_t
deflate_flush_output(struct deflate_output_bitstream * os)562 deflate_flush_output(struct deflate_output_bitstream *os)
563 {
564 if (os->next == os->end) /* overflow? */
565 return 0;
566
567 while ((int)os->bitcount > 0) {
568 *os->next++ = os->bitbuf;
569 os->bitcount -= 8;
570 os->bitbuf >>= 8;
571 }
572
573 return os->next - os->begin;
574 }
575
576 /* Given the binary tree node A[subtree_idx] whose children already
577 * satisfy the maxheap property, swap the node with its greater child
578 * until it is greater than both its children, so that the maxheap
579 * property is satisfied in the subtree rooted at A[subtree_idx]. */
580 static void
heapify_subtree(u32 A[],unsigned length,unsigned subtree_idx)581 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
582 {
583 unsigned parent_idx;
584 unsigned child_idx;
585 u32 v;
586
587 v = A[subtree_idx];
588 parent_idx = subtree_idx;
589 while ((child_idx = parent_idx * 2) <= length) {
590 if (child_idx < length && A[child_idx + 1] > A[child_idx])
591 child_idx++;
592 if (v >= A[child_idx])
593 break;
594 A[parent_idx] = A[child_idx];
595 parent_idx = child_idx;
596 }
597 A[parent_idx] = v;
598 }
599
600 /* Rearrange the array 'A' so that it satisfies the maxheap property.
601 * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
602 */
603 static void
heapify_array(u32 A[],unsigned length)604 heapify_array(u32 A[], unsigned length)
605 {
606 unsigned subtree_idx;
607
608 for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
609 heapify_subtree(A, length, subtree_idx);
610 }
611
612 /*
613 * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
614 *
615 * Note: name this function heap_sort() instead of heapsort() to avoid colliding
616 * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
617 * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
618 */
619 static void
heap_sort(u32 A[],unsigned length)620 heap_sort(u32 A[], unsigned length)
621 {
622 A--; /* Use 1-based indices */
623
624 heapify_array(A, length);
625
626 while (length >= 2) {
627 u32 tmp = A[length];
628 A[length] = A[1];
629 A[1] = tmp;
630 length--;
631 heapify_subtree(A, length, 1);
632 }
633 }
634
635 #define NUM_SYMBOL_BITS 10
636 #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
637
638 #define GET_NUM_COUNTERS(num_syms) ((((num_syms) + 3 / 4) + 3) & ~3)
639 /*
640 * Sort the symbols primarily by frequency and secondarily by symbol
641 * value. Discard symbols with zero frequency and fill in an array with
642 * the remaining symbols, along with their frequencies. The low
643 * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
644 * value, and the remaining bits will contain the frequency.
645 *
646 * @num_syms
647 * Number of symbols in the alphabet.
648 * Can't be greater than (1 << NUM_SYMBOL_BITS).
649 *
650 * @freqs[num_syms]
651 * The frequency of each symbol.
652 *
653 * @lens[num_syms]
654 * An array that eventually will hold the length of each codeword.
655 * This function only fills in the codeword lengths for symbols that
656 * have zero frequency, which are not well defined per se but will
657 * be set to 0.
658 *
659 * @symout[num_syms]
660 * The output array, described above.
661 *
662 * Returns the number of entries in 'symout' that were filled. This is
663 * the number of symbols that have nonzero frequency.
664 */
665 static unsigned
sort_symbols(unsigned num_syms,const u32 freqs[restrict],u8 lens[restrict],u32 symout[restrict])666 sort_symbols(unsigned num_syms, const u32 freqs[restrict],
667 u8 lens[restrict], u32 symout[restrict])
668 {
669 unsigned sym;
670 unsigned i;
671 unsigned num_used_syms;
672 unsigned num_counters;
673 unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
674
675 /* We rely on heapsort, but with an added optimization. Since
676 * it's common for most symbol frequencies to be low, we first do
677 * a count sort using a limited number of counters. High
678 * frequencies will be counted in the last counter, and only they
679 * will be sorted with heapsort.
680 *
681 * Note: with more symbols, it is generally beneficial to have more
682 * counters. About 1 counter per 4 symbols seems fast.
683 *
684 * Note: I also tested radix sort, but even for large symbol
685 * counts (> 255) and frequencies bounded at 16 bits (enabling
686 * radix sort by just two base-256 digits), it didn't seem any
687 * faster than the method implemented here.
688 *
689 * Note: I tested the optimized quicksort implementation from
690 * glibc (with indirection overhead removed), but it was only
691 * marginally faster than the simple heapsort implemented here.
692 *
693 * Tests were done with building the codes for LZX. Results may
694 * vary for different compression algorithms...! */
695
696 num_counters = GET_NUM_COUNTERS(num_syms);
697
698 memset(counters, 0, num_counters * sizeof(counters[0]));
699
700 /* Count the frequencies. */
701 for (sym = 0; sym < num_syms; sym++)
702 counters[MIN(freqs[sym], num_counters - 1)]++;
703
704 /* Make the counters cumulative, ignoring the zero-th, which
705 * counted symbols with zero frequency. As a side effect, this
706 * calculates the number of symbols with nonzero frequency. */
707 num_used_syms = 0;
708 for (i = 1; i < num_counters; i++) {
709 unsigned count = counters[i];
710 counters[i] = num_used_syms;
711 num_used_syms += count;
712 }
713
714 /* Sort nonzero-frequency symbols using the counters. At the
715 * same time, set the codeword lengths of zero-frequency symbols
716 * to 0. */
717 for (sym = 0; sym < num_syms; sym++) {
718 u32 freq = freqs[sym];
719 if (freq != 0) {
720 symout[counters[MIN(freq, num_counters - 1)]++] =
721 sym | (freq << NUM_SYMBOL_BITS);
722 } else {
723 lens[sym] = 0;
724 }
725 }
726
727 /* Sort the symbols counted in the last counter. */
728 heap_sort(symout + counters[num_counters - 2],
729 counters[num_counters - 1] - counters[num_counters - 2]);
730
731 return num_used_syms;
732 }
733
734 /*
735 * Build the Huffman tree.
736 *
737 * This is an optimized implementation that
738 * (a) takes advantage of the frequencies being already sorted;
739 * (b) only generates non-leaf nodes, since the non-leaf nodes of a
740 * Huffman tree are sufficient to generate a canonical code;
741 * (c) Only stores parent pointers, not child pointers;
742 * (d) Produces the nodes in the same memory used for input
743 * frequency information.
744 *
745 * Array 'A', which contains 'sym_count' entries, is used for both input
746 * and output. For this function, 'sym_count' must be at least 2.
747 *
748 * For input, the array must contain the frequencies of the symbols,
749 * sorted in increasing order. Specifically, each entry must contain a
750 * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low
751 * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
752 * Although these bits will, in fact, contain the symbols that correspond
753 * to the frequencies, this function is concerned with frequencies only
754 * and keeps the symbols as-is.
755 *
756 * For output, this function will produce the non-leaf nodes of the
757 * Huffman tree. These nodes will be stored in the first (sym_count - 1)
758 * entries of the array. Entry A[sym_count - 2] will represent the root
759 * node. Each other node will contain the zero-based index of its parent
760 * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low
761 * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again,
762 * note that although these low bits will, in fact, contain a symbol
763 * value, this symbol will have *no relationship* with the Huffman tree
764 * node that happens to occupy the same slot. This is because this
765 * implementation only generates the non-leaf nodes of the tree.
766 */
767 static void
build_tree(u32 A[],unsigned sym_count)768 build_tree(u32 A[], unsigned sym_count)
769 {
770 /* Index, in 'A', of next lowest frequency symbol that has not
771 * yet been processed. */
772 unsigned i = 0;
773
774 /* Index, in 'A', of next lowest frequency parentless non-leaf
775 * node; or, if equal to 'e', then no such node exists yet. */
776 unsigned b = 0;
777
778 /* Index, in 'A', of next node to allocate as a non-leaf. */
779 unsigned e = 0;
780
781 do {
782 unsigned m, n;
783 u32 freq_shifted;
784
785 /* Choose the two next lowest frequency entries. */
786
787 if (i != sym_count &&
788 (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
789 m = i++;
790 else
791 m = b++;
792
793 if (i != sym_count &&
794 (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
795 n = i++;
796 else
797 n = b++;
798
799 /* Allocate a non-leaf node and link the entries to it.
800 *
801 * If we link an entry that we're visiting for the first
802 * time (via index 'i'), then we're actually linking a
803 * leaf node and it will have no effect, since the leaf
804 * will be overwritten with a non-leaf when index 'e'
805 * catches up to it. But it's not any slower to
806 * unconditionally set the parent index.
807 *
808 * We also compute the frequency of the non-leaf node as
809 * the sum of its two children's frequencies. */
810
811 freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
812
813 A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
814 A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
815 A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
816 e++;
817 } while (sym_count - e > 1);
818 /* When just one entry remains, it is a "leaf" that was
819 * linked to some other node. We ignore it, since the
820 * rest of the array contains the non-leaves which we
821 * need. (Note that we're assuming the cases with 0 or 1
822 * symbols were handled separately.) */
823 }
824
825 /*
826 * Given the stripped-down Huffman tree constructed by build_tree(),
827 * determine the number of codewords that should be assigned each
828 * possible length, taking into account the length-limited constraint.
829 *
830 * @A
831 * The array produced by build_tree(), containing parent index
832 * information for the non-leaf nodes of the Huffman tree. Each
833 * entry in this array is a node; a node's parent always has a
834 * greater index than that node itself. This function will
835 * overwrite the parent index information in this array, so
836 * essentially it will destroy the tree. However, the data in the
837 * low NUM_SYMBOL_BITS of each entry will be preserved.
838 *
839 * @root_idx
840 * The 0-based index of the root node in 'A', and consequently one
841 * less than the number of tree node entries in 'A'. (Or, really 2
842 * less than the actual length of 'A'.)
843 *
844 * @len_counts
845 * An array of length ('max_codeword_len' + 1) in which the number of
846 * codewords having each length <= max_codeword_len will be
847 * returned.
848 *
849 * @max_codeword_len
850 * The maximum permissible codeword length.
851 */
852 static void
compute_length_counts(u32 A[restrict],unsigned root_idx,unsigned len_counts[restrict],unsigned max_codeword_len)853 compute_length_counts(u32 A[restrict], unsigned root_idx,
854 unsigned len_counts[restrict], unsigned max_codeword_len)
855 {
856 unsigned len;
857 int node;
858
859 /* The key observations are:
860 *
861 * (1) We can traverse the non-leaf nodes of the tree, always
862 * visiting a parent before its children, by simply iterating
863 * through the array in reverse order. Consequently, we can
864 * compute the depth of each node in one pass, overwriting the
865 * parent indices with depths.
866 *
867 * (2) We can initially assume that in the real Huffman tree,
868 * both children of the root are leaves. This corresponds to two
869 * codewords of length 1. Then, whenever we visit a (non-leaf)
870 * node during the traversal, we modify this assumption to
871 * account for the current node *not* being a leaf, but rather
872 * its two children being leaves. This causes the loss of one
873 * codeword for the current depth and the addition of two
874 * codewords for the current depth plus one.
875 *
876 * (3) We can handle the length-limited constraint fairly easily
877 * by simply using the largest length available when a depth
878 * exceeds max_codeword_len.
879 */
880
881 for (len = 0; len <= max_codeword_len; len++)
882 len_counts[len] = 0;
883 len_counts[1] = 2;
884
885 /* Set the root node's depth to 0. */
886 A[root_idx] &= SYMBOL_MASK;
887
888 for (node = root_idx - 1; node >= 0; node--) {
889
890 /* Calculate the depth of this node. */
891
892 unsigned parent = A[node] >> NUM_SYMBOL_BITS;
893 unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
894 unsigned depth = parent_depth + 1;
895 unsigned len = depth;
896
897 /* Set the depth of this node so that it is available
898 * when its children (if any) are processed. */
899
900 A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
901
902 /* If needed, decrease the length to meet the
903 * length-limited constraint. This is not the optimal
904 * method for generating length-limited Huffman codes!
905 * But it should be good enough. */
906 if (len >= max_codeword_len) {
907 len = max_codeword_len;
908 do {
909 len--;
910 } while (len_counts[len] == 0);
911 }
912
913 /* Account for the fact that we have a non-leaf node at
914 * the current depth. */
915 len_counts[len]--;
916 len_counts[len + 1] += 2;
917 }
918 }
919
920 /*
921 * Generate the codewords for a canonical Huffman code.
922 *
923 * @A
924 * The output array for codewords. In addition, initially this
925 * array must contain the symbols, sorted primarily by frequency and
926 * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
927 * each entry.
928 *
929 * @len
930 * Output array for codeword lengths.
931 *
932 * @len_counts
933 * An array that provides the number of codewords that will have
934 * each possible length <= max_codeword_len.
935 *
936 * @max_codeword_len
937 * Maximum length, in bits, of each codeword.
938 *
939 * @num_syms
940 * Number of symbols in the alphabet, including symbols with zero
941 * frequency. This is the length of the 'A' and 'len' arrays.
942 */
943 static void
gen_codewords(u32 A[restrict],u8 lens[restrict],const unsigned len_counts[restrict],unsigned max_codeword_len,unsigned num_syms)944 gen_codewords(u32 A[restrict], u8 lens[restrict],
945 const unsigned len_counts[restrict],
946 unsigned max_codeword_len, unsigned num_syms)
947 {
948 u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
949 unsigned i;
950 unsigned len;
951 unsigned sym;
952
953 /* Given the number of codewords that will have each length,
954 * assign codeword lengths to symbols. We do this by assigning
955 * the lengths in decreasing order to the symbols sorted
956 * primarily by increasing frequency and secondarily by
957 * increasing symbol value. */
958 for (i = 0, len = max_codeword_len; len >= 1; len--) {
959 unsigned count = len_counts[len];
960 while (count--)
961 lens[A[i++] & SYMBOL_MASK] = len;
962 }
963
964 /* Generate the codewords themselves. We initialize the
965 * 'next_codewords' array to provide the lexicographically first
966 * codeword of each length, then assign codewords in symbol
967 * order. This produces a canonical code. */
968 next_codewords[0] = 0;
969 next_codewords[1] = 0;
970 for (len = 2; len <= max_codeword_len; len++)
971 next_codewords[len] =
972 (next_codewords[len - 1] + len_counts[len - 1]) << 1;
973
974 for (sym = 0; sym < num_syms; sym++)
975 A[sym] = next_codewords[lens[sym]]++;
976 }
977
978 /*
979 * ---------------------------------------------------------------------
980 * make_canonical_huffman_code()
981 * ---------------------------------------------------------------------
982 *
983 * Given an alphabet and the frequency of each symbol in it, construct a
984 * length-limited canonical Huffman code.
985 *
986 * @num_syms
987 * The number of symbols in the alphabet. The symbols are the
988 * integers in the range [0, num_syms - 1]. This parameter must be
989 * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
990 *
991 * @max_codeword_len
992 * The maximum permissible codeword length.
993 *
994 * @freqs
995 * An array of @num_syms entries, each of which specifies the
996 * frequency of the corresponding symbol. It is valid for some,
997 * none, or all of the frequencies to be 0.
998 *
999 * @lens
1000 * An array of @num_syms entries in which this function will return
1001 * the length, in bits, of the codeword assigned to each symbol.
1002 * Symbols with 0 frequency will not have codewords per se, but
1003 * their entries in this array will be set to 0. No lengths greater
1004 * than @max_codeword_len will be assigned.
1005 *
1006 * @codewords
1007 * An array of @num_syms entries in which this function will return
1008 * the codeword for each symbol, right-justified and padded on the
1009 * left with zeroes. Codewords for symbols with 0 frequency will be
1010 * undefined.
1011 *
1012 * ---------------------------------------------------------------------
1013 *
1014 * This function builds a length-limited canonical Huffman code.
1015 *
1016 * A length-limited Huffman code contains no codewords longer than some
1017 * specified length, and has exactly (with some algorithms) or
1018 * approximately (with the algorithm used here) the minimum weighted path
1019 * length from the root, given this constraint.
1020 *
1021 * A canonical Huffman code satisfies the properties that a longer
1022 * codeword never lexicographically precedes a shorter codeword, and the
1023 * lexicographic ordering of codewords of the same length is the same as
1024 * the lexicographic ordering of the corresponding symbols. A canonical
1025 * Huffman code, or more generally a canonical prefix code, can be
1026 * reconstructed from only a list containing the codeword length of each
1027 * symbol.
1028 *
1029 * The classic algorithm to generate a Huffman code creates a node for
1030 * each symbol, then inserts these nodes into a min-heap keyed by symbol
1031 * frequency. Then, repeatedly, the two lowest-frequency nodes are
1032 * removed from the min-heap and added as the children of a new node
1033 * having frequency equal to the sum of its two children, which is then
1034 * inserted into the min-heap. When only a single node remains in the
1035 * min-heap, it is the root of the Huffman tree. The codeword for each
1036 * symbol is determined by the path needed to reach the corresponding
1037 * node from the root. Descending to the left child appends a 0 bit,
1038 * whereas descending to the right child appends a 1 bit.
1039 *
1040 * The classic algorithm is relatively easy to understand, but it is
1041 * subject to a number of inefficiencies. In practice, it is fastest to
1042 * first sort the symbols by frequency. (This itself can be subject to
1043 * an optimization based on the fact that most frequencies tend to be
1044 * low.) At the same time, we sort secondarily by symbol value, which
1045 * aids the process of generating a canonical code. Then, during tree
1046 * construction, no heap is necessary because both the leaf nodes and the
1047 * unparented non-leaf nodes can be easily maintained in sorted order.
1048 * Consequently, there can never be more than two possibilities for the
1049 * next-lowest-frequency node.
1050 *
1051 * In addition, because we're generating a canonical code, we actually
1052 * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
1053 * This is because for canonical code generation we don't need to know
1054 * where the symbols are in the tree. Rather, we only need to know how
1055 * many leaf nodes have each depth (codeword length). And this
1056 * information can, in fact, be quickly generated from the tree of
1057 * non-leaves only.
1058 *
1059 * Furthermore, we can build this stripped-down Huffman tree directly in
1060 * the array in which the codewords are to be generated, provided that
1061 * these array slots are large enough to hold a symbol and frequency
1062 * value.
1063 *
1064 * Still furthermore, we don't even need to maintain explicit child
1065 * pointers. We only need the parent pointers, and even those can be
1066 * overwritten in-place with depth information as part of the process of
1067 * extracting codeword lengths from the tree. So in summary, we do NOT
1068 * need a big structure like:
1069 *
1070 * struct huffman_tree_node {
1071 * unsigned int symbol;
1072 * unsigned int frequency;
1073 * unsigned int depth;
1074 * struct huffman_tree_node *left_child;
1075 * struct huffman_tree_node *right_child;
1076 * };
1077 *
1078 *
1079 * ... which often gets used in "naive" implementations of Huffman code
1080 * generation.
1081 *
1082 * Many of these optimizations are based on the implementation in 7-Zip
1083 * (source file: C/HuffEnc.c), which has been placed in the public domain
1084 * by Igor Pavlov.
1085 */
1086 static void
make_canonical_huffman_code(unsigned num_syms,unsigned max_codeword_len,const u32 freqs[restrict],u8 lens[restrict],u32 codewords[restrict])1087 make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1088 const u32 freqs[restrict],
1089 u8 lens[restrict], u32 codewords[restrict])
1090 {
1091 u32 *A = codewords;
1092 unsigned num_used_syms;
1093
1094 STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
1095
1096 /* We begin by sorting the symbols primarily by frequency and
1097 * secondarily by symbol value. As an optimization, the array
1098 * used for this purpose ('A') shares storage with the space in
1099 * which we will eventually return the codewords. */
1100
1101 num_used_syms = sort_symbols(num_syms, freqs, lens, A);
1102
1103 /* 'num_used_syms' is the number of symbols with nonzero
1104 * frequency. This may be less than @num_syms. 'num_used_syms'
1105 * is also the number of entries in 'A' that are valid. Each
1106 * entry consists of a distinct symbol and a nonzero frequency
1107 * packed into a 32-bit integer. */
1108
1109 /* Handle special cases where only 0 or 1 symbols were used (had
1110 * nonzero frequency). */
1111
1112 if (unlikely(num_used_syms == 0)) {
1113 /* Code is empty. sort_symbols() already set all lengths
1114 * to 0, so there is nothing more to do. */
1115 return;
1116 }
1117
1118 if (unlikely(num_used_syms == 1)) {
1119 /* Only one symbol was used, so we only need one
1120 * codeword. But two codewords are needed to form the
1121 * smallest complete Huffman code, which uses codewords 0
1122 * and 1. Therefore, we choose another symbol to which
1123 * to assign a codeword. We use 0 (if the used symbol is
1124 * not 0) or 1 (if the used symbol is 0). In either
1125 * case, the lesser-valued symbol must be assigned
1126 * codeword 0 so that the resulting code is canonical. */
1127
1128 unsigned sym = A[0] & SYMBOL_MASK;
1129 unsigned nonzero_idx = sym ? sym : 1;
1130
1131 codewords[0] = 0;
1132 lens[0] = 1;
1133 codewords[nonzero_idx] = 1;
1134 lens[nonzero_idx] = 1;
1135 return;
1136 }
1137
1138 /* Build a stripped-down version of the Huffman tree, sharing the
1139 * array 'A' with the symbol values. Then extract length counts
1140 * from the tree and use them to generate the final codewords. */
1141
1142 build_tree(A, num_used_syms);
1143
1144 {
1145 unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
1146
1147 compute_length_counts(A, num_used_syms - 2,
1148 len_counts, max_codeword_len);
1149
1150 gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
1151 }
1152 }
1153
1154 /*
1155 * Clear the Huffman symbol frequency counters.
1156 * This must be called when starting a new DEFLATE block.
1157 */
1158 static void
deflate_reset_symbol_frequencies(struct libdeflate_compressor * c)1159 deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
1160 {
1161 memset(&c->freqs, 0, sizeof(c->freqs));
1162 }
1163
1164 /* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */
1165 static u32
deflate_reverse_codeword(u32 codeword,u8 len)1166 deflate_reverse_codeword(u32 codeword, u8 len)
1167 {
1168 /* The following branchless algorithm is faster than going bit by bit.
1169 * Note: since no codewords are longer than 16 bits, we only need to
1170 * reverse the low 16 bits of the 'u32'. */
1171 STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
1172
1173 /* Flip adjacent 1-bit fields */
1174 codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
1175
1176 /* Flip adjacent 2-bit fields */
1177 codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
1178
1179 /* Flip adjacent 4-bit fields */
1180 codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
1181
1182 /* Flip adjacent 8-bit fields */
1183 codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
1184
1185 /* Return the high 'len' bits of the bit-reversed 16 bit value. */
1186 return codeword >> (16 - len);
1187 }
1188
1189 /* Make a canonical Huffman code with bit-reversed codewords. */
1190 static void
deflate_make_huffman_code(unsigned num_syms,unsigned max_codeword_len,const u32 freqs[],u8 lens[],u32 codewords[])1191 deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1192 const u32 freqs[], u8 lens[], u32 codewords[])
1193 {
1194 unsigned sym;
1195
1196 make_canonical_huffman_code(num_syms, max_codeword_len,
1197 freqs, lens, codewords);
1198
1199 for (sym = 0; sym < num_syms; sym++)
1200 codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
1201 }
1202
1203 /*
1204 * Build the literal/length and offset Huffman codes for a DEFLATE block.
1205 *
1206 * This takes as input the frequency tables for each code and produces as output
1207 * a set of tables that map symbols to codewords and codeword lengths.
1208 */
1209 static void
deflate_make_huffman_codes(const struct deflate_freqs * freqs,struct deflate_codes * codes)1210 deflate_make_huffman_codes(const struct deflate_freqs *freqs,
1211 struct deflate_codes *codes)
1212 {
1213 STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
1214 STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
1215
1216 deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
1217 MAX_LITLEN_CODEWORD_LEN,
1218 freqs->litlen,
1219 codes->lens.litlen,
1220 codes->codewords.litlen);
1221
1222 deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
1223 MAX_OFFSET_CODEWORD_LEN,
1224 freqs->offset,
1225 codes->lens.offset,
1226 codes->codewords.offset);
1227 }
1228
1229 /* Initialize c->static_codes. */
1230 static void
deflate_init_static_codes(struct libdeflate_compressor * c)1231 deflate_init_static_codes(struct libdeflate_compressor *c)
1232 {
1233 unsigned i;
1234
1235 for (i = 0; i < 144; i++)
1236 c->freqs.litlen[i] = 1 << (9 - 8);
1237 for (; i < 256; i++)
1238 c->freqs.litlen[i] = 1 << (9 - 9);
1239 for (; i < 280; i++)
1240 c->freqs.litlen[i] = 1 << (9 - 7);
1241 for (; i < 288; i++)
1242 c->freqs.litlen[i] = 1 << (9 - 8);
1243
1244 for (i = 0; i < 32; i++)
1245 c->freqs.offset[i] = 1 << (5 - 5);
1246
1247 deflate_make_huffman_codes(&c->freqs, &c->static_codes);
1248 }
1249
1250 /* Return the offset slot for the specified match offset. */
1251 static forceinline unsigned
deflate_get_offset_slot(struct libdeflate_compressor * c,unsigned offset)1252 deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
1253 {
1254 #if USE_FULL_OFFSET_SLOT_FAST
1255 return c->offset_slot_fast[offset];
1256 #else
1257 if (offset <= 256)
1258 return c->offset_slot_fast[offset - 1];
1259 else
1260 return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
1261 #endif
1262 }
1263
1264 /* Write the header fields common to all DEFLATE block types. */
1265 static void
deflate_write_block_header(struct deflate_output_bitstream * os,bool is_final_block,unsigned block_type)1266 deflate_write_block_header(struct deflate_output_bitstream *os,
1267 bool is_final_block, unsigned block_type)
1268 {
1269 deflate_add_bits(os, is_final_block, 1);
1270 deflate_add_bits(os, block_type, 2);
1271 deflate_flush_bits(os);
1272 }
1273
1274 static unsigned
deflate_compute_precode_items(const u8 lens[restrict],const unsigned num_lens,u32 precode_freqs[restrict],unsigned precode_items[restrict])1275 deflate_compute_precode_items(const u8 lens[restrict],
1276 const unsigned num_lens,
1277 u32 precode_freqs[restrict],
1278 unsigned precode_items[restrict])
1279 {
1280 unsigned *itemptr;
1281 unsigned run_start;
1282 unsigned run_end;
1283 unsigned extra_bits;
1284 u8 len;
1285
1286 memset(precode_freqs, 0,
1287 DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
1288
1289 itemptr = precode_items;
1290 run_start = 0;
1291 do {
1292 /* Find the next run of codeword lengths. */
1293
1294 /* len = the length being repeated */
1295 len = lens[run_start];
1296
1297 /* Extend the run. */
1298 run_end = run_start;
1299 do {
1300 run_end++;
1301 } while (run_end != num_lens && len == lens[run_end]);
1302
1303 if (len == 0) {
1304 /* Run of zeroes. */
1305
1306 /* Symbol 18: RLE 11 to 138 zeroes at a time. */
1307 while ((run_end - run_start) >= 11) {
1308 extra_bits = MIN((run_end - run_start) - 11, 0x7F);
1309 precode_freqs[18]++;
1310 *itemptr++ = 18 | (extra_bits << 5);
1311 run_start += 11 + extra_bits;
1312 }
1313
1314 /* Symbol 17: RLE 3 to 10 zeroes at a time. */
1315 if ((run_end - run_start) >= 3) {
1316 extra_bits = MIN((run_end - run_start) - 3, 0x7);
1317 precode_freqs[17]++;
1318 *itemptr++ = 17 | (extra_bits << 5);
1319 run_start += 3 + extra_bits;
1320 }
1321 } else {
1322
1323 /* A run of nonzero lengths. */
1324
1325 /* Symbol 16: RLE 3 to 6 of the previous length. */
1326 if ((run_end - run_start) >= 4) {
1327 precode_freqs[len]++;
1328 *itemptr++ = len;
1329 run_start++;
1330 do {
1331 extra_bits = MIN((run_end - run_start) - 3, 0x3);
1332 precode_freqs[16]++;
1333 *itemptr++ = 16 | (extra_bits << 5);
1334 run_start += 3 + extra_bits;
1335 } while ((run_end - run_start) >= 3);
1336 }
1337 }
1338
1339 /* Output any remaining lengths without RLE. */
1340 while (run_start != run_end) {
1341 precode_freqs[len]++;
1342 *itemptr++ = len;
1343 run_start++;
1344 }
1345 } while (run_start != num_lens);
1346
1347 return itemptr - precode_items;
1348 }
1349
1350 /*
1351 * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
1352 * separate Huffman code, the "precode", which contains a symbol for each
1353 * possible codeword length in the larger code as well as several special
1354 * symbols to represent repeated codeword lengths (a form of run-length
1355 * encoding). The precode is itself constructed in canonical form, and its
1356 * codeword lengths are represented literally in 19 3-bit fields that
1357 * immediately precede the compressed codeword lengths of the larger code.
1358 */
1359
1360 /* Precompute the information needed to output Huffman codes. */
1361 static void
deflate_precompute_huffman_header(struct libdeflate_compressor * c)1362 deflate_precompute_huffman_header(struct libdeflate_compressor *c)
1363 {
1364 /* Compute how many litlen and offset symbols are needed. */
1365
1366 for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
1367 c->num_litlen_syms > 257;
1368 c->num_litlen_syms--)
1369 if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
1370 break;
1371
1372 for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
1373 c->num_offset_syms > 1;
1374 c->num_offset_syms--)
1375 if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
1376 break;
1377
1378 /* If we're not using the full set of literal/length codeword lengths,
1379 * then temporarily move the offset codeword lengths over so that the
1380 * literal/length and offset codeword lengths are contiguous. */
1381
1382 STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
1383 DEFLATE_NUM_LITLEN_SYMS);
1384
1385 if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1386 memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
1387 (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1388 c->num_offset_syms);
1389 }
1390
1391 /* Compute the "items" (RLE / literal tokens and extra bits) with which
1392 * the codeword lengths in the larger code will be output. */
1393 c->num_precode_items =
1394 deflate_compute_precode_items((u8 *)&c->codes.lens,
1395 c->num_litlen_syms +
1396 c->num_offset_syms,
1397 c->precode_freqs,
1398 c->precode_items);
1399
1400 /* Build the precode. */
1401 STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
1402 deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
1403 MAX_PRE_CODEWORD_LEN,
1404 c->precode_freqs, c->precode_lens,
1405 c->precode_codewords);
1406
1407 /* Count how many precode lengths we actually need to output. */
1408 for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
1409 c->num_explicit_lens > 4;
1410 c->num_explicit_lens--)
1411 if (c->precode_lens[deflate_precode_lens_permutation[
1412 c->num_explicit_lens - 1]] != 0)
1413 break;
1414
1415 /* Restore the offset codeword lengths if needed. */
1416 if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1417 memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1418 (u8 *)&c->codes.lens + c->num_litlen_syms,
1419 c->num_offset_syms);
1420 }
1421 }
1422
1423 /* Output the Huffman codes. */
1424 static void
deflate_write_huffman_header(struct libdeflate_compressor * c,struct deflate_output_bitstream * os)1425 deflate_write_huffman_header(struct libdeflate_compressor *c,
1426 struct deflate_output_bitstream *os)
1427 {
1428 unsigned i;
1429
1430 deflate_add_bits(os, c->num_litlen_syms - 257, 5);
1431 deflate_add_bits(os, c->num_offset_syms - 1, 5);
1432 deflate_add_bits(os, c->num_explicit_lens - 4, 4);
1433 deflate_flush_bits(os);
1434
1435 /* Output the lengths of the codewords in the precode. */
1436 for (i = 0; i < c->num_explicit_lens; i++) {
1437 deflate_add_bits(os, c->precode_lens[
1438 deflate_precode_lens_permutation[i]], 3);
1439 deflate_flush_bits(os);
1440 }
1441
1442 /* Output the encoded lengths of the codewords in the larger code. */
1443 for (i = 0; i < c->num_precode_items; i++) {
1444 unsigned precode_item = c->precode_items[i];
1445 unsigned precode_sym = precode_item & 0x1F;
1446 deflate_add_bits(os, c->precode_codewords[precode_sym],
1447 c->precode_lens[precode_sym]);
1448 if (precode_sym >= 16) {
1449 if (precode_sym == 16)
1450 deflate_add_bits(os, precode_item >> 5, 2);
1451 else if (precode_sym == 17)
1452 deflate_add_bits(os, precode_item >> 5, 3);
1453 else
1454 deflate_add_bits(os, precode_item >> 5, 7);
1455 }
1456 STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7));
1457 deflate_flush_bits(os);
1458 }
1459 }
1460
1461 static void
deflate_write_sequences(struct deflate_output_bitstream * restrict os,const struct deflate_codes * restrict codes,const struct deflate_sequence sequences[restrict],const u8 * restrict in_next)1462 deflate_write_sequences(struct deflate_output_bitstream * restrict os,
1463 const struct deflate_codes * restrict codes,
1464 const struct deflate_sequence sequences[restrict],
1465 const u8 * restrict in_next)
1466 {
1467 const struct deflate_sequence *seq = sequences;
1468
1469 for (;;) {
1470 u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
1471 unsigned length = seq->litrunlen_and_length >> 23;
1472 unsigned length_slot;
1473 unsigned litlen_symbol;
1474 unsigned offset_symbol;
1475
1476 if (litrunlen) {
1477 #if 1
1478 while (litrunlen >= 4) {
1479 unsigned lit0 = in_next[0];
1480 unsigned lit1 = in_next[1];
1481 unsigned lit2 = in_next[2];
1482 unsigned lit3 = in_next[3];
1483
1484 deflate_add_bits(os, codes->codewords.litlen[lit0],
1485 codes->lens.litlen[lit0]);
1486 if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1487 deflate_flush_bits(os);
1488
1489 deflate_add_bits(os, codes->codewords.litlen[lit1],
1490 codes->lens.litlen[lit1]);
1491 if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
1492 deflate_flush_bits(os);
1493
1494 deflate_add_bits(os, codes->codewords.litlen[lit2],
1495 codes->lens.litlen[lit2]);
1496 if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1497 deflate_flush_bits(os);
1498
1499 deflate_add_bits(os, codes->codewords.litlen[lit3],
1500 codes->lens.litlen[lit3]);
1501 deflate_flush_bits(os);
1502 in_next += 4;
1503 litrunlen -= 4;
1504 }
1505 if (litrunlen-- != 0) {
1506 deflate_add_bits(os, codes->codewords.litlen[*in_next],
1507 codes->lens.litlen[*in_next]);
1508 if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1509 deflate_flush_bits(os);
1510 in_next++;
1511 if (litrunlen-- != 0) {
1512 deflate_add_bits(os, codes->codewords.litlen[*in_next],
1513 codes->lens.litlen[*in_next]);
1514 if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1515 deflate_flush_bits(os);
1516 in_next++;
1517 if (litrunlen-- != 0) {
1518 deflate_add_bits(os, codes->codewords.litlen[*in_next],
1519 codes->lens.litlen[*in_next]);
1520 if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1521 deflate_flush_bits(os);
1522 in_next++;
1523 }
1524 }
1525 if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1526 deflate_flush_bits(os);
1527 }
1528 #else
1529 do {
1530 unsigned lit = *in_next++;
1531 deflate_add_bits(os, codes->codewords.litlen[lit],
1532 codes->lens.litlen[lit]);
1533 deflate_flush_bits(os);
1534 } while (--litrunlen);
1535 #endif
1536 }
1537
1538 if (length == 0)
1539 return;
1540
1541 in_next += length;
1542
1543 length_slot = seq->length_slot;
1544 litlen_symbol = 257 + length_slot;
1545
1546 /* Litlen symbol */
1547 deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1548 codes->lens.litlen[litlen_symbol]);
1549
1550 /* Extra length bits */
1551 STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1552 DEFLATE_MAX_EXTRA_LENGTH_BITS));
1553 deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1554 deflate_extra_length_bits[length_slot]);
1555
1556 if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1557 DEFLATE_MAX_EXTRA_LENGTH_BITS +
1558 MAX_OFFSET_CODEWORD_LEN +
1559 DEFLATE_MAX_EXTRA_OFFSET_BITS))
1560 deflate_flush_bits(os);
1561
1562 /* Offset symbol */
1563 offset_symbol = seq->offset_symbol;
1564 deflate_add_bits(os, codes->codewords.offset[offset_symbol],
1565 codes->lens.offset[offset_symbol]);
1566
1567 if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1568 DEFLATE_MAX_EXTRA_OFFSET_BITS))
1569 deflate_flush_bits(os);
1570
1571 /* Extra offset bits */
1572 deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
1573 deflate_extra_offset_bits[offset_symbol]);
1574
1575 deflate_flush_bits(os);
1576
1577 seq++;
1578 }
1579 }
1580
1581 #if SUPPORT_NEAR_OPTIMAL_PARSING
1582 /*
1583 * Follow the minimum-cost path in the graph of possible match/literal choices
1584 * for the current block and write out the matches/literals using the specified
1585 * Huffman codes.
1586 *
1587 * Note: this is slightly duplicated with deflate_write_sequences(), the reason
1588 * being that we don't want to waste time translating between intermediate
1589 * match/literal representations.
1590 */
1591 static void
deflate_write_item_list(struct deflate_output_bitstream * os,const struct deflate_codes * codes,struct libdeflate_compressor * c,u32 block_length)1592 deflate_write_item_list(struct deflate_output_bitstream *os,
1593 const struct deflate_codes *codes,
1594 struct libdeflate_compressor *c,
1595 u32 block_length)
1596 {
1597 struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
1598 struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
1599 do {
1600 unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
1601 unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1602 unsigned litlen_symbol;
1603 unsigned length_slot;
1604 unsigned offset_slot;
1605
1606 if (length == 1) {
1607 /* Literal */
1608 litlen_symbol = offset;
1609 deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1610 codes->lens.litlen[litlen_symbol]);
1611 deflate_flush_bits(os);
1612 } else {
1613 /* Match length */
1614 length_slot = deflate_length_slot[length];
1615 litlen_symbol = 257 + length_slot;
1616 deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1617 codes->lens.litlen[litlen_symbol]);
1618
1619 deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1620 deflate_extra_length_bits[length_slot]);
1621
1622 if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1623 DEFLATE_MAX_EXTRA_LENGTH_BITS +
1624 MAX_OFFSET_CODEWORD_LEN +
1625 DEFLATE_MAX_EXTRA_OFFSET_BITS))
1626 deflate_flush_bits(os);
1627
1628
1629 /* Match offset */
1630 offset_slot = deflate_get_offset_slot(c, offset);
1631 deflate_add_bits(os, codes->codewords.offset[offset_slot],
1632 codes->lens.offset[offset_slot]);
1633
1634 if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1635 DEFLATE_MAX_EXTRA_OFFSET_BITS))
1636 deflate_flush_bits(os);
1637
1638 deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
1639 deflate_extra_offset_bits[offset_slot]);
1640
1641 deflate_flush_bits(os);
1642 }
1643 cur_node += length;
1644 } while (cur_node != end_node);
1645 }
1646 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
1647
1648 /* Output the end-of-block symbol. */
1649 static void
deflate_write_end_of_block(struct deflate_output_bitstream * os,const struct deflate_codes * codes)1650 deflate_write_end_of_block(struct deflate_output_bitstream *os,
1651 const struct deflate_codes *codes)
1652 {
1653 deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
1654 codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
1655 deflate_flush_bits(os);
1656 }
1657
1658 static void
deflate_write_uncompressed_block(struct deflate_output_bitstream * os,const u8 * data,u16 len,bool is_final_block)1659 deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
1660 const u8 *data, u16 len,
1661 bool is_final_block)
1662 {
1663 deflate_write_block_header(os, is_final_block,
1664 DEFLATE_BLOCKTYPE_UNCOMPRESSED);
1665 deflate_align_bitstream(os);
1666
1667 if (4 + (u32)len >= os->end - os->next) {
1668 os->next = os->end;
1669 return;
1670 }
1671
1672 put_unaligned_le16(len, os->next);
1673 os->next += 2;
1674 put_unaligned_le16(~len, os->next);
1675 os->next += 2;
1676 memcpy(os->next, data, len);
1677 os->next += len;
1678 }
1679
1680 static void
deflate_write_uncompressed_blocks(struct deflate_output_bitstream * os,const u8 * data,size_t data_length,bool is_final_block)1681 deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
1682 const u8 *data, size_t data_length,
1683 bool is_final_block)
1684 {
1685 do {
1686 u16 len = MIN(data_length, UINT16_MAX);
1687
1688 deflate_write_uncompressed_block(os, data, len,
1689 is_final_block && len == data_length);
1690 data += len;
1691 data_length -= len;
1692 } while (data_length != 0);
1693 }
1694
1695 /*
1696 * Choose the best type of block to use (dynamic Huffman, static Huffman, or
1697 * uncompressed), then output it.
1698 */
1699 static void
deflate_flush_block(struct libdeflate_compressor * restrict c,struct deflate_output_bitstream * restrict os,const u8 * restrict block_begin,u32 block_length,bool is_final_block,bool use_item_list)1700 deflate_flush_block(struct libdeflate_compressor * restrict c,
1701 struct deflate_output_bitstream * restrict os,
1702 const u8 * restrict block_begin, u32 block_length,
1703 bool is_final_block, bool use_item_list)
1704 {
1705 static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
1706 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
1707 };
1708
1709 /* Costs are measured in bits */
1710 u32 dynamic_cost = 0;
1711 u32 static_cost = 0;
1712 u32 uncompressed_cost = 0;
1713 struct deflate_codes *codes;
1714 int block_type;
1715 unsigned sym;
1716
1717 /* Tally the end-of-block symbol. */
1718 c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
1719
1720 /* Build dynamic Huffman codes. */
1721 deflate_make_huffman_codes(&c->freqs, &c->codes);
1722
1723 /* Account for the cost of sending dynamic Huffman codes. */
1724 deflate_precompute_huffman_header(c);
1725 dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
1726 for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
1727 u32 extra = deflate_extra_precode_bits[sym];
1728 dynamic_cost += c->precode_freqs[sym] *
1729 (extra + c->precode_lens[sym]);
1730 }
1731
1732 /* Account for the cost of encoding literals. */
1733 for (sym = 0; sym < 256; sym++) {
1734 dynamic_cost += c->freqs.litlen[sym] *
1735 c->codes.lens.litlen[sym];
1736 }
1737 for (sym = 0; sym < 144; sym++)
1738 static_cost += c->freqs.litlen[sym] * 8;
1739 for (; sym < 256; sym++)
1740 static_cost += c->freqs.litlen[sym] * 9;
1741
1742 /* Account for the cost of encoding the end-of-block symbol. */
1743 dynamic_cost += c->codes.lens.litlen[256];
1744 static_cost += 7;
1745
1746 /* Account for the cost of encoding lengths. */
1747 for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) {
1748 u32 extra = deflate_extra_length_bits[sym - 257];
1749 dynamic_cost += c->freqs.litlen[sym] *
1750 (extra + c->codes.lens.litlen[sym]);
1751 static_cost += c->freqs.litlen[sym] *
1752 (extra + c->static_codes.lens.litlen[sym]);
1753 }
1754
1755 /* Account for the cost of encoding offsets. */
1756 for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
1757 u32 extra = deflate_extra_offset_bits[sym];
1758 dynamic_cost += c->freqs.offset[sym] *
1759 (extra + c->codes.lens.offset[sym]);
1760 static_cost += c->freqs.offset[sym] * (extra + 5);
1761 }
1762
1763 /* Compute the cost of using uncompressed blocks. */
1764 uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
1765 (40 * (DIV_ROUND_UP(block_length,
1766 UINT16_MAX) - 1)) +
1767 (8 * block_length);
1768
1769 /* Choose the cheapest block type. */
1770 if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
1771 block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
1772 codes = &c->codes;
1773 } else if (static_cost < uncompressed_cost) {
1774 block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
1775 codes = &c->static_codes;
1776 } else {
1777 block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
1778 }
1779
1780 /* Now actually output the block. */
1781
1782 if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
1783 /* Note: the length being flushed may exceed the maximum length
1784 * of an uncompressed block (65535 bytes). Therefore, more than
1785 * one uncompressed block might be needed. */
1786 deflate_write_uncompressed_blocks(os, block_begin, block_length,
1787 is_final_block);
1788 } else {
1789 /* Output the block header. */
1790 deflate_write_block_header(os, is_final_block, block_type);
1791
1792 /* Output the Huffman codes (dynamic Huffman blocks only). */
1793 if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
1794 deflate_write_huffman_header(c, os);
1795
1796 /* Output the literals, matches, and end-of-block symbol. */
1797 #if SUPPORT_NEAR_OPTIMAL_PARSING
1798 if (use_item_list)
1799 deflate_write_item_list(os, codes, c, block_length);
1800 else
1801 #endif
1802 deflate_write_sequences(os, codes, c->p.g.sequences,
1803 block_begin);
1804 deflate_write_end_of_block(os, codes);
1805 }
1806 }
1807
1808 static forceinline void
deflate_choose_literal(struct libdeflate_compressor * c,unsigned literal,u32 * litrunlen_p)1809 deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
1810 u32 *litrunlen_p)
1811 {
1812 c->freqs.litlen[literal]++;
1813 ++*litrunlen_p;
1814 }
1815
1816 static forceinline void
deflate_choose_match(struct libdeflate_compressor * c,unsigned length,unsigned offset,u32 * litrunlen_p,struct deflate_sequence ** next_seq_p)1817 deflate_choose_match(struct libdeflate_compressor *c,
1818 unsigned length, unsigned offset,
1819 u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
1820 {
1821 struct deflate_sequence *seq = *next_seq_p;
1822 unsigned length_slot = deflate_length_slot[length];
1823 unsigned offset_slot = deflate_get_offset_slot(c, offset);
1824
1825 c->freqs.litlen[257 + length_slot]++;
1826 c->freqs.offset[offset_slot]++;
1827
1828 seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
1829 seq->offset = offset;
1830 seq->length_slot = length_slot;
1831 seq->offset_symbol = offset_slot;
1832
1833 *litrunlen_p = 0;
1834 *next_seq_p = seq + 1;
1835 }
1836
1837 static forceinline void
deflate_finish_sequence(struct deflate_sequence * seq,u32 litrunlen)1838 deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
1839 {
1840 seq->litrunlen_and_length = litrunlen; /* length = 0 */
1841 }
1842
1843 /******************************************************************************/
1844
1845 /*
1846 * Block splitting algorithm. The problem is to decide when it is worthwhile to
1847 * start a new block with new Huffman codes. There is a theoretically optimal
1848 * solution: recursively consider every possible block split, considering the
1849 * exact cost of each block, and choose the minimum cost approach. But this is
1850 * far too slow. Instead, as an approximation, we can count symbols and after
1851 * every N symbols, compare the expected distribution of symbols based on the
1852 * previous data with the actual distribution. If they differ "by enough", then
1853 * start a new block.
1854 *
1855 * As an optimization and heuristic, we don't distinguish between every symbol
1856 * but rather we combine many symbols into a single "observation type". For
1857 * literals we only look at the high bits and low bits, and for matches we only
1858 * look at whether the match is long or not. The assumption is that for typical
1859 * "real" data, places that are good block boundaries will tend to be noticeable
1860 * based only on changes in these aggregate frequencies, without looking for
1861 * subtle differences in individual symbols. For example, a change from ASCII
1862 * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1863 * to many matches (generally more compressible), would be easily noticed based
1864 * on the aggregates.
1865 *
1866 * For determining whether the frequency distributions are "different enough" to
1867 * start a new block, the simply heuristic of splitting when the sum of absolute
1868 * differences exceeds a constant seems to be good enough. We also add a number
1869 * proportional to the block length so that the algorithm is more likely to end
1870 * long blocks than short blocks. This reflects the general expectation that it
1871 * will become increasingly beneficial to start a new block as the current
1872 * block grows longer.
1873 *
1874 * Finally, for an approximation, it is not strictly necessary that the exact
1875 * symbols being used are considered. With "near-optimal parsing", for example,
1876 * the actual symbols that will be used are unknown until after the block
1877 * boundary is chosen and the block has been optimized. Since the final choices
1878 * cannot be used, we can use preliminary "greedy" choices instead.
1879 */
1880
1881 /* Initialize the block split statistics when starting a new block. */
1882 static void
init_block_split_stats(struct block_split_stats * stats)1883 init_block_split_stats(struct block_split_stats *stats)
1884 {
1885 int i;
1886
1887 for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1888 stats->new_observations[i] = 0;
1889 stats->observations[i] = 0;
1890 }
1891 stats->num_new_observations = 0;
1892 stats->num_observations = 0;
1893 }
1894
1895 /* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the
1896 * literal, for 8 possible literal observation types. */
1897 static forceinline void
observe_literal(struct block_split_stats * stats,u8 lit)1898 observe_literal(struct block_split_stats *stats, u8 lit)
1899 {
1900 stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1901 stats->num_new_observations++;
1902 }
1903
1904 /* Match observation. Heuristic: use one observation type for "short match" and
1905 * one observation type for "long match". */
1906 static forceinline void
observe_match(struct block_split_stats * stats,unsigned length)1907 observe_match(struct block_split_stats *stats, unsigned length)
1908 {
1909 stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
1910 stats->num_new_observations++;
1911 }
1912
1913 static bool
do_end_block_check(struct block_split_stats * stats,u32 block_length)1914 do_end_block_check(struct block_split_stats *stats, u32 block_length)
1915 {
1916 int i;
1917
1918 if (stats->num_observations > 0) {
1919
1920 /* Note: to avoid slow divisions, we do not divide by
1921 * 'num_observations', but rather do all math with the numbers
1922 * multiplied by 'num_observations'. */
1923 u32 total_delta = 0;
1924 for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1925 u32 expected = stats->observations[i] * stats->num_new_observations;
1926 u32 actual = stats->new_observations[i] * stats->num_observations;
1927 u32 delta = (actual > expected) ? actual - expected :
1928 expected - actual;
1929 total_delta += delta;
1930 }
1931
1932 /* Ready to end the block? */
1933 if (total_delta + (block_length / 4096) * stats->num_observations >=
1934 NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
1935 return true;
1936 }
1937
1938 for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1939 stats->num_observations += stats->new_observations[i];
1940 stats->observations[i] += stats->new_observations[i];
1941 stats->new_observations[i] = 0;
1942 }
1943 stats->num_new_observations = 0;
1944 return false;
1945 }
1946
1947 static forceinline bool
should_end_block(struct block_split_stats * stats,const u8 * in_block_begin,const u8 * in_next,const u8 * in_end)1948 should_end_block(struct block_split_stats *stats,
1949 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
1950 {
1951 /* Ready to check block split statistics? */
1952 if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
1953 in_next - in_block_begin < MIN_BLOCK_LENGTH ||
1954 in_end - in_next < MIN_BLOCK_LENGTH)
1955 return false;
1956
1957 return do_end_block_check(stats, in_next - in_block_begin);
1958 }
1959
1960 /******************************************************************************/
1961
1962 /*
1963 * This is the level 0 "compressor". It always outputs uncompressed blocks.
1964 */
1965 static size_t
deflate_compress_none(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)1966 deflate_compress_none(struct libdeflate_compressor * restrict c,
1967 const u8 * restrict in, size_t in_nbytes,
1968 u8 * restrict out, size_t out_nbytes_avail)
1969 {
1970 struct deflate_output_bitstream os;
1971
1972 deflate_init_output(&os, out, out_nbytes_avail);
1973
1974 deflate_write_uncompressed_blocks(&os, in, in_nbytes, true);
1975
1976 return deflate_flush_output(&os);
1977 }
1978
1979 /*
1980 * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
1981 */
1982 static size_t
deflate_compress_greedy(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)1983 deflate_compress_greedy(struct libdeflate_compressor * restrict c,
1984 const u8 * restrict in, size_t in_nbytes,
1985 u8 * restrict out, size_t out_nbytes_avail)
1986 {
1987 const u8 *in_next = in;
1988 const u8 *in_end = in_next + in_nbytes;
1989 struct deflate_output_bitstream os;
1990 const u8 *in_cur_base = in_next;
1991 unsigned max_len = DEFLATE_MAX_MATCH_LEN;
1992 unsigned nice_len = MIN(c->nice_match_length, max_len);
1993 u32 next_hashes[2] = {0, 0};
1994
1995 deflate_init_output(&os, out, out_nbytes_avail);
1996 hc_matchfinder_init(&c->p.g.hc_mf);
1997
1998 do {
1999 /* Starting a new DEFLATE block. */
2000
2001 const u8 * const in_block_begin = in_next;
2002 const u8 * const in_max_block_end =
2003 in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2004 u32 litrunlen = 0;
2005 struct deflate_sequence *next_seq = c->p.g.sequences;
2006
2007 init_block_split_stats(&c->split_stats);
2008 deflate_reset_symbol_frequencies(c);
2009
2010 do {
2011 u32 length;
2012 u32 offset;
2013
2014 /* Decrease the maximum and nice match lengths if we're
2015 * approaching the end of the input buffer. */
2016 if (unlikely(max_len > in_end - in_next)) {
2017 max_len = in_end - in_next;
2018 nice_len = MIN(nice_len, max_len);
2019 }
2020
2021 length = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2022 &in_cur_base,
2023 in_next,
2024 DEFLATE_MIN_MATCH_LEN - 1,
2025 max_len,
2026 nice_len,
2027 c->max_search_depth,
2028 next_hashes,
2029 &offset);
2030
2031 if (length >= DEFLATE_MIN_MATCH_LEN) {
2032 /* Match found. */
2033 deflate_choose_match(c, length, offset,
2034 &litrunlen, &next_seq);
2035 observe_match(&c->split_stats, length);
2036 in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2037 &in_cur_base,
2038 in_next + 1,
2039 in_end,
2040 length - 1,
2041 next_hashes);
2042 } else {
2043 /* No match found. */
2044 deflate_choose_literal(c, *in_next, &litrunlen);
2045 observe_literal(&c->split_stats, *in_next);
2046 in_next++;
2047 }
2048
2049 /* Check if it's time to output another block. */
2050 } while (in_next < in_max_block_end &&
2051 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2052
2053 deflate_finish_sequence(next_seq, litrunlen);
2054 deflate_flush_block(c, &os, in_block_begin,
2055 in_next - in_block_begin,
2056 in_next == in_end, false);
2057 } while (in_next != in_end);
2058
2059 return deflate_flush_output(&os);
2060 }
2061
2062 /*
2063 * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to
2064 * see if there's a longer match at the next position. If yes, it outputs a
2065 * literal and continues to the next position. If no, it outputs the match.
2066 */
2067 static size_t
deflate_compress_lazy(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)2068 deflate_compress_lazy(struct libdeflate_compressor * restrict c,
2069 const u8 * restrict in, size_t in_nbytes,
2070 u8 * restrict out, size_t out_nbytes_avail)
2071 {
2072 const u8 *in_next = in;
2073 const u8 *in_end = in_next + in_nbytes;
2074 struct deflate_output_bitstream os;
2075 const u8 *in_cur_base = in_next;
2076 unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2077 unsigned nice_len = MIN(c->nice_match_length, max_len);
2078 u32 next_hashes[2] = {0, 0};
2079
2080 deflate_init_output(&os, out, out_nbytes_avail);
2081 hc_matchfinder_init(&c->p.g.hc_mf);
2082
2083 do {
2084 /* Starting a new DEFLATE block. */
2085
2086 const u8 * const in_block_begin = in_next;
2087 const u8 * const in_max_block_end =
2088 in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2089 u32 litrunlen = 0;
2090 struct deflate_sequence *next_seq = c->p.g.sequences;
2091
2092 init_block_split_stats(&c->split_stats);
2093 deflate_reset_symbol_frequencies(c);
2094
2095 do {
2096 unsigned cur_len;
2097 unsigned cur_offset;
2098 unsigned next_len;
2099 unsigned next_offset;
2100
2101 if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2102 max_len = in_end - in_next;
2103 nice_len = MIN(nice_len, max_len);
2104 }
2105
2106 /* Find the longest match at the current position. */
2107 cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2108 &in_cur_base,
2109 in_next,
2110 DEFLATE_MIN_MATCH_LEN - 1,
2111 max_len,
2112 nice_len,
2113 c->max_search_depth,
2114 next_hashes,
2115 &cur_offset);
2116 in_next += 1;
2117
2118 if (cur_len < DEFLATE_MIN_MATCH_LEN) {
2119 /* No match found. Choose a literal. */
2120 deflate_choose_literal(c, *(in_next - 1), &litrunlen);
2121 observe_literal(&c->split_stats, *(in_next - 1));
2122 continue;
2123 }
2124
2125 have_cur_match:
2126 observe_match(&c->split_stats, cur_len);
2127
2128 /* We have a match at the current position. */
2129
2130 /* If the current match is very long, choose it
2131 * immediately. */
2132 if (cur_len >= nice_len) {
2133 deflate_choose_match(c, cur_len, cur_offset,
2134 &litrunlen, &next_seq);
2135 in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2136 &in_cur_base,
2137 in_next,
2138 in_end,
2139 cur_len - 1,
2140 next_hashes);
2141 continue;
2142 }
2143
2144 /*
2145 * Try to find a match at the next position.
2146 *
2147 * Note: since we already have a match at the *current*
2148 * position, we use only half the 'max_search_depth'
2149 * when checking the *next* position. This is a useful
2150 * trade-off because it's more worthwhile to use a
2151 * greater search depth on the initial match.
2152 *
2153 * Note: it's possible to structure the code such that
2154 * there's only one call to longest_match(), which
2155 * handles both the "find the initial match" and "try to
2156 * find a longer match" cases. However, it is faster to
2157 * have two call sites, with longest_match() inlined at
2158 * each.
2159 */
2160 if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2161 max_len = in_end - in_next;
2162 nice_len = MIN(nice_len, max_len);
2163 }
2164 next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2165 &in_cur_base,
2166 in_next,
2167 cur_len,
2168 max_len,
2169 nice_len,
2170 c->max_search_depth / 2,
2171 next_hashes,
2172 &next_offset);
2173 in_next += 1;
2174
2175 if (next_len > cur_len) {
2176 /* Found a longer match at the next position.
2177 * Output a literal. Then the next match
2178 * becomes the current match. */
2179 deflate_choose_literal(c, *(in_next - 2), &litrunlen);
2180 cur_len = next_len;
2181 cur_offset = next_offset;
2182 goto have_cur_match;
2183 }
2184
2185 /* No longer match at the next position.
2186 * Output the current match. */
2187 deflate_choose_match(c, cur_len, cur_offset,
2188 &litrunlen, &next_seq);
2189 in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2190 &in_cur_base,
2191 in_next,
2192 in_end,
2193 cur_len - 2,
2194 next_hashes);
2195
2196 /* Check if it's time to output another block. */
2197 } while (in_next < in_max_block_end &&
2198 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2199
2200 deflate_finish_sequence(next_seq, litrunlen);
2201 deflate_flush_block(c, &os, in_block_begin,
2202 in_next - in_block_begin,
2203 in_next == in_end, false);
2204 } while (in_next != in_end);
2205
2206 return deflate_flush_output(&os);
2207 }
2208
2209 #if SUPPORT_NEAR_OPTIMAL_PARSING
2210
2211 /*
2212 * Follow the minimum-cost path in the graph of possible match/literal choices
2213 * for the current block and compute the frequencies of the Huffman symbols that
2214 * would be needed to output those matches and literals.
2215 */
2216 static void
deflate_tally_item_list(struct libdeflate_compressor * c,u32 block_length)2217 deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
2218 {
2219 struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
2220 struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2221 do {
2222 unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
2223 unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
2224
2225 if (length == 1) {
2226 /* Literal */
2227 c->freqs.litlen[offset]++;
2228 } else {
2229 /* Match */
2230 c->freqs.litlen[257 + deflate_length_slot[length]]++;
2231 c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
2232 }
2233 cur_node += length;
2234 } while (cur_node != end_node);
2235 }
2236
2237 /* Set the current cost model from the codeword lengths specified in @lens. */
2238 static void
deflate_set_costs_from_codes(struct libdeflate_compressor * c,const struct deflate_lens * lens)2239 deflate_set_costs_from_codes(struct libdeflate_compressor *c,
2240 const struct deflate_lens *lens)
2241 {
2242 unsigned i;
2243
2244 /* Literals */
2245 for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
2246 u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
2247 c->p.n.costs.literal[i] = bits << COST_SHIFT;
2248 }
2249
2250 /* Lengths */
2251 for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
2252 unsigned length_slot = deflate_length_slot[i];
2253 unsigned litlen_sym = 257 + length_slot;
2254 u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
2255 bits += deflate_extra_length_bits[length_slot];
2256 c->p.n.costs.length[i] = bits << COST_SHIFT;
2257 }
2258
2259 /* Offset slots */
2260 for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
2261 u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
2262 bits += deflate_extra_offset_bits[i];
2263 c->p.n.costs.offset_slot[i] = bits << COST_SHIFT;
2264 }
2265 }
2266
2267 static forceinline u32
deflate_default_literal_cost(unsigned literal)2268 deflate_default_literal_cost(unsigned literal)
2269 {
2270 STATIC_ASSERT(COST_SHIFT == 3);
2271 /* 66 is 8.25 bits/symbol */
2272 return 66;
2273 }
2274
2275 static forceinline u32
deflate_default_length_slot_cost(unsigned length_slot)2276 deflate_default_length_slot_cost(unsigned length_slot)
2277 {
2278 STATIC_ASSERT(COST_SHIFT == 3);
2279 /* 60 is 7.5 bits/symbol */
2280 return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
2281 }
2282
2283 static forceinline u32
deflate_default_offset_slot_cost(unsigned offset_slot)2284 deflate_default_offset_slot_cost(unsigned offset_slot)
2285 {
2286 STATIC_ASSERT(COST_SHIFT == 3);
2287 /* 39 is 4.875 bits/symbol */
2288 return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
2289 }
2290
2291 /*
2292 * Set default symbol costs for the first block's first optimization pass.
2293 *
2294 * It works well to assume that each symbol is equally probable. This results
2295 * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
2296 * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
2297 * alphabet. However, we intentionally bias the parse towards matches rather
2298 * than literals by using a slightly lower default cost for length symbols than
2299 * for literals. This often improves the compression ratio slightly.
2300 */
2301 static void
deflate_set_default_costs(struct libdeflate_compressor * c)2302 deflate_set_default_costs(struct libdeflate_compressor *c)
2303 {
2304 unsigned i;
2305
2306 /* Literals */
2307 for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2308 c->p.n.costs.literal[i] = deflate_default_literal_cost(i);
2309
2310 /* Lengths */
2311 for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2312 c->p.n.costs.length[i] = deflate_default_length_slot_cost(
2313 deflate_length_slot[i]);
2314
2315 /* Offset slots */
2316 for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2317 c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
2318 }
2319
2320 static forceinline void
deflate_adjust_cost(u32 * cost_p,u32 default_cost)2321 deflate_adjust_cost(u32 *cost_p, u32 default_cost)
2322 {
2323 *cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
2324 }
2325
2326 /*
2327 * Adjust the costs when beginning a new block.
2328 *
2329 * Since the current costs have been optimized for the data, it's undesirable to
2330 * throw them away and start over with the default costs. At the same time, we
2331 * don't want to bias the parse by assuming that the next block will be similar
2332 * to the current block. As a compromise, make the costs closer to the
2333 * defaults, but don't simply set them to the defaults.
2334 */
2335 static void
deflate_adjust_costs(struct libdeflate_compressor * c)2336 deflate_adjust_costs(struct libdeflate_compressor *c)
2337 {
2338 unsigned i;
2339
2340 /* Literals */
2341 for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2342 deflate_adjust_cost(&c->p.n.costs.literal[i],
2343 deflate_default_literal_cost(i));
2344
2345 /* Lengths */
2346 for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2347 deflate_adjust_cost(&c->p.n.costs.length[i],
2348 deflate_default_length_slot_cost(
2349 deflate_length_slot[i]));
2350
2351 /* Offset slots */
2352 for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2353 deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
2354 deflate_default_offset_slot_cost(i));
2355 }
2356
2357 /*
2358 * Find the minimum-cost path through the graph of possible match/literal
2359 * choices for this block.
2360 *
2361 * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
2362 * represents the node at the beginning of the block, to
2363 * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
2364 * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'.
2365 *
2366 * The algorithm works backwards, starting at the end node and proceeding
2367 * backwards one node at a time. At each node, the minimum cost to reach the
2368 * end node is computed and the match/literal choice that begins that path is
2369 * saved.
2370 */
2371 static void
deflate_find_min_cost_path(struct libdeflate_compressor * c,const u32 block_length,const struct lz_match * cache_ptr)2372 deflate_find_min_cost_path(struct libdeflate_compressor *c,
2373 const u32 block_length,
2374 const struct lz_match *cache_ptr)
2375 {
2376 struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2377 struct deflate_optimum_node *cur_node = end_node;
2378
2379 cur_node->cost_to_end = 0;
2380 do {
2381 unsigned num_matches;
2382 unsigned literal;
2383 u32 best_cost_to_end;
2384
2385 cur_node--;
2386 cache_ptr--;
2387
2388 num_matches = cache_ptr->length;
2389 literal = cache_ptr->offset;
2390
2391 /* It's always possible to choose a literal. */
2392 best_cost_to_end = c->p.n.costs.literal[literal] +
2393 (cur_node + 1)->cost_to_end;
2394 cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
2395
2396 /* Also consider matches if there are any. */
2397 if (num_matches) {
2398 const struct lz_match *match;
2399 unsigned len;
2400 unsigned offset;
2401 unsigned offset_slot;
2402 u32 offset_cost;
2403 u32 cost_to_end;
2404
2405 /*
2406 * Consider each length from the minimum
2407 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
2408 * match found at this position. For each length, we
2409 * consider only the smallest offset for which that
2410 * length is available. Although this is not guaranteed
2411 * to be optimal due to the possibility of a larger
2412 * offset costing less than a smaller offset to code,
2413 * this is a very useful heuristic.
2414 */
2415 match = cache_ptr - num_matches;
2416 len = DEFLATE_MIN_MATCH_LEN;
2417 do {
2418 offset = match->offset;
2419 offset_slot = deflate_get_offset_slot(c, offset);
2420 offset_cost = c->p.n.costs.offset_slot[offset_slot];
2421 do {
2422 cost_to_end = offset_cost +
2423 c->p.n.costs.length[len] +
2424 (cur_node + len)->cost_to_end;
2425 if (cost_to_end < best_cost_to_end) {
2426 best_cost_to_end = cost_to_end;
2427 cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
2428 }
2429 } while (++len <= match->length);
2430 } while (++match != cache_ptr);
2431 cache_ptr -= num_matches;
2432 }
2433 cur_node->cost_to_end = best_cost_to_end;
2434 } while (cur_node != &c->p.n.optimum_nodes[0]);
2435 }
2436
2437 /*
2438 * Choose the literal/match sequence to use for the current block. The basic
2439 * algorithm finds a minimum-cost path through the block's graph of
2440 * literal/match choices, given a cost model. However, the cost of each symbol
2441 * is unknown until the Huffman codes have been built, but at the same time the
2442 * Huffman codes depend on the frequencies of chosen symbols. Consequently,
2443 * multiple passes must be used to try to approximate an optimal solution. The
2444 * first pass uses default costs, mixed with the costs from the previous block
2445 * if any. Later passes use the Huffman codeword lengths from the previous pass
2446 * as the costs.
2447 */
2448 static void
deflate_optimize_block(struct libdeflate_compressor * c,u32 block_length,const struct lz_match * cache_ptr,bool is_first_block)2449 deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
2450 const struct lz_match *cache_ptr, bool is_first_block)
2451 {
2452 unsigned num_passes_remaining = c->p.n.num_optim_passes;
2453 u32 i;
2454
2455 /* Force the block to really end at the desired length, even if some
2456 * matches extend beyond it. */
2457 for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
2458 ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
2459 c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
2460
2461 /* Set the initial costs. */
2462 if (is_first_block)
2463 deflate_set_default_costs(c);
2464 else
2465 deflate_adjust_costs(c);
2466
2467 for (;;) {
2468 /* Find the minimum cost path for this pass. */
2469 deflate_find_min_cost_path(c, block_length, cache_ptr);
2470
2471 /* Compute frequencies of the chosen symbols. */
2472 deflate_reset_symbol_frequencies(c);
2473 deflate_tally_item_list(c, block_length);
2474
2475 if (--num_passes_remaining == 0)
2476 break;
2477
2478 /* At least one optimization pass remains; update the costs. */
2479 deflate_make_huffman_codes(&c->freqs, &c->codes);
2480 deflate_set_costs_from_codes(c, &c->codes.lens);
2481 }
2482 }
2483
2484 /*
2485 * This is the "near-optimal" DEFLATE compressor. It computes the optimal
2486 * representation of each DEFLATE block using a minimum-cost path search over
2487 * the graph of possible match/literal choices for that block, assuming a
2488 * certain cost for each Huffman symbol.
2489 *
2490 * For several reasons, the end result is not guaranteed to be optimal:
2491 *
2492 * - Nonoptimal choice of blocks
2493 * - Heuristic limitations on which matches are actually considered
2494 * - Symbol costs are unknown until the symbols have already been chosen
2495 * (so iterative optimization must be used)
2496 */
2497 static size_t
deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)2498 deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
2499 const u8 * restrict in, size_t in_nbytes,
2500 u8 * restrict out, size_t out_nbytes_avail)
2501 {
2502 const u8 *in_next = in;
2503 const u8 *in_end = in_next + in_nbytes;
2504 struct deflate_output_bitstream os;
2505 const u8 *in_cur_base = in_next;
2506 const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
2507 unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2508 unsigned nice_len = MIN(c->nice_match_length, max_len);
2509 u32 next_hashes[2] = {0, 0};
2510
2511 deflate_init_output(&os, out, out_nbytes_avail);
2512 bt_matchfinder_init(&c->p.n.bt_mf);
2513
2514 do {
2515 /* Starting a new DEFLATE block. */
2516
2517 struct lz_match *cache_ptr = c->p.n.match_cache;
2518 const u8 * const in_block_begin = in_next;
2519 const u8 * const in_max_block_end =
2520 in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2521 const u8 *next_observation = in_next;
2522
2523 init_block_split_stats(&c->split_stats);
2524
2525 /*
2526 * Find matches until we decide to end the block. We end the
2527 * block if any of the following is true:
2528 *
2529 * (1) Maximum block length has been reached
2530 * (2) Match catch may overflow.
2531 * (3) Block split heuristic says to split now.
2532 */
2533 do {
2534 struct lz_match *matches;
2535 unsigned best_len;
2536
2537 /* Slide the window forward if needed. */
2538 if (in_next == in_next_slide) {
2539 bt_matchfinder_slide_window(&c->p.n.bt_mf);
2540 in_cur_base = in_next;
2541 in_next_slide = in_next + MIN(in_end - in_next,
2542 MATCHFINDER_WINDOW_SIZE);
2543 }
2544
2545 /* Decrease the maximum and nice match lengths if we're
2546 * approaching the end of the input buffer. */
2547 if (unlikely(max_len > in_end - in_next)) {
2548 max_len = in_end - in_next;
2549 nice_len = MIN(nice_len, max_len);
2550 }
2551
2552 /*
2553 * Find matches with the current position using the
2554 * binary tree matchfinder and save them in
2555 * 'match_cache'.
2556 *
2557 * Note: the binary tree matchfinder is more suited for
2558 * optimal parsing than the hash chain matchfinder. The
2559 * reasons for this include:
2560 *
2561 * - The binary tree matchfinder can find more matches
2562 * in the same number of steps.
2563 * - One of the major advantages of hash chains is that
2564 * skipping positions (not searching for matches at
2565 * them) is faster; however, with optimal parsing we
2566 * search for matches at almost all positions, so this
2567 * advantage of hash chains is negated.
2568 */
2569 matches = cache_ptr;
2570 best_len = 0;
2571 if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
2572 cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf,
2573 in_cur_base,
2574 in_next - in_cur_base,
2575 max_len,
2576 nice_len,
2577 c->max_search_depth,
2578 next_hashes,
2579 &best_len,
2580 matches);
2581 }
2582
2583 if (in_next >= next_observation) {
2584 if (best_len >= 4) {
2585 observe_match(&c->split_stats, best_len);
2586 next_observation = in_next + best_len;
2587 } else {
2588 observe_literal(&c->split_stats, *in_next);
2589 next_observation = in_next + 1;
2590 }
2591 }
2592
2593 cache_ptr->length = cache_ptr - matches;
2594 cache_ptr->offset = *in_next;
2595 in_next++;
2596 cache_ptr++;
2597
2598 /*
2599 * If there was a very long match found, don't cache any
2600 * matches for the bytes covered by that match. This
2601 * avoids degenerate behavior when compressing highly
2602 * redundant data, where the number of matches can be
2603 * very large.
2604 *
2605 * This heuristic doesn't actually hurt the compression
2606 * ratio very much. If there's a long match, then the
2607 * data must be highly compressible, so it doesn't
2608 * matter much what we do.
2609 */
2610 if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
2611 --best_len;
2612 do {
2613 if (in_next == in_next_slide) {
2614 bt_matchfinder_slide_window(&c->p.n.bt_mf);
2615 in_cur_base = in_next;
2616 in_next_slide = in_next + MIN(in_end - in_next,
2617 MATCHFINDER_WINDOW_SIZE);
2618 }
2619 if (unlikely(max_len > in_end - in_next)) {
2620 max_len = in_end - in_next;
2621 nice_len = MIN(nice_len, max_len);
2622 }
2623 if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) {
2624 bt_matchfinder_skip_position(&c->p.n.bt_mf,
2625 in_cur_base,
2626 in_next - in_cur_base,
2627 nice_len,
2628 c->max_search_depth,
2629 next_hashes);
2630 }
2631 cache_ptr->length = 0;
2632 cache_ptr->offset = *in_next;
2633 in_next++;
2634 cache_ptr++;
2635 } while (--best_len);
2636 }
2637 } while (in_next < in_max_block_end &&
2638 cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
2639 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2640
2641 /* All the matches for this block have been cached. Now choose
2642 * the sequence of items to output and flush the block. */
2643 deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
2644 in_block_begin == in);
2645 deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
2646 in_next == in_end, true);
2647 } while (in_next != in_end);
2648
2649 return deflate_flush_output(&os);
2650 }
2651
2652 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
2653
2654 /* Initialize c->offset_slot_fast. */
2655 static void
deflate_init_offset_slot_fast(struct libdeflate_compressor * c)2656 deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
2657 {
2658 unsigned offset_slot;
2659 unsigned offset;
2660 unsigned offset_end;
2661
2662 for (offset_slot = 0;
2663 offset_slot < ARRAY_LEN(deflate_offset_slot_base);
2664 offset_slot++)
2665 {
2666 offset = deflate_offset_slot_base[offset_slot];
2667 #if USE_FULL_OFFSET_SLOT_FAST
2668 offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2669 do {
2670 c->offset_slot_fast[offset] = offset_slot;
2671 } while (++offset != offset_end);
2672 #else
2673 if (offset <= 256) {
2674 offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2675 do {
2676 c->offset_slot_fast[offset - 1] = offset_slot;
2677 } while (++offset != offset_end);
2678 } else {
2679 offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2680 do {
2681 c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
2682 } while ((offset += (1 << 7)) != offset_end);
2683 }
2684 #endif
2685 }
2686 }
2687
2688 LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
libdeflate_alloc_compressor(int compression_level)2689 libdeflate_alloc_compressor(int compression_level)
2690 {
2691 struct libdeflate_compressor *c;
2692 size_t size = offsetof(struct libdeflate_compressor, p);
2693
2694 if (compression_level < 0 || compression_level > 12)
2695 return NULL;
2696
2697 #if SUPPORT_NEAR_OPTIMAL_PARSING
2698 if (compression_level >= 8)
2699 size += sizeof(c->p.n);
2700 else if (compression_level >= 1)
2701 size += sizeof(c->p.g);
2702 #else
2703 if (compression_level >= 1)
2704 size += sizeof(c->p.g);
2705 #endif
2706
2707 c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
2708 if (!c)
2709 return NULL;
2710
2711 c->compression_level = compression_level;
2712
2713 /*
2714 * The higher the compression level, the more we should bother trying to
2715 * compress very small inputs.
2716 */
2717 c->min_size_to_compress = 56 - (compression_level * 4);
2718
2719 switch (compression_level) {
2720 case 0:
2721 c->impl = deflate_compress_none;
2722 break;
2723 case 1:
2724 c->impl = deflate_compress_greedy;
2725 c->max_search_depth = 2;
2726 c->nice_match_length = 8;
2727 break;
2728 case 2:
2729 c->impl = deflate_compress_greedy;
2730 c->max_search_depth = 6;
2731 c->nice_match_length = 10;
2732 break;
2733 case 3:
2734 c->impl = deflate_compress_greedy;
2735 c->max_search_depth = 12;
2736 c->nice_match_length = 14;
2737 break;
2738 case 4:
2739 c->impl = deflate_compress_greedy;
2740 c->max_search_depth = 24;
2741 c->nice_match_length = 24;
2742 break;
2743 case 5:
2744 c->impl = deflate_compress_lazy;
2745 c->max_search_depth = 20;
2746 c->nice_match_length = 30;
2747 break;
2748 case 6:
2749 c->impl = deflate_compress_lazy;
2750 c->max_search_depth = 40;
2751 c->nice_match_length = 65;
2752 break;
2753 case 7:
2754 c->impl = deflate_compress_lazy;
2755 c->max_search_depth = 100;
2756 c->nice_match_length = 130;
2757 break;
2758 #if SUPPORT_NEAR_OPTIMAL_PARSING
2759 case 8:
2760 c->impl = deflate_compress_near_optimal;
2761 c->max_search_depth = 12;
2762 c->nice_match_length = 20;
2763 c->p.n.num_optim_passes = 1;
2764 break;
2765 case 9:
2766 c->impl = deflate_compress_near_optimal;
2767 c->max_search_depth = 16;
2768 c->nice_match_length = 26;
2769 c->p.n.num_optim_passes = 2;
2770 break;
2771 case 10:
2772 c->impl = deflate_compress_near_optimal;
2773 c->max_search_depth = 30;
2774 c->nice_match_length = 50;
2775 c->p.n.num_optim_passes = 2;
2776 break;
2777 case 11:
2778 c->impl = deflate_compress_near_optimal;
2779 c->max_search_depth = 60;
2780 c->nice_match_length = 80;
2781 c->p.n.num_optim_passes = 3;
2782 break;
2783 default:
2784 c->impl = deflate_compress_near_optimal;
2785 c->max_search_depth = 100;
2786 c->nice_match_length = 133;
2787 c->p.n.num_optim_passes = 4;
2788 break;
2789 #else
2790 case 8:
2791 c->impl = deflate_compress_lazy;
2792 c->max_search_depth = 150;
2793 c->nice_match_length = 200;
2794 break;
2795 default:
2796 c->impl = deflate_compress_lazy;
2797 c->max_search_depth = 200;
2798 c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
2799 break;
2800 #endif
2801 }
2802
2803 deflate_init_offset_slot_fast(c);
2804 deflate_init_static_codes(c);
2805
2806 return c;
2807 }
2808
2809 LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_deflate_compress(struct libdeflate_compressor * c,const void * in,size_t in_nbytes,void * out,size_t out_nbytes_avail)2810 libdeflate_deflate_compress(struct libdeflate_compressor *c,
2811 const void *in, size_t in_nbytes,
2812 void *out, size_t out_nbytes_avail)
2813 {
2814 if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
2815 return 0;
2816
2817 /* For extremely small inputs just use a single uncompressed block. */
2818 if (unlikely(in_nbytes < c->min_size_to_compress)) {
2819 struct deflate_output_bitstream os;
2820 deflate_init_output(&os, out, out_nbytes_avail);
2821 if (in_nbytes == 0)
2822 in = &os; /* Avoid passing NULL to memcpy() */
2823 deflate_write_uncompressed_block(&os, in, in_nbytes, true);
2824 return deflate_flush_output(&os);
2825 }
2826
2827 return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
2828 }
2829
2830 LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_free_compressor(struct libdeflate_compressor * c)2831 libdeflate_free_compressor(struct libdeflate_compressor *c)
2832 {
2833 libdeflate_aligned_free(c);
2834 }
2835
2836 unsigned int
deflate_get_compression_level(struct libdeflate_compressor * c)2837 deflate_get_compression_level(struct libdeflate_compressor *c)
2838 {
2839 return c->compression_level;
2840 }
2841
2842 LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_deflate_compress_bound(struct libdeflate_compressor * c,size_t in_nbytes)2843 libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
2844 size_t in_nbytes)
2845 {
2846 /*
2847 * The worst case is all uncompressed blocks where one block has length
2848 * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
2849 * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
2850 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
2851 */
2852 size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
2853 return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
2854 }
2855