1 /*
2 * deflate_compress.c - a compressor for DEFLATE
3 *
4 * Originally public domain; changes after 2016-09-07 are copyrighted.
5 *
6 * Copyright 2016 Eric Biggers
7 *
8 * Permission is hereby granted, free of charge, to any person
9 * obtaining a copy of this software and associated documentation
10 * files (the "Software"), to deal in the Software without
11 * restriction, including without limitation the rights to use,
12 * copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following
15 * conditions:
16 *
17 * The above copyright notice and this permission notice shall be
18 * included in all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 * OTHER DEALINGS IN THE SOFTWARE.
28 */
29
30 #include <stdlib.h>
31 #include <string.h>
32
33 #include "aligned_malloc.h"
34 #include "deflate_compress.h"
35 #include "deflate_constants.h"
36 #include "unaligned.h"
37
38 #include "libdeflate.h"
39
40 /*
41 * By default, the near-optimal parsing algorithm is enabled at compression
42 * level 8 and above. The near-optimal parsing algorithm produces a compression
43 * ratio significantly better than the greedy and lazy algorithms implemented
44 * here, and also the algorithm used by zlib at level 9. However, it is slow.
45 */
46 #define SUPPORT_NEAR_OPTIMAL_PARSING 1
47
48 /*
49 * Define to 1 to maintain the full map from match offsets to offset slots.
50 * This slightly speeds up translations of match offsets to offset slots, but it
51 * uses 32769 bytes of memory rather than the 512 bytes used by the condensed
52 * map. The speedup provided by the larger map is most helpful when the
53 * near-optimal parsing algorithm is being used.
54 */
55 #define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING
56
57 /*
58 * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
59 * appropriately.
60 */
61 #define MATCHFINDER_WINDOW_ORDER 15
62
63 #include "hc_matchfinder.h"
64 #if SUPPORT_NEAR_OPTIMAL_PARSING
65 # include "bt_matchfinder.h"
66 #endif
67
68 /*
69 * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
70 * except if the last block has to be shorter.
71 */
72 #define MIN_BLOCK_LENGTH 10000
73
74 /*
75 * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
76 * the final length might be slightly longer due to matches extending beyond
77 * this limit.
78 */
79 #define SOFT_MAX_BLOCK_LENGTH 300000
80
81 /*
82 * The number of observed matches or literals that represents sufficient data to
83 * decide whether the current block should be terminated or not.
84 */
85 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
86
87
88 #if SUPPORT_NEAR_OPTIMAL_PARSING
89 /* Constants specific to the near-optimal parsing algorithm */
90
91 /*
92 * The maximum number of matches the matchfinder can find at a single position.
93 * Since the matchfinder never finds more than one match for the same length,
94 * presuming one of each possible length is sufficient for an upper bound.
95 * (This says nothing about whether it is worthwhile to consider so many
96 * matches; this is just defining the worst case.)
97 */
98 # define MAX_MATCHES_PER_POS (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
99
100 /*
101 * The number of lz_match structures in the match cache, excluding the extra
102 * "overflow" entries. This value should be high enough so that nearly the
103 * time, all matches found in a given block can fit in the match cache.
104 * However, fallback behavior (immediately terminating the block) on cache
105 * overflow is still required.
106 */
107 # define CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5)
108
109 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
110
111 /*
112 * These are the compressor-side limits on the codeword lengths for each Huffman
113 * code. To make outputting bits slightly faster, some of these limits are
114 * lower than the limits defined by the DEFLATE format. This does not
115 * significantly affect the compression ratio, at least for the block lengths we
116 * use.
117 */
118 #define MAX_LITLEN_CODEWORD_LEN 14
119 #define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN
120 #define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN
121
122 /* Table: length slot => length slot base value */
123 static const unsigned deflate_length_slot_base[] = {
124 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ,
125 11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 ,
126 35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 ,
127 131 , 163 , 195 , 227 , 258 ,
128 };
129
130 /* Table: length slot => number of extra length bits */
131 static const u8 deflate_extra_length_bits[] = {
132 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
133 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 ,
134 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
135 5 , 5 , 5 , 5 , 0 ,
136 };
137
138 /* Table: offset slot => offset slot base value */
139 static const unsigned deflate_offset_slot_base[] = {
140 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 ,
141 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 ,
142 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 ,
143 4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
144 };
145
146 /* Table: offset slot => number of extra offset bits */
147 static const u8 deflate_extra_offset_bits[] = {
148 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 ,
149 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
150 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 ,
151 11 , 11 , 12 , 12 , 13 , 13 ,
152 };
153
154 /* Table: length => length slot */
155 static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
156 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
157 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
158 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
159 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
160 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
161 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
162 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
163 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
164 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
165 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
166 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
167 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
168 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
169 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
170 27, 27, 28,
171 };
172
173 /* The order in which precode codeword lengths are stored */
174 static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
175 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
176 };
177
178 /* Codewords for the DEFLATE Huffman codes. */
179 struct deflate_codewords {
180 u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
181 u32 offset[DEFLATE_NUM_OFFSET_SYMS];
182 };
183
184 /* Codeword lengths (in bits) for the DEFLATE Huffman codes.
185 * A zero length means the corresponding symbol had zero frequency. */
186 struct deflate_lens {
187 u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
188 u8 offset[DEFLATE_NUM_OFFSET_SYMS];
189 };
190
191 /* Codewords and lengths for the DEFLATE Huffman codes. */
192 struct deflate_codes {
193 struct deflate_codewords codewords;
194 struct deflate_lens lens;
195 };
196
197 /* Symbol frequency counters for the DEFLATE Huffman codes. */
198 struct deflate_freqs {
199 u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
200 u32 offset[DEFLATE_NUM_OFFSET_SYMS];
201 };
202
203 #if SUPPORT_NEAR_OPTIMAL_PARSING
204
205 /* Costs for the near-optimal parsing algorithm. */
206 struct deflate_costs {
207
208 /* The cost to output each possible literal. */
209 u32 literal[DEFLATE_NUM_LITERALS];
210
211 /* The cost to output each possible match length. */
212 u32 length[DEFLATE_MAX_MATCH_LEN + 1];
213
214 /* The cost to output a match offset of each possible offset slot. */
215 u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
216 };
217
218 /*
219 * COST_SHIFT is a scaling factor that makes it possible to consider fractional
220 * bit costs. A token requiring 'n' bits to represent has cost n << COST_SHIFT.
221 *
222 * Note: this is only useful as a statistical trick for when the true costs are
223 * unknown. In reality, each token in DEFLATE requires a whole number of bits
224 * to output.
225 */
226 #define COST_SHIFT 3
227
228 /*
229 * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
230 * be needed to output a symbol that was unused in the previous optimization
231 * pass. Assigning a default cost allows the symbol to be used in the next
232 * optimization pass. However, the cost should be relatively high because the
233 * symbol probably won't be used very many times (if at all).
234 */
235 #define LITERAL_NOSTAT_BITS 13
236 #define LENGTH_NOSTAT_BITS 13
237 #define OFFSET_NOSTAT_BITS 10
238
239 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
240
241 /*
242 * Represents a run of literals followed by a match or end-of-block. This
243 * struct is needed to temporarily store items chosen by the parser, since items
244 * cannot be written until all items for the block have been chosen and the
245 * block's Huffman codes have been computed.
246 */
247 struct deflate_sequence {
248
249 /* Bits 0..22: the number of literals in this run. This may be 0 and
250 * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not
251 * stored explicitly in this structure; instead, they are read directly
252 * from the uncompressed data.
253 *
254 * Bits 23..31: the length of the match which follows the literals, or 0
255 * if this literal run was the last in the block, so there is no match
256 * which follows it. */
257 u32 litrunlen_and_length;
258
259 /* If 'length' doesn't indicate end-of-block, then this is the offset of
260 * the match which follows the literals. */
261 u16 offset;
262
263 /* If 'length' doesn't indicate end-of-block, then this is the offset
264 * symbol of the match which follows the literals. */
265 u8 offset_symbol;
266
267 /* If 'length' doesn't indicate end-of-block, then this is the length
268 * slot of the match which follows the literals. */
269 u8 length_slot;
270 };
271
272 #if SUPPORT_NEAR_OPTIMAL_PARSING
273
274 /*
275 * This structure represents a byte position in the input data and a node in the
276 * graph of possible match/literal choices for the current block.
277 *
278 * Logically, each incoming edge to this node is labeled with a literal or a
279 * match that can be taken to reach this position from an earlier position; and
280 * each outgoing edge from this node is labeled with a literal or a match that
281 * can be taken to advance from this position to a later position.
282 *
283 * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we
284 * associate with each node just two pieces of information:
285 *
286 * 'cost_to_end' is the minimum cost to reach the end of the block from
287 * this position.
288 *
289 * 'item' represents the literal or match that must be chosen from here to
290 * reach the end of the block with the minimum cost. Equivalently, this
291 * can be interpreted as the label of the outgoing edge on the minimum-cost
292 * path to the "end of block" node from this node.
293 */
294 struct deflate_optimum_node {
295
296 u32 cost_to_end;
297
298 /*
299 * Notes on the match/literal representation used here:
300 *
301 * The low bits of 'item' are the length: 1 if this is a literal,
302 * or the match length if this is a match.
303 *
304 * The high bits of 'item' are the actual literal byte if this is a
305 * literal, or the match offset if this is a match.
306 */
307 #define OPTIMUM_OFFSET_SHIFT 9
308 #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
309 u32 item;
310
311 };
312
313 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
314
315 /* Block split statistics. See "Block splitting algorithm" below. */
316 #define NUM_LITERAL_OBSERVATION_TYPES 8
317 #define NUM_MATCH_OBSERVATION_TYPES 2
318 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
319 struct block_split_stats {
320 u32 new_observations[NUM_OBSERVATION_TYPES];
321 u32 observations[NUM_OBSERVATION_TYPES];
322 u32 num_new_observations;
323 u32 num_observations;
324 };
325
326 /* The main DEFLATE compressor structure */
327 struct libdeflate_compressor {
328
329 /* Pointer to the compress() implementation chosen at allocation time */
330 size_t (*impl)(struct libdeflate_compressor *,
331 const u8 *, size_t, u8 *, size_t);
332
333 /* Frequency counters for the current block */
334 struct deflate_freqs freqs;
335
336 /* Dynamic Huffman codes for the current block */
337 struct deflate_codes codes;
338
339 /* Static Huffman codes */
340 struct deflate_codes static_codes;
341
342 /* Block split statistics for the currently pending block */
343 struct block_split_stats split_stats;
344
345 /* A table for fast lookups of offset slot by match offset.
346 *
347 * If the full table is being used, it is a direct mapping from offset
348 * to offset slot.
349 *
350 * If the condensed table is being used, the first 256 entries map
351 * directly to the offset slots of offsets 1 through 256. The next 256
352 * entries map to the offset slots for the remaining offsets, stepping
353 * through the offsets with a stride of 128. This relies on the fact
354 * that each of the remaining offset slots contains at least 128 offsets
355 * and has an offset base that is a multiple of 128. */
356 #if USE_FULL_OFFSET_SLOT_FAST
357 u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
358 #else
359 u8 offset_slot_fast[512];
360 #endif
361
362 /* The "nice" match length: if a match of this length is found, choose
363 * it immediately without further consideration. */
364 unsigned nice_match_length;
365
366 /* The maximum search depth: consider at most this many potential
367 * matches at each position. */
368 unsigned max_search_depth;
369
370 /* The compression level with which this compressor was created. */
371 unsigned compression_level;
372
373 /* Temporary space for Huffman code output */
374 u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
375 u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
376 u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
377 unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
378 unsigned num_litlen_syms;
379 unsigned num_offset_syms;
380 unsigned num_explicit_lens;
381 unsigned num_precode_items;
382
383 union {
384 /* Data for greedy or lazy parsing */
385 struct {
386 /* Hash chain matchfinder */
387 struct hc_matchfinder hc_mf;
388
389 /* The matches and literals that the parser has chosen
390 * for the current block. The required length of this
391 * array is limited by the maximum number of matches
392 * that can ever be chosen for a single block, plus one
393 * for the special entry at the end. */
394 struct deflate_sequence sequences[
395 DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
396 DEFLATE_MIN_MATCH_LEN) + 1];
397 } g; /* (g)reedy */
398
399 #if SUPPORT_NEAR_OPTIMAL_PARSING
400 /* Data for near-optimal parsing */
401 struct {
402
403 /* Binary tree matchfinder */
404 struct bt_matchfinder bt_mf;
405
406 /*
407 * Cached matches for the current block. This array
408 * contains the matches that were found at each position
409 * in the block. Specifically, for each position, there
410 * is a list of matches found at that position, if any,
411 * sorted by strictly increasing length. In addition,
412 * following the matches for each position, there is a
413 * special 'struct lz_match' whose 'length' member
414 * contains the number of matches found at that
415 * position, and whose 'offset' member contains the
416 * literal at that position.
417 *
418 * Note: in rare cases, there will be a very high number
419 * of matches in the block and this array will overflow.
420 * If this happens, we force the end of the current
421 * block. CACHE_LENGTH is the length at which we
422 * actually check for overflow. The extra slots beyond
423 * this are enough to absorb the worst case overflow,
424 * which occurs if starting at &match_cache[CACHE_LENGTH
425 * - 1], we write MAX_MATCHES_PER_POS matches and a
426 * match count header, then skip searching for matches
427 * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write
428 * the match count header for each.
429 */
430 struct lz_match match_cache[CACHE_LENGTH +
431 MAX_MATCHES_PER_POS +
432 DEFLATE_MAX_MATCH_LEN - 1];
433
434 /*
435 * Array of nodes, one per position, for running the
436 * minimum-cost path algorithm.
437 *
438 * This array must be large enough to accommodate the
439 * worst-case number of nodes, which occurs if we find a
440 * match of length DEFLATE_MAX_MATCH_LEN at position
441 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
442 * length SOFT_MAX_BLOCK_LENGTH - 1 +
443 * DEFLATE_MAX_MATCH_LEN. Add one for the end-of-block
444 * node.
445 */
446 struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
447 DEFLATE_MAX_MATCH_LEN + 1];
448
449 /* The current cost model being used. */
450 struct deflate_costs costs;
451
452 unsigned num_optim_passes;
453 } n; /* (n)ear-optimal */
454 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
455
456 } p; /* (p)arser */
457 };
458
459 /*
460 * The type for the bitbuffer variable, which temporarily holds bits that are
461 * being packed into bytes and written to the output buffer. For best
462 * performance, this should have size equal to a machine word.
463 */
464 typedef machine_word_t bitbuf_t;
465 #define BITBUF_NBITS (8 * sizeof(bitbuf_t))
466
467 /* Can the specified number of bits always be added to 'bitbuf' after any
468 * pending bytes have been flushed? */
469 #define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7)
470
471 /*
472 * Structure to keep track of the current state of sending bits to the
473 * compressed output buffer.
474 */
475 struct deflate_output_bitstream {
476
477 /* Bits that haven't yet been written to the output buffer. */
478 bitbuf_t bitbuf;
479
480 /* Number of bits currently held in @bitbuf. */
481 unsigned bitcount;
482
483 /* Pointer to the beginning of the output buffer. */
484 u8 *begin;
485
486 /* Pointer to the position in the output buffer at which the next byte
487 * should be written. */
488 u8 *next;
489
490 /* Pointer just past the end of the output buffer. */
491 u8 *end;
492 };
493
494 /*
495 * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
496 * present following os->end, in order to not overrun the buffer when generating
497 * output. When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
498 * bytes for put_unaligned_leword(). Otherwise we need only 1 byte. However,
499 * to make the compression algorithm produce the same result on all CPU
500 * architectures (which is sometimes desirable), we have to unconditionally use
501 * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
502 */
503 #define OUTPUT_END_PADDING 8
504
505 /* Initialize the output bitstream. 'size' is assumed to be at least
506 * OUTPUT_END_PADDING. */
507 static void
deflate_init_output(struct deflate_output_bitstream * os,void * buffer,size_t size)508 deflate_init_output(struct deflate_output_bitstream *os,
509 void *buffer, size_t size)
510 {
511 os->bitbuf = 0;
512 os->bitcount = 0;
513 os->begin = buffer;
514 os->next = os->begin;
515 os->end = os->begin + size - OUTPUT_END_PADDING;
516 }
517
518 /* Add some bits to the bitbuffer variable of the output bitstream. The caller
519 * must make sure there is enough room. */
520 static forceinline void
deflate_add_bits(struct deflate_output_bitstream * os,const bitbuf_t bits,const unsigned num_bits)521 deflate_add_bits(struct deflate_output_bitstream *os,
522 const bitbuf_t bits, const unsigned num_bits)
523 {
524 os->bitbuf |= bits << os->bitcount;
525 os->bitcount += num_bits;
526 }
527
528 /* Flush bits from the bitbuffer variable to the output buffer. */
529 static forceinline void
deflate_flush_bits(struct deflate_output_bitstream * os)530 deflate_flush_bits(struct deflate_output_bitstream *os)
531 {
532 if (UNALIGNED_ACCESS_IS_FAST) {
533 /* Flush a whole word (branchlessly). */
534 put_unaligned_leword(os->bitbuf, os->next);
535 os->bitbuf >>= os->bitcount & ~7;
536 os->next += MIN(os->end - os->next, os->bitcount >> 3);
537 os->bitcount &= 7;
538 } else {
539 /* Flush a byte at a time. */
540 while (os->bitcount >= 8) {
541 *os->next = os->bitbuf;
542 if (os->next != os->end)
543 os->next++;
544 os->bitcount -= 8;
545 os->bitbuf >>= 8;
546 }
547 }
548 }
549
550 /* Align the bitstream on a byte boundary. */
551 static forceinline void
deflate_align_bitstream(struct deflate_output_bitstream * os)552 deflate_align_bitstream(struct deflate_output_bitstream *os)
553 {
554 os->bitcount += -os->bitcount & 7;
555 deflate_flush_bits(os);
556 }
557
558 /*
559 * Flush any remaining bits to the output buffer if needed. Return the total
560 * number of bytes written to the output buffer, or 0 if an overflow occurred.
561 */
562 static u32
deflate_flush_output(struct deflate_output_bitstream * os)563 deflate_flush_output(struct deflate_output_bitstream *os)
564 {
565 if (os->next == os->end) /* overflow? */
566 return 0;
567
568 while ((int)os->bitcount > 0) {
569 *os->next++ = os->bitbuf;
570 os->bitcount -= 8;
571 os->bitbuf >>= 8;
572 }
573
574 return os->next - os->begin;
575 }
576
577 /* Given the binary tree node A[subtree_idx] whose children already
578 * satisfy the maxheap property, swap the node with its greater child
579 * until it is greater than both its children, so that the maxheap
580 * property is satisfied in the subtree rooted at A[subtree_idx]. */
581 static void
heapify_subtree(u32 A[],unsigned length,unsigned subtree_idx)582 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
583 {
584 unsigned parent_idx;
585 unsigned child_idx;
586 u32 v;
587
588 v = A[subtree_idx];
589 parent_idx = subtree_idx;
590 while ((child_idx = parent_idx * 2) <= length) {
591 if (child_idx < length && A[child_idx + 1] > A[child_idx])
592 child_idx++;
593 if (v >= A[child_idx])
594 break;
595 A[parent_idx] = A[child_idx];
596 parent_idx = child_idx;
597 }
598 A[parent_idx] = v;
599 }
600
601 /* Rearrange the array 'A' so that it satisfies the maxheap property.
602 * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
603 */
604 static void
heapify_array(u32 A[],unsigned length)605 heapify_array(u32 A[], unsigned length)
606 {
607 unsigned subtree_idx;
608
609 for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
610 heapify_subtree(A, length, subtree_idx);
611 }
612
613 /*
614 * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
615 *
616 * Note: name this function heap_sort() instead of heapsort() to avoid colliding
617 * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
618 * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
619 */
620 static void
heap_sort(u32 A[],unsigned length)621 heap_sort(u32 A[], unsigned length)
622 {
623 A--; /* Use 1-based indices */
624
625 heapify_array(A, length);
626
627 while (length >= 2) {
628 u32 tmp = A[length];
629 A[length] = A[1];
630 A[1] = tmp;
631 length--;
632 heapify_subtree(A, length, 1);
633 }
634 }
635
636 #define NUM_SYMBOL_BITS 10
637 #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
638
639 #define GET_NUM_COUNTERS(num_syms) ((((num_syms) + 3 / 4) + 3) & ~3)
640 /*
641 * Sort the symbols primarily by frequency and secondarily by symbol
642 * value. Discard symbols with zero frequency and fill in an array with
643 * the remaining symbols, along with their frequencies. The low
644 * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
645 * value, and the remaining bits will contain the frequency.
646 *
647 * @num_syms
648 * Number of symbols in the alphabet.
649 * Can't be greater than (1 << NUM_SYMBOL_BITS).
650 *
651 * @freqs[num_syms]
652 * The frequency of each symbol.
653 *
654 * @lens[num_syms]
655 * An array that eventually will hold the length of each codeword.
656 * This function only fills in the codeword lengths for symbols that
657 * have zero frequency, which are not well defined per se but will
658 * be set to 0.
659 *
660 * @symout[num_syms]
661 * The output array, described above.
662 *
663 * Returns the number of entries in 'symout' that were filled. This is
664 * the number of symbols that have nonzero frequency.
665 */
666 static unsigned
sort_symbols(unsigned num_syms,const u32 freqs[restrict],u8 lens[restrict],u32 symout[restrict])667 sort_symbols(unsigned num_syms, const u32 freqs[restrict],
668 u8 lens[restrict], u32 symout[restrict])
669 {
670 unsigned sym;
671 unsigned i;
672 unsigned num_used_syms;
673 unsigned num_counters;
674 unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
675
676 /* We rely on heapsort, but with an added optimization. Since
677 * it's common for most symbol frequencies to be low, we first do
678 * a count sort using a limited number of counters. High
679 * frequencies will be counted in the last counter, and only they
680 * will be sorted with heapsort.
681 *
682 * Note: with more symbols, it is generally beneficial to have more
683 * counters. About 1 counter per 4 symbols seems fast.
684 *
685 * Note: I also tested radix sort, but even for large symbol
686 * counts (> 255) and frequencies bounded at 16 bits (enabling
687 * radix sort by just two base-256 digits), it didn't seem any
688 * faster than the method implemented here.
689 *
690 * Note: I tested the optimized quicksort implementation from
691 * glibc (with indirection overhead removed), but it was only
692 * marginally faster than the simple heapsort implemented here.
693 *
694 * Tests were done with building the codes for LZX. Results may
695 * vary for different compression algorithms...! */
696
697 num_counters = GET_NUM_COUNTERS(num_syms);
698
699 memset(counters, 0, num_counters * sizeof(counters[0]));
700
701 /* Count the frequencies. */
702 for (sym = 0; sym < num_syms; sym++)
703 counters[MIN(freqs[sym], num_counters - 1)]++;
704
705 /* Make the counters cumulative, ignoring the zero-th, which
706 * counted symbols with zero frequency. As a side effect, this
707 * calculates the number of symbols with nonzero frequency. */
708 num_used_syms = 0;
709 for (i = 1; i < num_counters; i++) {
710 unsigned count = counters[i];
711 counters[i] = num_used_syms;
712 num_used_syms += count;
713 }
714
715 /* Sort nonzero-frequency symbols using the counters. At the
716 * same time, set the codeword lengths of zero-frequency symbols
717 * to 0. */
718 for (sym = 0; sym < num_syms; sym++) {
719 u32 freq = freqs[sym];
720 if (freq != 0) {
721 symout[counters[MIN(freq, num_counters - 1)]++] =
722 sym | (freq << NUM_SYMBOL_BITS);
723 } else {
724 lens[sym] = 0;
725 }
726 }
727
728 /* Sort the symbols counted in the last counter. */
729 heap_sort(symout + counters[num_counters - 2],
730 counters[num_counters - 1] - counters[num_counters - 2]);
731
732 return num_used_syms;
733 }
734
735 /*
736 * Build the Huffman tree.
737 *
738 * This is an optimized implementation that
739 * (a) takes advantage of the frequencies being already sorted;
740 * (b) only generates non-leaf nodes, since the non-leaf nodes of a
741 * Huffman tree are sufficient to generate a canonical code;
742 * (c) Only stores parent pointers, not child pointers;
743 * (d) Produces the nodes in the same memory used for input
744 * frequency information.
745 *
746 * Array 'A', which contains 'sym_count' entries, is used for both input
747 * and output. For this function, 'sym_count' must be at least 2.
748 *
749 * For input, the array must contain the frequencies of the symbols,
750 * sorted in increasing order. Specifically, each entry must contain a
751 * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low
752 * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
753 * Although these bits will, in fact, contain the symbols that correspond
754 * to the frequencies, this function is concerned with frequencies only
755 * and keeps the symbols as-is.
756 *
757 * For output, this function will produce the non-leaf nodes of the
758 * Huffman tree. These nodes will be stored in the first (sym_count - 1)
759 * entries of the array. Entry A[sym_count - 2] will represent the root
760 * node. Each other node will contain the zero-based index of its parent
761 * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low
762 * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again,
763 * note that although these low bits will, in fact, contain a symbol
764 * value, this symbol will have *no relationship* with the Huffman tree
765 * node that happens to occupy the same slot. This is because this
766 * implementation only generates the non-leaf nodes of the tree.
767 */
768 static void
build_tree(u32 A[],unsigned sym_count)769 build_tree(u32 A[], unsigned sym_count)
770 {
771 /* Index, in 'A', of next lowest frequency symbol that has not
772 * yet been processed. */
773 unsigned i = 0;
774
775 /* Index, in 'A', of next lowest frequency parentless non-leaf
776 * node; or, if equal to 'e', then no such node exists yet. */
777 unsigned b = 0;
778
779 /* Index, in 'A', of next node to allocate as a non-leaf. */
780 unsigned e = 0;
781
782 do {
783 unsigned m, n;
784 u32 freq_shifted;
785
786 /* Choose the two next lowest frequency entries. */
787
788 if (i != sym_count &&
789 (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
790 m = i++;
791 else
792 m = b++;
793
794 if (i != sym_count &&
795 (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
796 n = i++;
797 else
798 n = b++;
799
800 /* Allocate a non-leaf node and link the entries to it.
801 *
802 * If we link an entry that we're visiting for the first
803 * time (via index 'i'), then we're actually linking a
804 * leaf node and it will have no effect, since the leaf
805 * will be overwritten with a non-leaf when index 'e'
806 * catches up to it. But it's not any slower to
807 * unconditionally set the parent index.
808 *
809 * We also compute the frequency of the non-leaf node as
810 * the sum of its two children's frequencies. */
811
812 freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
813
814 A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
815 A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
816 A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
817 e++;
818 } while (sym_count - e > 1);
819 /* When just one entry remains, it is a "leaf" that was
820 * linked to some other node. We ignore it, since the
821 * rest of the array contains the non-leaves which we
822 * need. (Note that we're assuming the cases with 0 or 1
823 * symbols were handled separately.) */
824 }
825
826 /*
827 * Given the stripped-down Huffman tree constructed by build_tree(),
828 * determine the number of codewords that should be assigned each
829 * possible length, taking into account the length-limited constraint.
830 *
831 * @A
832 * The array produced by build_tree(), containing parent index
833 * information for the non-leaf nodes of the Huffman tree. Each
834 * entry in this array is a node; a node's parent always has a
835 * greater index than that node itself. This function will
836 * overwrite the parent index information in this array, so
837 * essentially it will destroy the tree. However, the data in the
838 * low NUM_SYMBOL_BITS of each entry will be preserved.
839 *
840 * @root_idx
841 * The 0-based index of the root node in 'A', and consequently one
842 * less than the number of tree node entries in 'A'. (Or, really 2
843 * less than the actual length of 'A'.)
844 *
845 * @len_counts
846 * An array of length ('max_codeword_len' + 1) in which the number of
847 * codewords having each length <= max_codeword_len will be
848 * returned.
849 *
850 * @max_codeword_len
851 * The maximum permissible codeword length.
852 */
853 static void
compute_length_counts(u32 A[restrict],unsigned root_idx,unsigned len_counts[restrict],unsigned max_codeword_len)854 compute_length_counts(u32 A[restrict], unsigned root_idx,
855 unsigned len_counts[restrict], unsigned max_codeword_len)
856 {
857 unsigned len;
858 int node;
859
860 /* The key observations are:
861 *
862 * (1) We can traverse the non-leaf nodes of the tree, always
863 * visiting a parent before its children, by simply iterating
864 * through the array in reverse order. Consequently, we can
865 * compute the depth of each node in one pass, overwriting the
866 * parent indices with depths.
867 *
868 * (2) We can initially assume that in the real Huffman tree,
869 * both children of the root are leaves. This corresponds to two
870 * codewords of length 1. Then, whenever we visit a (non-leaf)
871 * node during the traversal, we modify this assumption to
872 * account for the current node *not* being a leaf, but rather
873 * its two children being leaves. This causes the loss of one
874 * codeword for the current depth and the addition of two
875 * codewords for the current depth plus one.
876 *
877 * (3) We can handle the length-limited constraint fairly easily
878 * by simply using the largest length available when a depth
879 * exceeds max_codeword_len.
880 */
881
882 for (len = 0; len <= max_codeword_len; len++)
883 len_counts[len] = 0;
884 len_counts[1] = 2;
885
886 /* Set the root node's depth to 0. */
887 A[root_idx] &= SYMBOL_MASK;
888
889 for (node = root_idx - 1; node >= 0; node--) {
890
891 /* Calculate the depth of this node. */
892
893 unsigned parent = A[node] >> NUM_SYMBOL_BITS;
894 unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
895 unsigned depth = parent_depth + 1;
896 unsigned len = depth;
897
898 /* Set the depth of this node so that it is available
899 * when its children (if any) are processed. */
900
901 A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
902
903 /* If needed, decrease the length to meet the
904 * length-limited constraint. This is not the optimal
905 * method for generating length-limited Huffman codes!
906 * But it should be good enough. */
907 if (len >= max_codeword_len) {
908 len = max_codeword_len;
909 do {
910 len--;
911 } while (len_counts[len] == 0);
912 }
913
914 /* Account for the fact that we have a non-leaf node at
915 * the current depth. */
916 len_counts[len]--;
917 len_counts[len + 1] += 2;
918 }
919 }
920
921 /*
922 * Generate the codewords for a canonical Huffman code.
923 *
924 * @A
925 * The output array for codewords. In addition, initially this
926 * array must contain the symbols, sorted primarily by frequency and
927 * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
928 * each entry.
929 *
930 * @len
931 * Output array for codeword lengths.
932 *
933 * @len_counts
934 * An array that provides the number of codewords that will have
935 * each possible length <= max_codeword_len.
936 *
937 * @max_codeword_len
938 * Maximum length, in bits, of each codeword.
939 *
940 * @num_syms
941 * Number of symbols in the alphabet, including symbols with zero
942 * frequency. This is the length of the 'A' and 'len' arrays.
943 */
944 static void
gen_codewords(u32 A[restrict],u8 lens[restrict],const unsigned len_counts[restrict],unsigned max_codeword_len,unsigned num_syms)945 gen_codewords(u32 A[restrict], u8 lens[restrict],
946 const unsigned len_counts[restrict],
947 unsigned max_codeword_len, unsigned num_syms)
948 {
949 u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
950 unsigned i;
951 unsigned len;
952 unsigned sym;
953
954 /* Given the number of codewords that will have each length,
955 * assign codeword lengths to symbols. We do this by assigning
956 * the lengths in decreasing order to the symbols sorted
957 * primarily by increasing frequency and secondarily by
958 * increasing symbol value. */
959 for (i = 0, len = max_codeword_len; len >= 1; len--) {
960 unsigned count = len_counts[len];
961 while (count--)
962 lens[A[i++] & SYMBOL_MASK] = len;
963 }
964
965 /* Generate the codewords themselves. We initialize the
966 * 'next_codewords' array to provide the lexicographically first
967 * codeword of each length, then assign codewords in symbol
968 * order. This produces a canonical code. */
969 next_codewords[0] = 0;
970 next_codewords[1] = 0;
971 for (len = 2; len <= max_codeword_len; len++)
972 next_codewords[len] =
973 (next_codewords[len - 1] + len_counts[len - 1]) << 1;
974
975 for (sym = 0; sym < num_syms; sym++)
976 A[sym] = next_codewords[lens[sym]]++;
977 }
978
979 /*
980 * ---------------------------------------------------------------------
981 * make_canonical_huffman_code()
982 * ---------------------------------------------------------------------
983 *
984 * Given an alphabet and the frequency of each symbol in it, construct a
985 * length-limited canonical Huffman code.
986 *
987 * @num_syms
988 * The number of symbols in the alphabet. The symbols are the
989 * integers in the range [0, num_syms - 1]. This parameter must be
990 * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
991 *
992 * @max_codeword_len
993 * The maximum permissible codeword length.
994 *
995 * @freqs
996 * An array of @num_syms entries, each of which specifies the
997 * frequency of the corresponding symbol. It is valid for some,
998 * none, or all of the frequencies to be 0.
999 *
1000 * @lens
1001 * An array of @num_syms entries in which this function will return
1002 * the length, in bits, of the codeword assigned to each symbol.
1003 * Symbols with 0 frequency will not have codewords per se, but
1004 * their entries in this array will be set to 0. No lengths greater
1005 * than @max_codeword_len will be assigned.
1006 *
1007 * @codewords
1008 * An array of @num_syms entries in which this function will return
1009 * the codeword for each symbol, right-justified and padded on the
1010 * left with zeroes. Codewords for symbols with 0 frequency will be
1011 * undefined.
1012 *
1013 * ---------------------------------------------------------------------
1014 *
1015 * This function builds a length-limited canonical Huffman code.
1016 *
1017 * A length-limited Huffman code contains no codewords longer than some
1018 * specified length, and has exactly (with some algorithms) or
1019 * approximately (with the algorithm used here) the minimum weighted path
1020 * length from the root, given this constraint.
1021 *
1022 * A canonical Huffman code satisfies the properties that a longer
1023 * codeword never lexicographically precedes a shorter codeword, and the
1024 * lexicographic ordering of codewords of the same length is the same as
1025 * the lexicographic ordering of the corresponding symbols. A canonical
1026 * Huffman code, or more generally a canonical prefix code, can be
1027 * reconstructed from only a list containing the codeword length of each
1028 * symbol.
1029 *
1030 * The classic algorithm to generate a Huffman code creates a node for
1031 * each symbol, then inserts these nodes into a min-heap keyed by symbol
1032 * frequency. Then, repeatedly, the two lowest-frequency nodes are
1033 * removed from the min-heap and added as the children of a new node
1034 * having frequency equal to the sum of its two children, which is then
1035 * inserted into the min-heap. When only a single node remains in the
1036 * min-heap, it is the root of the Huffman tree. The codeword for each
1037 * symbol is determined by the path needed to reach the corresponding
1038 * node from the root. Descending to the left child appends a 0 bit,
1039 * whereas descending to the right child appends a 1 bit.
1040 *
1041 * The classic algorithm is relatively easy to understand, but it is
1042 * subject to a number of inefficiencies. In practice, it is fastest to
1043 * first sort the symbols by frequency. (This itself can be subject to
1044 * an optimization based on the fact that most frequencies tend to be
1045 * low.) At the same time, we sort secondarily by symbol value, which
1046 * aids the process of generating a canonical code. Then, during tree
1047 * construction, no heap is necessary because both the leaf nodes and the
1048 * unparented non-leaf nodes can be easily maintained in sorted order.
1049 * Consequently, there can never be more than two possibilities for the
1050 * next-lowest-frequency node.
1051 *
1052 * In addition, because we're generating a canonical code, we actually
1053 * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
1054 * This is because for canonical code generation we don't need to know
1055 * where the symbols are in the tree. Rather, we only need to know how
1056 * many leaf nodes have each depth (codeword length). And this
1057 * information can, in fact, be quickly generated from the tree of
1058 * non-leaves only.
1059 *
1060 * Furthermore, we can build this stripped-down Huffman tree directly in
1061 * the array in which the codewords are to be generated, provided that
1062 * these array slots are large enough to hold a symbol and frequency
1063 * value.
1064 *
1065 * Still furthermore, we don't even need to maintain explicit child
1066 * pointers. We only need the parent pointers, and even those can be
1067 * overwritten in-place with depth information as part of the process of
1068 * extracting codeword lengths from the tree. So in summary, we do NOT
1069 * need a big structure like:
1070 *
1071 * struct huffman_tree_node {
1072 * unsigned int symbol;
1073 * unsigned int frequency;
1074 * unsigned int depth;
1075 * struct huffman_tree_node *left_child;
1076 * struct huffman_tree_node *right_child;
1077 * };
1078 *
1079 *
1080 * ... which often gets used in "naive" implementations of Huffman code
1081 * generation.
1082 *
1083 * Many of these optimizations are based on the implementation in 7-Zip
1084 * (source file: C/HuffEnc.c), which has been placed in the public domain
1085 * by Igor Pavlov.
1086 */
1087 static void
make_canonical_huffman_code(unsigned num_syms,unsigned max_codeword_len,const u32 freqs[restrict],u8 lens[restrict],u32 codewords[restrict])1088 make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1089 const u32 freqs[restrict],
1090 u8 lens[restrict], u32 codewords[restrict])
1091 {
1092 u32 *A = codewords;
1093 unsigned num_used_syms;
1094
1095 STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
1096
1097 /* We begin by sorting the symbols primarily by frequency and
1098 * secondarily by symbol value. As an optimization, the array
1099 * used for this purpose ('A') shares storage with the space in
1100 * which we will eventually return the codewords. */
1101
1102 num_used_syms = sort_symbols(num_syms, freqs, lens, A);
1103
1104 /* 'num_used_syms' is the number of symbols with nonzero
1105 * frequency. This may be less than @num_syms. 'num_used_syms'
1106 * is also the number of entries in 'A' that are valid. Each
1107 * entry consists of a distinct symbol and a nonzero frequency
1108 * packed into a 32-bit integer. */
1109
1110 /* Handle special cases where only 0 or 1 symbols were used (had
1111 * nonzero frequency). */
1112
1113 if (unlikely(num_used_syms == 0)) {
1114 /* Code is empty. sort_symbols() already set all lengths
1115 * to 0, so there is nothing more to do. */
1116 return;
1117 }
1118
1119 if (unlikely(num_used_syms == 1)) {
1120 /* Only one symbol was used, so we only need one
1121 * codeword. But two codewords are needed to form the
1122 * smallest complete Huffman code, which uses codewords 0
1123 * and 1. Therefore, we choose another symbol to which
1124 * to assign a codeword. We use 0 (if the used symbol is
1125 * not 0) or 1 (if the used symbol is 0). In either
1126 * case, the lesser-valued symbol must be assigned
1127 * codeword 0 so that the resulting code is canonical. */
1128
1129 unsigned sym = A[0] & SYMBOL_MASK;
1130 unsigned nonzero_idx = sym ? sym : 1;
1131
1132 codewords[0] = 0;
1133 lens[0] = 1;
1134 codewords[nonzero_idx] = 1;
1135 lens[nonzero_idx] = 1;
1136 return;
1137 }
1138
1139 /* Build a stripped-down version of the Huffman tree, sharing the
1140 * array 'A' with the symbol values. Then extract length counts
1141 * from the tree and use them to generate the final codewords. */
1142
1143 build_tree(A, num_used_syms);
1144
1145 {
1146 unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
1147
1148 compute_length_counts(A, num_used_syms - 2,
1149 len_counts, max_codeword_len);
1150
1151 gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
1152 }
1153 }
1154
1155 /*
1156 * Clear the Huffman symbol frequency counters.
1157 * This must be called when starting a new DEFLATE block.
1158 */
1159 static void
deflate_reset_symbol_frequencies(struct libdeflate_compressor * c)1160 deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
1161 {
1162 memset(&c->freqs, 0, sizeof(c->freqs));
1163 }
1164
1165 /* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */
1166 static u32
deflate_reverse_codeword(u32 codeword,u8 len)1167 deflate_reverse_codeword(u32 codeword, u8 len)
1168 {
1169 /* The following branchless algorithm is faster than going bit by bit.
1170 * Note: since no codewords are longer than 16 bits, we only need to
1171 * reverse the low 16 bits of the 'u32'. */
1172 STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
1173
1174 /* Flip adjacent 1-bit fields */
1175 codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
1176
1177 /* Flip adjacent 2-bit fields */
1178 codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
1179
1180 /* Flip adjacent 4-bit fields */
1181 codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
1182
1183 /* Flip adjacent 8-bit fields */
1184 codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
1185
1186 /* Return the high 'len' bits of the bit-reversed 16 bit value. */
1187 return codeword >> (16 - len);
1188 }
1189
1190 /* Make a canonical Huffman code with bit-reversed codewords. */
1191 static void
deflate_make_huffman_code(unsigned num_syms,unsigned max_codeword_len,const u32 freqs[],u8 lens[],u32 codewords[])1192 deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1193 const u32 freqs[], u8 lens[], u32 codewords[])
1194 {
1195 unsigned sym;
1196
1197 make_canonical_huffman_code(num_syms, max_codeword_len,
1198 freqs, lens, codewords);
1199
1200 for (sym = 0; sym < num_syms; sym++)
1201 codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
1202 }
1203
1204 /*
1205 * Build the literal/length and offset Huffman codes for a DEFLATE block.
1206 *
1207 * This takes as input the frequency tables for each code and produces as output
1208 * a set of tables that map symbols to codewords and codeword lengths.
1209 */
1210 static void
deflate_make_huffman_codes(const struct deflate_freqs * freqs,struct deflate_codes * codes)1211 deflate_make_huffman_codes(const struct deflate_freqs *freqs,
1212 struct deflate_codes *codes)
1213 {
1214 STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
1215 STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
1216
1217 deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
1218 MAX_LITLEN_CODEWORD_LEN,
1219 freqs->litlen,
1220 codes->lens.litlen,
1221 codes->codewords.litlen);
1222
1223 deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
1224 MAX_OFFSET_CODEWORD_LEN,
1225 freqs->offset,
1226 codes->lens.offset,
1227 codes->codewords.offset);
1228 }
1229
1230 /* Initialize c->static_codes. */
1231 static void
deflate_init_static_codes(struct libdeflate_compressor * c)1232 deflate_init_static_codes(struct libdeflate_compressor *c)
1233 {
1234 unsigned i;
1235
1236 for (i = 0; i < 144; i++)
1237 c->freqs.litlen[i] = 1 << (9 - 8);
1238 for (; i < 256; i++)
1239 c->freqs.litlen[i] = 1 << (9 - 9);
1240 for (; i < 280; i++)
1241 c->freqs.litlen[i] = 1 << (9 - 7);
1242 for (; i < 288; i++)
1243 c->freqs.litlen[i] = 1 << (9 - 8);
1244
1245 for (i = 0; i < 32; i++)
1246 c->freqs.offset[i] = 1 << (5 - 5);
1247
1248 deflate_make_huffman_codes(&c->freqs, &c->static_codes);
1249 }
1250
1251 /* Return the offset slot for the specified match offset. */
1252 static forceinline unsigned
deflate_get_offset_slot(struct libdeflate_compressor * c,unsigned offset)1253 deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
1254 {
1255 #if USE_FULL_OFFSET_SLOT_FAST
1256 return c->offset_slot_fast[offset];
1257 #else
1258 if (offset <= 256)
1259 return c->offset_slot_fast[offset - 1];
1260 else
1261 return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
1262 #endif
1263 }
1264
1265 /* Write the header fields common to all DEFLATE block types. */
1266 static void
deflate_write_block_header(struct deflate_output_bitstream * os,bool is_final_block,unsigned block_type)1267 deflate_write_block_header(struct deflate_output_bitstream *os,
1268 bool is_final_block, unsigned block_type)
1269 {
1270 deflate_add_bits(os, is_final_block, 1);
1271 deflate_add_bits(os, block_type, 2);
1272 deflate_flush_bits(os);
1273 }
1274
1275 static unsigned
deflate_compute_precode_items(const u8 lens[restrict],const unsigned num_lens,u32 precode_freqs[restrict],unsigned precode_items[restrict])1276 deflate_compute_precode_items(const u8 lens[restrict],
1277 const unsigned num_lens,
1278 u32 precode_freqs[restrict],
1279 unsigned precode_items[restrict])
1280 {
1281 unsigned *itemptr;
1282 unsigned run_start;
1283 unsigned run_end;
1284 unsigned extra_bits;
1285 u8 len;
1286
1287 memset(precode_freqs, 0,
1288 DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
1289
1290 itemptr = precode_items;
1291 run_start = 0;
1292 do {
1293 /* Find the next run of codeword lengths. */
1294
1295 /* len = the length being repeated */
1296 len = lens[run_start];
1297
1298 /* Extend the run. */
1299 run_end = run_start;
1300 do {
1301 run_end++;
1302 } while (run_end != num_lens && len == lens[run_end]);
1303
1304 if (len == 0) {
1305 /* Run of zeroes. */
1306
1307 /* Symbol 18: RLE 11 to 138 zeroes at a time. */
1308 while ((run_end - run_start) >= 11) {
1309 extra_bits = MIN((run_end - run_start) - 11, 0x7F);
1310 precode_freqs[18]++;
1311 *itemptr++ = 18 | (extra_bits << 5);
1312 run_start += 11 + extra_bits;
1313 }
1314
1315 /* Symbol 17: RLE 3 to 10 zeroes at a time. */
1316 if ((run_end - run_start) >= 3) {
1317 extra_bits = MIN((run_end - run_start) - 3, 0x7);
1318 precode_freqs[17]++;
1319 *itemptr++ = 17 | (extra_bits << 5);
1320 run_start += 3 + extra_bits;
1321 }
1322 } else {
1323
1324 /* A run of nonzero lengths. */
1325
1326 /* Symbol 16: RLE 3 to 6 of the previous length. */
1327 if ((run_end - run_start) >= 4) {
1328 precode_freqs[len]++;
1329 *itemptr++ = len;
1330 run_start++;
1331 do {
1332 extra_bits = MIN((run_end - run_start) - 3, 0x3);
1333 precode_freqs[16]++;
1334 *itemptr++ = 16 | (extra_bits << 5);
1335 run_start += 3 + extra_bits;
1336 } while ((run_end - run_start) >= 3);
1337 }
1338 }
1339
1340 /* Output any remaining lengths without RLE. */
1341 while (run_start != run_end) {
1342 precode_freqs[len]++;
1343 *itemptr++ = len;
1344 run_start++;
1345 }
1346 } while (run_start != num_lens);
1347
1348 return itemptr - precode_items;
1349 }
1350
1351 /*
1352 * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
1353 * separate Huffman code, the "precode", which contains a symbol for each
1354 * possible codeword length in the larger code as well as several special
1355 * symbols to represent repeated codeword lengths (a form of run-length
1356 * encoding). The precode is itself constructed in canonical form, and its
1357 * codeword lengths are represented literally in 19 3-bit fields that
1358 * immediately precede the compressed codeword lengths of the larger code.
1359 */
1360
1361 /* Precompute the information needed to output Huffman codes. */
1362 static void
deflate_precompute_huffman_header(struct libdeflate_compressor * c)1363 deflate_precompute_huffman_header(struct libdeflate_compressor *c)
1364 {
1365 /* Compute how many litlen and offset symbols are needed. */
1366
1367 for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
1368 c->num_litlen_syms > 257;
1369 c->num_litlen_syms--)
1370 if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
1371 break;
1372
1373 for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
1374 c->num_offset_syms > 1;
1375 c->num_offset_syms--)
1376 if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
1377 break;
1378
1379 /* If we're not using the full set of literal/length codeword lengths,
1380 * then temporarily move the offset codeword lengths over so that the
1381 * literal/length and offset codeword lengths are contiguous. */
1382
1383 STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
1384 DEFLATE_NUM_LITLEN_SYMS);
1385
1386 if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1387 memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
1388 (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1389 c->num_offset_syms);
1390 }
1391
1392 /* Compute the "items" (RLE / literal tokens and extra bits) with which
1393 * the codeword lengths in the larger code will be output. */
1394 c->num_precode_items =
1395 deflate_compute_precode_items((u8 *)&c->codes.lens,
1396 c->num_litlen_syms +
1397 c->num_offset_syms,
1398 c->precode_freqs,
1399 c->precode_items);
1400
1401 /* Build the precode. */
1402 STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
1403 deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
1404 MAX_PRE_CODEWORD_LEN,
1405 c->precode_freqs, c->precode_lens,
1406 c->precode_codewords);
1407
1408 /* Count how many precode lengths we actually need to output. */
1409 for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
1410 c->num_explicit_lens > 4;
1411 c->num_explicit_lens--)
1412 if (c->precode_lens[deflate_precode_lens_permutation[
1413 c->num_explicit_lens - 1]] != 0)
1414 break;
1415
1416 /* Restore the offset codeword lengths if needed. */
1417 if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1418 memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1419 (u8 *)&c->codes.lens + c->num_litlen_syms,
1420 c->num_offset_syms);
1421 }
1422 }
1423
1424 /* Output the Huffman codes. */
1425 static void
deflate_write_huffman_header(struct libdeflate_compressor * c,struct deflate_output_bitstream * os)1426 deflate_write_huffman_header(struct libdeflate_compressor *c,
1427 struct deflate_output_bitstream *os)
1428 {
1429 unsigned i;
1430
1431 deflate_add_bits(os, c->num_litlen_syms - 257, 5);
1432 deflate_add_bits(os, c->num_offset_syms - 1, 5);
1433 deflate_add_bits(os, c->num_explicit_lens - 4, 4);
1434 deflate_flush_bits(os);
1435
1436 /* Output the lengths of the codewords in the precode. */
1437 for (i = 0; i < c->num_explicit_lens; i++) {
1438 deflate_add_bits(os, c->precode_lens[
1439 deflate_precode_lens_permutation[i]], 3);
1440 deflate_flush_bits(os);
1441 }
1442
1443 /* Output the encoded lengths of the codewords in the larger code. */
1444 for (i = 0; i < c->num_precode_items; i++) {
1445 unsigned precode_item = c->precode_items[i];
1446 unsigned precode_sym = precode_item & 0x1F;
1447 deflate_add_bits(os, c->precode_codewords[precode_sym],
1448 c->precode_lens[precode_sym]);
1449 if (precode_sym >= 16) {
1450 if (precode_sym == 16)
1451 deflate_add_bits(os, precode_item >> 5, 2);
1452 else if (precode_sym == 17)
1453 deflate_add_bits(os, precode_item >> 5, 3);
1454 else
1455 deflate_add_bits(os, precode_item >> 5, 7);
1456 }
1457 STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7));
1458 deflate_flush_bits(os);
1459 }
1460 }
1461
1462 static void
deflate_write_sequences(struct deflate_output_bitstream * restrict os,const struct deflate_codes * restrict codes,const struct deflate_sequence sequences[restrict],const u8 * restrict in_next)1463 deflate_write_sequences(struct deflate_output_bitstream * restrict os,
1464 const struct deflate_codes * restrict codes,
1465 const struct deflate_sequence sequences[restrict],
1466 const u8 * restrict in_next)
1467 {
1468 const struct deflate_sequence *seq = sequences;
1469
1470 for (;;) {
1471 u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
1472 unsigned length = seq->litrunlen_and_length >> 23;
1473 unsigned length_slot;
1474 unsigned litlen_symbol;
1475 unsigned offset_symbol;
1476
1477 if (litrunlen) {
1478 #if 1
1479 while (litrunlen >= 4) {
1480 unsigned lit0 = in_next[0];
1481 unsigned lit1 = in_next[1];
1482 unsigned lit2 = in_next[2];
1483 unsigned lit3 = in_next[3];
1484
1485 deflate_add_bits(os, codes->codewords.litlen[lit0],
1486 codes->lens.litlen[lit0]);
1487 if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1488 deflate_flush_bits(os);
1489
1490 deflate_add_bits(os, codes->codewords.litlen[lit1],
1491 codes->lens.litlen[lit1]);
1492 if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
1493 deflate_flush_bits(os);
1494
1495 deflate_add_bits(os, codes->codewords.litlen[lit2],
1496 codes->lens.litlen[lit2]);
1497 if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1498 deflate_flush_bits(os);
1499
1500 deflate_add_bits(os, codes->codewords.litlen[lit3],
1501 codes->lens.litlen[lit3]);
1502 deflate_flush_bits(os);
1503 in_next += 4;
1504 litrunlen -= 4;
1505 }
1506 if (litrunlen-- != 0) {
1507 deflate_add_bits(os, codes->codewords.litlen[*in_next],
1508 codes->lens.litlen[*in_next]);
1509 if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1510 deflate_flush_bits(os);
1511 in_next++;
1512 if (litrunlen-- != 0) {
1513 deflate_add_bits(os, codes->codewords.litlen[*in_next],
1514 codes->lens.litlen[*in_next]);
1515 if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1516 deflate_flush_bits(os);
1517 in_next++;
1518 if (litrunlen-- != 0) {
1519 deflate_add_bits(os, codes->codewords.litlen[*in_next],
1520 codes->lens.litlen[*in_next]);
1521 if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1522 deflate_flush_bits(os);
1523 in_next++;
1524 }
1525 }
1526 if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1527 deflate_flush_bits(os);
1528 }
1529 #else
1530 do {
1531 unsigned lit = *in_next++;
1532 deflate_add_bits(os, codes->codewords.litlen[lit],
1533 codes->lens.litlen[lit]);
1534 deflate_flush_bits(os);
1535 } while (--litrunlen);
1536 #endif
1537 }
1538
1539 if (length == 0)
1540 return;
1541
1542 in_next += length;
1543
1544 length_slot = seq->length_slot;
1545 litlen_symbol = 257 + length_slot;
1546
1547 /* Litlen symbol */
1548 deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1549 codes->lens.litlen[litlen_symbol]);
1550
1551 /* Extra length bits */
1552 STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1553 DEFLATE_MAX_EXTRA_LENGTH_BITS));
1554 deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1555 deflate_extra_length_bits[length_slot]);
1556
1557 if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1558 DEFLATE_MAX_EXTRA_LENGTH_BITS +
1559 MAX_OFFSET_CODEWORD_LEN +
1560 DEFLATE_MAX_EXTRA_OFFSET_BITS))
1561 deflate_flush_bits(os);
1562
1563 /* Offset symbol */
1564 offset_symbol = seq->offset_symbol;
1565 deflate_add_bits(os, codes->codewords.offset[offset_symbol],
1566 codes->lens.offset[offset_symbol]);
1567
1568 if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1569 DEFLATE_MAX_EXTRA_OFFSET_BITS))
1570 deflate_flush_bits(os);
1571
1572 /* Extra offset bits */
1573 deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
1574 deflate_extra_offset_bits[offset_symbol]);
1575
1576 deflate_flush_bits(os);
1577
1578 seq++;
1579 }
1580 }
1581
1582 #if SUPPORT_NEAR_OPTIMAL_PARSING
1583 /*
1584 * Follow the minimum-cost path in the graph of possible match/literal choices
1585 * for the current block and write out the matches/literals using the specified
1586 * Huffman codes.
1587 *
1588 * Note: this is slightly duplicated with deflate_write_sequences(), the reason
1589 * being that we don't want to waste time translating between intermediate
1590 * match/literal representations.
1591 */
1592 static void
deflate_write_item_list(struct deflate_output_bitstream * os,const struct deflate_codes * codes,struct libdeflate_compressor * c,u32 block_length)1593 deflate_write_item_list(struct deflate_output_bitstream *os,
1594 const struct deflate_codes *codes,
1595 struct libdeflate_compressor *c,
1596 u32 block_length)
1597 {
1598 struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
1599 struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
1600 do {
1601 unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
1602 unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1603 unsigned litlen_symbol;
1604 unsigned length_slot;
1605 unsigned offset_slot;
1606
1607 if (length == 1) {
1608 /* Literal */
1609 litlen_symbol = offset;
1610 deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1611 codes->lens.litlen[litlen_symbol]);
1612 deflate_flush_bits(os);
1613 } else {
1614 /* Match length */
1615 length_slot = deflate_length_slot[length];
1616 litlen_symbol = 257 + length_slot;
1617 deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1618 codes->lens.litlen[litlen_symbol]);
1619
1620 deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1621 deflate_extra_length_bits[length_slot]);
1622
1623 if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1624 DEFLATE_MAX_EXTRA_LENGTH_BITS +
1625 MAX_OFFSET_CODEWORD_LEN +
1626 DEFLATE_MAX_EXTRA_OFFSET_BITS))
1627 deflate_flush_bits(os);
1628
1629
1630 /* Match offset */
1631 offset_slot = deflate_get_offset_slot(c, offset);
1632 deflate_add_bits(os, codes->codewords.offset[offset_slot],
1633 codes->lens.offset[offset_slot]);
1634
1635 if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1636 DEFLATE_MAX_EXTRA_OFFSET_BITS))
1637 deflate_flush_bits(os);
1638
1639 deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
1640 deflate_extra_offset_bits[offset_slot]);
1641
1642 deflate_flush_bits(os);
1643 }
1644 cur_node += length;
1645 } while (cur_node != end_node);
1646 }
1647 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
1648
1649 /* Output the end-of-block symbol. */
1650 static void
deflate_write_end_of_block(struct deflate_output_bitstream * os,const struct deflate_codes * codes)1651 deflate_write_end_of_block(struct deflate_output_bitstream *os,
1652 const struct deflate_codes *codes)
1653 {
1654 deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
1655 codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
1656 deflate_flush_bits(os);
1657 }
1658
1659 static void
deflate_write_uncompressed_block(struct deflate_output_bitstream * os,const u8 * data,u16 len,bool is_final_block)1660 deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
1661 const u8 *data, u16 len,
1662 bool is_final_block)
1663 {
1664 deflate_write_block_header(os, is_final_block,
1665 DEFLATE_BLOCKTYPE_UNCOMPRESSED);
1666 deflate_align_bitstream(os);
1667
1668 if (4 + (u32)len >= os->end - os->next) {
1669 os->next = os->end;
1670 return;
1671 }
1672
1673 put_unaligned_le16(len, os->next);
1674 os->next += 2;
1675 put_unaligned_le16(~len, os->next);
1676 os->next += 2;
1677 memcpy(os->next, data, len);
1678 os->next += len;
1679 }
1680
1681 static void
deflate_write_uncompressed_blocks(struct deflate_output_bitstream * os,const u8 * data,u32 data_length,bool is_final_block)1682 deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
1683 const u8 *data, u32 data_length,
1684 bool is_final_block)
1685 {
1686 do {
1687 u16 len = MIN(data_length, UINT16_MAX);
1688
1689 deflate_write_uncompressed_block(os, data, len,
1690 is_final_block && len == data_length);
1691 data += len;
1692 data_length -= len;
1693 } while (data_length != 0);
1694 }
1695
1696 /*
1697 * Choose the best type of block to use (dynamic Huffman, static Huffman, or
1698 * uncompressed), then output it.
1699 */
1700 static void
deflate_flush_block(struct libdeflate_compressor * restrict c,struct deflate_output_bitstream * restrict os,const u8 * restrict block_begin,u32 block_length,bool is_final_block,bool use_item_list)1701 deflate_flush_block(struct libdeflate_compressor * restrict c,
1702 struct deflate_output_bitstream * restrict os,
1703 const u8 * restrict block_begin, u32 block_length,
1704 bool is_final_block, bool use_item_list)
1705 {
1706 static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
1707 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
1708 };
1709
1710 /* Costs are measured in bits */
1711 u32 dynamic_cost = 0;
1712 u32 static_cost = 0;
1713 u32 uncompressed_cost = 0;
1714 struct deflate_codes *codes;
1715 int block_type;
1716 unsigned sym;
1717
1718 /* Tally the end-of-block symbol. */
1719 c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
1720
1721 /* Build dynamic Huffman codes. */
1722 deflate_make_huffman_codes(&c->freqs, &c->codes);
1723
1724 /* Account for the cost of sending dynamic Huffman codes. */
1725 deflate_precompute_huffman_header(c);
1726 dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
1727 for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
1728 u32 extra = deflate_extra_precode_bits[sym];
1729 dynamic_cost += c->precode_freqs[sym] *
1730 (extra + c->precode_lens[sym]);
1731 }
1732
1733 /* Account for the cost of encoding literals. */
1734 for (sym = 0; sym < 256; sym++) {
1735 dynamic_cost += c->freqs.litlen[sym] *
1736 c->codes.lens.litlen[sym];
1737 }
1738 for (sym = 0; sym < 144; sym++)
1739 static_cost += c->freqs.litlen[sym] * 8;
1740 for (; sym < 256; sym++)
1741 static_cost += c->freqs.litlen[sym] * 9;
1742
1743 /* Account for the cost of encoding the end-of-block symbol. */
1744 dynamic_cost += c->codes.lens.litlen[256];
1745 static_cost += 7;
1746
1747 /* Account for the cost of encoding lengths. */
1748 for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) {
1749 u32 extra = deflate_extra_length_bits[sym - 257];
1750 dynamic_cost += c->freqs.litlen[sym] *
1751 (extra + c->codes.lens.litlen[sym]);
1752 static_cost += c->freqs.litlen[sym] *
1753 (extra + c->static_codes.lens.litlen[sym]);
1754 }
1755
1756 /* Account for the cost of encoding offsets. */
1757 for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
1758 u32 extra = deflate_extra_offset_bits[sym];
1759 dynamic_cost += c->freqs.offset[sym] *
1760 (extra + c->codes.lens.offset[sym]);
1761 static_cost += c->freqs.offset[sym] * (extra + 5);
1762 }
1763
1764 /* Compute the cost of using uncompressed blocks. */
1765 uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
1766 (40 * (DIV_ROUND_UP(block_length,
1767 UINT16_MAX) - 1)) +
1768 (8 * block_length);
1769
1770 /* Choose the cheapest block type. */
1771 if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
1772 block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
1773 codes = &c->codes;
1774 } else if (static_cost < uncompressed_cost) {
1775 block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
1776 codes = &c->static_codes;
1777 } else {
1778 block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
1779 }
1780
1781 /* Now actually output the block. */
1782
1783 if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
1784 /* Note: the length being flushed may exceed the maximum length
1785 * of an uncompressed block (65535 bytes). Therefore, more than
1786 * one uncompressed block might be needed. */
1787 deflate_write_uncompressed_blocks(os, block_begin, block_length,
1788 is_final_block);
1789 } else {
1790 /* Output the block header. */
1791 deflate_write_block_header(os, is_final_block, block_type);
1792
1793 /* Output the Huffman codes (dynamic Huffman blocks only). */
1794 if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
1795 deflate_write_huffman_header(c, os);
1796
1797 /* Output the literals, matches, and end-of-block symbol. */
1798 #if SUPPORT_NEAR_OPTIMAL_PARSING
1799 if (use_item_list)
1800 deflate_write_item_list(os, codes, c, block_length);
1801 else
1802 #endif
1803 deflate_write_sequences(os, codes, c->p.g.sequences,
1804 block_begin);
1805 deflate_write_end_of_block(os, codes);
1806 }
1807 }
1808
1809 static forceinline void
deflate_choose_literal(struct libdeflate_compressor * c,unsigned literal,u32 * litrunlen_p)1810 deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
1811 u32 *litrunlen_p)
1812 {
1813 c->freqs.litlen[literal]++;
1814 ++*litrunlen_p;
1815 }
1816
1817 static forceinline void
deflate_choose_match(struct libdeflate_compressor * c,unsigned length,unsigned offset,u32 * litrunlen_p,struct deflate_sequence ** next_seq_p)1818 deflate_choose_match(struct libdeflate_compressor *c,
1819 unsigned length, unsigned offset,
1820 u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
1821 {
1822 struct deflate_sequence *seq = *next_seq_p;
1823 unsigned length_slot = deflate_length_slot[length];
1824 unsigned offset_slot = deflate_get_offset_slot(c, offset);
1825
1826 c->freqs.litlen[257 + length_slot]++;
1827 c->freqs.offset[offset_slot]++;
1828
1829 seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
1830 seq->offset = offset;
1831 seq->length_slot = length_slot;
1832 seq->offset_symbol = offset_slot;
1833
1834 *litrunlen_p = 0;
1835 *next_seq_p = seq + 1;
1836 }
1837
1838 static forceinline void
deflate_finish_sequence(struct deflate_sequence * seq,u32 litrunlen)1839 deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
1840 {
1841 seq->litrunlen_and_length = litrunlen; /* length = 0 */
1842 }
1843
1844 /******************************************************************************/
1845
1846 /*
1847 * Block splitting algorithm. The problem is to decide when it is worthwhile to
1848 * start a new block with new Huffman codes. There is a theoretically optimal
1849 * solution: recursively consider every possible block split, considering the
1850 * exact cost of each block, and choose the minimum cost approach. But this is
1851 * far too slow. Instead, as an approximation, we can count symbols and after
1852 * every N symbols, compare the expected distribution of symbols based on the
1853 * previous data with the actual distribution. If they differ "by enough", then
1854 * start a new block.
1855 *
1856 * As an optimization and heuristic, we don't distinguish between every symbol
1857 * but rather we combine many symbols into a single "observation type". For
1858 * literals we only look at the high bits and low bits, and for matches we only
1859 * look at whether the match is long or not. The assumption is that for typical
1860 * "real" data, places that are good block boundaries will tend to be noticable
1861 * based only on changes in these aggregate frequencies, without looking for
1862 * subtle differences in individual symbols. For example, a change from ASCII
1863 * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1864 * to many matches (generally more compressible), would be easily noticed based
1865 * on the aggregates.
1866 *
1867 * For determining whether the frequency distributions are "different enough" to
1868 * start a new block, the simply heuristic of splitting when the sum of absolute
1869 * differences exceeds a constant seems to be good enough. We also add a number
1870 * proportional to the block length so that the algorithm is more likely to end
1871 * long blocks than short blocks. This reflects the general expectation that it
1872 * will become increasingly beneficial to start a new block as the current
1873 * block grows longer.
1874 *
1875 * Finally, for an approximation, it is not strictly necessary that the exact
1876 * symbols being used are considered. With "near-optimal parsing", for example,
1877 * the actual symbols that will be used are unknown until after the block
1878 * boundary is chosen and the block has been optimized. Since the final choices
1879 * cannot be used, we can use preliminary "greedy" choices instead.
1880 */
1881
1882 /* Initialize the block split statistics when starting a new block. */
1883 static void
init_block_split_stats(struct block_split_stats * stats)1884 init_block_split_stats(struct block_split_stats *stats)
1885 {
1886 int i;
1887
1888 for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1889 stats->new_observations[i] = 0;
1890 stats->observations[i] = 0;
1891 }
1892 stats->num_new_observations = 0;
1893 stats->num_observations = 0;
1894 }
1895
1896 /* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the
1897 * literal, for 8 possible literal observation types. */
1898 static forceinline void
observe_literal(struct block_split_stats * stats,u8 lit)1899 observe_literal(struct block_split_stats *stats, u8 lit)
1900 {
1901 stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1902 stats->num_new_observations++;
1903 }
1904
1905 /* Match observation. Heuristic: use one observation type for "short match" and
1906 * one observation type for "long match". */
1907 static forceinline void
observe_match(struct block_split_stats * stats,unsigned length)1908 observe_match(struct block_split_stats *stats, unsigned length)
1909 {
1910 stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
1911 stats->num_new_observations++;
1912 }
1913
1914 static bool
do_end_block_check(struct block_split_stats * stats,u32 block_length)1915 do_end_block_check(struct block_split_stats *stats, u32 block_length)
1916 {
1917 int i;
1918
1919 if (stats->num_observations > 0) {
1920
1921 /* Note: to avoid slow divisions, we do not divide by
1922 * 'num_observations', but rather do all math with the numbers
1923 * multiplied by 'num_observations'. */
1924 u32 total_delta = 0;
1925 for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1926 u32 expected = stats->observations[i] * stats->num_new_observations;
1927 u32 actual = stats->new_observations[i] * stats->num_observations;
1928 u32 delta = (actual > expected) ? actual - expected :
1929 expected - actual;
1930 total_delta += delta;
1931 }
1932
1933 /* Ready to end the block? */
1934 if (total_delta + (block_length / 4096) * stats->num_observations >=
1935 NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
1936 return true;
1937 }
1938
1939 for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1940 stats->num_observations += stats->new_observations[i];
1941 stats->observations[i] += stats->new_observations[i];
1942 stats->new_observations[i] = 0;
1943 }
1944 stats->num_new_observations = 0;
1945 return false;
1946 }
1947
1948 static forceinline bool
should_end_block(struct block_split_stats * stats,const u8 * in_block_begin,const u8 * in_next,const u8 * in_end)1949 should_end_block(struct block_split_stats *stats,
1950 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
1951 {
1952 /* Ready to check block split statistics? */
1953 if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
1954 in_next - in_block_begin < MIN_BLOCK_LENGTH ||
1955 in_end - in_next < MIN_BLOCK_LENGTH)
1956 return false;
1957
1958 return do_end_block_check(stats, in_next - in_block_begin);
1959 }
1960
1961 /******************************************************************************/
1962
1963 /*
1964 * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
1965 */
1966 static size_t
deflate_compress_greedy(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)1967 deflate_compress_greedy(struct libdeflate_compressor * restrict c,
1968 const u8 * restrict in, size_t in_nbytes,
1969 u8 * restrict out, size_t out_nbytes_avail)
1970 {
1971 const u8 *in_next = in;
1972 const u8 *in_end = in_next + in_nbytes;
1973 struct deflate_output_bitstream os;
1974 const u8 *in_cur_base = in_next;
1975 unsigned max_len = DEFLATE_MAX_MATCH_LEN;
1976 unsigned nice_len = MIN(c->nice_match_length, max_len);
1977 u32 next_hashes[2] = {0, 0};
1978
1979 deflate_init_output(&os, out, out_nbytes_avail);
1980 hc_matchfinder_init(&c->p.g.hc_mf);
1981
1982 do {
1983 /* Starting a new DEFLATE block. */
1984
1985 const u8 * const in_block_begin = in_next;
1986 const u8 * const in_max_block_end =
1987 in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
1988 u32 litrunlen = 0;
1989 struct deflate_sequence *next_seq = c->p.g.sequences;
1990
1991 init_block_split_stats(&c->split_stats);
1992 deflate_reset_symbol_frequencies(c);
1993
1994 do {
1995 u32 length;
1996 u32 offset;
1997
1998 /* Decrease the maximum and nice match lengths if we're
1999 * approaching the end of the input buffer. */
2000 if (unlikely(max_len > in_end - in_next)) {
2001 max_len = in_end - in_next;
2002 nice_len = MIN(nice_len, max_len);
2003 }
2004
2005 length = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2006 &in_cur_base,
2007 in_next,
2008 DEFLATE_MIN_MATCH_LEN - 1,
2009 max_len,
2010 nice_len,
2011 c->max_search_depth,
2012 next_hashes,
2013 &offset);
2014
2015 if (length >= DEFLATE_MIN_MATCH_LEN) {
2016 /* Match found. */
2017 deflate_choose_match(c, length, offset,
2018 &litrunlen, &next_seq);
2019 observe_match(&c->split_stats, length);
2020 in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2021 &in_cur_base,
2022 in_next + 1,
2023 in_end,
2024 length - 1,
2025 next_hashes);
2026 } else {
2027 /* No match found. */
2028 deflate_choose_literal(c, *in_next, &litrunlen);
2029 observe_literal(&c->split_stats, *in_next);
2030 in_next++;
2031 }
2032
2033 /* Check if it's time to output another block. */
2034 } while (in_next < in_max_block_end &&
2035 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2036
2037 deflate_finish_sequence(next_seq, litrunlen);
2038 deflate_flush_block(c, &os, in_block_begin,
2039 in_next - in_block_begin,
2040 in_next == in_end, false);
2041 } while (in_next != in_end);
2042
2043 return deflate_flush_output(&os);
2044 }
2045
2046 /*
2047 * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to
2048 * see if there's a longer match at the next position. If yes, it outputs a
2049 * literal and continues to the next position. If no, it outputs the match.
2050 */
2051 static size_t
deflate_compress_lazy(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)2052 deflate_compress_lazy(struct libdeflate_compressor * restrict c,
2053 const u8 * restrict in, size_t in_nbytes,
2054 u8 * restrict out, size_t out_nbytes_avail)
2055 {
2056 const u8 *in_next = in;
2057 const u8 *in_end = in_next + in_nbytes;
2058 struct deflate_output_bitstream os;
2059 const u8 *in_cur_base = in_next;
2060 unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2061 unsigned nice_len = MIN(c->nice_match_length, max_len);
2062 u32 next_hashes[2] = {0, 0};
2063
2064 deflate_init_output(&os, out, out_nbytes_avail);
2065 hc_matchfinder_init(&c->p.g.hc_mf);
2066
2067 do {
2068 /* Starting a new DEFLATE block. */
2069
2070 const u8 * const in_block_begin = in_next;
2071 const u8 * const in_max_block_end =
2072 in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2073 u32 litrunlen = 0;
2074 struct deflate_sequence *next_seq = c->p.g.sequences;
2075
2076 init_block_split_stats(&c->split_stats);
2077 deflate_reset_symbol_frequencies(c);
2078
2079 do {
2080 unsigned cur_len;
2081 unsigned cur_offset;
2082 unsigned next_len;
2083 unsigned next_offset;
2084
2085 if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2086 max_len = in_end - in_next;
2087 nice_len = MIN(nice_len, max_len);
2088 }
2089
2090 /* Find the longest match at the current position. */
2091 cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2092 &in_cur_base,
2093 in_next,
2094 DEFLATE_MIN_MATCH_LEN - 1,
2095 max_len,
2096 nice_len,
2097 c->max_search_depth,
2098 next_hashes,
2099 &cur_offset);
2100 in_next += 1;
2101
2102 if (cur_len < DEFLATE_MIN_MATCH_LEN) {
2103 /* No match found. Choose a literal. */
2104 deflate_choose_literal(c, *(in_next - 1), &litrunlen);
2105 observe_literal(&c->split_stats, *(in_next - 1));
2106 continue;
2107 }
2108
2109 have_cur_match:
2110 observe_match(&c->split_stats, cur_len);
2111
2112 /* We have a match at the current position. */
2113
2114 /* If the current match is very long, choose it
2115 * immediately. */
2116 if (cur_len >= nice_len) {
2117 deflate_choose_match(c, cur_len, cur_offset,
2118 &litrunlen, &next_seq);
2119 in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2120 &in_cur_base,
2121 in_next,
2122 in_end,
2123 cur_len - 1,
2124 next_hashes);
2125 continue;
2126 }
2127
2128 /*
2129 * Try to find a match at the next position.
2130 *
2131 * Note: since we already have a match at the *current*
2132 * position, we use only half the 'max_search_depth'
2133 * when checking the *next* position. This is a useful
2134 * trade-off because it's more worthwhile to use a
2135 * greater search depth on the initial match.
2136 *
2137 * Note: it's possible to structure the code such that
2138 * there's only one call to longest_match(), which
2139 * handles both the "find the initial match" and "try to
2140 * find a longer match" cases. However, it is faster to
2141 * have two call sites, with longest_match() inlined at
2142 * each.
2143 */
2144 if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2145 max_len = in_end - in_next;
2146 nice_len = MIN(nice_len, max_len);
2147 }
2148 next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2149 &in_cur_base,
2150 in_next,
2151 cur_len,
2152 max_len,
2153 nice_len,
2154 c->max_search_depth / 2,
2155 next_hashes,
2156 &next_offset);
2157 in_next += 1;
2158
2159 if (next_len > cur_len) {
2160 /* Found a longer match at the next position.
2161 * Output a literal. Then the next match
2162 * becomes the current match. */
2163 deflate_choose_literal(c, *(in_next - 2), &litrunlen);
2164 cur_len = next_len;
2165 cur_offset = next_offset;
2166 goto have_cur_match;
2167 }
2168
2169 /* No longer match at the next position.
2170 * Output the current match. */
2171 deflate_choose_match(c, cur_len, cur_offset,
2172 &litrunlen, &next_seq);
2173 in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2174 &in_cur_base,
2175 in_next,
2176 in_end,
2177 cur_len - 2,
2178 next_hashes);
2179
2180 /* Check if it's time to output another block. */
2181 } while (in_next < in_max_block_end &&
2182 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2183
2184 deflate_finish_sequence(next_seq, litrunlen);
2185 deflate_flush_block(c, &os, in_block_begin,
2186 in_next - in_block_begin,
2187 in_next == in_end, false);
2188 } while (in_next != in_end);
2189
2190 return deflate_flush_output(&os);
2191 }
2192
2193 #if SUPPORT_NEAR_OPTIMAL_PARSING
2194
2195 /*
2196 * Follow the minimum-cost path in the graph of possible match/literal choices
2197 * for the current block and compute the frequencies of the Huffman symbols that
2198 * would be needed to output those matches and literals.
2199 */
2200 static void
deflate_tally_item_list(struct libdeflate_compressor * c,u32 block_length)2201 deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
2202 {
2203 struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
2204 struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2205 do {
2206 unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
2207 unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
2208
2209 if (length == 1) {
2210 /* Literal */
2211 c->freqs.litlen[offset]++;
2212 } else {
2213 /* Match */
2214 c->freqs.litlen[257 + deflate_length_slot[length]]++;
2215 c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
2216 }
2217 cur_node += length;
2218 } while (cur_node != end_node);
2219 }
2220
2221 /* Set the current cost model from the codeword lengths specified in @lens. */
2222 static void
deflate_set_costs_from_codes(struct libdeflate_compressor * c,const struct deflate_lens * lens)2223 deflate_set_costs_from_codes(struct libdeflate_compressor *c,
2224 const struct deflate_lens *lens)
2225 {
2226 unsigned i;
2227
2228 /* Literals */
2229 for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
2230 u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
2231 c->p.n.costs.literal[i] = bits << COST_SHIFT;
2232 }
2233
2234 /* Lengths */
2235 for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
2236 unsigned length_slot = deflate_length_slot[i];
2237 unsigned litlen_sym = 257 + length_slot;
2238 u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
2239 bits += deflate_extra_length_bits[length_slot];
2240 c->p.n.costs.length[i] = bits << COST_SHIFT;
2241 }
2242
2243 /* Offset slots */
2244 for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
2245 u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
2246 bits += deflate_extra_offset_bits[i];
2247 c->p.n.costs.offset_slot[i] = bits << COST_SHIFT;
2248 }
2249 }
2250
2251 static forceinline u32
deflate_default_literal_cost(unsigned literal)2252 deflate_default_literal_cost(unsigned literal)
2253 {
2254 STATIC_ASSERT(COST_SHIFT == 3);
2255 /* 66 is 8.25 bits/symbol */
2256 return 66;
2257 }
2258
2259 static forceinline u32
deflate_default_length_slot_cost(unsigned length_slot)2260 deflate_default_length_slot_cost(unsigned length_slot)
2261 {
2262 STATIC_ASSERT(COST_SHIFT == 3);
2263 /* 60 is 7.5 bits/symbol */
2264 return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
2265 }
2266
2267 static forceinline u32
deflate_default_offset_slot_cost(unsigned offset_slot)2268 deflate_default_offset_slot_cost(unsigned offset_slot)
2269 {
2270 STATIC_ASSERT(COST_SHIFT == 3);
2271 /* 39 is 4.875 bits/symbol */
2272 return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
2273 }
2274
2275 /*
2276 * Set default symbol costs for the first block's first optimization pass.
2277 *
2278 * It works well to assume that each symbol is equally probable. This results
2279 * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
2280 * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
2281 * alphabet. However, we intentionally bias the parse towards matches rather
2282 * than literals by using a slightly lower default cost for length symbols than
2283 * for literals. This often improves the compression ratio slightly.
2284 */
2285 static void
deflate_set_default_costs(struct libdeflate_compressor * c)2286 deflate_set_default_costs(struct libdeflate_compressor *c)
2287 {
2288 unsigned i;
2289
2290 /* Literals */
2291 for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2292 c->p.n.costs.literal[i] = deflate_default_literal_cost(i);
2293
2294 /* Lengths */
2295 for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2296 c->p.n.costs.length[i] = deflate_default_length_slot_cost(
2297 deflate_length_slot[i]);
2298
2299 /* Offset slots */
2300 for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2301 c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
2302 }
2303
2304 static forceinline void
deflate_adjust_cost(u32 * cost_p,u32 default_cost)2305 deflate_adjust_cost(u32 *cost_p, u32 default_cost)
2306 {
2307 *cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
2308 }
2309
2310 /*
2311 * Adjust the costs when beginning a new block.
2312 *
2313 * Since the current costs have been optimized for the data, it's undesirable to
2314 * throw them away and start over with the default costs. At the same time, we
2315 * don't want to bias the parse by assuming that the next block will be similar
2316 * to the current block. As a compromise, make the costs closer to the
2317 * defaults, but don't simply set them to the defaults.
2318 */
2319 static void
deflate_adjust_costs(struct libdeflate_compressor * c)2320 deflate_adjust_costs(struct libdeflate_compressor *c)
2321 {
2322 unsigned i;
2323
2324 /* Literals */
2325 for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2326 deflate_adjust_cost(&c->p.n.costs.literal[i],
2327 deflate_default_literal_cost(i));
2328
2329 /* Lengths */
2330 for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2331 deflate_adjust_cost(&c->p.n.costs.length[i],
2332 deflate_default_length_slot_cost(
2333 deflate_length_slot[i]));
2334
2335 /* Offset slots */
2336 for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2337 deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
2338 deflate_default_offset_slot_cost(i));
2339 }
2340
2341 /*
2342 * Find the minimum-cost path through the graph of possible match/literal
2343 * choices for this block.
2344 *
2345 * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
2346 * represents the node at the beginning of the block, to
2347 * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
2348 * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'.
2349 *
2350 * The algorithm works backwards, starting at the end node and proceeding
2351 * backwards one node at a time. At each node, the minimum cost to reach the
2352 * end node is computed and the match/literal choice that begins that path is
2353 * saved.
2354 */
2355 static void
deflate_find_min_cost_path(struct libdeflate_compressor * c,const u32 block_length,const struct lz_match * cache_ptr)2356 deflate_find_min_cost_path(struct libdeflate_compressor *c,
2357 const u32 block_length,
2358 const struct lz_match *cache_ptr)
2359 {
2360 struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2361 struct deflate_optimum_node *cur_node = end_node;
2362
2363 cur_node->cost_to_end = 0;
2364 do {
2365 unsigned num_matches;
2366 unsigned literal;
2367 u32 best_cost_to_end;
2368
2369 cur_node--;
2370 cache_ptr--;
2371
2372 num_matches = cache_ptr->length;
2373 literal = cache_ptr->offset;
2374
2375 /* It's always possible to choose a literal. */
2376 best_cost_to_end = c->p.n.costs.literal[literal] +
2377 (cur_node + 1)->cost_to_end;
2378 cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
2379
2380 /* Also consider matches if there are any. */
2381 if (num_matches) {
2382 const struct lz_match *match;
2383 unsigned len;
2384 unsigned offset;
2385 unsigned offset_slot;
2386 u32 offset_cost;
2387 u32 cost_to_end;
2388
2389 /*
2390 * Consider each length from the minimum
2391 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
2392 * match found at this position. For each length, we
2393 * consider only the smallest offset for which that
2394 * length is available. Although this is not guaranteed
2395 * to be optimal due to the possibility of a larger
2396 * offset costing less than a smaller offset to code,
2397 * this is a very useful heuristic.
2398 */
2399 match = cache_ptr - num_matches;
2400 len = DEFLATE_MIN_MATCH_LEN;
2401 do {
2402 offset = match->offset;
2403 offset_slot = deflate_get_offset_slot(c, offset);
2404 offset_cost = c->p.n.costs.offset_slot[offset_slot];
2405 do {
2406 cost_to_end = offset_cost +
2407 c->p.n.costs.length[len] +
2408 (cur_node + len)->cost_to_end;
2409 if (cost_to_end < best_cost_to_end) {
2410 best_cost_to_end = cost_to_end;
2411 cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
2412 }
2413 } while (++len <= match->length);
2414 } while (++match != cache_ptr);
2415 cache_ptr -= num_matches;
2416 }
2417 cur_node->cost_to_end = best_cost_to_end;
2418 } while (cur_node != &c->p.n.optimum_nodes[0]);
2419 }
2420
2421 /*
2422 * Choose the literal/match sequence to use for the current block. The basic
2423 * algorithm finds a minimum-cost path through the block's graph of
2424 * literal/match choices, given a cost model. However, the cost of each symbol
2425 * is unknown until the Huffman codes have been built, but at the same time the
2426 * Huffman codes depend on the frequencies of chosen symbols. Consequently,
2427 * multiple passes must be used to try to approximate an optimal solution. The
2428 * first pass uses default costs, mixed with the costs from the previous block
2429 * if any. Later passes use the Huffman codeword lengths from the previous pass
2430 * as the costs.
2431 */
2432 static void
deflate_optimize_block(struct libdeflate_compressor * c,u32 block_length,const struct lz_match * cache_ptr,bool is_first_block)2433 deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
2434 const struct lz_match *cache_ptr, bool is_first_block)
2435 {
2436 unsigned num_passes_remaining = c->p.n.num_optim_passes;
2437 u32 i;
2438
2439 /* Force the block to really end at the desired length, even if some
2440 * matches extend beyond it. */
2441 for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
2442 ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
2443 c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
2444
2445 /* Set the initial costs. */
2446 if (is_first_block)
2447 deflate_set_default_costs(c);
2448 else
2449 deflate_adjust_costs(c);
2450
2451 for (;;) {
2452 /* Find the minimum cost path for this pass. */
2453 deflate_find_min_cost_path(c, block_length, cache_ptr);
2454
2455 /* Compute frequencies of the chosen symbols. */
2456 deflate_reset_symbol_frequencies(c);
2457 deflate_tally_item_list(c, block_length);
2458
2459 if (--num_passes_remaining == 0)
2460 break;
2461
2462 /* At least one optimization pass remains; update the costs. */
2463 deflate_make_huffman_codes(&c->freqs, &c->codes);
2464 deflate_set_costs_from_codes(c, &c->codes.lens);
2465 }
2466 }
2467
2468 /*
2469 * This is the "near-optimal" DEFLATE compressor. It computes the optimal
2470 * representation of each DEFLATE block using a minimum-cost path search over
2471 * the graph of possible match/literal choices for that block, assuming a
2472 * certain cost for each Huffman symbol.
2473 *
2474 * For several reasons, the end result is not guaranteed to be optimal:
2475 *
2476 * - Nonoptimal choice of blocks
2477 * - Heuristic limitations on which matches are actually considered
2478 * - Symbol costs are unknown until the symbols have already been chosen
2479 * (so iterative optimization must be used)
2480 */
2481 static size_t
deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,const u8 * restrict in,size_t in_nbytes,u8 * restrict out,size_t out_nbytes_avail)2482 deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
2483 const u8 * restrict in, size_t in_nbytes,
2484 u8 * restrict out, size_t out_nbytes_avail)
2485 {
2486 const u8 *in_next = in;
2487 const u8 *in_end = in_next + in_nbytes;
2488 struct deflate_output_bitstream os;
2489 const u8 *in_cur_base = in_next;
2490 const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
2491 unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2492 unsigned nice_len = MIN(c->nice_match_length, max_len);
2493 u32 next_hashes[2] = {0, 0};
2494
2495 deflate_init_output(&os, out, out_nbytes_avail);
2496 bt_matchfinder_init(&c->p.n.bt_mf);
2497
2498 do {
2499 /* Starting a new DEFLATE block. */
2500
2501 struct lz_match *cache_ptr = c->p.n.match_cache;
2502 const u8 * const in_block_begin = in_next;
2503 const u8 * const in_max_block_end =
2504 in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2505 const u8 *next_observation = in_next;
2506
2507 init_block_split_stats(&c->split_stats);
2508
2509 /*
2510 * Find matches until we decide to end the block. We end the
2511 * block if any of the following is true:
2512 *
2513 * (1) Maximum block length has been reached
2514 * (2) Match catch may overflow.
2515 * (3) Block split heuristic says to split now.
2516 */
2517 do {
2518 struct lz_match *matches;
2519 unsigned best_len;
2520
2521 /* Slide the window forward if needed. */
2522 if (in_next == in_next_slide) {
2523 bt_matchfinder_slide_window(&c->p.n.bt_mf);
2524 in_cur_base = in_next;
2525 in_next_slide = in_next + MIN(in_end - in_next,
2526 MATCHFINDER_WINDOW_SIZE);
2527 }
2528
2529 /* Decrease the maximum and nice match lengths if we're
2530 * approaching the end of the input buffer. */
2531 if (unlikely(max_len > in_end - in_next)) {
2532 max_len = in_end - in_next;
2533 nice_len = MIN(nice_len, max_len);
2534 }
2535
2536 /*
2537 * Find matches with the current position using the
2538 * binary tree matchfinder and save them in
2539 * 'match_cache'.
2540 *
2541 * Note: the binary tree matchfinder is more suited for
2542 * optimal parsing than the hash chain matchfinder. The
2543 * reasons for this include:
2544 *
2545 * - The binary tree matchfinder can find more matches
2546 * in the same number of steps.
2547 * - One of the major advantages of hash chains is that
2548 * skipping positions (not searching for matches at
2549 * them) is faster; however, with optimal parsing we
2550 * search for matches at almost all positions, so this
2551 * advantage of hash chains is negated.
2552 */
2553 matches = cache_ptr;
2554 best_len = 0;
2555 if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
2556 cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf,
2557 in_cur_base,
2558 in_next - in_cur_base,
2559 max_len,
2560 nice_len,
2561 c->max_search_depth,
2562 next_hashes,
2563 &best_len,
2564 matches);
2565 }
2566
2567 if (in_next >= next_observation) {
2568 if (best_len >= 4) {
2569 observe_match(&c->split_stats, best_len);
2570 next_observation = in_next + best_len;
2571 } else {
2572 observe_literal(&c->split_stats, *in_next);
2573 next_observation = in_next + 1;
2574 }
2575 }
2576
2577 cache_ptr->length = cache_ptr - matches;
2578 cache_ptr->offset = *in_next;
2579 in_next++;
2580 cache_ptr++;
2581
2582 /*
2583 * If there was a very long match found, don't cache any
2584 * matches for the bytes covered by that match. This
2585 * avoids degenerate behavior when compressing highly
2586 * redundant data, where the number of matches can be
2587 * very large.
2588 *
2589 * This heuristic doesn't actually hurt the compression
2590 * ratio very much. If there's a long match, then the
2591 * data must be highly compressible, so it doesn't
2592 * matter much what we do.
2593 */
2594 if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
2595 --best_len;
2596 do {
2597 if (in_next == in_next_slide) {
2598 bt_matchfinder_slide_window(&c->p.n.bt_mf);
2599 in_cur_base = in_next;
2600 in_next_slide = in_next + MIN(in_end - in_next,
2601 MATCHFINDER_WINDOW_SIZE);
2602 }
2603 if (unlikely(max_len > in_end - in_next)) {
2604 max_len = in_end - in_next;
2605 nice_len = MIN(nice_len, max_len);
2606 }
2607 if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) {
2608 bt_matchfinder_skip_position(&c->p.n.bt_mf,
2609 in_cur_base,
2610 in_next - in_cur_base,
2611 nice_len,
2612 c->max_search_depth,
2613 next_hashes);
2614 }
2615 cache_ptr->length = 0;
2616 cache_ptr->offset = *in_next;
2617 in_next++;
2618 cache_ptr++;
2619 } while (--best_len);
2620 }
2621 } while (in_next < in_max_block_end &&
2622 cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
2623 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2624
2625 /* All the matches for this block have been cached. Now choose
2626 * the sequence of items to output and flush the block. */
2627 deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
2628 in_block_begin == in);
2629 deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
2630 in_next == in_end, true);
2631 } while (in_next != in_end);
2632
2633 return deflate_flush_output(&os);
2634 }
2635
2636 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
2637
2638 /* Initialize c->offset_slot_fast. */
2639 static void
deflate_init_offset_slot_fast(struct libdeflate_compressor * c)2640 deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
2641 {
2642 unsigned offset_slot;
2643 unsigned offset;
2644 unsigned offset_end;
2645
2646 for (offset_slot = 0;
2647 offset_slot < ARRAY_LEN(deflate_offset_slot_base);
2648 offset_slot++)
2649 {
2650 offset = deflate_offset_slot_base[offset_slot];
2651 #if USE_FULL_OFFSET_SLOT_FAST
2652 offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2653 do {
2654 c->offset_slot_fast[offset] = offset_slot;
2655 } while (++offset != offset_end);
2656 #else
2657 if (offset <= 256) {
2658 offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2659 do {
2660 c->offset_slot_fast[offset - 1] = offset_slot;
2661 } while (++offset != offset_end);
2662 } else {
2663 offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2664 do {
2665 c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
2666 } while ((offset += (1 << 7)) != offset_end);
2667 }
2668 #endif
2669 }
2670 }
2671
2672 LIBDEFLATEAPI struct libdeflate_compressor *
libdeflate_alloc_compressor(int compression_level)2673 libdeflate_alloc_compressor(int compression_level)
2674 {
2675 struct libdeflate_compressor *c;
2676 size_t size;
2677
2678 #if SUPPORT_NEAR_OPTIMAL_PARSING
2679 if (compression_level >= 8)
2680 size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.n);
2681 else
2682 #endif
2683 size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.g);
2684
2685 c = aligned_malloc(MATCHFINDER_ALIGNMENT, size);
2686 if (!c)
2687 return NULL;
2688
2689 switch (compression_level) {
2690 case 1:
2691 c->impl = deflate_compress_greedy;
2692 c->max_search_depth = 2;
2693 c->nice_match_length = 8;
2694 break;
2695 case 2:
2696 c->impl = deflate_compress_greedy;
2697 c->max_search_depth = 6;
2698 c->nice_match_length = 10;
2699 break;
2700 case 3:
2701 c->impl = deflate_compress_greedy;
2702 c->max_search_depth = 12;
2703 c->nice_match_length = 14;
2704 break;
2705 case 4:
2706 c->impl = deflate_compress_greedy;
2707 c->max_search_depth = 24;
2708 c->nice_match_length = 24;
2709 break;
2710 case 5:
2711 c->impl = deflate_compress_lazy;
2712 c->max_search_depth = 20;
2713 c->nice_match_length = 30;
2714 break;
2715 case 6:
2716 c->impl = deflate_compress_lazy;
2717 c->max_search_depth = 40;
2718 c->nice_match_length = 65;
2719 break;
2720 case 7:
2721 c->impl = deflate_compress_lazy;
2722 c->max_search_depth = 100;
2723 c->nice_match_length = 130;
2724 break;
2725 #if SUPPORT_NEAR_OPTIMAL_PARSING
2726 case 8:
2727 c->impl = deflate_compress_near_optimal;
2728 c->max_search_depth = 12;
2729 c->nice_match_length = 20;
2730 c->p.n.num_optim_passes = 1;
2731 break;
2732 case 9:
2733 c->impl = deflate_compress_near_optimal;
2734 c->max_search_depth = 16;
2735 c->nice_match_length = 26;
2736 c->p.n.num_optim_passes = 2;
2737 break;
2738 case 10:
2739 c->impl = deflate_compress_near_optimal;
2740 c->max_search_depth = 30;
2741 c->nice_match_length = 50;
2742 c->p.n.num_optim_passes = 2;
2743 break;
2744 case 11:
2745 c->impl = deflate_compress_near_optimal;
2746 c->max_search_depth = 60;
2747 c->nice_match_length = 80;
2748 c->p.n.num_optim_passes = 3;
2749 break;
2750 case 12:
2751 c->impl = deflate_compress_near_optimal;
2752 c->max_search_depth = 100;
2753 c->nice_match_length = 133;
2754 c->p.n.num_optim_passes = 4;
2755 break;
2756 #else
2757 case 8:
2758 c->impl = deflate_compress_lazy;
2759 c->max_search_depth = 150;
2760 c->nice_match_length = 200;
2761 break;
2762 case 9:
2763 c->impl = deflate_compress_lazy;
2764 c->max_search_depth = 200;
2765 c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
2766 break;
2767 #endif
2768 default:
2769 aligned_free(c);
2770 return NULL;
2771 }
2772
2773 c->compression_level = compression_level;
2774
2775 deflate_init_offset_slot_fast(c);
2776 deflate_init_static_codes(c);
2777
2778 return c;
2779 }
2780
2781 LIBDEFLATEAPI size_t
libdeflate_deflate_compress(struct libdeflate_compressor * c,const void * in,size_t in_nbytes,void * out,size_t out_nbytes_avail)2782 libdeflate_deflate_compress(struct libdeflate_compressor *c,
2783 const void *in, size_t in_nbytes,
2784 void *out, size_t out_nbytes_avail)
2785 {
2786 if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
2787 return 0;
2788
2789 /* For extremely small inputs just use a single uncompressed block. */
2790 if (unlikely(in_nbytes < 16)) {
2791 struct deflate_output_bitstream os;
2792 deflate_init_output(&os, out, out_nbytes_avail);
2793 if (in_nbytes == 0)
2794 in = &os; /* Avoid passing NULL to memcpy() */
2795 deflate_write_uncompressed_block(&os, in, in_nbytes, true);
2796 return deflate_flush_output(&os);
2797 }
2798
2799 return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
2800 }
2801
2802 LIBDEFLATEAPI void
libdeflate_free_compressor(struct libdeflate_compressor * c)2803 libdeflate_free_compressor(struct libdeflate_compressor *c)
2804 {
2805 aligned_free(c);
2806 }
2807
2808 unsigned int
deflate_get_compression_level(struct libdeflate_compressor * c)2809 deflate_get_compression_level(struct libdeflate_compressor *c)
2810 {
2811 return c->compression_level;
2812 }
2813
2814 LIBDEFLATEAPI size_t
libdeflate_deflate_compress_bound(struct libdeflate_compressor * c,size_t in_nbytes)2815 libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
2816 size_t in_nbytes)
2817 {
2818 /*
2819 * The worst case is all uncompressed blocks where one block has length
2820 * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
2821 * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
2822 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
2823 */
2824 size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
2825 return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
2826 }
2827