1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 4. Neither the name MongoDB or the name WiredTiger
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY MONGODB INC. ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include "wt_internal.h"
32
33 #define __HUFFMAN_DETAIL 0 /* Set to 1 for debugging output. */
34
35 /* Length of header in compressed message, in bits. */
36 #define WT_HUFFMAN_HEADER 3
37
38 /*
39 * Maximum allowed length of Huffman code words, which otherwise can range up
40 * to (#symbols - 1) bits long. Lower value to use less memory for tables,
41 * higher value for better compression. Max value = 16 (or 32-7=25 or 64-7=57
42 * if adjust data types). FYI, JPEG uses 16. A side effect of limiting max
43 * code length is that the worst case compression (a message of the least
44 * frequent symbols) is shorter.
45 */
46 #define MAX_CODE_LENGTH 16
47
48 typedef struct __wt_freqtree_node {
49 /*
50 * Data structure representing a node of the huffman tree. It holds a
51 * 64-bit weight and pointers to the left and right child nodes. The
52 * node either has two child nodes or none.
53 */
54 uint8_t symbol; /* only used in leaf nodes */
55 uint64_t weight;
56 struct __wt_freqtree_node *left; /* bit 0 */
57 struct __wt_freqtree_node *right; /* bit 1 */
58 } WT_FREQTREE_NODE;
59
60 typedef struct __wt_huffman_code {
61 uint16_t pattern; /* requirement: length of field's type
62 * in bits >= MAX_CODE_LENGTH.
63 */
64 uint8_t length;
65 } WT_HUFFMAN_CODE;
66
67 typedef struct __wt_huffman_obj {
68 /*
69 * Data structure here defines specific instance of the encoder/decoder.
70 */
71 u_int numSymbols; /* Symbols: UINT16_MAX or UINT8_MAX */
72
73 uint16_t max_depth, min_depth; /* Tree max/min depths */
74
75 /*
76 * use: codes[symbol] = struct with pattern and length.
77 * Used in encoding and decoding.
78 * memory: codes[0-to-(number of symbols - 1)]
79 */
80 WT_HUFFMAN_CODE *codes;
81
82 /*
83 * use: code2symbol[Huffman_code] = symbol.
84 * Used in decoding.
85 * memory: code2symbol[1 << max_code_length]
86 */
87 uint8_t *code2symbol;
88 } WT_HUFFMAN_OBJ;
89
90 /*
91 * Queue element data structure.
92 *
93 * Consists of a pointer to a huffman tree node, and a pointer to the next
94 * element in the queue.
95 */
96 typedef struct node_queue_elem {
97 WT_FREQTREE_NODE *node;
98 struct node_queue_elem *next;
99 } NODE_QUEUE_ELEM;
100
101 /*
102 * Queue of huffman tree nodes.
103 *
104 * Contains a pointer to the beginning and the end of the queue, which is
105 * implemented as a linked list.
106 */
107 typedef struct node_queue {
108 NODE_QUEUE_ELEM *first;
109 NODE_QUEUE_ELEM *last;
110 } NODE_QUEUE;
111
112 /*
113 * Internal data structure used to preserve the symbol when rearranging the
114 * frequency array.
115 */
116 typedef struct __indexed_byte {
117 uint32_t symbol; /* not uint8_t: match external data structure */
118 uint32_t frequency;
119 } INDEXED_SYMBOL;
120
121 static int WT_CDECL indexed_freq_compare(const void *, const void *);
122 static int WT_CDECL indexed_symbol_compare(const void *, const void *);
123 static void make_table(
124 WT_SESSION_IMPL *, uint8_t *, uint16_t, WT_HUFFMAN_CODE *, u_int);
125 static void node_queue_close(WT_SESSION_IMPL *, NODE_QUEUE *);
126 static void node_queue_dequeue(
127 WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE **);
128 static int node_queue_enqueue(
129 WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE *);
130 static uint32_t profile_tree(
131 WT_FREQTREE_NODE *, uint16_t, uint16_t *, uint16_t *);
132 static void recursive_free_node(WT_SESSION_IMPL *, WT_FREQTREE_NODE *);
133 static void set_codes(WT_FREQTREE_NODE *, WT_HUFFMAN_CODE *, uint16_t, uint8_t);
134
135 #define node_queue_is_empty(queue) \
136 ((queue) == NULL || (queue)->first == NULL)
137
138 /*
139 * indexed_symbol_compare --
140 * Qsort comparator to order the table by symbol, lowest to highest.
141 */
142 static int WT_CDECL
indexed_symbol_compare(const void * a,const void * b)143 indexed_symbol_compare(const void *a, const void *b)
144 {
145 return (((INDEXED_SYMBOL *)a)->symbol >
146 ((INDEXED_SYMBOL *)b)->symbol ? 1 :
147 (((INDEXED_SYMBOL *)a)->symbol <
148 ((INDEXED_SYMBOL *)b)->symbol ? -1 : 0));
149 }
150
151 /*
152 * indexed_freq_compare --
153 * Qsort comparator to order the table by frequency (the most frequent
154 * symbols will be at the end of the array).
155 */
156 static int WT_CDECL
indexed_freq_compare(const void * a,const void * b)157 indexed_freq_compare(const void *a, const void *b)
158 {
159 return (((INDEXED_SYMBOL *)a)->frequency >
160 ((INDEXED_SYMBOL *)b)->frequency ? 1 :
161 (((INDEXED_SYMBOL *)a)->frequency <
162 ((INDEXED_SYMBOL *)b)->frequency ? -1 : 0));
163 }
164
165 /*
166 * profile_tree --
167 * Traverses tree to determine #leaves under each node, max depth, min
168 * depth of leaf.
169 */
170 static uint32_t
profile_tree(WT_FREQTREE_NODE * node,uint16_t len,uint16_t * max_depth,uint16_t * min_depth)171 profile_tree(WT_FREQTREE_NODE *node,
172 uint16_t len, uint16_t *max_depth, uint16_t *min_depth)
173 {
174 uint32_t leaf_cnt;
175
176 if (node->left == NULL && node->right == NULL) { /* leaf */
177 leaf_cnt = 1;
178 if (*max_depth < len)
179 *max_depth = len;
180 if (*min_depth > len)
181 *min_depth = len;
182 } else {
183 /*
184 * internal node -- way tree constructed internal always has
185 * left and right children
186 */
187 leaf_cnt =
188 profile_tree(node->left, len + 1, max_depth, min_depth) +
189 profile_tree(node->right, len + 1, max_depth, min_depth);
190 }
191 node->weight = leaf_cnt; /* abuse weight field */
192 return (leaf_cnt);
193 }
194
195 /*
196 * set_codes --
197 * Computes Huffman code for each symbol in tree.
198 *
199 * Method is standard way in the literature, except that limits maximum code
200 * length. A known max code length is important for limiting memory use by
201 * the tables and for knowing how large data types need to be such as the field
202 * that holds the code pattern.
203 */
204 static void
set_codes(WT_FREQTREE_NODE * node,WT_HUFFMAN_CODE * codes,uint16_t pattern,uint8_t len)205 set_codes(WT_FREQTREE_NODE *node,
206 WT_HUFFMAN_CODE *codes, uint16_t pattern, uint8_t len)
207 {
208 WT_HUFFMAN_CODE *code;
209 uint16_t patternleft, patternright, half;
210 uint8_t remaining;
211
212 if (node->left == NULL && node->right == NULL) {
213 code = &codes[node->symbol];
214 code->pattern = pattern;
215 code->length = len;
216 #if __HUFFMAN_DETAIL
217 printf("%" PRIx16 ": code %" PRIx16 ", len %" PRIu8 "\n",
218 node->symbol, pattern, len);
219 #endif
220 } else {
221 /*
222 * Check each subtree individually to see if can afford to split
223 * up bits into possibly shorter codes, or if need to employ all
224 * remaining bits up to MAX_CODE_LENGTH to consecutively number
225 * leaves.
226 */
227 remaining = MAX_CODE_LENGTH - len;
228 /*
229 * If not already in "low-bit mode", but need to be, open up
230 * lower-order bits for consecutive numbering.
231 */
232 if (len < MAX_CODE_LENGTH &&
233 ((half = (uint16_t)(1 << (remaining - 1))) <
234 node->left->weight || half < node->right->weight)) {
235 pattern = (uint16_t)(pattern << remaining);
236 len = MAX_CODE_LENGTH;
237 }
238
239 if (len < MAX_CODE_LENGTH) {
240 patternleft = (uint16_t)((pattern << 1) | 0);
241 patternright = (uint16_t)((pattern << 1) | 1);
242 len++;
243 } else { /* "low bit mode" */
244 patternleft = pattern;
245 patternright = (uint16_t)(pattern + node->left->weight);
246 /* len unchanged */
247 }
248
249 set_codes(node->left, codes, patternleft, len);
250 set_codes(node->right, codes, patternright, len);
251 }
252 }
253
254 /*
255 * make_table --
256 * Computes Huffman table used for subsequent lookups in encoding and
257 * decoding. With the table, encoding from a symbol to Huffman code and
258 * decoding from a code to a symbol are simple array lookups.
259 */
260 static void
make_table(WT_SESSION_IMPL * session,uint8_t * code2symbol,uint16_t max_depth,WT_HUFFMAN_CODE * codes,u_int symcnt)261 make_table(WT_SESSION_IMPL *session, uint8_t *code2symbol,
262 uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int symcnt)
263 {
264 uint32_t j, c1, c2; /* Exceeds uint16_t bounds at loop boundary. */
265 uint16_t c, i;
266 uint8_t len, shift;
267
268 /* Zero out, for assertion below. */
269 for (j = 0, c2 = (1U << max_depth); j < c2; j++)
270 code2symbol[j] = 0;
271
272 /*
273 * Here's the magic: flood all bit patterns for lower-order bits to
274 * point to same symbol.
275 */
276 for (i = 0; i < symcnt; i++) {
277 if ((len = codes[i].length) == 0)
278 continue;
279
280 /*
281 * The size of the array index should be enough to hold largest
282 * index into symbol table. Pre-existing symbols were packed
283 * 0-255, so 8 bits is enough. Don't want to make it larger
284 * than necessary, we allocate (2 ^ max-code-length) of them.
285 */
286 c = codes[i].pattern;
287 shift = (uint8_t)(max_depth - len);
288 c1 = (uint32_t)c << shift;
289 c2 = (uint32_t)(c + 1) << shift;
290 for (j = c1; j < c2; j++) {
291 WT_ASSERT(session, code2symbol[j] == 0);
292 code2symbol[j] = (uint8_t)i;
293 }
294 }
295 }
296
297 /*
298 * recursive_free_node --
299 * Recursively free the huffman frequency tree's nodes.
300 */
301 static void
recursive_free_node(WT_SESSION_IMPL * session,WT_FREQTREE_NODE * node)302 recursive_free_node(WT_SESSION_IMPL *session, WT_FREQTREE_NODE *node)
303 {
304 if (node != NULL) {
305 recursive_free_node(session, node->left);
306 recursive_free_node(session, node->right);
307 __wt_free(session, node);
308 }
309 }
310
311 /*
312 * __wt_huffman_open --
313 * Take a frequency table and return a pointer to a descriptor object.
314 */
315 int
__wt_huffman_open(WT_SESSION_IMPL * session,void * symbol_frequency_array,u_int symcnt,u_int numbytes,void * retp)316 __wt_huffman_open(WT_SESSION_IMPL *session,
317 void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp)
318 {
319 INDEXED_SYMBOL *indexed_freqs, *sym;
320 NODE_QUEUE *combined_nodes, *leaves;
321 WT_DECL_RET;
322 WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode;
323 WT_HUFFMAN_OBJ *huffman;
324 uint64_t w1, w2;
325 uint16_t i;
326
327 indexed_freqs = NULL;
328 combined_nodes = leaves = NULL;
329 node = node2 = tempnode = NULL;
330
331 WT_RET(__wt_calloc_one(session, &huffman));
332
333 /*
334 * The frequency table is 4B pairs of symbol and frequency. The symbol
335 * is either 1 or 2 bytes and the frequency ranges from 1 to UINT32_MAX
336 * (a frequency of 0 means the value is never expected to appear in the
337 * input). Validate the symbols are within range.
338 */
339 if (numbytes != 1 && numbytes != 2)
340 WT_ERR_MSG(session, EINVAL,
341 "illegal number of symbol bytes specified for a huffman "
342 "table");
343
344 if (symcnt == 0)
345 WT_ERR_MSG(session, EINVAL,
346 "illegal number of symbols specified for a huffman table");
347
348 huffman->numSymbols = numbytes == 2 ? UINT16_MAX : UINT8_MAX;
349
350 /*
351 * Order the array by symbol and check for invalid symbols and
352 * duplicates.
353 */
354 sym = symbol_frequency_array;
355 __wt_qsort(sym, symcnt, sizeof(INDEXED_SYMBOL), indexed_symbol_compare);
356 for (i = 0; i < symcnt; ++i) {
357 if (i > 0 && sym[i].symbol == sym[i - 1].symbol)
358 WT_ERR_MSG(session, EINVAL,
359 "duplicate symbol %" PRIu32 " (%#" PRIx32 ") "
360 "specified in a huffman table",
361 sym[i].symbol, sym[i].symbol);
362 if (sym[i].symbol > huffman->numSymbols)
363 WT_ERR_MSG(session, EINVAL,
364 "out-of-range symbol %" PRIu32 " (%#" PRIx32 ") "
365 "specified in a huffman table",
366 sym[i].symbol, sym[i].symbol);
367 }
368
369 /*
370 * Massage frequencies.
371 */
372 WT_ERR(__wt_calloc_def(session, 256, &indexed_freqs));
373
374 /*
375 * Minimum of frequency==1 so everybody gets a Huffman code, in case
376 * data evolves and we need to represent this value.
377 */
378 for (i = 0; i < 256; i++) {
379 sym = &indexed_freqs[i];
380 sym->symbol = i;
381 sym->frequency = 1;
382 }
383 /*
384 * Avoid large tables by splitting UTF-16 frequencies into high byte
385 * and low byte.
386 */
387 for (i = 0; i < symcnt; i++) {
388 sym = &((INDEXED_SYMBOL *)symbol_frequency_array)[i];
389 indexed_freqs[sym->symbol & 0xff].frequency += sym->frequency;
390 if (numbytes == 2)
391 indexed_freqs[(sym->symbol >> 8) & 0xff].frequency +=
392 sym->frequency;
393 }
394 huffman->numSymbols = symcnt = 256;
395
396 /*
397 * The array must be sorted by frequency to be able to use a linear time
398 * construction algorithm.
399 */
400 __wt_qsort((void *)indexed_freqs,
401 symcnt, sizeof(INDEXED_SYMBOL), indexed_freq_compare);
402
403 /* We need two node queues to build the tree. */
404 WT_ERR(__wt_calloc_one(session, &leaves));
405 WT_ERR(__wt_calloc_one(session, &combined_nodes));
406
407 /*
408 * Adding the leaves to the queue.
409 *
410 * Discard symbols with a frequency of 0; this assumes these symbols
411 * never occur in the source stream, and the purpose is to reduce the
412 * huffman tree's size.
413 */
414 for (i = 0; i < symcnt; ++i)
415 if (indexed_freqs[i].frequency > 0) {
416 WT_ERR(__wt_calloc_one(session, &tempnode));
417 tempnode->symbol = (uint8_t)indexed_freqs[i].symbol;
418 tempnode->weight = indexed_freqs[i].frequency;
419 WT_ERR(node_queue_enqueue(session, leaves, tempnode));
420 tempnode = NULL;
421 }
422
423 while (!node_queue_is_empty(leaves) ||
424 !node_queue_is_empty(combined_nodes)) {
425 /*
426 * We have to get the node with the smaller weight, examining
427 * both queues' first element. We are collecting pairs of these
428 * items, by alternating between node and node2:
429 */
430 refnode = !node ? &node : &node2;
431
432 /*
433 * To decide which queue must be used, we get the weights of
434 * the first items from both:
435 */
436 w1 = node_queue_is_empty(leaves) ?
437 UINT64_MAX : leaves->first->node->weight;
438 w2 = node_queue_is_empty(combined_nodes) ?
439 UINT64_MAX : combined_nodes->first->node->weight;
440
441 /*
442 * Based on the two weights we finally can dequeue the smaller
443 * element and place it to the alternating target node pointer:
444 */
445 if (w1 < w2)
446 node_queue_dequeue(session, leaves, refnode);
447 else
448 node_queue_dequeue(session, combined_nodes, refnode);
449
450 /*
451 * In every second run, we have both node and node2 initialized.
452 */
453 if (node != NULL && node2 != NULL) {
454 WT_ERR(__wt_calloc_one(session, &tempnode));
455
456 /* The new weight is the sum of the two weights. */
457 tempnode->weight = node->weight + node2->weight;
458 tempnode->left = node;
459 tempnode->right = node2;
460
461 /* Enqueue it to the combined nodes queue */
462 WT_ERR(node_queue_enqueue(
463 session, combined_nodes, tempnode));
464 tempnode = NULL;
465
466 /* Reset the state pointers */
467 node = node2 = NULL;
468 }
469 }
470
471 /*
472 * The remaining node is in the node variable, this is the root of the
473 * tree. Calculate how many bytes it takes to hold numSymbols bytes
474 * bits.
475 */
476 huffman->max_depth = 0;
477 huffman->min_depth = MAX_CODE_LENGTH;
478 (void)profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth);
479 if (huffman->max_depth > MAX_CODE_LENGTH)
480 huffman->max_depth = MAX_CODE_LENGTH;
481
482 WT_ERR(__wt_calloc_def(session, huffman->numSymbols, &huffman->codes));
483 set_codes(node, huffman->codes, 0, 0);
484
485 WT_ERR(__wt_calloc_def(
486 session, (size_t)1U << huffman->max_depth, &huffman->code2symbol));
487 make_table(session, huffman->code2symbol,
488 huffman->max_depth, huffman->codes, huffman->numSymbols);
489
490 #if __HUFFMAN_DETAIL
491 {
492 uint8_t symbol;
493 uint32_t weighted_length;
494
495 printf("leaf depth %" PRIu16 "..%" PRIu16
496 ", memory use: codes %u# * %" WT_SIZET_FMT
497 "B + code2symbol %u# * %" WT_SIZET_FMT "B\n",
498 huffman->min_depth, huffman->max_depth,
499 huffman->numSymbols, sizeof(WT_HUFFMAN_CODE),
500 1U << huffman->max_depth, sizeof(uint16_t));
501
502 /*
503 * measure quality of computed Huffman codes, for different max bit
504 * lengths (say, 16 vs 24 vs 32)
505 */
506 weighted_length = 0;
507 for (i = 0; i < symcnt; i++) {
508 symbol = indexed_freqs[i].symbol;
509 weighted_length +=
510 indexed_freqs[i].frequency * huffman->codes[symbol].length;
511 printf(
512 "\t%" PRIu16 "->%" PRIu16 ". %" PRIu32 " * %" PRIu8 "\n",
513 i, symbol,
514 indexed_freqs[i].frequency, huffman->codes[symbol].length);
515 }
516 printf("weighted length of all codes (the smaller the better): "
517 "%" PRIu32 "\n", weighted_length);
518 }
519 #endif
520
521 *(void **)retp = huffman;
522
523 err: __wt_free(session, indexed_freqs);
524 if (leaves != NULL)
525 node_queue_close(session, leaves);
526 if (combined_nodes != NULL)
527 node_queue_close(session, combined_nodes);
528 if (node != NULL)
529 recursive_free_node(session, node);
530 if (node2 != NULL)
531 recursive_free_node(session, node2);
532 __wt_free(session, tempnode);
533 if (ret != 0)
534 __wt_huffman_close(session, huffman);
535 return (ret);
536 }
537
538 /*
539 * __wt_huffman_close --
540 * Discard a Huffman descriptor object.
541 */
542 void
__wt_huffman_close(WT_SESSION_IMPL * session,void * huffman_arg)543 __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg)
544 {
545 WT_HUFFMAN_OBJ *huffman;
546
547 huffman = huffman_arg;
548
549 __wt_free(session, huffman->code2symbol);
550 __wt_free(session, huffman->codes);
551 __wt_free(session, huffman);
552 }
553
554 #if __HUFFMAN_DETAIL
555 /*
556 * __wt_print_huffman_code --
557 * Prints a symbol's Huffman code.
558 */
559 void
__wt_print_huffman_code(void * huffman_arg,uint16_t symbol)560 __wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
561 {
562 WT_HUFFMAN_CODE code;
563 WT_HUFFMAN_OBJ *huffman;
564
565 huffman = huffman_arg;
566
567 if (symbol >= huffman->numSymbols)
568 printf("symbol %" PRIu16 " out of range\n", symbol);
569 else {
570 code = huffman->codes[symbol];
571 if (code.length == 0)
572 printf(
573 "symbol %" PRIu16 " not defined -- 0 frequency\n",
574 symbol);
575 else
576 /* should print code as binary */
577 printf(
578 "%" PRIu16 " -> code pattern "
579 "%" PRIx16 ", length %" PRIu8 "\n",
580 symbol, code.pattern, code.length);
581 }
582 }
583 #endif
584
585 /*
586 * __wt_huffman_encode --
587 * Take a byte string, encode it into the target.
588 *
589 * Translation from symbol to Huffman code is a simple array lookup.
590 *
591 * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per
592 * symbol. Then, given a symbol:
593 * pattern = codes[symbol].pattern;
594 * length = codes[symbol].length;
595 *
596 * To encode byte-string, we iterate over the input symbols. For each symbol,
597 * look it up via table, shift bits onto a shift register (an int long enough
598 * to hold the longest code word + up to 7 bits remaining from the previous),
599 * then drain out full bytes. Finally, at the end flush remaining bits
600 * and write header bits.
601 */
602 int
__wt_huffman_encode(WT_SESSION_IMPL * session,void * huffman_arg,const uint8_t * from_arg,size_t from_len,WT_ITEM * to_buf)603 __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg,
604 const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf)
605 {
606 WT_DECL_RET;
607 WT_HUFFMAN_CODE code;
608 WT_HUFFMAN_OBJ *huffman;
609 WT_ITEM *tmp;
610 size_t max_len, outlen, bytes;
611 uint64_t bitpos;
612 const uint8_t *from;
613 uint8_t len, *out, padding_info, symbol;
614
615 /*
616 * Shift register to accumulate bits from input.
617 * Should be >= (MAX_CODE_LENGTH + 7), but also efficient to shift bits
618 * and preferably in a machine register.
619 */
620 uint32_t bits;
621
622 /* Count of bits in shift register ('bits' above). */
623 uint8_t valid;
624
625 huffman = huffman_arg;
626 from = from_arg;
627 tmp = NULL;
628
629 /*
630 * We don't want to find all of our callers and ensure they don't pass
631 * 0-length byte strings, but there's no reason to do any work.
632 */
633 if (from_len == 0) {
634 to_buf->size = 0;
635 return (0);
636 }
637
638 /*
639 * Compute the largest compressed output size, which is if all symbols
640 * are least frequent and so have largest Huffman codes, and compressed
641 * output may be larger than the input size. This way we don't have to
642 * worry about resizing the buffer during compression. Use the shared
643 * system buffer while compressing, then allocate a new buffer of the
644 * right size and copy the result into it.
645 */
646 max_len = (WT_HUFFMAN_HEADER +
647 from_len * huffman->max_depth + 7 /* round up to full byte */) / 8;
648 WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
649
650 /*
651 * Leave the first 3 bits of the encoded value empty, it holds the
652 * number of bits actually used in the last byte of the encoded value.
653 */
654 bits = 0;
655 bitpos = WT_HUFFMAN_HEADER;
656 valid = WT_HUFFMAN_HEADER;
657 out = tmp->mem;
658 for (bytes = 0; bytes < from_len; bytes++) {
659 WT_ASSERT(session, WT_PTR_IN_RANGE(from, from_arg, from_len));
660
661 symbol = *from++;
662
663 /* Translate symbol into Huffman code and stuff into buffer. */
664 code = huffman->codes[symbol];
665 len = code.length;
666 bits = (bits << len) | code.pattern;
667 valid += len;
668 bitpos += len;
669 while (valid >= 8) {
670 WT_ASSERT(session,
671 WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize));
672 *out++ = (uint8_t)(bits >> (valid - 8));
673 valid -= 8;
674 }
675 }
676 if (valid > 0) { /* Flush shift register. */
677 WT_ASSERT(session,
678 WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize));
679 *out = (uint8_t)(bits << (8 - valid));
680 }
681
682 /*
683 * At this point, bitpos is the total number of used bits (including
684 * the 3 bits at the beginning of the buffer, which we'll set now to
685 * the number of bits used in the last byte). Note if the number of
686 * bits used in the last byte is 8, we set the 3 bits to 0, in other
687 * words, the first 3 bits of the encoded value are the number of bits
688 * used in the last byte, unless they're 0, in which case there are 8
689 * bits used in the last byte.
690 */
691 padding_info = (uint8_t)((bitpos % 8) << (8 - WT_HUFFMAN_HEADER));
692 ((uint8_t *)tmp->mem)[0] |= padding_info;
693
694 /* Copy result of exact known size into caller's buffer. */
695 outlen = (uint32_t)((bitpos + 7) / 8);
696 WT_ERR(__wt_buf_initsize(session, to_buf, outlen));
697 memcpy(to_buf->mem, tmp->mem, outlen);
698
699 #if __HUFFMAN_DETAIL
700 printf("encode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n",
701 max_len, outlen);
702 #endif
703
704 err: __wt_scr_free(session, &tmp);
705 return (ret);
706
707 }
708
709 /*
710 * __wt_huffman_decode --
711 * Take a byte string, decode it into the target.
712 *
713 * Translation from Huffman code to symbol is a simple array lookup.
714 *
715 * WT_HUFFMAN_OBJ contains an array called 'code2symbol' indexed by code word
716 * and whose value is the corresponding symbol.
717 * From the symbol, we index into the 'codes' array to get the code length.
718 *
719 * When decoding a message, we don't know where the boundaries are between
720 * codes. The trick is that we collect enough bits for the longest code word,
721 * and construct the table such that for codes with fewer bits we flood the
722 * table with all of the bit patterns in the lower order bits. This works
723 * because the Huffman code is a unique prefix, and by the flooding we are
724 * treating bits beyond the unique prefix as don't care bits.
725 *
726 * For example, we have table of length 2^max_code_length (1<<max_code_length).
727 * For a code of length, max_code_length, the position code2symbol[code] =
728 * symbol.
729 * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol,
730 * as well as code2symbol[(code << 1) | 1] = symbol.
731 * And so on, so in general we fill:
732 * code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive].
733 *
734 * To decode a message, we read in enough bits from input to fill the shift
735 * register with at least MAX_CODE_LENGTH bits.
736 * We look up in the table code2symbol to obtain the symbol.
737 * We look up the symbol in 'codes' to obtain the code length
738 * Finally, subtract off these bits from the shift register.
739 */
740 int
__wt_huffman_decode(WT_SESSION_IMPL * session,void * huffman_arg,const uint8_t * from_arg,size_t from_len,WT_ITEM * to_buf)741 __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg,
742 const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf)
743 {
744 WT_DECL_RET;
745 WT_HUFFMAN_OBJ *huffman;
746 WT_ITEM *tmp;
747 size_t from_bytes, len, max_len, outlen;
748 uint64_t from_len_bits;
749 uint32_t bits, mask, max;
750 uint16_t pattern;
751 const uint8_t *from;
752 uint8_t padding_info, symbol, *to, valid;
753
754 huffman = huffman_arg;
755 from = from_arg;
756 tmp = NULL;
757
758 /*
759 * We don't want to find all of our callers and ensure they don't pass
760 * 0-length byte strings, but there's no reason to do any work.
761 */
762 if (from_len == 0) {
763 to_buf->size = 0;
764 return (0);
765 }
766
767 /*
768 * The first 3 bits are the number of used bits in the last byte, unless
769 * they're 0, in which case there are 8 bits used in the last byte.
770 */
771 padding_info = (*from & 0xE0) >> (8 - WT_HUFFMAN_HEADER);
772 from_len_bits = from_len * 8;
773 if (padding_info != 0)
774 from_len_bits -= 8U - padding_info;
775
776 /* Number of bits that have codes. */
777 from_len_bits -= WT_HUFFMAN_HEADER;
778
779 /*
780 * Compute largest uncompressed output size, which is if all symbols are
781 * most frequent and so have smallest Huffman codes and therefore
782 * largest expansion. Use the shared system buffer while uncompressing,
783 * then allocate a new buffer of exactly the right size and copy the
784 * result into it.
785 */
786 max_len = (uint32_t)(from_len_bits / huffman->min_depth);
787 WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
788 to = tmp->mem;
789
790 /* The first byte of input is a special case because of header bits. */
791 bits = *from++;
792 valid = 8 - WT_HUFFMAN_HEADER;
793 from_bytes = from_len - 1;
794
795 max = huffman->max_depth;
796 mask = (1U << max) - 1;
797 for (outlen = 0; from_len_bits > 0; outlen++) {
798 while (valid < max && from_bytes > 0) {
799 WT_ASSERT(session,
800 WT_PTR_IN_RANGE(from, from_arg, from_len));
801 bits = (bits << 8) | *from++;
802 valid += 8;
803 from_bytes--;
804 }
805 pattern = (uint16_t)
806 (valid >= max ? /* short patterns near end */
807 (bits >> (valid - max)) : (bits << (max - valid)));
808 symbol = huffman->code2symbol[pattern & mask];
809 len = huffman->codes[symbol].length;
810 valid -= (uint8_t)len;
811
812 /*
813 * from_len_bits is the total number of input bits, reduced by
814 * the number of bits we consume from input at each step. For
815 * all but the last step from_len_bits > len, then at the last
816 * step from_len_bits == len (in other words, from_len_bits -
817 * len = 0 input bits remaining). Generally, we cannot detect
818 * corruption during huffman decompression, this is one place
819 * where that's not true.
820 */
821 if (from_len_bits < len) /* corrupted */
822 WT_ERR_MSG(session, EINVAL,
823 "huffman decompression detected input corruption");
824 from_len_bits -= len;
825
826 WT_ASSERT(session,
827 WT_PTR_IN_RANGE(to, tmp->mem, tmp->memsize));
828 *to++ = symbol;
829 }
830
831 /* Return the number of bytes used. */
832 WT_ERR(__wt_buf_initsize(session, to_buf, outlen));
833 memcpy(to_buf->mem, tmp->mem, outlen);
834
835 #if __HUFFMAN_DETAIL
836 printf("decode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n",
837 max_len, outlen);
838 #endif
839
840 err: __wt_scr_free(session, &tmp);
841 return (ret);
842 }
843
844 /*
845 * node_queue_close --
846 * Delete a queue from memory.
847 *
848 * It does not delete the pointed huffman tree nodes!
849 */
850 static void
node_queue_close(WT_SESSION_IMPL * session,NODE_QUEUE * queue)851 node_queue_close(WT_SESSION_IMPL *session, NODE_QUEUE *queue)
852 {
853 NODE_QUEUE_ELEM *elem, *next_elem;
854
855 /* Freeing each element of the queue's linked list. */
856 for (elem = queue->first; elem != NULL; elem = next_elem) {
857 next_elem = elem->next;
858 __wt_free(session, elem);
859 }
860
861 /* Freeing the queue record itself. */
862 __wt_free(session, queue);
863 }
864
865 /*
866 * node_queue_enqueue --
867 * Push a tree node to the end of the queue.
868 */
869 static int
node_queue_enqueue(WT_SESSION_IMPL * session,NODE_QUEUE * queue,WT_FREQTREE_NODE * node)870 node_queue_enqueue(
871 WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE *node)
872 {
873 NODE_QUEUE_ELEM *elem;
874
875 /* Allocating a new linked list element */
876 WT_RET(__wt_calloc_one(session, &elem));
877
878 /* It holds the tree node, and has no next element yet */
879 elem->node = node;
880 elem->next = NULL;
881
882 /* If the queue is empty, the first element will be the new one. */
883 if (queue->first == NULL)
884 queue->first = elem;
885
886 /*
887 * If the queue is not empty, the last element's next pointer must be
888 * updated.
889 */
890 if (queue->last != NULL)
891 queue->last->next = elem;
892
893 /* The last element is the new one */
894 queue->last = elem;
895
896 return (0);
897 }
898
899 /*
900 * node_queue_dequeue --
901 * Removes a node from the beginning of the queue and copies the node's
902 * pointer to the location referred by the retp parameter.
903 */
904 static void
node_queue_dequeue(WT_SESSION_IMPL * session,NODE_QUEUE * queue,WT_FREQTREE_NODE ** retp)905 node_queue_dequeue(
906 WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE **retp)
907 {
908 NODE_QUEUE_ELEM *first_elem;
909
910 /*
911 * Getting the first element of the queue and updating it to point to
912 * the next element as first.
913 */
914 first_elem = queue->first;
915 *retp = first_elem->node;
916 queue->first = first_elem->next;
917
918 /*
919 * If the last element was the dequeued element, we have to update it
920 * to NULL.
921 */
922 if (queue->last == first_elem)
923 queue->last = NULL;
924
925 /* Freeing the linked list element that has been dequeued */
926 __wt_free(session, first_elem);
927 }
928