1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 4. Neither the name MongoDB or the name WiredTiger
15  *    may be used to endorse or promote products derived from this software
16  *    without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY MONGODB INC. ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include "wt_internal.h"
32 
33 #define	__HUFFMAN_DETAIL	0	/* Set to 1 for debugging output. */
34 
35 /* Length of header in compressed message, in bits. */
36 #define	WT_HUFFMAN_HEADER 	3
37 
38 /*
39  * Maximum allowed length of Huffman code words, which otherwise can range up
40  * to (#symbols - 1) bits long.  Lower value to use less memory for tables,
41  * higher value for better compression.  Max value = 16 (or 32-7=25 or 64-7=57
42  * if adjust data types).  FYI, JPEG uses 16.  A side effect of limiting max
43  * code length is that the worst case compression (a message of the least
44  * frequent symbols) is shorter.
45  */
46 #define	MAX_CODE_LENGTH		16
47 
48 typedef struct __wt_freqtree_node {
49 	/*
50 	 * Data structure representing a node of the huffman tree. It holds a
51 	 * 64-bit weight and pointers to the left and right child nodes.  The
52 	 * node either has two child nodes or none.
53 	 */
54 	uint8_t  symbol;			/* only used in leaf nodes */
55 	uint64_t weight;
56 	struct __wt_freqtree_node *left;	/* bit 0 */
57 	struct __wt_freqtree_node *right;	/* bit 1 */
58 } WT_FREQTREE_NODE;
59 
60 typedef struct __wt_huffman_code {
61 	uint16_t pattern;		/* requirement: length of field's type
62 					 * in bits >= MAX_CODE_LENGTH.
63 					 */
64 	uint8_t length;
65 } WT_HUFFMAN_CODE;
66 
67 typedef struct __wt_huffman_obj {
68 	/*
69 	 * Data structure here defines specific instance of the encoder/decoder.
70 	 */
71 	u_int	numSymbols;		/* Symbols: UINT16_MAX or UINT8_MAX */
72 
73 	uint16_t max_depth, min_depth;	/* Tree max/min depths */
74 
75 	/*
76 	 * use: codes[symbol] = struct with pattern and length.
77 	 * Used in encoding and decoding.
78 	 * memory: codes[0-to-(number of symbols - 1)]
79 	 */
80 	WT_HUFFMAN_CODE *codes;
81 
82 	/*
83 	 * use: code2symbol[Huffman_code] = symbol.
84 	 * Used in decoding.
85 	 * memory: code2symbol[1 << max_code_length]
86 	 */
87 	uint8_t *code2symbol;
88 } WT_HUFFMAN_OBJ;
89 
90 /*
91  * Queue element data structure.
92  *
93  * Consists of a pointer to a huffman tree node, and a pointer to the next
94  * element in the queue.
95  */
96 typedef struct node_queue_elem {
97 	WT_FREQTREE_NODE *node;
98 	struct node_queue_elem *next;
99 } NODE_QUEUE_ELEM;
100 
101 /*
102  * Queue of huffman tree nodes.
103  *
104  * Contains a pointer to the beginning and the end of the queue, which is
105  * implemented as a linked list.
106  */
107 typedef struct node_queue {
108 	NODE_QUEUE_ELEM *first;
109 	NODE_QUEUE_ELEM *last;
110 } NODE_QUEUE;
111 
112 /*
113  * Internal data structure used to preserve the symbol when rearranging the
114  * frequency array.
115  */
116 typedef struct __indexed_byte {
117 	uint32_t symbol;	/* not uint8_t: match external data structure */
118 	uint32_t frequency;
119 } INDEXED_SYMBOL;
120 
121 static int WT_CDECL indexed_freq_compare(const void *, const void *);
122 static int WT_CDECL indexed_symbol_compare(const void *, const void *);
123 static void make_table(
124 	WT_SESSION_IMPL *, uint8_t *, uint16_t, WT_HUFFMAN_CODE *, u_int);
125 static void node_queue_close(WT_SESSION_IMPL *, NODE_QUEUE *);
126 static void node_queue_dequeue(
127 	WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE **);
128 static int  node_queue_enqueue(
129 	WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE *);
130 static uint32_t profile_tree(
131 	WT_FREQTREE_NODE *, uint16_t, uint16_t *, uint16_t *);
132 static void recursive_free_node(WT_SESSION_IMPL *, WT_FREQTREE_NODE *);
133 static void set_codes(WT_FREQTREE_NODE *, WT_HUFFMAN_CODE *, uint16_t, uint8_t);
134 
135 #define	node_queue_is_empty(queue)					\
136 	((queue) == NULL || (queue)->first == NULL)
137 
138 /*
139  * indexed_symbol_compare --
140  *	Qsort comparator to order the table by symbol, lowest to highest.
141  */
142 static int WT_CDECL
indexed_symbol_compare(const void * a,const void * b)143 indexed_symbol_compare(const void *a, const void *b)
144 {
145 	return (((INDEXED_SYMBOL *)a)->symbol >
146 	    ((INDEXED_SYMBOL *)b)->symbol ? 1 :
147 	    (((INDEXED_SYMBOL *)a)->symbol <
148 	    ((INDEXED_SYMBOL *)b)->symbol ? -1 : 0));
149 }
150 
151 /*
152  * indexed_freq_compare --
153  *	Qsort comparator to order the table by frequency (the most frequent
154  * symbols will be at the end of the array).
155  */
156 static int WT_CDECL
indexed_freq_compare(const void * a,const void * b)157 indexed_freq_compare(const void *a, const void *b)
158 {
159 	return (((INDEXED_SYMBOL *)a)->frequency >
160 	    ((INDEXED_SYMBOL *)b)->frequency ? 1 :
161 	    (((INDEXED_SYMBOL *)a)->frequency <
162 	    ((INDEXED_SYMBOL *)b)->frequency ? -1 : 0));
163 }
164 
165 /*
166  * profile_tree --
167  *	Traverses tree to determine #leaves under each node, max depth, min
168  *	depth of leaf.
169  */
170 static uint32_t
profile_tree(WT_FREQTREE_NODE * node,uint16_t len,uint16_t * max_depth,uint16_t * min_depth)171 profile_tree(WT_FREQTREE_NODE *node,
172     uint16_t len, uint16_t *max_depth, uint16_t *min_depth)
173 {
174 	uint32_t leaf_cnt;
175 
176 	if (node->left == NULL && node->right == NULL) {	/* leaf */
177 		leaf_cnt = 1;
178 		if (*max_depth < len)
179 			*max_depth = len;
180 		if (*min_depth > len)
181 			*min_depth = len;
182 	} else {
183 		/*
184 		 * internal node -- way tree constructed internal always has
185 		 * left and right children
186 		 */
187 		leaf_cnt =
188 		    profile_tree(node->left, len + 1, max_depth, min_depth) +
189 		    profile_tree(node->right, len + 1, max_depth, min_depth);
190 	}
191 	node->weight = leaf_cnt;		/* abuse weight field */
192 	return (leaf_cnt);
193 }
194 
195 /*
196  * set_codes --
197  *	Computes Huffman code for each symbol in tree.
198  *
199  * Method is standard way in the literature, except that limits maximum code
200  * length.  A known max code length is important for limiting memory use by
201  * the tables and for knowing how large data types need to be such as the field
202  * that holds the code pattern.
203  */
204 static void
set_codes(WT_FREQTREE_NODE * node,WT_HUFFMAN_CODE * codes,uint16_t pattern,uint8_t len)205 set_codes(WT_FREQTREE_NODE *node,
206     WT_HUFFMAN_CODE *codes, uint16_t pattern, uint8_t len)
207 {
208 	WT_HUFFMAN_CODE *code;
209 	uint16_t patternleft, patternright, half;
210 	uint8_t remaining;
211 
212 	if (node->left == NULL && node->right == NULL) {
213 		code = &codes[node->symbol];
214 		code->pattern = pattern;
215 		code->length = len;
216 #if __HUFFMAN_DETAIL
217 		printf("%" PRIx16 ": code %" PRIx16 ", len %" PRIu8 "\n",
218 		    node->symbol, pattern, len);
219 #endif
220 	} else {
221 		/*
222 		 * Check each subtree individually to see if can afford to split
223 		 * up bits into possibly shorter codes, or if need to employ all
224 		 * remaining bits up to MAX_CODE_LENGTH to consecutively number
225 		 * leaves.
226 		 */
227 		remaining = MAX_CODE_LENGTH - len;
228 		/*
229 		 * If not already in "low-bit mode", but need to be, open up
230 		 * lower-order bits for consecutive numbering.
231 		 */
232 		if (len < MAX_CODE_LENGTH &&
233 		    ((half = (uint16_t)(1 << (remaining - 1))) <
234 		    node->left->weight || half < node->right->weight)) {
235 			pattern = (uint16_t)(pattern << remaining);
236 			len = MAX_CODE_LENGTH;
237 		}
238 
239 		if (len < MAX_CODE_LENGTH) {
240 			patternleft = (uint16_t)((pattern << 1) | 0);
241 			patternright = (uint16_t)((pattern << 1) | 1);
242 			len++;
243 		} else {			/* "low bit mode" */
244 			patternleft = pattern;
245 			patternright = (uint16_t)(pattern + node->left->weight);
246 						/* len unchanged */
247 		}
248 
249 		set_codes(node->left, codes, patternleft, len);
250 		set_codes(node->right, codes, patternright, len);
251 	}
252 }
253 
254 /*
255  * make_table --
256  *	Computes Huffman table used for subsequent lookups in encoding and
257  * decoding.  With the table, encoding from a symbol to Huffman code and
258  * decoding from a code to a symbol are simple array lookups.
259  */
260 static void
make_table(WT_SESSION_IMPL * session,uint8_t * code2symbol,uint16_t max_depth,WT_HUFFMAN_CODE * codes,u_int symcnt)261 make_table(WT_SESSION_IMPL *session, uint8_t *code2symbol,
262     uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int symcnt)
263 {
264 	uint32_t j, c1, c2;	/* Exceeds uint16_t bounds at loop boundary. */
265 	uint16_t c, i;
266 	uint8_t len, shift;
267 
268 	/* Zero out, for assertion below. */
269 	for (j = 0, c2 = (1U << max_depth); j < c2; j++)
270 		code2symbol[j] = 0;
271 
272 	/*
273 	 * Here's the magic: flood all bit patterns for lower-order bits to
274 	 * point to same symbol.
275 	 */
276 	for (i = 0; i < symcnt; i++) {
277 		if ((len = codes[i].length) == 0)
278 			continue;
279 
280 		/*
281 		 * The size of the array index should be enough to hold largest
282 		 * index into symbol table.  Pre-existing symbols were packed
283 		 * 0-255, so 8 bits is enough.  Don't want to make it larger
284 		 * than necessary, we allocate (2 ^ max-code-length) of them.
285 		 */
286 		c = codes[i].pattern;
287 		shift = (uint8_t)(max_depth - len);
288 		c1 = (uint32_t)c << shift;
289 		c2 = (uint32_t)(c + 1) << shift;
290 		for (j = c1; j < c2; j++) {
291 			WT_ASSERT(session, code2symbol[j] == 0);
292 			code2symbol[j] = (uint8_t)i;
293 		}
294 	}
295 }
296 
297 /*
298  * recursive_free_node --
299  *	Recursively free the huffman frequency tree's nodes.
300  */
301 static void
recursive_free_node(WT_SESSION_IMPL * session,WT_FREQTREE_NODE * node)302 recursive_free_node(WT_SESSION_IMPL *session, WT_FREQTREE_NODE *node)
303 {
304 	if (node != NULL) {
305 		recursive_free_node(session, node->left);
306 		recursive_free_node(session, node->right);
307 		__wt_free(session, node);
308 	}
309 }
310 
311 /*
312  * __wt_huffman_open --
313  *	Take a frequency table and return a pointer to a descriptor object.
314  */
315 int
__wt_huffman_open(WT_SESSION_IMPL * session,void * symbol_frequency_array,u_int symcnt,u_int numbytes,void * retp)316 __wt_huffman_open(WT_SESSION_IMPL *session,
317     void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp)
318 {
319 	INDEXED_SYMBOL *indexed_freqs, *sym;
320 	NODE_QUEUE *combined_nodes, *leaves;
321 	WT_DECL_RET;
322 	WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode;
323 	WT_HUFFMAN_OBJ *huffman;
324 	uint64_t w1, w2;
325 	uint16_t i;
326 
327 	indexed_freqs = NULL;
328 	combined_nodes = leaves = NULL;
329 	node = node2 = tempnode = NULL;
330 
331 	WT_RET(__wt_calloc_one(session, &huffman));
332 
333 	/*
334 	 * The frequency table is 4B pairs of symbol and frequency.  The symbol
335 	 * is either 1 or 2 bytes and the frequency ranges from 1 to UINT32_MAX
336 	 * (a frequency of 0 means the value is never expected to appear in the
337 	 * input).  Validate the symbols are within range.
338 	 */
339 	if (numbytes != 1 && numbytes != 2)
340 		WT_ERR_MSG(session, EINVAL,
341 		    "illegal number of symbol bytes specified for a huffman "
342 		    "table");
343 
344 	if (symcnt == 0)
345 		WT_ERR_MSG(session, EINVAL,
346 		    "illegal number of symbols specified for a huffman table");
347 
348 	huffman->numSymbols = numbytes == 2 ? UINT16_MAX : UINT8_MAX;
349 
350 	/*
351 	 * Order the array by symbol and check for invalid symbols and
352 	 * duplicates.
353 	 */
354 	sym = symbol_frequency_array;
355 	__wt_qsort(sym, symcnt, sizeof(INDEXED_SYMBOL), indexed_symbol_compare);
356 	for (i = 0; i < symcnt; ++i) {
357 		if (i > 0 && sym[i].symbol == sym[i - 1].symbol)
358 			WT_ERR_MSG(session, EINVAL,
359 			    "duplicate symbol %" PRIu32 " (%#" PRIx32 ") "
360 			    "specified in a huffman table",
361 			    sym[i].symbol, sym[i].symbol);
362 		if (sym[i].symbol > huffman->numSymbols)
363 			WT_ERR_MSG(session, EINVAL,
364 			    "out-of-range symbol %" PRIu32 " (%#" PRIx32 ") "
365 			    "specified in a huffman table",
366 			    sym[i].symbol, sym[i].symbol);
367 	}
368 
369 	/*
370 	 * Massage frequencies.
371 	 */
372 	WT_ERR(__wt_calloc_def(session, 256, &indexed_freqs));
373 
374 	/*
375 	 * Minimum of frequency==1 so everybody gets a Huffman code, in case
376 	 * data evolves and we need to represent this value.
377 	 */
378 	for (i = 0; i < 256; i++) {
379 		sym = &indexed_freqs[i];
380 		sym->symbol = i;
381 		sym->frequency = 1;
382 	}
383 	/*
384 	 * Avoid large tables by splitting UTF-16 frequencies into high byte
385 	 * and low byte.
386 	 */
387 	for (i = 0; i < symcnt; i++) {
388 		sym = &((INDEXED_SYMBOL *)symbol_frequency_array)[i];
389 		indexed_freqs[sym->symbol & 0xff].frequency += sym->frequency;
390 		if (numbytes == 2)
391 			indexed_freqs[(sym->symbol >> 8) & 0xff].frequency +=
392 			    sym->frequency;
393 	}
394 	huffman->numSymbols = symcnt = 256;
395 
396 	/*
397 	 * The array must be sorted by frequency to be able to use a linear time
398 	 * construction algorithm.
399 	 */
400 	__wt_qsort((void *)indexed_freqs,
401 	    symcnt, sizeof(INDEXED_SYMBOL), indexed_freq_compare);
402 
403 	/* We need two node queues to build the tree. */
404 	WT_ERR(__wt_calloc_one(session, &leaves));
405 	WT_ERR(__wt_calloc_one(session, &combined_nodes));
406 
407 	/*
408 	 * Adding the leaves to the queue.
409 	 *
410 	 * Discard symbols with a frequency of 0; this assumes these symbols
411 	 * never occur in the source stream, and the purpose is to reduce the
412 	 * huffman tree's size.
413 	 */
414 	for (i = 0; i < symcnt; ++i)
415 		if (indexed_freqs[i].frequency > 0) {
416 			WT_ERR(__wt_calloc_one(session, &tempnode));
417 			tempnode->symbol = (uint8_t)indexed_freqs[i].symbol;
418 			tempnode->weight = indexed_freqs[i].frequency;
419 			WT_ERR(node_queue_enqueue(session, leaves, tempnode));
420 			tempnode = NULL;
421 		}
422 
423 	while (!node_queue_is_empty(leaves) ||
424 	    !node_queue_is_empty(combined_nodes)) {
425 		/*
426 		 * We have to get the node with the smaller weight, examining
427 		 * both queues' first element.  We are collecting pairs of these
428 		 * items, by alternating between node and node2:
429 		 */
430 		refnode = !node ? &node : &node2;
431 
432 		/*
433 		 * To decide which queue must be used, we get the weights of
434 		 * the first items from both:
435 		 */
436 		w1 = node_queue_is_empty(leaves) ?
437 		    UINT64_MAX : leaves->first->node->weight;
438 		w2 = node_queue_is_empty(combined_nodes) ?
439 		    UINT64_MAX : combined_nodes->first->node->weight;
440 
441 		/*
442 		 * Based on the two weights we finally can dequeue the smaller
443 		 * element and place it to the alternating target node pointer:
444 		 */
445 		if (w1 < w2)
446 			node_queue_dequeue(session, leaves, refnode);
447 		else
448 			node_queue_dequeue(session, combined_nodes, refnode);
449 
450 		/*
451 		 * In every second run, we have both node and node2 initialized.
452 		 */
453 		if (node != NULL && node2 != NULL) {
454 			WT_ERR(__wt_calloc_one(session, &tempnode));
455 
456 			/* The new weight is the sum of the two weights. */
457 			tempnode->weight = node->weight + node2->weight;
458 			tempnode->left = node;
459 			tempnode->right = node2;
460 
461 			/* Enqueue it to the combined nodes queue */
462 			WT_ERR(node_queue_enqueue(
463 			    session, combined_nodes, tempnode));
464 			tempnode = NULL;
465 
466 			/* Reset the state pointers */
467 			node = node2 = NULL;
468 		}
469 	}
470 
471 	/*
472 	 * The remaining node is in the node variable, this is the root of the
473 	 * tree. Calculate how many bytes it takes to hold numSymbols bytes
474 	 * bits.
475 	 */
476 	huffman->max_depth = 0;
477 	huffman->min_depth = MAX_CODE_LENGTH;
478 	(void)profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth);
479 	if (huffman->max_depth > MAX_CODE_LENGTH)
480 		huffman->max_depth = MAX_CODE_LENGTH;
481 
482 	WT_ERR(__wt_calloc_def(session, huffman->numSymbols, &huffman->codes));
483 	set_codes(node, huffman->codes, 0, 0);
484 
485 	WT_ERR(__wt_calloc_def(
486 	    session, (size_t)1U << huffman->max_depth, &huffman->code2symbol));
487 	make_table(session, huffman->code2symbol,
488 	    huffman->max_depth, huffman->codes, huffman->numSymbols);
489 
490 #if __HUFFMAN_DETAIL
491 	{
492 	uint8_t symbol;
493 	uint32_t weighted_length;
494 
495 	printf("leaf depth %" PRIu16 "..%" PRIu16
496 	    ", memory use: codes %u# * %" WT_SIZET_FMT
497 	    "B + code2symbol %u# * %" WT_SIZET_FMT "B\n",
498 	    huffman->min_depth, huffman->max_depth,
499 	    huffman->numSymbols, sizeof(WT_HUFFMAN_CODE),
500 	    1U << huffman->max_depth, sizeof(uint16_t));
501 
502 	/*
503 	 * measure quality of computed Huffman codes, for different max bit
504 	 * lengths (say, 16 vs 24 vs 32)
505 	 */
506 	weighted_length = 0;
507 	for (i = 0; i < symcnt; i++) {
508 		symbol = indexed_freqs[i].symbol;
509 		weighted_length +=
510 		    indexed_freqs[i].frequency * huffman->codes[symbol].length;
511 		printf(
512 		    "\t%" PRIu16 "->%" PRIu16 ". %" PRIu32 " * %" PRIu8 "\n",
513 		    i, symbol,
514 		    indexed_freqs[i].frequency, huffman->codes[symbol].length);
515 	}
516 	printf("weighted length of all codes (the smaller the better): "
517 	    "%" PRIu32 "\n", weighted_length);
518 	}
519 #endif
520 
521 	*(void **)retp = huffman;
522 
523 err:	__wt_free(session, indexed_freqs);
524 	if (leaves != NULL)
525 		node_queue_close(session, leaves);
526 	if (combined_nodes != NULL)
527 		node_queue_close(session, combined_nodes);
528 	if (node != NULL)
529 		recursive_free_node(session, node);
530 	if (node2 != NULL)
531 		recursive_free_node(session, node2);
532 	__wt_free(session, tempnode);
533 	if (ret != 0)
534 		__wt_huffman_close(session, huffman);
535 	return (ret);
536 }
537 
538 /*
539  * __wt_huffman_close --
540  *	Discard a Huffman descriptor object.
541  */
542 void
__wt_huffman_close(WT_SESSION_IMPL * session,void * huffman_arg)543 __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg)
544 {
545 	WT_HUFFMAN_OBJ *huffman;
546 
547 	huffman = huffman_arg;
548 
549 	__wt_free(session, huffman->code2symbol);
550 	__wt_free(session, huffman->codes);
551 	__wt_free(session, huffman);
552 }
553 
554 #if __HUFFMAN_DETAIL
555 /*
556  * __wt_print_huffman_code --
557  *	Prints a symbol's Huffman code.
558  */
559 void
__wt_print_huffman_code(void * huffman_arg,uint16_t symbol)560 __wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
561 {
562 	WT_HUFFMAN_CODE code;
563 	WT_HUFFMAN_OBJ *huffman;
564 
565 	huffman = huffman_arg;
566 
567 	if (symbol >= huffman->numSymbols)
568 		printf("symbol %" PRIu16 " out of range\n", symbol);
569 	else {
570 		code = huffman->codes[symbol];
571 		if (code.length == 0)
572 			printf(
573 			    "symbol %" PRIu16 " not defined -- 0 frequency\n",
574 			    symbol);
575 		else
576 			/* should print code as binary */
577 			printf(
578 			    "%" PRIu16 " -> code pattern "
579 			    "%" PRIx16 ", length %" PRIu8 "\n",
580 				symbol, code.pattern, code.length);
581 	}
582 }
583 #endif
584 
585 /*
586  * __wt_huffman_encode --
587  *	Take a byte string, encode it into the target.
588  *
589  * Translation from symbol to Huffman code is a simple array lookup.
590  *
591  * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per
592  * symbol.  Then, given a symbol:
593  *	pattern = codes[symbol].pattern;
594  *	length = codes[symbol].length;
595  *
596  * To encode byte-string, we iterate over the input symbols.  For each symbol,
597  * look it up via table, shift bits onto a shift register (an int long enough
598  * to hold the longest code word + up to 7 bits remaining from the previous),
599  * then drain out full bytes.  Finally, at the end flush remaining bits
600  * and write header bits.
601  */
602 int
__wt_huffman_encode(WT_SESSION_IMPL * session,void * huffman_arg,const uint8_t * from_arg,size_t from_len,WT_ITEM * to_buf)603 __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg,
604     const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf)
605 {
606 	WT_DECL_RET;
607 	WT_HUFFMAN_CODE code;
608 	WT_HUFFMAN_OBJ *huffman;
609 	WT_ITEM *tmp;
610 	size_t max_len, outlen, bytes;
611 	uint64_t bitpos;
612 	const uint8_t *from;
613 	uint8_t len, *out, padding_info, symbol;
614 
615 	/*
616 	 * Shift register to accumulate bits from input.
617 	 * Should be >= (MAX_CODE_LENGTH + 7), but also efficient to shift bits
618 	 * and preferably in a machine register.
619 	 */
620 	uint32_t bits;
621 
622 	/* Count of bits in shift register ('bits' above). */
623 	uint8_t valid;
624 
625 	huffman = huffman_arg;
626 	from = from_arg;
627 	tmp = NULL;
628 
629 	/*
630 	 * We don't want to find all of our callers and ensure they don't pass
631 	 * 0-length byte strings, but there's no reason to do any work.
632 	 */
633 	if (from_len == 0) {
634 		to_buf->size = 0;
635 		return (0);
636 	}
637 
638 	/*
639 	 * Compute the largest compressed output size, which is if all symbols
640 	 * are least frequent and so have largest Huffman codes, and compressed
641 	 * output may be larger than the input size.  This way we don't have to
642 	 * worry about resizing the buffer during compression.  Use the shared
643 	 * system buffer while compressing, then allocate a new buffer of the
644 	 * right size and copy the result into it.
645 	 */
646 	max_len = (WT_HUFFMAN_HEADER +
647 	    from_len * huffman->max_depth + 7 /* round up to full byte */) / 8;
648 	WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
649 
650 	/*
651 	 * Leave the first 3 bits of the encoded value empty, it holds the
652 	 * number of bits actually used in the last byte of the encoded value.
653 	 */
654 	bits = 0;
655 	bitpos = WT_HUFFMAN_HEADER;
656 	valid = WT_HUFFMAN_HEADER;
657 	out = tmp->mem;
658 	for (bytes = 0; bytes < from_len; bytes++) {
659 		WT_ASSERT(session, WT_PTR_IN_RANGE(from, from_arg, from_len));
660 
661 		symbol = *from++;
662 
663 		/* Translate symbol into Huffman code and stuff into buffer. */
664 		code = huffman->codes[symbol];
665 		len = code.length;
666 		bits = (bits << len) | code.pattern;
667 		valid += len;
668 		bitpos += len;
669 		while (valid >= 8) {
670 			WT_ASSERT(session,
671 			    WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize));
672 			*out++ = (uint8_t)(bits >> (valid - 8));
673 			valid -= 8;
674 		}
675 	}
676 	if (valid > 0) {		/* Flush shift register. */
677 		WT_ASSERT(session,
678 		    WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize));
679 		*out = (uint8_t)(bits << (8 - valid));
680 	}
681 
682 	/*
683 	 * At this point, bitpos is the total number of used bits (including
684 	 * the 3 bits at the beginning of the buffer, which we'll set now to
685 	 * the number of bits used in the last byte). Note if the number of
686 	 * bits used in the last byte is 8, we set the 3 bits to 0, in other
687 	 * words, the first 3 bits of the encoded value are the number of bits
688 	 * used in the last byte, unless they're 0, in which case there are 8
689 	 * bits used in the last byte.
690 	 */
691 	padding_info = (uint8_t)((bitpos % 8) << (8 - WT_HUFFMAN_HEADER));
692 	((uint8_t *)tmp->mem)[0] |= padding_info;
693 
694 	/* Copy result of exact known size into caller's buffer. */
695 	outlen = (uint32_t)((bitpos + 7) / 8);
696 	WT_ERR(__wt_buf_initsize(session, to_buf, outlen));
697 	memcpy(to_buf->mem, tmp->mem, outlen);
698 
699 #if __HUFFMAN_DETAIL
700 	printf("encode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n",
701 	    max_len, outlen);
702 #endif
703 
704 err:	__wt_scr_free(session, &tmp);
705 	return (ret);
706 
707 }
708 
709 /*
710  * __wt_huffman_decode --
711  *	Take a byte string, decode it into the target.
712  *
713  * Translation from Huffman code to symbol is a simple array lookup.
714  *
715  * WT_HUFFMAN_OBJ contains an array called 'code2symbol' indexed by code word
716  * and whose value is the corresponding symbol.
717  * From the symbol, we index into the 'codes' array to get the code length.
718  *
719  * When decoding a message, we don't know where the boundaries are between
720  * codes.  The trick is that we collect enough bits for the longest code word,
721  * and construct the table such that for codes with fewer bits we flood the
722  * table with all of the bit patterns in the lower order bits.  This works
723  * because the Huffman code is a unique prefix, and by the flooding we are
724  * treating bits beyond the unique prefix as don't care bits.
725  *
726  * For example, we have table of length 2^max_code_length (1<<max_code_length).
727  * For a code of length, max_code_length, the position code2symbol[code] =
728  *	symbol.
729  * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol,
730  * as well as code2symbol[(code << 1) | 1] = symbol.
731  * And so on, so in general we fill:
732  * 	code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive].
733  *
734  * To decode a message, we read in enough bits from input to fill the shift
735  * register with at least MAX_CODE_LENGTH bits.
736  * We look up in the table code2symbol to obtain the symbol.
737  * We look up the symbol in 'codes' to obtain the code length
738  * Finally, subtract off these bits from the shift register.
739  */
740 int
__wt_huffman_decode(WT_SESSION_IMPL * session,void * huffman_arg,const uint8_t * from_arg,size_t from_len,WT_ITEM * to_buf)741 __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg,
742     const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf)
743 {
744 	WT_DECL_RET;
745 	WT_HUFFMAN_OBJ *huffman;
746 	WT_ITEM *tmp;
747 	size_t from_bytes, len, max_len, outlen;
748 	uint64_t from_len_bits;
749 	uint32_t bits, mask, max;
750 	uint16_t pattern;
751 	const uint8_t *from;
752 	uint8_t padding_info, symbol, *to, valid;
753 
754 	huffman = huffman_arg;
755 	from = from_arg;
756 	tmp = NULL;
757 
758 	/*
759 	 * We don't want to find all of our callers and ensure they don't pass
760 	 * 0-length byte strings, but there's no reason to do any work.
761 	 */
762 	if (from_len == 0) {
763 		to_buf->size = 0;
764 		return (0);
765 	}
766 
767 	/*
768 	 * The first 3 bits are the number of used bits in the last byte, unless
769 	 * they're 0, in which case there are 8 bits used in the last byte.
770 	 */
771 	padding_info = (*from & 0xE0) >> (8 - WT_HUFFMAN_HEADER);
772 	from_len_bits = from_len * 8;
773 	if (padding_info != 0)
774 		from_len_bits -= 8U - padding_info;
775 
776 	/* Number of bits that have codes. */
777 	from_len_bits -= WT_HUFFMAN_HEADER;
778 
779 	/*
780 	 * Compute largest uncompressed output size, which is if all symbols are
781 	 * most frequent and so have smallest Huffman codes and therefore
782 	 * largest expansion.  Use the shared system buffer while uncompressing,
783 	 * then allocate a new buffer of exactly the right size and copy the
784 	 * result into it.
785 	 */
786 	max_len = (uint32_t)(from_len_bits / huffman->min_depth);
787 	WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
788 	to = tmp->mem;
789 
790 	/* The first byte of input is a special case because of header bits. */
791 	bits = *from++;
792 	valid = 8 - WT_HUFFMAN_HEADER;
793 	from_bytes = from_len - 1;
794 
795 	max = huffman->max_depth;
796 	mask = (1U << max) - 1;
797 	for (outlen = 0; from_len_bits > 0; outlen++) {
798 		while (valid < max && from_bytes > 0) {
799 			WT_ASSERT(session,
800 			    WT_PTR_IN_RANGE(from, from_arg, from_len));
801 			bits = (bits << 8) | *from++;
802 			valid += 8;
803 			from_bytes--;
804 		}
805 		pattern = (uint16_t)
806 		    (valid >= max ?	/* short patterns near end */
807 		    (bits >> (valid - max)) : (bits << (max - valid)));
808 		symbol = huffman->code2symbol[pattern & mask];
809 		len = huffman->codes[symbol].length;
810 		valid -= (uint8_t)len;
811 
812 		/*
813 		 * from_len_bits is the total number of input bits, reduced by
814 		 * the number of bits we consume from input at each step.  For
815 		 * all but the last step from_len_bits > len, then at the last
816 		 * step from_len_bits == len (in other words, from_len_bits -
817 		 * len = 0 input bits remaining). Generally, we cannot detect
818 		 * corruption during huffman decompression, this is one place
819 		 * where that's not true.
820 		 */
821 		if (from_len_bits < len)	/* corrupted */
822 			WT_ERR_MSG(session, EINVAL,
823 			    "huffman decompression detected input corruption");
824 		from_len_bits -= len;
825 
826 		WT_ASSERT(session,
827 		    WT_PTR_IN_RANGE(to, tmp->mem, tmp->memsize));
828 		*to++ = symbol;
829 	}
830 
831 	/* Return the number of bytes used. */
832 	WT_ERR(__wt_buf_initsize(session, to_buf, outlen));
833 	memcpy(to_buf->mem, tmp->mem, outlen);
834 
835 #if __HUFFMAN_DETAIL
836 	printf("decode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n",
837 	    max_len, outlen);
838 #endif
839 
840 err:	__wt_scr_free(session, &tmp);
841 	return (ret);
842 }
843 
844 /*
845  * node_queue_close --
846  *	Delete a queue from memory.
847  *
848  * It does not delete the pointed huffman tree nodes!
849  */
850 static void
node_queue_close(WT_SESSION_IMPL * session,NODE_QUEUE * queue)851 node_queue_close(WT_SESSION_IMPL *session, NODE_QUEUE *queue)
852 {
853 	NODE_QUEUE_ELEM *elem, *next_elem;
854 
855 	/* Freeing each element of the queue's linked list. */
856 	for (elem = queue->first; elem != NULL; elem = next_elem) {
857 		next_elem = elem->next;
858 		__wt_free(session, elem);
859 	}
860 
861 	/* Freeing the queue record itself. */
862 	__wt_free(session, queue);
863 }
864 
865 /*
866  * node_queue_enqueue --
867  *	Push a tree node to the end of the queue.
868  */
869 static int
node_queue_enqueue(WT_SESSION_IMPL * session,NODE_QUEUE * queue,WT_FREQTREE_NODE * node)870 node_queue_enqueue(
871     WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE *node)
872 {
873 	NODE_QUEUE_ELEM *elem;
874 
875 	/* Allocating a new linked list element */
876 	WT_RET(__wt_calloc_one(session, &elem));
877 
878 	/* It holds the tree node, and has no next element yet */
879 	elem->node = node;
880 	elem->next = NULL;
881 
882 	/* If the queue is empty, the first element will be the new one. */
883 	if (queue->first == NULL)
884 		queue->first = elem;
885 
886 	/*
887 	 * If the queue is not empty, the last element's next pointer must be
888 	 * updated.
889 	 */
890 	if (queue->last != NULL)
891 		queue->last->next = elem;
892 
893 	/* The last element is the new one */
894 	queue->last = elem;
895 
896 	return (0);
897 }
898 
899 /*
900  * node_queue_dequeue --
901  *	Removes a node from the beginning of the queue and copies the node's
902  *	pointer to the location referred by the retp parameter.
903  */
904 static void
node_queue_dequeue(WT_SESSION_IMPL * session,NODE_QUEUE * queue,WT_FREQTREE_NODE ** retp)905 node_queue_dequeue(
906     WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE **retp)
907 {
908 	NODE_QUEUE_ELEM *first_elem;
909 
910 	/*
911 	 * Getting the first element of the queue and updating it to point to
912 	 * the next element as first.
913 	 */
914 	first_elem = queue->first;
915 	*retp = first_elem->node;
916 	queue->first = first_elem->next;
917 
918 	/*
919 	 * If the last element was the dequeued element, we have to update it
920 	 * to NULL.
921 	 */
922 	if (queue->last == first_elem)
923 		queue->last = NULL;
924 
925 	/* Freeing the linked list element that has been dequeued */
926 	__wt_free(session, first_elem);
927 }
928