1 /*-------------------------------------------------------------------------
2  *
3  * integerset.c
4  *	  Data structure to hold a large set of 64-bit integers efficiently
5  *
6  * IntegerSet provides an in-memory data structure to hold a set of
7  * arbitrary 64-bit integers.  Internally, the values are stored in a
8  * B-tree, with a special packed representation at the leaf level using
9  * the Simple-8b algorithm, which can pack clusters of nearby values
10  * very tightly.
11  *
12  * Memory consumption depends on the number of values stored, but also
13  * on how far the values are from each other.  In the best case, with
14  * long runs of consecutive integers, memory consumption can be as low as
15  * 0.1 bytes per integer.  In the worst case, if integers are more than
16  * 2^32 apart, it uses about 8 bytes per integer.  In typical use, the
17  * consumption per integer is somewhere between those extremes, depending
18  * on the range of integers stored, and how "clustered" they are.
19  *
20  *
21  * Interface
22  * ---------
23  *
24  *	intset_create			- Create a new, empty set
25  *	intset_add_member		- Add an integer to the set
26  *	intset_is_member		- Test if an integer is in the set
27  *	intset_begin_iterate	- Begin iterating through all integers in set
28  *	intset_iterate_next		- Return next set member, if any
29  *
30  * intset_create() creates the set in the current memory context.  Subsequent
31  * operations that add to the data structure will continue to allocate from
32  * that same context, even if it's not current anymore.
33  *
34  * Note that there is no function to free an integer set.  If you need to do
35  * that, create a dedicated memory context to hold it, and destroy the memory
36  * context instead.
37  *
38  *
39  * Limitations
40  * -----------
41  *
42  * - Values must be added in order.  (Random insertions would require
43  *   splitting nodes, which hasn't been implemented.)
44  *
45  * - Values cannot be added while iteration is in progress.
46  *
47  * - No support for removing values.
48  *
49  * None of these limitations are fundamental to the data structure, so they
50  * could be lifted if needed, by writing some new code.  But the current
51  * users of this facility don't need them.
52  *
53  *
54  * References
55  * ----------
56  *
57  * Simple-8b encoding is based on:
58  *
59  * Vo Ngoc Anh, Alistair Moffat, Index compression using 64-bit words,
60  *   Software - Practice & Experience, v.40 n.2, p.131-147, February 2010
61  *   (https://doi.org/10.1002/spe.948)
62  *
63  *
64  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
65  * Portions Copyright (c) 1994, Regents of the University of California
66  *
67  * IDENTIFICATION
68  *	  src/backend/lib/integerset.c
69  *
70  *-------------------------------------------------------------------------
71  */
72 #include "postgres.h"
73 
74 #include "access/htup_details.h"
75 #include "lib/integerset.h"
76 #include "port/pg_bitutils.h"
77 #include "utils/memutils.h"
78 
79 
80 /*
81  * Maximum number of integers that can be encoded in a single Simple-8b
82  * codeword. (Defined here before anything else, so that we can size arrays
83  * using this.)
84  */
85 #define SIMPLE8B_MAX_VALUES_PER_CODEWORD 240
86 
87 /*
88  * Parameters for shape of the in-memory B-tree.
89  *
90  * These set the size of each internal and leaf node.  They don't necessarily
91  * need to be the same, because the tree is just an in-memory structure.
92  * With the default 64, each node is about 1 kb.
93  *
94  * If you change these, you must recalculate MAX_TREE_LEVELS, too!
95  */
96 #define MAX_INTERNAL_ITEMS	64
97 #define MAX_LEAF_ITEMS	64
98 
99 /*
100  * Maximum height of the tree.
101  *
102  * MAX_TREE_ITEMS is calculated from the "fan-out" of the B-tree.  The
103  * theoretical maximum number of items that we can store in a set is 2^64,
104  * so MAX_TREE_LEVELS should be set so that:
105  *
106  *   MAX_LEAF_ITEMS * MAX_INTERNAL_ITEMS ^ (MAX_TREE_LEVELS - 1) >= 2^64.
107  *
108  * In practice, we'll need far fewer levels, because you will run out of
109  * memory long before reaching that number, but let's be conservative.
110  */
111 #define MAX_TREE_LEVELS		11
112 
113 /*
114  * Node structures, for the in-memory B-tree.
115  *
116  * An internal node holds a number of downlink pointers to leaf nodes, or
117  * to internal nodes on a lower level.  For each downlink, the key value
118  * corresponding to the lower level node is stored in a sorted array.  The
119  * stored key values are low keys.  In other words, if the downlink has value
120  * X, then all items stored on that child are >= X.
121  *
122  * Each leaf node holds a number of "items", with a varying number of
123  * integers packed into each item.  Each item consists of two 64-bit words:
124  * The first word holds the first integer stored in the item, in plain format.
125  * The second word contains between 0 and 240 more integers, packed using
126  * Simple-8b encoding.  By storing the first integer in plain, unpacked,
127  * format, we can use binary search to quickly find an item that holds (or
128  * would hold) a particular integer.  And by storing the rest in packed form,
129  * we still get pretty good memory density, if there are clusters of integers
130  * with similar values.
131  *
132  * Each leaf node also has a pointer to the next leaf node, so that the leaf
133  * nodes can be easily walked from beginning to end when iterating.
134  */
135 typedef struct intset_node intset_node;
136 typedef struct intset_leaf_node intset_leaf_node;
137 typedef struct intset_internal_node intset_internal_node;
138 
139 /* Common structure of both leaf and internal nodes. */
140 struct intset_node
141 {
142 	uint16		level;			/* tree level of this node */
143 	uint16		num_items;		/* number of items in this node */
144 };
145 
146 /* Internal node */
147 struct intset_internal_node
148 {
149 	/* common header, must match intset_node */
150 	uint16		level;			/* >= 1 on internal nodes */
151 	uint16		num_items;
152 
153 	/*
154 	 * 'values' is an array of key values, and 'downlinks' are pointers to
155 	 * lower-level nodes, corresponding to the key values.
156 	 */
157 	uint64		values[MAX_INTERNAL_ITEMS];
158 	intset_node *downlinks[MAX_INTERNAL_ITEMS];
159 };
160 
161 /* Leaf node */
162 typedef struct
163 {
164 	uint64		first;			/* first integer in this item */
165 	uint64		codeword;		/* simple8b encoded differences from 'first' */
166 } leaf_item;
167 
168 #define MAX_VALUES_PER_LEAF_ITEM	(1 + SIMPLE8B_MAX_VALUES_PER_CODEWORD)
169 
170 struct intset_leaf_node
171 {
172 	/* common header, must match intset_node */
173 	uint16		level;			/* 0 on leafs */
174 	uint16		num_items;
175 
176 	intset_leaf_node *next;		/* right sibling, if any */
177 
178 	leaf_item	items[MAX_LEAF_ITEMS];
179 };
180 
181 /*
182  * We buffer insertions in a simple array, before packing and inserting them
183  * into the B-tree.  MAX_BUFFERED_VALUES sets the size of the buffer.  The
184  * encoder assumes that it is large enough that we can always fill a leaf
185  * item with buffered new items.  In other words, MAX_BUFFERED_VALUES must be
186  * larger than MAX_VALUES_PER_LEAF_ITEM.  For efficiency, make it much larger.
187  */
188 #define MAX_BUFFERED_VALUES			(MAX_VALUES_PER_LEAF_ITEM * 2)
189 
190 /*
191  * IntegerSet is the top-level object representing the set.
192  *
193  * The integers are stored in an in-memory B-tree structure, plus an array
194  * for newly-added integers.  IntegerSet also tracks information about memory
195  * usage, as well as the current position when iterating the set with
196  * intset_begin_iterate / intset_iterate_next.
197  */
198 struct IntegerSet
199 {
200 	/*
201 	 * 'context' is the memory context holding this integer set and all its
202 	 * tree nodes.
203 	 *
204 	 * 'mem_used' tracks the amount of memory used.  We don't do anything with
205 	 * it in integerset.c itself, but the callers can ask for it with
206 	 * intset_memory_usage().
207 	 */
208 	MemoryContext context;
209 	uint64		mem_used;
210 
211 	uint64		num_entries;	/* total # of values in the set */
212 	uint64		highest_value;	/* highest value stored in this set */
213 
214 	/*
215 	 * B-tree to hold the packed values.
216 	 *
217 	 * 'rightmost_nodes' hold pointers to the rightmost node on each level.
218 	 * rightmost_parent[0] is rightmost leaf, rightmost_parent[1] is its
219 	 * parent, and so forth, all the way up to the root. These are needed when
220 	 * adding new values. (Currently, we require that new values are added at
221 	 * the end.)
222 	 */
223 	int			num_levels;		/* height of the tree */
224 	intset_node *root;			/* root node */
225 	intset_node *rightmost_nodes[MAX_TREE_LEVELS];
226 	intset_leaf_node *leftmost_leaf;	/* leftmost leaf node */
227 
228 	/*
229 	 * Holding area for new items that haven't been inserted to the tree yet.
230 	 */
231 	uint64		buffered_values[MAX_BUFFERED_VALUES];
232 	int			num_buffered_values;
233 
234 	/*
235 	 * Iterator support.
236 	 *
237 	 * 'iter_values' is an array of integers ready to be returned to the
238 	 * caller; 'iter_num_values' is the length of that array, and
239 	 * 'iter_valueno' is the next index.  'iter_node' and 'iter_itemno' point
240 	 * to the leaf node, and item within the leaf node, to get the next batch
241 	 * of values from.
242 	 *
243 	 * Normally, 'iter_values' points to 'iter_values_buf', which holds items
244 	 * decoded from a leaf item.  But after we have scanned the whole B-tree,
245 	 * we iterate through all the unbuffered values, too, by pointing
246 	 * iter_values to 'buffered_values'.
247 	 */
248 	bool		iter_active;	/* is iteration in progress? */
249 
250 	const uint64 *iter_values;
251 	int			iter_num_values;	/* number of elements in 'iter_values' */
252 	int			iter_valueno;	/* next index into 'iter_values' */
253 
254 	intset_leaf_node *iter_node;	/* current leaf node */
255 	int			iter_itemno;	/* next item in 'iter_node' to decode */
256 
257 	uint64		iter_values_buf[MAX_VALUES_PER_LEAF_ITEM];
258 };
259 
260 /*
261  * Prototypes for internal functions.
262  */
263 static void intset_update_upper(IntegerSet *intset, int level,
264 								intset_node *child, uint64 child_key);
265 static void intset_flush_buffered_values(IntegerSet *intset);
266 
267 static int	intset_binsrch_uint64(uint64 value, uint64 *arr, int arr_elems,
268 								  bool nextkey);
269 static int	intset_binsrch_leaf(uint64 value, leaf_item *arr, int arr_elems,
270 								bool nextkey);
271 
272 static uint64 simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base);
273 static int	simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base);
274 static bool simple8b_contains(uint64 codeword, uint64 key, uint64 base);
275 
276 
277 /*
278  * Create a new, initially empty, integer set.
279  *
280  * The integer set is created in the current memory context.
281  * We will do all subsequent allocations in the same context, too, regardless
282  * of which memory context is current when new integers are added to the set.
283  */
284 IntegerSet *
intset_create(void)285 intset_create(void)
286 {
287 	IntegerSet *intset;
288 
289 	intset = (IntegerSet *) palloc(sizeof(IntegerSet));
290 	intset->context = CurrentMemoryContext;
291 	intset->mem_used = GetMemoryChunkSpace(intset);
292 
293 	intset->num_entries = 0;
294 	intset->highest_value = 0;
295 
296 	intset->num_levels = 0;
297 	intset->root = NULL;
298 	memset(intset->rightmost_nodes, 0, sizeof(intset->rightmost_nodes));
299 	intset->leftmost_leaf = NULL;
300 
301 	intset->num_buffered_values = 0;
302 
303 	intset->iter_active = false;
304 	intset->iter_node = NULL;
305 	intset->iter_itemno = 0;
306 	intset->iter_valueno = 0;
307 	intset->iter_num_values = 0;
308 	intset->iter_values = NULL;
309 
310 	return intset;
311 }
312 
313 /*
314  * Allocate a new node.
315  */
316 static intset_internal_node *
intset_new_internal_node(IntegerSet * intset)317 intset_new_internal_node(IntegerSet *intset)
318 {
319 	intset_internal_node *n;
320 
321 	n = (intset_internal_node *) MemoryContextAlloc(intset->context,
322 													sizeof(intset_internal_node));
323 	intset->mem_used += GetMemoryChunkSpace(n);
324 
325 	n->level = 0;				/* caller must set */
326 	n->num_items = 0;
327 
328 	return n;
329 }
330 
331 static intset_leaf_node *
intset_new_leaf_node(IntegerSet * intset)332 intset_new_leaf_node(IntegerSet *intset)
333 {
334 	intset_leaf_node *n;
335 
336 	n = (intset_leaf_node *) MemoryContextAlloc(intset->context,
337 												sizeof(intset_leaf_node));
338 	intset->mem_used += GetMemoryChunkSpace(n);
339 
340 	n->level = 0;
341 	n->num_items = 0;
342 	n->next = NULL;
343 
344 	return n;
345 }
346 
347 /*
348  * Return the number of entries in the integer set.
349  */
350 uint64
intset_num_entries(IntegerSet * intset)351 intset_num_entries(IntegerSet *intset)
352 {
353 	return intset->num_entries;
354 }
355 
356 /*
357  * Return the amount of memory used by the integer set.
358  */
359 uint64
intset_memory_usage(IntegerSet * intset)360 intset_memory_usage(IntegerSet *intset)
361 {
362 	return intset->mem_used;
363 }
364 
365 /*
366  * Add a value to the set.
367  *
368  * Values must be added in order.
369  */
370 void
intset_add_member(IntegerSet * intset,uint64 x)371 intset_add_member(IntegerSet *intset, uint64 x)
372 {
373 	if (intset->iter_active)
374 		elog(ERROR, "cannot add new values to integer set while iteration is in progress");
375 
376 	if (x <= intset->highest_value && intset->num_entries > 0)
377 		elog(ERROR, "cannot add value to integer set out of order");
378 
379 	if (intset->num_buffered_values >= MAX_BUFFERED_VALUES)
380 	{
381 		/* Time to flush our buffer */
382 		intset_flush_buffered_values(intset);
383 		Assert(intset->num_buffered_values < MAX_BUFFERED_VALUES);
384 	}
385 
386 	/* Add it to the buffer of newly-added values */
387 	intset->buffered_values[intset->num_buffered_values] = x;
388 	intset->num_buffered_values++;
389 	intset->num_entries++;
390 	intset->highest_value = x;
391 }
392 
393 /*
394  * Take a batch of buffered values, and pack them into the B-tree.
395  */
396 static void
intset_flush_buffered_values(IntegerSet * intset)397 intset_flush_buffered_values(IntegerSet *intset)
398 {
399 	uint64	   *values = intset->buffered_values;
400 	uint64		num_values = intset->num_buffered_values;
401 	int			num_packed = 0;
402 	intset_leaf_node *leaf;
403 
404 	leaf = (intset_leaf_node *) intset->rightmost_nodes[0];
405 
406 	/*
407 	 * If the tree is completely empty, create the first leaf page, which is
408 	 * also the root.
409 	 */
410 	if (leaf == NULL)
411 	{
412 		/*
413 		 * This is the very first item in the set.
414 		 *
415 		 * Allocate root node. It's also a leaf.
416 		 */
417 		leaf = intset_new_leaf_node(intset);
418 
419 		intset->root = (intset_node *) leaf;
420 		intset->leftmost_leaf = leaf;
421 		intset->rightmost_nodes[0] = (intset_node *) leaf;
422 		intset->num_levels = 1;
423 	}
424 
425 	/*
426 	 * If there are less than MAX_VALUES_PER_LEAF_ITEM values in the buffer,
427 	 * stop.  In most cases, we cannot encode that many values in a single
428 	 * value, but this way, the encoder doesn't have to worry about running
429 	 * out of input.
430 	 */
431 	while (num_values - num_packed >= MAX_VALUES_PER_LEAF_ITEM)
432 	{
433 		leaf_item	item;
434 		int			num_encoded;
435 
436 		/*
437 		 * Construct the next leaf item, packing as many buffered values as
438 		 * possible.
439 		 */
440 		item.first = values[num_packed];
441 		item.codeword = simple8b_encode(&values[num_packed + 1],
442 										&num_encoded,
443 										item.first);
444 
445 		/*
446 		 * Add the item to the node, allocating a new node if the old one is
447 		 * full.
448 		 */
449 		if (leaf->num_items >= MAX_LEAF_ITEMS)
450 		{
451 			/* Allocate new leaf and link it to the tree */
452 			intset_leaf_node *old_leaf = leaf;
453 
454 			leaf = intset_new_leaf_node(intset);
455 			old_leaf->next = leaf;
456 			intset->rightmost_nodes[0] = (intset_node *) leaf;
457 			intset_update_upper(intset, 1, (intset_node *) leaf, item.first);
458 		}
459 		leaf->items[leaf->num_items++] = item;
460 
461 		num_packed += 1 + num_encoded;
462 	}
463 
464 	/*
465 	 * Move any remaining buffered values to the beginning of the array.
466 	 */
467 	if (num_packed < intset->num_buffered_values)
468 	{
469 		memmove(&intset->buffered_values[0],
470 				&intset->buffered_values[num_packed],
471 				(intset->num_buffered_values - num_packed) * sizeof(uint64));
472 	}
473 	intset->num_buffered_values -= num_packed;
474 }
475 
476 /*
477  * Insert a downlink into parent node, after creating a new node.
478  *
479  * Recurses if the parent node is full, too.
480  */
481 static void
intset_update_upper(IntegerSet * intset,int level,intset_node * child,uint64 child_key)482 intset_update_upper(IntegerSet *intset, int level, intset_node *child,
483 					uint64 child_key)
484 {
485 	intset_internal_node *parent;
486 
487 	Assert(level > 0);
488 
489 	/*
490 	 * Create a new root node, if necessary.
491 	 */
492 	if (level >= intset->num_levels)
493 	{
494 		intset_node *oldroot = intset->root;
495 		uint64		downlink_key;
496 
497 		/* MAX_TREE_LEVELS should be more than enough, this shouldn't happen */
498 		if (intset->num_levels == MAX_TREE_LEVELS)
499 			elog(ERROR, "could not expand integer set, maximum number of levels reached");
500 		intset->num_levels++;
501 
502 		/*
503 		 * Get the first value on the old root page, to be used as the
504 		 * downlink.
505 		 */
506 		if (intset->root->level == 0)
507 			downlink_key = ((intset_leaf_node *) oldroot)->items[0].first;
508 		else
509 			downlink_key = ((intset_internal_node *) oldroot)->values[0];
510 
511 		parent = intset_new_internal_node(intset);
512 		parent->level = level;
513 		parent->values[0] = downlink_key;
514 		parent->downlinks[0] = oldroot;
515 		parent->num_items = 1;
516 
517 		intset->root = (intset_node *) parent;
518 		intset->rightmost_nodes[level] = (intset_node *) parent;
519 	}
520 
521 	/*
522 	 * Place the downlink on the parent page.
523 	 */
524 	parent = (intset_internal_node *) intset->rightmost_nodes[level];
525 
526 	if (parent->num_items < MAX_INTERNAL_ITEMS)
527 	{
528 		parent->values[parent->num_items] = child_key;
529 		parent->downlinks[parent->num_items] = child;
530 		parent->num_items++;
531 	}
532 	else
533 	{
534 		/*
535 		 * Doesn't fit.  Allocate new parent, with the downlink as the first
536 		 * item on it, and recursively insert the downlink to the new parent
537 		 * to the grandparent.
538 		 */
539 		parent = intset_new_internal_node(intset);
540 		parent->level = level;
541 		parent->values[0] = child_key;
542 		parent->downlinks[0] = child;
543 		parent->num_items = 1;
544 
545 		intset->rightmost_nodes[level] = (intset_node *) parent;
546 
547 		intset_update_upper(intset, level + 1, (intset_node *) parent, child_key);
548 	}
549 }
550 
551 /*
552  * Does the set contain the given value?
553  */
554 bool
intset_is_member(IntegerSet * intset,uint64 x)555 intset_is_member(IntegerSet *intset, uint64 x)
556 {
557 	intset_node *node;
558 	intset_leaf_node *leaf;
559 	int			level;
560 	int			itemno;
561 	leaf_item  *item;
562 
563 	/*
564 	 * The value might be in the buffer of newly-added values.
565 	 */
566 	if (intset->num_buffered_values > 0 && x >= intset->buffered_values[0])
567 	{
568 		int			itemno;
569 
570 		itemno = intset_binsrch_uint64(x,
571 									   intset->buffered_values,
572 									   intset->num_buffered_values,
573 									   false);
574 		if (itemno >= intset->num_buffered_values)
575 			return false;
576 		else
577 			return (intset->buffered_values[itemno] == x);
578 	}
579 
580 	/*
581 	 * Start from the root, and walk down the B-tree to find the right leaf
582 	 * node.
583 	 */
584 	if (!intset->root)
585 		return false;
586 	node = intset->root;
587 	for (level = intset->num_levels - 1; level > 0; level--)
588 	{
589 		intset_internal_node *n = (intset_internal_node *) node;
590 
591 		Assert(node->level == level);
592 
593 		itemno = intset_binsrch_uint64(x, n->values, n->num_items, true);
594 		if (itemno == 0)
595 			return false;
596 		node = n->downlinks[itemno - 1];
597 	}
598 	Assert(node->level == 0);
599 	leaf = (intset_leaf_node *) node;
600 
601 	/*
602 	 * Binary search to find the right item on the leaf page
603 	 */
604 	itemno = intset_binsrch_leaf(x, leaf->items, leaf->num_items, true);
605 	if (itemno == 0)
606 		return false;
607 	item = &leaf->items[itemno - 1];
608 
609 	/* Is this a match to the first value on the item? */
610 	if (item->first == x)
611 		return true;
612 	Assert(x > item->first);
613 
614 	/* Is it in the packed codeword? */
615 	if (simple8b_contains(item->codeword, x, item->first))
616 		return true;
617 
618 	return false;
619 }
620 
621 /*
622  * Begin in-order scan through all the values.
623  *
624  * While the iteration is in-progress, you cannot add new values to the set.
625  */
626 void
intset_begin_iterate(IntegerSet * intset)627 intset_begin_iterate(IntegerSet *intset)
628 {
629 	/* Note that we allow an iteration to be abandoned midway */
630 	intset->iter_active = true;
631 	intset->iter_node = intset->leftmost_leaf;
632 	intset->iter_itemno = 0;
633 	intset->iter_valueno = 0;
634 	intset->iter_num_values = 0;
635 	intset->iter_values = intset->iter_values_buf;
636 }
637 
638 /*
639  * Returns the next integer, when iterating.
640  *
641  * intset_begin_iterate() must be called first.  intset_iterate_next() returns
642  * the next value in the set.  Returns true, if there was another value, and
643  * stores the value in *next.  Otherwise, returns false.
644  */
645 bool
intset_iterate_next(IntegerSet * intset,uint64 * next)646 intset_iterate_next(IntegerSet *intset, uint64 *next)
647 {
648 	Assert(intset->iter_active);
649 	for (;;)
650 	{
651 		/* Return next iter_values[] entry if any */
652 		if (intset->iter_valueno < intset->iter_num_values)
653 		{
654 			*next = intset->iter_values[intset->iter_valueno++];
655 			return true;
656 		}
657 
658 		/* Decode next item in current leaf node, if any */
659 		if (intset->iter_node &&
660 			intset->iter_itemno < intset->iter_node->num_items)
661 		{
662 			leaf_item  *item;
663 			int			num_decoded;
664 
665 			item = &intset->iter_node->items[intset->iter_itemno++];
666 
667 			intset->iter_values_buf[0] = item->first;
668 			num_decoded = simple8b_decode(item->codeword,
669 										  &intset->iter_values_buf[1],
670 										  item->first);
671 			intset->iter_num_values = num_decoded + 1;
672 			intset->iter_valueno = 0;
673 			continue;
674 		}
675 
676 		/* No more items on this leaf, step to next node */
677 		if (intset->iter_node)
678 		{
679 			intset->iter_node = intset->iter_node->next;
680 			intset->iter_itemno = 0;
681 			continue;
682 		}
683 
684 		/*
685 		 * We have reached the end of the B-tree.  But we might still have
686 		 * some integers in the buffer of newly-added values.
687 		 */
688 		if (intset->iter_values == (const uint64 *) intset->iter_values_buf)
689 		{
690 			intset->iter_values = intset->buffered_values;
691 			intset->iter_num_values = intset->num_buffered_values;
692 			intset->iter_valueno = 0;
693 			continue;
694 		}
695 
696 		break;
697 	}
698 
699 	/* No more results. */
700 	intset->iter_active = false;
701 	*next = 0;					/* prevent uninitialized-variable warnings */
702 	return false;
703 }
704 
705 /*
706  * intset_binsrch_uint64() -- search a sorted array of uint64s
707  *
708  * Returns the first position with key equal or less than the given key.
709  * The returned position would be the "insert" location for the given key,
710  * that is, the position where the new key should be inserted to.
711  *
712  * 'nextkey' affects the behavior on equal keys.  If true, and there is an
713  * equal key in the array, this returns the position immediately after the
714  * equal key.  If false, this returns the position of the equal key itself.
715  */
716 static int
intset_binsrch_uint64(uint64 item,uint64 * arr,int arr_elems,bool nextkey)717 intset_binsrch_uint64(uint64 item, uint64 *arr, int arr_elems, bool nextkey)
718 {
719 	int			low,
720 				high,
721 				mid;
722 
723 	low = 0;
724 	high = arr_elems;
725 	while (high > low)
726 	{
727 		mid = low + (high - low) / 2;
728 
729 		if (nextkey)
730 		{
731 			if (item >= arr[mid])
732 				low = mid + 1;
733 			else
734 				high = mid;
735 		}
736 		else
737 		{
738 			if (item > arr[mid])
739 				low = mid + 1;
740 			else
741 				high = mid;
742 		}
743 	}
744 
745 	return low;
746 }
747 
748 /* same, but for an array of leaf items */
749 static int
intset_binsrch_leaf(uint64 item,leaf_item * arr,int arr_elems,bool nextkey)750 intset_binsrch_leaf(uint64 item, leaf_item *arr, int arr_elems, bool nextkey)
751 {
752 	int			low,
753 				high,
754 				mid;
755 
756 	low = 0;
757 	high = arr_elems;
758 	while (high > low)
759 	{
760 		mid = low + (high - low) / 2;
761 
762 		if (nextkey)
763 		{
764 			if (item >= arr[mid].first)
765 				low = mid + 1;
766 			else
767 				high = mid;
768 		}
769 		else
770 		{
771 			if (item > arr[mid].first)
772 				low = mid + 1;
773 			else
774 				high = mid;
775 		}
776 	}
777 
778 	return low;
779 }
780 
781 /*
782  * Simple-8b encoding.
783  *
784  * The simple-8b algorithm packs between 1 and 240 integers into 64-bit words,
785  * called "codewords".  The number of integers packed into a single codeword
786  * depends on the integers being packed; small integers are encoded using
787  * fewer bits than large integers.  A single codeword can store a single
788  * 60-bit integer, or two 30-bit integers, for example.
789  *
790  * Since we're storing a unique, sorted, set of integers, we actually encode
791  * the *differences* between consecutive integers.  That way, clusters of
792  * integers that are close to each other are packed efficiently, regardless
793  * of their absolute values.
794  *
795  * In Simple-8b, each codeword consists of a 4-bit selector, which indicates
796  * how many integers are encoded in the codeword, and the encoded integers are
797  * packed into the remaining 60 bits.  The selector allows for 16 different
798  * ways of using the remaining 60 bits, called "modes".  The number of integers
799  * packed into a single codeword in each mode is listed in the simple8b_modes
800  * table below.  For example, consider the following codeword:
801  *
802  *      20-bit integer       20-bit integer       20-bit integer
803  * 1101 00000000000000010010 01111010000100100000 00000000000000010100
804  * ^
805  * selector
806  *
807  * The selector 1101 is 13 in decimal.  From the modes table below, we see
808  * that it means that the codeword encodes three 20-bit integers.  In decimal,
809  * those integers are 18, 500000 and 20.  Because we encode deltas rather than
810  * absolute values, the actual values that they represent are 18, 500018 and
811  * 500038.
812  *
813  * Modes 0 and 1 are a bit special; they encode a run of 240 or 120 zeroes
814  * (which means 240 or 120 consecutive integers, since we're encoding the
815  * deltas between integers), without using the rest of the codeword bits
816  * for anything.
817  *
818  * Simple-8b cannot encode integers larger than 60 bits.  Values larger than
819  * that are always stored in the 'first' field of a leaf item, never in the
820  * packed codeword.  If there is a sequence of integers that are more than
821  * 2^60 apart, the codeword will go unused on those items.  To represent that,
822  * we use a magic EMPTY_CODEWORD codeword value.
823  */
824 static const struct simple8b_mode
825 {
826 	uint8		bits_per_int;
827 	uint8		num_ints;
828 }			simple8b_modes[17] =
829 
830 {
831 	{0, 240},					/* mode  0: 240 zeroes */
832 	{0, 120},					/* mode  1: 120 zeroes */
833 	{1, 60},					/* mode  2: sixty 1-bit integers */
834 	{2, 30},					/* mode  3: thirty 2-bit integers */
835 	{3, 20},					/* mode  4: twenty 3-bit integers */
836 	{4, 15},					/* mode  5: fifteen 4-bit integers */
837 	{5, 12},					/* mode  6: twelve 5-bit integers */
838 	{6, 10},					/* mode  7: ten 6-bit integers */
839 	{7, 8},						/* mode  8: eight 7-bit integers (four bits
840 								 * are wasted) */
841 	{8, 7},						/* mode  9: seven 8-bit integers (four bits
842 								 * are wasted) */
843 	{10, 6},					/* mode 10: six 10-bit integers */
844 	{12, 5},					/* mode 11: five 12-bit integers */
845 	{15, 4},					/* mode 12: four 15-bit integers */
846 	{20, 3},					/* mode 13: three 20-bit integers */
847 	{30, 2},					/* mode 14: two 30-bit integers */
848 	{60, 1},					/* mode 15: one 60-bit integer */
849 
850 	{0, 0}						/* sentinel value */
851 };
852 
853 /*
854  * EMPTY_CODEWORD is a special value, used to indicate "no values".
855  * It is used if the next value is too large to be encoded with Simple-8b.
856  *
857  * This value looks like a mode-0 codeword, but we can distinguish it
858  * because a regular mode-0 codeword would have zeroes in the unused bits.
859  */
860 #define EMPTY_CODEWORD		UINT64CONST(0x0FFFFFFFFFFFFFFF)
861 
862 /*
863  * Encode a number of integers into a Simple-8b codeword.
864  *
865  * (What we actually encode are deltas between successive integers.
866  * "base" is the value before ints[0].)
867  *
868  * The input array must contain at least SIMPLE8B_MAX_VALUES_PER_CODEWORD
869  * elements, ensuring that we can produce a full codeword.
870  *
871  * Returns the encoded codeword, and sets *num_encoded to the number of
872  * input integers that were encoded.  That can be zero, if the first delta
873  * is too large to be encoded.
874  */
875 static uint64
simple8b_encode(const uint64 * ints,int * num_encoded,uint64 base)876 simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base)
877 {
878 	int			selector;
879 	int			nints;
880 	int			bits;
881 	uint64		diff;
882 	uint64		last_val;
883 	uint64		codeword;
884 	int			i;
885 
886 	Assert(ints[0] > base);
887 
888 	/*
889 	 * Select the "mode" to use for this codeword.
890 	 *
891 	 * In each iteration, check if the next value can be represented in the
892 	 * current mode we're considering.  If it's too large, then step up the
893 	 * mode to a wider one, and repeat.  If it fits, move on to the next
894 	 * integer.  Repeat until the codeword is full, given the current mode.
895 	 *
896 	 * Note that we don't have any way to represent unused slots in the
897 	 * codeword, so we require each codeword to be "full".  It is always
898 	 * possible to produce a full codeword unless the very first delta is too
899 	 * large to be encoded.  For example, if the first delta is small but the
900 	 * second is too large to be encoded, we'll end up using the last "mode",
901 	 * which has nints == 1.
902 	 */
903 	selector = 0;
904 	nints = simple8b_modes[0].num_ints;
905 	bits = simple8b_modes[0].bits_per_int;
906 	diff = ints[0] - base - 1;
907 	last_val = ints[0];
908 	i = 0;						/* number of deltas we have accepted */
909 	for (;;)
910 	{
911 		if (diff >= (UINT64CONST(1) << bits))
912 		{
913 			/* too large, step up to next mode */
914 			selector++;
915 			nints = simple8b_modes[selector].num_ints;
916 			bits = simple8b_modes[selector].bits_per_int;
917 			/* we might already have accepted enough deltas for this mode */
918 			if (i >= nints)
919 				break;
920 		}
921 		else
922 		{
923 			/* accept this delta; then done if codeword is full */
924 			i++;
925 			if (i >= nints)
926 				break;
927 			/* examine next delta */
928 			Assert(ints[i] > last_val);
929 			diff = ints[i] - last_val - 1;
930 			last_val = ints[i];
931 		}
932 	}
933 
934 	if (nints == 0)
935 	{
936 		/*
937 		 * The first delta is too large to be encoded with Simple-8b.
938 		 *
939 		 * If there is at least one not-too-large integer in the input, we
940 		 * will encode it using mode 15 (or a more compact mode).  Hence, we
941 		 * can only get here if the *first* delta is >= 2^60.
942 		 */
943 		Assert(i == 0);
944 		*num_encoded = 0;
945 		return EMPTY_CODEWORD;
946 	}
947 
948 	/*
949 	 * Encode the integers using the selected mode.  Note that we shift them
950 	 * into the codeword in reverse order, so that they will come out in the
951 	 * correct order in the decoder.
952 	 */
953 	codeword = 0;
954 	if (bits > 0)
955 	{
956 		for (i = nints - 1; i > 0; i--)
957 		{
958 			diff = ints[i] - ints[i - 1] - 1;
959 			codeword |= diff;
960 			codeword <<= bits;
961 		}
962 		diff = ints[0] - base - 1;
963 		codeword |= diff;
964 	}
965 
966 	/* add selector to the codeword, and return */
967 	codeword |= (uint64) selector << 60;
968 
969 	*num_encoded = nints;
970 	return codeword;
971 }
972 
973 /*
974  * Decode a codeword into an array of integers.
975  * Returns the number of integers decoded.
976  */
977 static int
simple8b_decode(uint64 codeword,uint64 * decoded,uint64 base)978 simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base)
979 {
980 	int			selector = (codeword >> 60);
981 	int			nints = simple8b_modes[selector].num_ints;
982 	int			bits = simple8b_modes[selector].bits_per_int;
983 	uint64		mask = (UINT64CONST(1) << bits) - 1;
984 	uint64		curr_value;
985 
986 	if (codeword == EMPTY_CODEWORD)
987 		return 0;
988 
989 	curr_value = base;
990 	for (int i = 0; i < nints; i++)
991 	{
992 		uint64		diff = codeword & mask;
993 
994 		curr_value += 1 + diff;
995 		decoded[i] = curr_value;
996 		codeword >>= bits;
997 	}
998 	return nints;
999 }
1000 
1001 /*
1002  * This is very similar to simple8b_decode(), but instead of decoding all
1003  * the values to an array, it just checks if the given "key" is part of
1004  * the codeword.
1005  */
1006 static bool
simple8b_contains(uint64 codeword,uint64 key,uint64 base)1007 simple8b_contains(uint64 codeword, uint64 key, uint64 base)
1008 {
1009 	int			selector = (codeword >> 60);
1010 	int			nints = simple8b_modes[selector].num_ints;
1011 	int			bits = simple8b_modes[selector].bits_per_int;
1012 
1013 	if (codeword == EMPTY_CODEWORD)
1014 		return false;
1015 
1016 	if (bits == 0)
1017 	{
1018 		/* Special handling for 0-bit cases. */
1019 		return (key - base) <= nints;
1020 	}
1021 	else
1022 	{
1023 		uint64		mask = (UINT64CONST(1) << bits) - 1;
1024 		uint64		curr_value;
1025 
1026 		curr_value = base;
1027 		for (int i = 0; i < nints; i++)
1028 		{
1029 			uint64		diff = codeword & mask;
1030 
1031 			curr_value += 1 + diff;
1032 
1033 			if (curr_value >= key)
1034 			{
1035 				if (curr_value == key)
1036 					return true;
1037 				else
1038 					return false;
1039 			}
1040 
1041 			codeword >>= bits;
1042 		}
1043 	}
1044 	return false;
1045 }
1046