1 /*
2  * brin_bloom.c
3  *		Implementation of Bloom opclass for BRIN
4  *
5  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
6  * Portions Copyright (c) 1994, Regents of the University of California
7  *
8  *
9  * A BRIN opclass summarizing page range into a bloom filter.
10  *
11  * Bloom filters allow efficient testing whether a given page range contains
12  * a particular value. Therefore, if we summarize each page range into a small
13  * bloom filter, we can easily (and cheaply) test whether it contains values
14  * we get later.
15  *
16  * The index only supports equality operators, similarly to hash indexes.
17  * Bloom indexes are however much smaller, and support only bitmap scans.
18  *
19  * Note: Don't confuse this with bloom indexes, implemented in a contrib
20  * module. That extension implements an entirely new AM, building a bloom
21  * filter on multiple columns in a single row. This opclass works with an
22  * existing AM (BRIN) and builds bloom filter on a column.
23  *
24  *
25  * values vs. hashes
26  * -----------------
27  *
28  * The original column values are not used directly, but are first hashed
29  * using the regular type-specific hash function, producing a uint32 hash.
30  * And this hash value is then added to the summary - i.e. it's hashed
31  * again and added to the bloom filter.
32  *
33  * This allows the code to treat all data types (byval/byref/...) the same
34  * way, with only minimal space requirements, because we're working with
35  * hashes and not the original values. Everything is uint32.
36  *
37  * Of course, this assumes the built-in hash function is reasonably good,
38  * without too many collisions etc. But that does seem to be the case, at
39  * least based on past experience. After all, the same hash functions are
40  * used for hash indexes, hash partitioning and so on.
41  *
42  *
43  * hashing scheme
44  * --------------
45  *
46  * Bloom filters require a number of independent hash functions. There are
47  * different schemes how to construct them - for example we might use
48  * hash_uint32_extended with random seeds, but that seems fairly expensive.
49  * We use a scheme requiring only two functions described in this paper:
50  *
51  * Less Hashing, Same Performance:Building a Better Bloom Filter
52  * Adam Kirsch, Michael Mitzenmacher†, Harvard School of Engineering and
53  * Applied Sciences, Cambridge, Massachusetts [DOI 10.1002/rsa.20208]
54  *
55  * The two hash functions h1 and h2 are calculated using hard-coded seeds,
56  * and then combined using (h1 + i * h2) to generate the hash functions.
57  *
58  *
59  * sizing the bloom filter
60  * -----------------------
61  *
62  * Size of a bloom filter depends on the number of distinct values we will
63  * store in it, and the desired false positive rate. The higher the number
64  * of distinct values and/or the lower the false positive rate, the larger
65  * the bloom filter. On the other hand, we want to keep the index as small
66  * as possible - that's one of the basic advantages of BRIN indexes.
67  *
68  * Although the number of distinct elements (in a page range) depends on
69  * the data, we can consider it fixed. This simplifies the trade-off to
70  * just false positive rate vs. size.
71  *
72  * At the page range level, false positive rate is a probability the bloom
73  * filter matches a random value. For the whole index (with sufficiently
74  * many page ranges) it represents the fraction of the index ranges (and
75  * thus fraction of the table to be scanned) matching the random value.
76  *
77  * Furthermore, the size of the bloom filter is subject to implementation
78  * limits - it has to fit onto a single index page (8kB by default). As
79  * the bitmap is inherently random (when "full" about half the bits is set
80  * to 1, randomly), compression can't help very much.
81  *
82  * To reduce the size of a filter (to fit to a page), we have to either
83  * accept higher false positive rate (undesirable), or reduce the number
84  * of distinct items to be stored in the filter. We can't alter the input
85  * data, of course, but we may make the BRIN page ranges smaller - instead
86  * of the default 128 pages (1MB) we may build index with 16-page ranges,
87  * or something like that. This should reduce the number of distinct values
88  * in the page range, making the filter smaller (with fixed false positive
89  * rate). Even for random data sets this should help, as the number of rows
90  * per heap page is limited (to ~290 with very narrow tables, likely ~20
91  * in practice).
92  *
93  * Of course, good sizing decisions depend on having the necessary data,
94  * i.e. number of distinct values in a page range (of a given size) and
95  * table size (to estimate cost change due to change in false positive
96  * rate due to having larger index vs. scanning larger indexes). We may
97  * not have that data - for example when building an index on empty table
98  * it's not really possible. And for some data we only have estimates for
99  * the whole table and we can only estimate per-range values (ndistinct).
100  *
101  * Another challenge is that while the bloom filter is per-column, it's
102  * the whole index tuple that has to fit into a page. And for multi-column
103  * indexes that may include pieces we have no control over (not necessarily
104  * bloom filters, the other columns may use other BRIN opclasses). So it's
105  * not entirely clear how to distribute the space between those columns.
106  *
107  * The current logic, implemented in brin_bloom_get_ndistinct, attempts to
108  * make some basic sizing decisions, based on the size of BRIN ranges, and
109  * the maximum number of rows per range.
110  *
111  *
112  * IDENTIFICATION
113  *	  src/backend/access/brin/brin_bloom.c
114  */
115 #include "postgres.h"
116 
117 #include "access/genam.h"
118 #include "access/brin.h"
119 #include "access/brin_internal.h"
120 #include "access/brin_page.h"
121 #include "access/brin_tuple.h"
122 #include "access/hash.h"
123 #include "access/htup_details.h"
124 #include "access/reloptions.h"
125 #include "access/stratnum.h"
126 #include "catalog/pg_type.h"
127 #include "catalog/pg_amop.h"
128 #include "utils/builtins.h"
129 #include "utils/datum.h"
130 #include "utils/lsyscache.h"
131 #include "utils/rel.h"
132 #include "utils/syscache.h"
133 
134 #include <math.h>
135 
136 #define BloomEqualStrategyNumber	1
137 
138 /*
139  * Additional SQL level support functions. We only have one, which is
140  * used to calculate hash of the input value.
141  *
142  * Procedure numbers must not use values reserved for BRIN itself; see
143  * brin_internal.h.
144  */
145 #define		BLOOM_MAX_PROCNUMS		1	/* maximum support procs we need */
146 #define		PROCNUM_HASH			11	/* required */
147 
148 /*
149  * Subtract this from procnum to obtain index in BloomOpaque arrays
150  * (Must be equal to minimum of private procnums).
151  */
152 #define		PROCNUM_BASE			11
153 
154 /*
155  * Storage type for BRIN's reloptions.
156  */
157 typedef struct BloomOptions
158 {
159 	int32		vl_len_;		/* varlena header (do not touch directly!) */
160 	double		nDistinctPerRange;	/* number of distinct values per range */
161 	double		falsePositiveRate;	/* false positive for bloom filter */
162 } BloomOptions;
163 
164 /*
165  * The current min value (16) is somewhat arbitrary, but it's based
166  * on the fact that the filter header is ~20B alone, which is about
167  * the same as the filter bitmap for 16 distinct items with 1% false
168  * positive rate. So by allowing lower values we'd not gain much. In
169  * any case, the min should not be larger than MaxHeapTuplesPerPage
170  * (~290), which is the theoretical maximum for single-page ranges.
171  */
172 #define		BLOOM_MIN_NDISTINCT_PER_RANGE		16
173 
174 /*
175  * Used to determine number of distinct items, based on the number of rows
176  * in a page range. The 10% is somewhat similar to what estimate_num_groups
177  * does, so we use the same factor here.
178  */
179 #define		BLOOM_DEFAULT_NDISTINCT_PER_RANGE	-0.1	/* 10% of values */
180 
181 /*
182  * Allowed range and default value for the false positive range. The exact
183  * values are somewhat arbitrary, but were chosen considering the various
184  * parameters (size of filter vs. page size, etc.).
185  *
186  * The lower the false-positive rate, the more accurate the filter is, but
187  * it also gets larger - at some point this eliminates the main advantage
188  * of BRIN indexes, which is the tiny size. At 0.01% the index is about
189  * 10% of the table (assuming 290 distinct values per 8kB page).
190  *
191  * On the other hand, as the false-positive rate increases, larger part of
192  * the table has to be scanned due to mismatches - at 25% we're probably
193  * close to sequential scan being cheaper.
194  */
195 #define		BLOOM_MIN_FALSE_POSITIVE_RATE	0.0001	/* 0.01% fp rate */
196 #define		BLOOM_MAX_FALSE_POSITIVE_RATE	0.25	/* 25% fp rate */
197 #define		BLOOM_DEFAULT_FALSE_POSITIVE_RATE	0.01	/* 1% fp rate */
198 
199 #define BloomGetNDistinctPerRange(opts) \
200 	((opts) && (((BloomOptions *) (opts))->nDistinctPerRange != 0) ? \
201 	 (((BloomOptions *) (opts))->nDistinctPerRange) : \
202 	 BLOOM_DEFAULT_NDISTINCT_PER_RANGE)
203 
204 #define BloomGetFalsePositiveRate(opts) \
205 	((opts) && (((BloomOptions *) (opts))->falsePositiveRate != 0.0) ? \
206 	 (((BloomOptions *) (opts))->falsePositiveRate) : \
207 	 BLOOM_DEFAULT_FALSE_POSITIVE_RATE)
208 
209 /*
210  * And estimate of the largest bloom we can fit onto a page. This is not
211  * a perfect guarantee, for a couple of reasons. For example, the row may
212  * be larger because the index has multiple columns.
213  */
214 #define BloomMaxFilterSize \
215 	MAXALIGN_DOWN(BLCKSZ - \
216 				  (MAXALIGN(SizeOfPageHeaderData + \
217 							sizeof(ItemIdData)) + \
218 				   MAXALIGN(sizeof(BrinSpecialSpace)) + \
219 				   SizeOfBrinTuple))
220 
221 /*
222  * Seeds used to calculate two hash functions h1 and h2, which are then used
223  * to generate k hashes using the (h1 + i * h2) scheme.
224  */
225 #define BLOOM_SEED_1	0x71d924af
226 #define BLOOM_SEED_2	0xba48b314
227 
228 /*
229  * Bloom Filter
230  *
231  * Represents a bloom filter, built on hashes of the indexed values. That is,
232  * we compute a uint32 hash of the value, and then store this hash into the
233  * bloom filter (and compute additional hashes on it).
234  *
235  * XXX We could implement "sparse" bloom filters, keeping only the bytes that
236  * are not entirely 0. But while indexes don't support TOAST, the varlena can
237  * still be compressed. So this seems unnecessary, because the compression
238  * should do the same job.
239  *
240  * XXX We can also watch the number of bits set in the bloom filter, and then
241  * stop using it (and not store the bitmap, to save space) when the false
242  * positive rate gets too high. But even if the false positive rate exceeds the
243  * desired value, it still can eliminate some page ranges.
244  */
245 typedef struct BloomFilter
246 {
247 	/* varlena header (do not touch directly!) */
248 	int32		vl_len_;
249 
250 	/* space for various flags (unused for now) */
251 	uint16		flags;
252 
253 	/* fields for the HASHED phase */
254 	uint8		nhashes;		/* number of hash functions */
255 	uint32		nbits;			/* number of bits in the bitmap (size) */
256 	uint32		nbits_set;		/* number of bits set to 1 */
257 
258 	/* data of the bloom filter */
259 	char		data[FLEXIBLE_ARRAY_MEMBER];
260 
261 } BloomFilter;
262 
263 
264 /*
265  * bloom_init
266  * 		Initialize the Bloom Filter, allocate all the memory.
267  *
268  * The filter is initialized with optimal size for ndistinct expected values
269  * and the requested false positive rate. The filter is stored as varlena.
270  */
271 static BloomFilter *
bloom_init(int ndistinct,double false_positive_rate)272 bloom_init(int ndistinct, double false_positive_rate)
273 {
274 	Size		len;
275 	BloomFilter *filter;
276 
277 	int			nbits;			/* size of filter / number of bits */
278 	int			nbytes;			/* size of filter / number of bytes */
279 
280 	double		k;				/* number of hash functions */
281 
282 	Assert(ndistinct > 0);
283 	Assert((false_positive_rate >= BLOOM_MIN_FALSE_POSITIVE_RATE) &&
284 		   (false_positive_rate < BLOOM_MAX_FALSE_POSITIVE_RATE));
285 
286 	/* sizing bloom filter: -(n * ln(p)) / (ln(2))^2 */
287 	nbits = ceil(-(ndistinct * log(false_positive_rate)) / pow(log(2.0), 2));
288 
289 	/* round m to whole bytes */
290 	nbytes = ((nbits + 7) / 8);
291 	nbits = nbytes * 8;
292 
293 	/*
294 	 * Reject filters that are obviously too large to store on a page.
295 	 *
296 	 * Initially the bloom filter is just zeroes and so very compressible, but
297 	 * as we add values it gets more and more random, and so less and less
298 	 * compressible. So initially everything fits on the page, but we might
299 	 * get surprising failures later - we want to prevent that, so we reject
300 	 * bloom filter that are obviously too large.
301 	 *
302 	 * XXX It's not uncommon to oversize the bloom filter a bit, to defend
303 	 * against unexpected data anomalies (parts of table with more distinct
304 	 * values per range etc.). But we still need to make sure even the
305 	 * oversized filter fits on page, if such need arises.
306 	 *
307 	 * XXX This check is not perfect, because the index may have multiple
308 	 * filters that are small individually, but too large when combined.
309 	 */
310 	if (nbytes > BloomMaxFilterSize)
311 		elog(ERROR, "the bloom filter is too large (%d > %zu)", nbytes,
312 			 BloomMaxFilterSize);
313 
314 	/*
315 	 * round(log(2.0) * m / ndistinct), but assume round() may not be
316 	 * available on Windows
317 	 */
318 	k = log(2.0) * nbits / ndistinct;
319 	k = (k - floor(k) >= 0.5) ? ceil(k) : floor(k);
320 
321 	/*
322 	 * We allocate the whole filter. Most of it is going to be 0 bits, so the
323 	 * varlena is easy to compress.
324 	 */
325 	len = offsetof(BloomFilter, data) + nbytes;
326 
327 	filter = (BloomFilter *) palloc0(len);
328 
329 	filter->flags = 0;
330 	filter->nhashes = (int) k;
331 	filter->nbits = nbits;
332 
333 	SET_VARSIZE(filter, len);
334 
335 	return filter;
336 }
337 
338 
339 /*
340  * bloom_add_value
341  * 		Add value to the bloom filter.
342  */
343 static BloomFilter *
bloom_add_value(BloomFilter * filter,uint32 value,bool * updated)344 bloom_add_value(BloomFilter *filter, uint32 value, bool *updated)
345 {
346 	int			i;
347 	uint64		h1,
348 				h2;
349 
350 	/* compute the hashes, used for the bloom filter */
351 	h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits;
352 	h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits;
353 
354 	/* compute the requested number of hashes */
355 	for (i = 0; i < filter->nhashes; i++)
356 	{
357 		/* h1 + h2 + f(i) */
358 		uint32		h = (h1 + i * h2) % filter->nbits;
359 		uint32		byte = (h / 8);
360 		uint32		bit = (h % 8);
361 
362 		/* if the bit is not set, set it and remember we did that */
363 		if (!(filter->data[byte] & (0x01 << bit)))
364 		{
365 			filter->data[byte] |= (0x01 << bit);
366 			filter->nbits_set++;
367 			if (updated)
368 				*updated = true;
369 		}
370 	}
371 
372 	return filter;
373 }
374 
375 
376 /*
377  * bloom_contains_value
378  * 		Check if the bloom filter contains a particular value.
379  */
380 static bool
bloom_contains_value(BloomFilter * filter,uint32 value)381 bloom_contains_value(BloomFilter *filter, uint32 value)
382 {
383 	int			i;
384 	uint64		h1,
385 				h2;
386 
387 	/* calculate the two hashes */
388 	h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits;
389 	h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits;
390 
391 	/* compute the requested number of hashes */
392 	for (i = 0; i < filter->nhashes; i++)
393 	{
394 		/* h1 + h2 + f(i) */
395 		uint32		h = (h1 + i * h2) % filter->nbits;
396 		uint32		byte = (h / 8);
397 		uint32		bit = (h % 8);
398 
399 		/* if the bit is not set, the value is not there */
400 		if (!(filter->data[byte] & (0x01 << bit)))
401 			return false;
402 	}
403 
404 	/* all hashes found in bloom filter */
405 	return true;
406 }
407 
408 typedef struct BloomOpaque
409 {
410 	/*
411 	 * XXX At this point we only need a single proc (to compute the hash), but
412 	 * let's keep the array just like inclusion and minmax opclasses, for
413 	 * consistency. We may need additional procs in the future.
414 	 */
415 	FmgrInfo	extra_procinfos[BLOOM_MAX_PROCNUMS];
416 	bool		extra_proc_missing[BLOOM_MAX_PROCNUMS];
417 } BloomOpaque;
418 
419 static FmgrInfo *bloom_get_procinfo(BrinDesc *bdesc, uint16 attno,
420 									uint16 procnum);
421 
422 
423 Datum
brin_bloom_opcinfo(PG_FUNCTION_ARGS)424 brin_bloom_opcinfo(PG_FUNCTION_ARGS)
425 {
426 	BrinOpcInfo *result;
427 
428 	/*
429 	 * opaque->strategy_procinfos is initialized lazily; here it is set to
430 	 * all-uninitialized by palloc0 which sets fn_oid to InvalidOid.
431 	 *
432 	 * bloom indexes only store the filter as a single BYTEA column
433 	 */
434 
435 	result = palloc0(MAXALIGN(SizeofBrinOpcInfo(1)) +
436 					 sizeof(BloomOpaque));
437 	result->oi_nstored = 1;
438 	result->oi_regular_nulls = true;
439 	result->oi_opaque = (BloomOpaque *)
440 		MAXALIGN((char *) result + SizeofBrinOpcInfo(1));
441 	result->oi_typcache[0] = lookup_type_cache(PG_BRIN_BLOOM_SUMMARYOID, 0);
442 
443 	PG_RETURN_POINTER(result);
444 }
445 
446 /*
447  * brin_bloom_get_ndistinct
448  *		Determine the ndistinct value used to size bloom filter.
449  *
450  * Adjust the ndistinct value based on the pagesPerRange value. First,
451  * if it's negative, it's assumed to be relative to maximum number of
452  * tuples in the range (assuming each page gets MaxHeapTuplesPerPage
453  * tuples, which is likely a significant over-estimate). We also clamp
454  * the value, not to over-size the bloom filter unnecessarily.
455  *
456  * XXX We can only do this when the pagesPerRange value was supplied.
457  * If it wasn't, it has to be a read-only access to the index, in which
458  * case we don't really care. But perhaps we should fall-back to the
459  * default pagesPerRange value?
460  *
461  * XXX We might also fetch info about ndistinct estimate for the column,
462  * and compute the expected number of distinct values in a range. But
463  * that may be tricky due to data being sorted in various ways, so it
464  * seems better to rely on the upper estimate.
465  *
466  * XXX We might also calculate a better estimate of rows per BRIN range,
467  * instead of using MaxHeapTuplesPerPage (which probably produces values
468  * much higher than reality).
469  */
470 static int
brin_bloom_get_ndistinct(BrinDesc * bdesc,BloomOptions * opts)471 brin_bloom_get_ndistinct(BrinDesc *bdesc, BloomOptions *opts)
472 {
473 	double		ndistinct;
474 	double		maxtuples;
475 	BlockNumber pagesPerRange;
476 
477 	pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index);
478 	ndistinct = BloomGetNDistinctPerRange(opts);
479 
480 	Assert(BlockNumberIsValid(pagesPerRange));
481 
482 	maxtuples = MaxHeapTuplesPerPage * pagesPerRange;
483 
484 	/*
485 	 * Similarly to n_distinct, negative values are relative - in this case to
486 	 * maximum number of tuples in the page range (maxtuples).
487 	 */
488 	if (ndistinct < 0)
489 		ndistinct = (-ndistinct) * maxtuples;
490 
491 	/*
492 	 * Positive values are to be used directly, but we still apply a couple of
493 	 * safeties to avoid using unreasonably small bloom filters.
494 	 */
495 	ndistinct = Max(ndistinct, BLOOM_MIN_NDISTINCT_PER_RANGE);
496 
497 	/*
498 	 * And don't use more than the maximum possible number of tuples, in the
499 	 * range, which would be entirely wasteful.
500 	 */
501 	ndistinct = Min(ndistinct, maxtuples);
502 
503 	return (int) ndistinct;
504 }
505 
506 /*
507  * Examine the given index tuple (which contains partial status of a certain
508  * page range) by comparing it to the given value that comes from another heap
509  * tuple.  If the new value is outside the bloom filter specified by the
510  * existing tuple values, update the index tuple and return true.  Otherwise,
511  * return false and do not modify in this case.
512  */
513 Datum
brin_bloom_add_value(PG_FUNCTION_ARGS)514 brin_bloom_add_value(PG_FUNCTION_ARGS)
515 {
516 	BrinDesc   *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
517 	BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
518 	Datum		newval = PG_GETARG_DATUM(2);
519 	bool		isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3);
520 	BloomOptions *opts = (BloomOptions *) PG_GET_OPCLASS_OPTIONS();
521 	Oid			colloid = PG_GET_COLLATION();
522 	FmgrInfo   *hashFn;
523 	uint32		hashValue;
524 	bool		updated = false;
525 	AttrNumber	attno;
526 	BloomFilter *filter;
527 
528 	Assert(!isnull);
529 
530 	attno = column->bv_attno;
531 
532 	/*
533 	 * If this is the first non-null value, we need to initialize the bloom
534 	 * filter. Otherwise just extract the existing bloom filter from
535 	 * BrinValues.
536 	 */
537 	if (column->bv_allnulls)
538 	{
539 		filter = bloom_init(brin_bloom_get_ndistinct(bdesc, opts),
540 							BloomGetFalsePositiveRate(opts));
541 		column->bv_values[0] = PointerGetDatum(filter);
542 		column->bv_allnulls = false;
543 		updated = true;
544 	}
545 	else
546 		filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]);
547 
548 	/*
549 	 * Compute the hash of the new value, using the supplied hash function,
550 	 * and then add the hash value to the bloom filter.
551 	 */
552 	hashFn = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH);
553 
554 	hashValue = DatumGetUInt32(FunctionCall1Coll(hashFn, colloid, newval));
555 
556 	filter = bloom_add_value(filter, hashValue, &updated);
557 
558 	column->bv_values[0] = PointerGetDatum(filter);
559 
560 	PG_RETURN_BOOL(updated);
561 }
562 
563 /*
564  * Given an index tuple corresponding to a certain page range and a scan key,
565  * return whether the scan key is consistent with the index tuple's bloom
566  * filter.  Return true if so, false otherwise.
567  */
568 Datum
brin_bloom_consistent(PG_FUNCTION_ARGS)569 brin_bloom_consistent(PG_FUNCTION_ARGS)
570 {
571 	BrinDesc   *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
572 	BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
573 	ScanKey    *keys = (ScanKey *) PG_GETARG_POINTER(2);
574 	int			nkeys = PG_GETARG_INT32(3);
575 	Oid			colloid = PG_GET_COLLATION();
576 	AttrNumber	attno;
577 	Datum		value;
578 	Datum		matches;
579 	FmgrInfo   *finfo;
580 	uint32		hashValue;
581 	BloomFilter *filter;
582 	int			keyno;
583 
584 	filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]);
585 
586 	Assert(filter);
587 
588 	matches = true;
589 
590 	for (keyno = 0; keyno < nkeys; keyno++)
591 	{
592 		ScanKey		key = keys[keyno];
593 
594 		/* NULL keys are handled and filtered-out in bringetbitmap */
595 		Assert(!(key->sk_flags & SK_ISNULL));
596 
597 		attno = key->sk_attno;
598 		value = key->sk_argument;
599 
600 		switch (key->sk_strategy)
601 		{
602 			case BloomEqualStrategyNumber:
603 
604 				/*
605 				 * In the equality case (WHERE col = someval), we want to
606 				 * return the current page range if the minimum value in the
607 				 * range <= scan key, and the maximum value >= scan key.
608 				 */
609 				finfo = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH);
610 
611 				hashValue = DatumGetUInt32(FunctionCall1Coll(finfo, colloid, value));
612 				matches &= bloom_contains_value(filter, hashValue);
613 
614 				break;
615 			default:
616 				/* shouldn't happen */
617 				elog(ERROR, "invalid strategy number %d", key->sk_strategy);
618 				matches = 0;
619 				break;
620 		}
621 
622 		if (!matches)
623 			break;
624 	}
625 
626 	PG_RETURN_DATUM(matches);
627 }
628 
629 /*
630  * Given two BrinValues, update the first of them as a union of the summary
631  * values contained in both.  The second one is untouched.
632  *
633  * XXX We assume the bloom filters have the same parameters for now. In the
634  * future we should have 'can union' function, to decide if we can combine
635  * two particular bloom filters.
636  */
637 Datum
brin_bloom_union(PG_FUNCTION_ARGS)638 brin_bloom_union(PG_FUNCTION_ARGS)
639 {
640 	int			i;
641 	int			nbytes;
642 	BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1);
643 	BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2);
644 	BloomFilter *filter_a;
645 	BloomFilter *filter_b;
646 
647 	Assert(col_a->bv_attno == col_b->bv_attno);
648 	Assert(!col_a->bv_allnulls && !col_b->bv_allnulls);
649 
650 	filter_a = (BloomFilter *) PG_DETOAST_DATUM(col_a->bv_values[0]);
651 	filter_b = (BloomFilter *) PG_DETOAST_DATUM(col_b->bv_values[0]);
652 
653 	/* make sure the filters use the same parameters */
654 	Assert(filter_a && filter_b);
655 	Assert(filter_a->nbits == filter_b->nbits);
656 	Assert(filter_a->nhashes == filter_b->nhashes);
657 	Assert((filter_a->nbits > 0) && (filter_a->nbits % 8 == 0));
658 
659 	nbytes = (filter_a->nbits) / 8;
660 
661 	/* simply OR the bitmaps */
662 	for (i = 0; i < nbytes; i++)
663 		filter_a->data[i] |= filter_b->data[i];
664 
665 	PG_RETURN_VOID();
666 }
667 
668 /*
669  * Cache and return inclusion opclass support procedure
670  *
671  * Return the procedure corresponding to the given function support number
672  * or null if it does not exist.
673  */
674 static FmgrInfo *
bloom_get_procinfo(BrinDesc * bdesc,uint16 attno,uint16 procnum)675 bloom_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum)
676 {
677 	BloomOpaque *opaque;
678 	uint16		basenum = procnum - PROCNUM_BASE;
679 
680 	/*
681 	 * We cache these in the opaque struct, to avoid repetitive syscache
682 	 * lookups.
683 	 */
684 	opaque = (BloomOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
685 
686 	/*
687 	 * If we already searched for this proc and didn't find it, don't bother
688 	 * searching again.
689 	 */
690 	if (opaque->extra_proc_missing[basenum])
691 		return NULL;
692 
693 	if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid)
694 	{
695 		if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno,
696 												procnum)))
697 		{
698 			fmgr_info_copy(&opaque->extra_procinfos[basenum],
699 						   index_getprocinfo(bdesc->bd_index, attno, procnum),
700 						   bdesc->bd_context);
701 		}
702 		else
703 		{
704 			opaque->extra_proc_missing[basenum] = true;
705 			return NULL;
706 		}
707 	}
708 
709 	return &opaque->extra_procinfos[basenum];
710 }
711 
712 Datum
brin_bloom_options(PG_FUNCTION_ARGS)713 brin_bloom_options(PG_FUNCTION_ARGS)
714 {
715 	local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0);
716 
717 	init_local_reloptions(relopts, sizeof(BloomOptions));
718 
719 	add_local_real_reloption(relopts, "n_distinct_per_range",
720 							 "number of distinct items expected in a BRIN page range",
721 							 BLOOM_DEFAULT_NDISTINCT_PER_RANGE,
722 							 -1.0, INT_MAX, offsetof(BloomOptions, nDistinctPerRange));
723 
724 	add_local_real_reloption(relopts, "false_positive_rate",
725 							 "desired false-positive rate for the bloom filters",
726 							 BLOOM_DEFAULT_FALSE_POSITIVE_RATE,
727 							 BLOOM_MIN_FALSE_POSITIVE_RATE,
728 							 BLOOM_MAX_FALSE_POSITIVE_RATE,
729 							 offsetof(BloomOptions, falsePositiveRate));
730 
731 	PG_RETURN_VOID();
732 }
733 
734 /*
735  * brin_bloom_summary_in
736  *		- input routine for type brin_bloom_summary.
737  *
738  * brin_bloom_summary is only used internally to represent summaries
739  * in BRIN bloom indexes, so it has no operations of its own, and we
740  * disallow input too.
741  */
742 Datum
brin_bloom_summary_in(PG_FUNCTION_ARGS)743 brin_bloom_summary_in(PG_FUNCTION_ARGS)
744 {
745 	/*
746 	 * brin_bloom_summary stores the data in binary form and parsing text
747 	 * input is not needed, so disallow this.
748 	 */
749 	ereport(ERROR,
750 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
751 			 errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary")));
752 
753 	PG_RETURN_VOID();			/* keep compiler quiet */
754 }
755 
756 
757 /*
758  * brin_bloom_summary_out
759  *		- output routine for type brin_bloom_summary.
760  *
761  * BRIN bloom summaries are serialized into a bytea value, but we want
762  * to output something nicer humans can understand.
763  */
764 Datum
brin_bloom_summary_out(PG_FUNCTION_ARGS)765 brin_bloom_summary_out(PG_FUNCTION_ARGS)
766 {
767 	BloomFilter *filter;
768 	StringInfoData str;
769 
770 	/* detoast the data to get value with a full 4B header */
771 	filter = (BloomFilter *) PG_DETOAST_DATUM(PG_GETARG_BYTEA_PP(0));
772 
773 	initStringInfo(&str);
774 	appendStringInfoChar(&str, '{');
775 
776 	appendStringInfo(&str, "mode: hashed  nhashes: %u  nbits: %u  nbits_set: %u",
777 					 filter->nhashes, filter->nbits, filter->nbits_set);
778 
779 	appendStringInfoChar(&str, '}');
780 
781 	PG_RETURN_CSTRING(str.data);
782 }
783 
784 /*
785  * brin_bloom_summary_recv
786  *		- binary input routine for type brin_bloom_summary.
787  */
788 Datum
brin_bloom_summary_recv(PG_FUNCTION_ARGS)789 brin_bloom_summary_recv(PG_FUNCTION_ARGS)
790 {
791 	ereport(ERROR,
792 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
793 			 errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary")));
794 
795 	PG_RETURN_VOID();			/* keep compiler quiet */
796 }
797 
798 /*
799  * brin_bloom_summary_send
800  *		- binary output routine for type brin_bloom_summary.
801  *
802  * BRIN bloom summaries are serialized in a bytea value (although the
803  * type is named differently), so let's just send that.
804  */
805 Datum
brin_bloom_summary_send(PG_FUNCTION_ARGS)806 brin_bloom_summary_send(PG_FUNCTION_ARGS)
807 {
808 	return byteasend(fcinfo);
809 }
810