1 /*-------------------------------------------------------------------------
2  *
3  * tidbitmap.c
4  *	  PostgreSQL tuple-id (TID) bitmap package
5  *
6  * This module provides bitmap data structures that are spiritually
7  * similar to Bitmapsets, but are specially adapted to store sets of
8  * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
9  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10  * Also, since we wish to be able to store very large tuple sets in
11  * memory with this data structure, we support "lossy" storage, in which
12  * we no longer remember individual tuple offsets on a page but only the
13  * fact that a particular page needs to be visited.
14  *
15  * The "lossy" storage uses one bit per disk page, so at the standard 8K
16  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17  * of memory.  People pushing around tables of that size should have a
18  * couple of Mb to spare, so we don't worry about providing a second level
19  * of lossiness.  In theory we could fall back to page ranges at some
20  * point, but for now that seems useless complexity.
21  *
22  * We also support the notion of candidate matches, or rechecking.  This
23  * means we know that a search need visit only some tuples on a page,
24  * but we are not certain that all of those tuples are real matches.
25  * So the eventual heap scan must recheck the quals for these tuples only,
26  * rather than rechecking the quals for all tuples on the page as in the
27  * lossy-bitmap case.  Rechecking can be specified when TIDs are inserted
28  * into a bitmap, and it can also happen internally when we AND a lossy
29  * and a non-lossy page.
30  *
31  *
32  * Copyright (c) 2003-2019, PostgreSQL Global Development Group
33  *
34  * IDENTIFICATION
35  *	  src/backend/nodes/tidbitmap.c
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include <limits.h>
42 
43 #include "access/htup_details.h"
44 #include "nodes/bitmapset.h"
45 #include "nodes/tidbitmap.h"
46 #include "storage/lwlock.h"
47 #include "utils/dsa.h"
48 #include "utils/hashutils.h"
49 
50 /*
51  * The maximum number of tuples per page is not large (typically 256 with
52  * 8K pages, or 1024 with 32K pages).  So there's not much point in making
53  * the per-page bitmaps variable size.  We just legislate that the size
54  * is this:
55  */
56 #define MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
57 
58 /*
59  * When we have to switch over to lossy storage, we use a data structure
60  * with one bit per page, where all pages having the same number DIV
61  * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
62  * and has the bit set for a given page, there must not be a per-page entry
63  * for that page in the page table.
64  *
65  * We actually store both exact pages and lossy chunks in the same hash
66  * table, using identical data structures.  (This is because the memory
67  * management for hashtables doesn't easily/efficiently allow space to be
68  * transferred easily from one hashtable to another.)  Therefore it's best
69  * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
70  * too different.  But we also want PAGES_PER_CHUNK to be a power of 2 to
71  * avoid expensive integer remainder operations.  So, define it like this:
72  */
73 #define PAGES_PER_CHUNK  (BLCKSZ / 32)
74 
75 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
76 
77 #define WORDNUM(x)	((x) / BITS_PER_BITMAPWORD)
78 #define BITNUM(x)	((x) % BITS_PER_BITMAPWORD)
79 
80 /* number of active words for an exact page: */
81 #define WORDS_PER_PAGE	((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
82 /* number of active words for a lossy chunk: */
83 #define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
84 
85 /*
86  * The hashtable entries are represented by this data structure.  For
87  * an exact page, blockno is the page number and bit k of the bitmap
88  * represents tuple offset k+1.  For a lossy chunk, blockno is the first
89  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
90  * bit k represents page blockno+k.  Note that it is not possible to
91  * have exact storage for the first page of a chunk if we are using
92  * lossy storage for any page in the chunk's range, since the same
93  * hashtable entry has to serve both purposes.
94  *
95  * recheck is used only on exact pages --- it indicates that although
96  * only the stated tuples need be checked, the full index qual condition
97  * must be checked for each (ie, these are candidate matches).
98  */
99 typedef struct PagetableEntry
100 {
101 	BlockNumber blockno;		/* page number (hashtable key) */
102 	char		status;			/* hash entry status */
103 	bool		ischunk;		/* T = lossy storage, F = exact */
104 	bool		recheck;		/* should the tuples be rechecked? */
105 	bitmapword	words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
106 } PagetableEntry;
107 
108 /*
109  * Holds array of pagetable entries.
110  */
111 typedef struct PTEntryArray
112 {
113 	pg_atomic_uint32 refcount;	/* no. of iterator attached */
114 	PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
115 } PTEntryArray;
116 
117 /*
clone(&self) -> FILE118  * We want to avoid the overhead of creating the hashtable, which is
119  * comparatively large, when not necessary. Particularly when we are using a
120  * bitmap scan on the inside of a nestloop join: a bitmap may well live only
121  * long enough to accumulate one entry in such cases.  We therefore avoid
122  * creating an actual hashtable until we need two pagetable entries.  When
123  * just one pagetable entry is needed, we store it in a fixed field of
124  * TIDBitMap.  (NOTE: we don't get rid of the hashtable if the bitmap later
125  * shrinks down to zero or one page again.  So, status can be TBM_HASH even
126  * when nentries is zero or one.)
127  */
128 typedef enum
129 {
130 	TBM_EMPTY,					/* no hashtable, nentries == 0 */
131 	TBM_ONE_PAGE,				/* entry1 contains the single entry */
132 	TBM_HASH					/* pagetable is valid, entry1 is not */
133 } TBMStatus;
iscntrl(c: c_int) -> c_int134 
135 /*
136  * Current iterating state of the TBM.
137  */
138 typedef enum
139 {
140 	TBM_NOT_ITERATING,			/* not yet converted to page and chunk array */
141 	TBM_ITERATING_PRIVATE,		/* converted to local page and chunk array */
142 	TBM_ITERATING_SHARED		/* converted to shared page and chunk array */
143 } TBMIteratingState;
tolower(c: c_int) -> c_int144 
145 /*
146  * Here is the representation for a whole TIDBitMap:
147  */
148 struct TIDBitmap
149 {
150 	NodeTag		type;			/* to make it a valid Node */
151 	MemoryContext mcxt;			/* memory context containing me */
152 	TBMStatus	status;			/* see codes above */
153 	struct pagetable_hash *pagetable;	/* hash table of PagetableEntry's */
154 	int			nentries;		/* number of entries in pagetable */
155 	int			maxentries;		/* limit on same to meet maxbytes */
156 	int			npages;			/* number of exact entries in pagetable */
157 	int			nchunks;		/* number of lossy entries in pagetable */
158 	TBMIteratingState iterating;	/* tbm_begin_iterate called? */
159 	uint32		lossify_start;	/* offset to start lossifying hashtable at */
160 	PagetableEntry entry1;		/* used when status == TBM_ONE_PAGE */
161 	/* these are valid when iterating is true: */
162 	PagetableEntry **spages;	/* sorted exact-page list, or NULL */
163 	PagetableEntry **schunks;	/* sorted lossy-chunk list, or NULL */
164 	dsa_pointer dsapagetable;	/* dsa_pointer to the element array */
165 	dsa_pointer dsapagetableold;	/* dsa_pointer to the old element array */
166 	dsa_pointer ptpages;		/* dsa_pointer to the page array */
167 	dsa_pointer ptchunks;		/* dsa_pointer to the chunk array */
168 	dsa_area   *dsa;			/* reference to per-query dsa area */
169 };
fputs(s: *const c_char, stream: *mut FILE) -> c_int170 
171 /*
172  * When iterating over a bitmap in sorted order, a TBMIterator is used to
173  * track our progress.  There can be several iterators scanning the same
174  * bitmap concurrently.  Note that the bitmap becomes read-only as soon as
175  * any iterator is created.
176  */
177 struct TBMIterator
178 {
179 	TIDBitmap  *tbm;			/* TIDBitmap we're iterating over */
180 	int			spageptr;		/* next spages index */
181 	int			schunkptr;		/* next schunks index */
182 	int			schunkbit;		/* next bit to check in current schunk */
183 	TBMIterateResult output;	/* MUST BE LAST (because variable-size) */
184 };
fseek(stream: *mut FILE, offset: c_long, whence: c_int) -> c_int185 
186 /*
187  * Holds the shared members of the iterator so that multiple processes
188  * can jointly iterate.
189  */
190 typedef struct TBMSharedIteratorState
191 {
192 	int			nentries;		/* number of entries in pagetable */
193 	int			maxentries;		/* limit on same to meet maxbytes */
194 	int			npages;			/* number of exact entries in pagetable */
195 	int			nchunks;		/* number of lossy entries in pagetable */
196 	dsa_pointer pagetable;		/* dsa pointers to head of pagetable data */
197 	dsa_pointer spages;			/* dsa pointer to page array */
198 	dsa_pointer schunks;		/* dsa pointer to chunk array */
199 	LWLock		lock;			/* lock to protect below members */
200 	int			spageptr;		/* next spages index */
201 	int			schunkptr;		/* next schunks index */
202 	int			schunkbit;		/* next bit to check in current schunk */
203 } TBMSharedIteratorState;
204 
calloc(nobj: size_t, size: size_t) -> *mut c_void205 /*
206  * pagetable iteration array.
207  */
208 typedef struct PTIterationArray
209 {
210 	pg_atomic_uint32 refcount;	/* no. of iterator attached */
211 	int			index[FLEXIBLE_ARRAY_MEMBER];	/* index array */
212 } PTIterationArray;
system(s: *const c_char) -> c_int213 
214 /*
215  * same as TBMIterator, but it is used for joint iteration, therefore this
216  * also holds a reference to the shared state.
217  */
218 struct TBMSharedIterator
219 {
220 	TBMSharedIteratorState *state;	/* shared state */
221 	PTEntryArray *ptbase;		/* pagetable element array */
222 	PTIterationArray *ptpages;	/* sorted exact page index list */
223 	PTIterationArray *ptchunks; /* sorted lossy page index list */
224 	TBMIterateResult output;	/* MUST BE LAST (because variable-size) */
225 };
226 
strcat(s: *mut c_char, ct: *const c_char) -> *mut c_char227 /* Local function prototypes */
228 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
229 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
230 							   const TIDBitmap *b);
231 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
232 												BlockNumber pageno);
strcmp(cs: *const c_char, ct: *const c_char) -> c_int233 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
strncmp(cs: *const c_char, ct: *const c_char, n: size_t) -> c_int234 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
strcoll(cs: *const c_char, ct: *const c_char) -> c_int235 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
strchr(cs: *const c_char, c: c_int) -> *mut c_char236 static void tbm_lossify(TIDBitmap *tbm);
strrchr(cs: *const c_char, c: c_int) -> *mut c_char237 static int	tbm_comparator(const void *left, const void *right);
strspn(cs: *const c_char, ct: *const c_char) -> size_t238 static int	tbm_shared_comparator(const void *left, const void *right,
239 								  void *arg);
strdup(cs: *const c_char) -> *mut c_char240 
241 /* define hashtable mapping block numbers to PagetableEntry's */
242 #define SH_USE_NONDEFAULT_ALLOCATOR
243 #define SH_PREFIX pagetable
244 #define SH_ELEMENT_TYPE PagetableEntry
245 #define SH_KEY_TYPE BlockNumber
246 #define SH_KEY blockno
247 #define SH_HASH_KEY(tb, key) murmurhash32(key)
248 #define SH_EQUAL(tb, a, b) a == b
249 #define SH_SCOPE static inline
250 #define SH_DEFINE
251 #define SH_DECLARE
252 #include "lib/simplehash.h"
253 
254 
255 /*
256  * tbm_create - create an initially-empty bitmap
257  *
258  * The bitmap will live in the memory context that is CurrentMemoryContext
259  * at the time of this call.  It will be limited to (approximately) maxbytes
260  * total memory consumption.  If the DSA passed to this function is not NULL
261  * then the memory for storing elements of the underlying page table will
262  * be allocated from the DSA.
263  */
264 TIDBitmap *
265 tbm_create(long maxbytes, dsa_area *dsa)
266 {
267 	TIDBitmap  *tbm;
268 
269 	/* Create the TIDBitmap struct and zero all its fields */
270 	tbm = makeNode(TIDBitmap);
271 
272 	tbm->mcxt = CurrentMemoryContext;
273 	tbm->status = TBM_EMPTY;
274 
275 	tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
276 	tbm->lossify_start = 0;
277 	tbm->dsa = dsa;
278 	tbm->dsapagetable = InvalidDsaPointer;
279 	tbm->dsapagetableold = InvalidDsaPointer;
280 	tbm->ptpages = InvalidDsaPointer;
281 	tbm->ptchunks = InvalidDsaPointer;
282 
283 	return tbm;
284 }
getaddrinfo( node: *const c_char, service: *const c_char, hints: *const addrinfo, res: *mut *mut addrinfo, ) -> ::c_int285 
286 /*
287  * Actually create the hashtable.  Since this is a moderately expensive
288  * proposition, we don't do it until we have to.
289  */
290 static void
291 tbm_create_pagetable(TIDBitmap *tbm)
292 {
293 	Assert(tbm->status != TBM_HASH);
294 	Assert(tbm->pagetable == NULL);
295 
296 	tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
297 
298 	/* If entry1 is valid, push it into the hashtable */
299 	if (tbm->status == TBM_ONE_PAGE)
300 	{
301 		PagetableEntry *page;
302 		bool		found;
303 		char		oldstatus;
304 
305 		page = pagetable_insert(tbm->pagetable,
306 								tbm->entry1.blockno,
307 								&found);
308 		Assert(!found);
309 		oldstatus = page->status;
310 		memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
311 		page->status = oldstatus;
312 	}
313 
314 	tbm->status = TBM_HASH;
315 }
pthread_getspecific(key: pthread_key_t) -> *mut ::c_void316 
317 /*
318  * tbm_free - free a TIDBitmap
319  */
320 void
321 tbm_free(TIDBitmap *tbm)
322 {
323 	if (tbm->pagetable)
324 		pagetable_destroy(tbm->pagetable);
325 	if (tbm->spages)
326 		pfree(tbm->spages);
327 	if (tbm->schunks)
328 		pfree(tbm->schunks);
329 	pfree(tbm);
330 }
331 
332 /*
333  * tbm_free_shared_area - free shared state
334  *
335  * Free shared iterator state, Also free shared pagetable and iterator arrays
sysconf(name: ::c_int) -> ::c_long336  * memory if they are not referred by any of the shared iterator i.e recount
337  * is becomes 0.
338  */
339 void
340 tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
341 {
342 	TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
343 	PTEntryArray *ptbase;
344 	PTIterationArray *ptpages;
345 	PTIterationArray *ptchunks;
346 
347 	if (DsaPointerIsValid(istate->pagetable))
348 	{
349 		ptbase = dsa_get_address(dsa, istate->pagetable);
350 		if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
351 			dsa_free(dsa, istate->pagetable);
352 	}
353 	if (DsaPointerIsValid(istate->spages))
354 	{
355 		ptpages = dsa_get_address(dsa, istate->spages);
356 		if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
357 			dsa_free(dsa, istate->spages);
358 	}
359 	if (DsaPointerIsValid(istate->schunks))
360 	{
361 		ptchunks = dsa_get_address(dsa, istate->schunks);
362 		if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
363 			dsa_free(dsa, istate->schunks);
364 	}
365 
366 	dsa_free(dsa, dp);
367 }
368 
369 /*
370  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
371  *
372  * If recheck is true, then the recheck flag will be set in the
373  * TBMIterateResult when any of these tuples are reported out.
374  */
375 void
376 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
377 			   bool recheck)
378 {
379 	BlockNumber currblk = InvalidBlockNumber;
380 	PagetableEntry *page = NULL;	/* only valid when currblk is valid */
381 	int			i;
382 
383 	Assert(tbm->iterating == TBM_NOT_ITERATING);
384 	for (i = 0; i < ntids; i++)
385 	{
386 		BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
387 		OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
388 		int			wordnum,
389 					bitnum;
390 
391 		/* safety check to ensure we don't overrun bit array bounds */
392 		if (off < 1 || off > MAX_TUPLES_PER_PAGE)
393 			elog(ERROR, "tuple offset out of range: %u", off);
394 
395 		/*
396 		 * Look up target page unless we already did.  This saves cycles when
397 		 * the input includes consecutive tuples on the same page, which is
398 		 * common enough to justify an extra test here.
399 		 */
400 		if (blk != currblk)
401 		{
402 			if (tbm_page_is_lossy(tbm, blk))
403 				page = NULL;	/* remember page is lossy */
404 			else
405 				page = tbm_get_pageentry(tbm, blk);
406 			currblk = blk;
407 		}
408 
409 		if (page == NULL)
410 			continue;			/* whole page is already marked */
411 
412 		if (page->ischunk)
413 		{
414 			/* The page is a lossy chunk header, set bit for itself */
415 			wordnum = bitnum = 0;
416 		}
417 		else
418 		{
419 			/* Page is exact, so set bit for individual tuple */
420 			wordnum = WORDNUM(off - 1);
421 			bitnum = BITNUM(off - 1);
422 		}
423 		page->words[wordnum] |= ((bitmapword) 1 << bitnum);
424 		page->recheck |= recheck;
425 
426 		if (tbm->nentries > tbm->maxentries)
427 		{
428 			tbm_lossify(tbm);
429 			/* Page could have been converted to lossy, so force new lookup */
430 			currblk = InvalidBlockNumber;
431 		}
432 	}
433 }
434 
435 /*
436  * tbm_add_page - add a whole page to a TIDBitmap
437  *
438  * This causes the whole page to be reported (with the recheck flag)
439  * when the TIDBitmap is scanned.
440  */
441 void
442 tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
443 {
444 	/* Enter the page in the bitmap, or mark it lossy if already present */
445 	tbm_mark_page_lossy(tbm, pageno);
446 	/* If we went over the memory limit, lossify some more pages */
447 	if (tbm->nentries > tbm->maxentries)
448 		tbm_lossify(tbm);
449 }
450 
451 /*
452  * tbm_union - set union
453  *
454  * a is modified in-place, b is not changed
455  */
456 void
457 tbm_union(TIDBitmap *a, const TIDBitmap *b)
458 {
459 	Assert(!a->iterating);
460 	/* Nothing to do if b is empty */
461 	if (b->nentries == 0)
462 		return;
463 	/* Scan through chunks and pages in b, merge into a */
464 	if (b->status == TBM_ONE_PAGE)
465 		tbm_union_page(a, &b->entry1);
466 	else
467 	{
468 		pagetable_iterator i;
469 		PagetableEntry *bpage;
470 
471 		Assert(b->status == TBM_HASH);
472 		pagetable_start_iterate(b->pagetable, &i);
473 		while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
474 			tbm_union_page(a, bpage);
475 	}
476 }
477 
478 /* Process one page of b during a union op */
479 static void
480 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
481 {
482 	PagetableEntry *apage;
483 	int			wordnum;
484 
485 	if (bpage->ischunk)
486 	{
487 		/* Scan b's chunk, mark each indicated page lossy in a */
488 		for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
489 		{
490 			bitmapword	w = bpage->words[wordnum];
491 
492 			if (w != 0)
493 			{
494 				BlockNumber pg;
495 
496 				pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
497 				while (w != 0)
498 				{
499 					if (w & 1)
500 						tbm_mark_page_lossy(a, pg);
501 					pg++;
502 					w >>= 1;
503 				}
504 			}
505 		}
506 	}
507 	else if (tbm_page_is_lossy(a, bpage->blockno))
508 	{
509 		/* page is already lossy in a, nothing to do */
510 		return;
511 	}
512 	else
513 	{
514 		apage = tbm_get_pageentry(a, bpage->blockno);
515 		if (apage->ischunk)
516 		{
517 			/* The page is a lossy chunk header, set bit for itself */
518 			apage->words[0] |= ((bitmapword) 1 << 0);
519 		}
520 		else
521 		{
522 			/* Both pages are exact, merge at the bit level */
523 			for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
524 				apage->words[wordnum] |= bpage->words[wordnum];
525 			apage->recheck |= bpage->recheck;
526 		}
527 	}
528 
529 	if (a->nentries > a->maxentries)
530 		tbm_lossify(a);
531 }
532 
533 /*
534  * tbm_intersect - set intersection
535  *
536  * a is modified in-place, b is not changed
537  */
538 void
539 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
540 {
541 	Assert(!a->iterating);
542 	/* Nothing to do if a is empty */
543 	if (a->nentries == 0)
544 		return;
545 	/* Scan through chunks and pages in a, try to match to b */
546 	if (a->status == TBM_ONE_PAGE)
547 	{
548 		if (tbm_intersect_page(a, &a->entry1, b))
549 		{
550 			/* Page is now empty, remove it from a */
551 			Assert(!a->entry1.ischunk);
552 			a->npages--;
553 			a->nentries--;
554 			Assert(a->nentries == 0);
555 			a->status = TBM_EMPTY;
556 		}
557 	}
558 	else
559 	{
560 		pagetable_iterator i;
561 		PagetableEntry *apage;
562 
563 		Assert(a->status == TBM_HASH);
564 		pagetable_start_iterate(a->pagetable, &i);
565 		while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
566 		{
567 			if (tbm_intersect_page(a, apage, b))
568 			{
569 				/* Page or chunk is now empty, remove it from a */
570 				if (apage->ischunk)
571 					a->nchunks--;
572 				else
573 					a->npages--;
574 				a->nentries--;
575 				if (!pagetable_delete(a->pagetable, apage->blockno))
576 					elog(ERROR, "hash table corrupted");
577 			}
578 		}
579 	}
580 }
581 
582 /*
583  * Process one page of a during an intersection op
584  *
585  * Returns true if apage is now empty and should be deleted from a
586  */
587 static bool
588 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
589 {
590 	const PagetableEntry *bpage;
591 	int			wordnum;
592 
593 	if (apage->ischunk)
594 	{
595 		/* Scan each bit in chunk, try to clear */
596 		bool		candelete = true;
597 
598 		for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
599 		{
600 			bitmapword	w = apage->words[wordnum];
601 
602 			if (w != 0)
603 			{
604 				bitmapword	neww = w;
605 				BlockNumber pg;
606 				int			bitnum;
607 
608 				pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
609 				bitnum = 0;
610 				while (w != 0)
611 				{
612 					if (w & 1)
613 					{
614 						if (!tbm_page_is_lossy(b, pg) &&
615 							tbm_find_pageentry(b, pg) == NULL)
616 						{
617 							/* Page is not in b at all, lose lossy bit */
618 							neww &= ~((bitmapword) 1 << bitnum);
619 						}
620 					}
621 					pg++;
622 					bitnum++;
623 					w >>= 1;
624 				}
625 				apage->words[wordnum] = neww;
626 				if (neww != 0)
627 					candelete = false;
628 			}
629 		}
630 		return candelete;
631 	}
632 	else if (tbm_page_is_lossy(b, apage->blockno))
633 	{
634 		/*
635 		 * Some of the tuples in 'a' might not satisfy the quals for 'b', but
636 		 * because the page 'b' is lossy, we don't know which ones. Therefore
637 		 * we mark 'a' as requiring rechecks, to indicate that at most those
638 		 * tuples set in 'a' are matches.
639 		 */
640 		apage->recheck = true;
641 		return false;
642 	}
643 	else
644 	{
645 		bool		candelete = true;
646 
647 		bpage = tbm_find_pageentry(b, apage->blockno);
648 		if (bpage != NULL)
649 		{
650 			/* Both pages are exact, merge at the bit level */
651 			Assert(!bpage->ischunk);
652 			for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
653 			{
654 				apage->words[wordnum] &= bpage->words[wordnum];
655 				if (apage->words[wordnum] != 0)
656 					candelete = false;
657 			}
658 			apage->recheck |= bpage->recheck;
659 		}
660 		/* If there is no matching b page, we can just delete the a page */
661 		return candelete;
662 	}
663 }
664 
665 /*
666  * tbm_is_empty - is a TIDBitmap completely empty?
667  */
668 bool
669 tbm_is_empty(const TIDBitmap *tbm)
670 {
671 	return (tbm->nentries == 0);
672 }
673 
674 /*
675  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
676  *
677  * The TBMIterator struct is created in the caller's memory context.
678  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
679  * okay to just allow the memory context to be released, too.  It is caller's
680  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
681  * is freed.
682  *
683  * NB: after this is called, it is no longer allowed to modify the contents
684  * of the bitmap.  However, you can call this multiple times to scan the
685  * contents repeatedly, including parallel scans.
686  */
687 TBMIterator *
688 tbm_begin_iterate(TIDBitmap *tbm)
689 {
690 	TBMIterator *iterator;
691 
692 	Assert(tbm->iterating != TBM_ITERATING_SHARED);
693 
694 	/*
695 	 * Create the TBMIterator struct, with enough trailing space to serve the
696 	 * needs of the TBMIterateResult sub-struct.
697 	 */
698 	iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
699 									  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
700 	iterator->tbm = tbm;
701 
702 	/*
703 	 * Initialize iteration pointers.
704 	 */
705 	iterator->spageptr = 0;
706 	iterator->schunkptr = 0;
707 	iterator->schunkbit = 0;
708 
709 	/*
710 	 * If we have a hashtable, create and fill the sorted page lists, unless
711 	 * we already did that for a previous iterator.  Note that the lists are
712 	 * attached to the bitmap not the iterator, so they can be used by more
713 	 * than one iterator.
714 	 */
715 	if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
716 	{
717 		pagetable_iterator i;
718 		PagetableEntry *page;
719 		int			npages;
720 		int			nchunks;
721 
722 		if (!tbm->spages && tbm->npages > 0)
723 			tbm->spages = (PagetableEntry **)
724 				MemoryContextAlloc(tbm->mcxt,
725 								   tbm->npages * sizeof(PagetableEntry *));
726 		if (!tbm->schunks && tbm->nchunks > 0)
727 			tbm->schunks = (PagetableEntry **)
728 				MemoryContextAlloc(tbm->mcxt,
729 								   tbm->nchunks * sizeof(PagetableEntry *));
730 
731 		npages = nchunks = 0;
732 		pagetable_start_iterate(tbm->pagetable, &i);
733 		while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
734 		{
735 			if (page->ischunk)
736 				tbm->schunks[nchunks++] = page;
737 			else
738 				tbm->spages[npages++] = page;
739 		}
740 		Assert(npages == tbm->npages);
741 		Assert(nchunks == tbm->nchunks);
742 		if (npages > 1)
743 			qsort(tbm->spages, npages, sizeof(PagetableEntry *),
744 				  tbm_comparator);
745 		if (nchunks > 1)
746 			qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
747 				  tbm_comparator);
748 	}
749 
750 	tbm->iterating = TBM_ITERATING_PRIVATE;
751 
752 	return iterator;
753 }
754 
755 /*
756  * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
757  *
758  * The necessary shared state will be allocated from the DSA passed to
759  * tbm_create, so that multiple processes can attach to it and iterate jointly.
760  *
761  * This will convert the pagetable hash into page and chunk array of the index
762  * into pagetable array.
763  */
764 dsa_pointer
765 tbm_prepare_shared_iterate(TIDBitmap *tbm)
766 {
767 	dsa_pointer dp;
768 	TBMSharedIteratorState *istate;
769 	PTEntryArray *ptbase = NULL;
770 	PTIterationArray *ptpages = NULL;
771 	PTIterationArray *ptchunks = NULL;
772 
773 	Assert(tbm->dsa != NULL);
774 	Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
775 
776 	/*
777 	 * Allocate TBMSharedIteratorState from DSA to hold the shared members and
778 	 * lock, this will also be used by multiple worker for shared iterate.
779 	 */
780 	dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
781 	istate = dsa_get_address(tbm->dsa, dp);
782 
783 	/*
784 	 * If we're not already iterating, create and fill the sorted page lists.
785 	 * (If we are, the sorted page lists are already stored in the TIDBitmap,
786 	 * and we can just reuse them.)
787 	 */
788 	if (tbm->iterating == TBM_NOT_ITERATING)
789 	{
790 		pagetable_iterator i;
791 		PagetableEntry *page;
792 		int			idx;
793 		int			npages;
794 		int			nchunks;
795 
796 		/*
797 		 * Allocate the page and chunk array memory from the DSA to share
798 		 * across multiple processes.
799 		 */
800 		if (tbm->npages)
801 		{
802 			tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
803 										tbm->npages * sizeof(int));
804 			ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
805 			pg_atomic_init_u32(&ptpages->refcount, 0);
806 		}
807 		if (tbm->nchunks)
808 		{
809 			tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
810 										 tbm->nchunks * sizeof(int));
811 			ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
812 			pg_atomic_init_u32(&ptchunks->refcount, 0);
813 		}
814 
815 		/*
816 		 * If TBM status is TBM_HASH then iterate over the pagetable and
817 		 * convert it to page and chunk arrays.  But if it's in the
818 		 * TBM_ONE_PAGE mode then directly allocate the space for one entry
819 		 * from the DSA.
820 		 */
821 		npages = nchunks = 0;
822 		if (tbm->status == TBM_HASH)
823 		{
824 			ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
825 
826 			pagetable_start_iterate(tbm->pagetable, &i);
827 			while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
828 			{
829 				idx = page - ptbase->ptentry;
830 				if (page->ischunk)
831 					ptchunks->index[nchunks++] = idx;
832 				else
833 					ptpages->index[npages++] = idx;
834 			}
835 
836 			Assert(npages == tbm->npages);
837 			Assert(nchunks == tbm->nchunks);
838 		}
839 		else if (tbm->status == TBM_ONE_PAGE)
840 		{
841 			/*
842 			 * In one page mode allocate the space for one pagetable entry,
843 			 * initialize it, and directly store its index (i.e. 0) in the
844 			 * page array.
845 			 */
846 			tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
847 											 sizeof(PagetableEntry));
848 			ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
849 			memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
850 			ptpages->index[0] = 0;
851 		}
852 
853 		if (ptbase != NULL)
854 			pg_atomic_init_u32(&ptbase->refcount, 0);
855 		if (npages > 1)
856 			qsort_arg((void *) (ptpages->index), npages, sizeof(int),
857 					  tbm_shared_comparator, (void *) ptbase->ptentry);
858 		if (nchunks > 1)
859 			qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
860 					  tbm_shared_comparator, (void *) ptbase->ptentry);
861 	}
862 
863 	/*
864 	 * Store the TBM members in the shared state so that we can share them
865 	 * across multiple processes.
866 	 */
867 	istate->nentries = tbm->nentries;
868 	istate->maxentries = tbm->maxentries;
869 	istate->npages = tbm->npages;
870 	istate->nchunks = tbm->nchunks;
871 	istate->pagetable = tbm->dsapagetable;
872 	istate->spages = tbm->ptpages;
873 	istate->schunks = tbm->ptchunks;
874 
875 	ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
876 	ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
877 	ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
878 
879 	/*
880 	 * For every shared iterator, referring to pagetable and iterator array,
881 	 * increase the refcount by 1 so that while freeing the shared iterator we
882 	 * don't free pagetable and iterator array until its refcount becomes 0.
883 	 */
884 	if (ptbase != NULL)
885 		pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
886 	if (ptpages != NULL)
887 		pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
888 	if (ptchunks != NULL)
889 		pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
890 
891 	/* Initialize the iterator lock */
892 	LWLockInitialize(&istate->lock, LWTRANCHE_TBM);
893 
894 	/* Initialize the shared iterator state */
895 	istate->schunkbit = 0;
896 	istate->schunkptr = 0;
897 	istate->spageptr = 0;
898 
899 	tbm->iterating = TBM_ITERATING_SHARED;
900 
901 	return dp;
902 }
903 
904 /*
905  * tbm_extract_page_tuple - extract the tuple offsets from a page
906  *
907  * The extracted offsets are stored into TBMIterateResult.
908  */
909 static inline int
910 tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
911 {
912 	int			wordnum;
913 	int			ntuples = 0;
914 
915 	for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
916 	{
917 		bitmapword	w = page->words[wordnum];
918 
919 		if (w != 0)
920 		{
921 			int			off = wordnum * BITS_PER_BITMAPWORD + 1;
922 
923 			while (w != 0)
924 			{
925 				if (w & 1)
926 					output->offsets[ntuples++] = (OffsetNumber) off;
927 				off++;
928 				w >>= 1;
929 			}
930 		}
931 	}
932 
933 	return ntuples;
934 }
935 
936 /*
937  *	tbm_advance_schunkbit - Advance the schunkbit
938  */
939 static inline void
940 tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
941 {
942 	int			schunkbit = *schunkbitp;
943 
944 	while (schunkbit < PAGES_PER_CHUNK)
945 	{
946 		int			wordnum = WORDNUM(schunkbit);
947 		int			bitnum = BITNUM(schunkbit);
948 
949 		if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
950 			break;
951 		schunkbit++;
952 	}
953 
954 	*schunkbitp = schunkbit;
955 }
956 
957 /*
958  * tbm_iterate - scan through next page of a TIDBitmap
959  *
960  * Returns a TBMIterateResult representing one page, or NULL if there are
961  * no more pages to scan.  Pages are guaranteed to be delivered in numerical
962  * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
963  * remember the exact tuples to look at on this page --- the caller must
964  * examine all tuples on the page and check if they meet the intended
965  * condition.  If result->recheck is true, only the indicated tuples need
966  * be examined, but the condition must be rechecked anyway.  (For ease of
967  * testing, recheck is always set true when ntuples < 0.)
968  */
969 TBMIterateResult *
970 tbm_iterate(TBMIterator *iterator)
971 {
972 	TIDBitmap  *tbm = iterator->tbm;
973 	TBMIterateResult *output = &(iterator->output);
974 
975 	Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
976 
977 	/*
978 	 * If lossy chunk pages remain, make sure we've advanced schunkptr/
979 	 * schunkbit to the next set bit.
980 	 */
981 	while (iterator->schunkptr < tbm->nchunks)
982 	{
983 		PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
984 		int			schunkbit = iterator->schunkbit;
985 
986 		tbm_advance_schunkbit(chunk, &schunkbit);
987 		if (schunkbit < PAGES_PER_CHUNK)
988 		{
989 			iterator->schunkbit = schunkbit;
990 			break;
991 		}
992 		/* advance to next chunk */
993 		iterator->schunkptr++;
994 		iterator->schunkbit = 0;
995 	}
996 
997 	/*
998 	 * If both chunk and per-page data remain, must output the numerically
999 	 * earlier page.
1000 	 */
1001 	if (iterator->schunkptr < tbm->nchunks)
1002 	{
1003 		PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1004 		BlockNumber chunk_blockno;
1005 
1006 		chunk_blockno = chunk->blockno + iterator->schunkbit;
1007 		if (iterator->spageptr >= tbm->npages ||
1008 			chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1009 		{
1010 			/* Return a lossy page indicator from the chunk */
1011 			output->blockno = chunk_blockno;
1012 			output->ntuples = -1;
1013 			output->recheck = true;
1014 			iterator->schunkbit++;
1015 			return output;
1016 		}
1017 	}
1018 
1019 	if (iterator->spageptr < tbm->npages)
1020 	{
1021 		PagetableEntry *page;
1022 		int			ntuples;
1023 
1024 		/* In ONE_PAGE state, we don't allocate an spages[] array */
1025 		if (tbm->status == TBM_ONE_PAGE)
1026 			page = &tbm->entry1;
1027 		else
1028 			page = tbm->spages[iterator->spageptr];
1029 
1030 		/* scan bitmap to extract individual offset numbers */
1031 		ntuples = tbm_extract_page_tuple(page, output);
1032 		output->blockno = page->blockno;
1033 		output->ntuples = ntuples;
1034 		output->recheck = page->recheck;
1035 		iterator->spageptr++;
1036 		return output;
1037 	}
1038 
1039 	/* Nothing more in the bitmap */
1040 	return NULL;
1041 }
1042 
1043 /*
1044  *	tbm_shared_iterate - scan through next page of a TIDBitmap
1045  *
1046  *	As above, but this will iterate using an iterator which is shared
1047  *	across multiple processes.  We need to acquire the iterator LWLock,
1048  *	before accessing the shared members.
1049  */
1050 TBMIterateResult *
1051 tbm_shared_iterate(TBMSharedIterator *iterator)
1052 {
1053 	TBMIterateResult *output = &iterator->output;
1054 	TBMSharedIteratorState *istate = iterator->state;
1055 	PagetableEntry *ptbase = NULL;
1056 	int		   *idxpages = NULL;
1057 	int		   *idxchunks = NULL;
1058 
1059 	if (iterator->ptbase != NULL)
1060 		ptbase = iterator->ptbase->ptentry;
1061 	if (iterator->ptpages != NULL)
1062 		idxpages = iterator->ptpages->index;
1063 	if (iterator->ptchunks != NULL)
1064 		idxchunks = iterator->ptchunks->index;
1065 
1066 	/* Acquire the LWLock before accessing the shared members */
1067 	LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1068 
1069 	/*
1070 	 * If lossy chunk pages remain, make sure we've advanced schunkptr/
1071 	 * schunkbit to the next set bit.
1072 	 */
1073 	while (istate->schunkptr < istate->nchunks)
1074 	{
1075 		PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1076 		int			schunkbit = istate->schunkbit;
1077 
1078 		tbm_advance_schunkbit(chunk, &schunkbit);
1079 		if (schunkbit < PAGES_PER_CHUNK)
1080 		{
1081 			istate->schunkbit = schunkbit;
1082 			break;
1083 		}
1084 		/* advance to next chunk */
1085 		istate->schunkptr++;
1086 		istate->schunkbit = 0;
1087 	}
1088 
1089 	/*
1090 	 * If both chunk and per-page data remain, must output the numerically
1091 	 * earlier page.
1092 	 */
1093 	if (istate->schunkptr < istate->nchunks)
1094 	{
1095 		PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1096 		BlockNumber chunk_blockno;
1097 
1098 		chunk_blockno = chunk->blockno + istate->schunkbit;
1099 
1100 		if (istate->spageptr >= istate->npages ||
1101 			chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1102 		{
1103 			/* Return a lossy page indicator from the chunk */
1104 			output->blockno = chunk_blockno;
1105 			output->ntuples = -1;
1106 			output->recheck = true;
1107 			istate->schunkbit++;
1108 
1109 			LWLockRelease(&istate->lock);
1110 			return output;
1111 		}
1112 	}
1113 
1114 	if (istate->spageptr < istate->npages)
1115 	{
1116 		PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1117 		int			ntuples;
1118 
1119 		/* scan bitmap to extract individual offset numbers */
1120 		ntuples = tbm_extract_page_tuple(page, output);
1121 		output->blockno = page->blockno;
1122 		output->ntuples = ntuples;
1123 		output->recheck = page->recheck;
1124 		istate->spageptr++;
1125 
1126 		LWLockRelease(&istate->lock);
1127 
1128 		return output;
1129 	}
1130 
1131 	LWLockRelease(&istate->lock);
1132 
1133 	/* Nothing more in the bitmap */
1134 	return NULL;
1135 }
1136 
1137 /*
1138  * tbm_end_iterate - finish an iteration over a TIDBitmap
1139  *
1140  * Currently this is just a pfree, but it might do more someday.  (For
1141  * instance, it could be useful to count open iterators and allow the
1142  * bitmap to return to read/write status when there are no more iterators.)
1143  */
1144 void
1145 tbm_end_iterate(TBMIterator *iterator)
1146 {
1147 	pfree(iterator);
1148 }
1149 
1150 /*
1151  * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1152  *
1153  * This doesn't free any of the shared state associated with the iterator,
1154  * just our backend-private state.
1155  */
1156 void
1157 tbm_end_shared_iterate(TBMSharedIterator *iterator)
1158 {
1159 	pfree(iterator);
1160 }
1161 
1162 /*
1163  * tbm_find_pageentry - find a PagetableEntry for the pageno
1164  *
1165  * Returns NULL if there is no non-lossy entry for the pageno.
1166  */
1167 static const PagetableEntry *
1168 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1169 {
1170 	const PagetableEntry *page;
1171 
1172 	if (tbm->nentries == 0)		/* in case pagetable doesn't exist */
1173 		return NULL;
1174 
1175 	if (tbm->status == TBM_ONE_PAGE)
1176 	{
1177 		page = &tbm->entry1;
1178 		if (page->blockno != pageno)
1179 			return NULL;
1180 		Assert(!page->ischunk);
1181 		return page;
1182 	}
1183 
1184 	page = pagetable_lookup(tbm->pagetable, pageno);
1185 	if (page == NULL)
1186 		return NULL;
1187 	if (page->ischunk)
1188 		return NULL;			/* don't want a lossy chunk header */
1189 	return page;
1190 }
1191 
1192 /*
1193  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1194  *
1195  * If new, the entry is marked as an exact (non-chunk) entry.
1196  *
1197  * This may cause the table to exceed the desired memory size.  It is
1198  * up to the caller to call tbm_lossify() at the next safe point if so.
1199  */
1200 static PagetableEntry *
1201 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1202 {
1203 	PagetableEntry *page;
1204 	bool		found;
1205 
1206 	if (tbm->status == TBM_EMPTY)
1207 	{
1208 		/* Use the fixed slot */
1209 		page = &tbm->entry1;
1210 		found = false;
1211 		tbm->status = TBM_ONE_PAGE;
1212 	}
1213 	else
1214 	{
1215 		if (tbm->status == TBM_ONE_PAGE)
1216 		{
1217 			page = &tbm->entry1;
1218 			if (page->blockno == pageno)
1219 				return page;
1220 			/* Time to switch from one page to a hashtable */
1221 			tbm_create_pagetable(tbm);
1222 		}
1223 
1224 		/* Look up or create an entry */
1225 		page = pagetable_insert(tbm->pagetable, pageno, &found);
1226 	}
1227 
1228 	/* Initialize it if not present before */
1229 	if (!found)
1230 	{
1231 		char		oldstatus = page->status;
1232 
1233 		MemSet(page, 0, sizeof(PagetableEntry));
1234 		page->status = oldstatus;
1235 		page->blockno = pageno;
1236 		/* must count it too */
1237 		tbm->nentries++;
1238 		tbm->npages++;
1239 	}
1240 
1241 	return page;
1242 }
1243 
1244 /*
1245  * tbm_page_is_lossy - is the page marked as lossily stored?
1246  */
1247 static bool
1248 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1249 {
1250 	PagetableEntry *page;
1251 	BlockNumber chunk_pageno;
1252 	int			bitno;
1253 
1254 	/* we can skip the lookup if there are no lossy chunks */
1255 	if (tbm->nchunks == 0)
1256 		return false;
1257 	Assert(tbm->status == TBM_HASH);
1258 
1259 	bitno = pageno % PAGES_PER_CHUNK;
1260 	chunk_pageno = pageno - bitno;
1261 
1262 	page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1263 
1264 	if (page != NULL && page->ischunk)
1265 	{
1266 		int			wordnum = WORDNUM(bitno);
1267 		int			bitnum = BITNUM(bitno);
1268 
1269 		if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1270 			return true;
1271 	}
1272 	return false;
1273 }
1274 
1275 /*
1276  * tbm_mark_page_lossy - mark the page number as lossily stored
1277  *
1278  * This may cause the table to exceed the desired memory size.  It is
1279  * up to the caller to call tbm_lossify() at the next safe point if so.
1280  */
1281 static void
1282 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1283 {
1284 	PagetableEntry *page;
1285 	bool		found;
1286 	BlockNumber chunk_pageno;
1287 	int			bitno;
1288 	int			wordnum;
1289 	int			bitnum;
1290 
1291 	/* We force the bitmap into hashtable mode whenever it's lossy */
1292 	if (tbm->status != TBM_HASH)
1293 		tbm_create_pagetable(tbm);
1294 
1295 	bitno = pageno % PAGES_PER_CHUNK;
1296 	chunk_pageno = pageno - bitno;
1297 
1298 	/*
1299 	 * Remove any extant non-lossy entry for the page.  If the page is its own
1300 	 * chunk header, however, we skip this and handle the case below.
1301 	 */
1302 	if (bitno != 0)
1303 	{
1304 		if (pagetable_delete(tbm->pagetable, pageno))
1305 		{
1306 			/* It was present, so adjust counts */
1307 			tbm->nentries--;
1308 			tbm->npages--;		/* assume it must have been non-lossy */
1309 		}
1310 	}
1311 
1312 	/* Look up or create entry for chunk-header page */
1313 	page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1314 
1315 	/* Initialize it if not present before */
1316 	if (!found)
1317 	{
1318 		char		oldstatus = page->status;
1319 
1320 		MemSet(page, 0, sizeof(PagetableEntry));
1321 		page->status = oldstatus;
1322 		page->blockno = chunk_pageno;
1323 		page->ischunk = true;
1324 		/* must count it too */
1325 		tbm->nentries++;
1326 		tbm->nchunks++;
1327 	}
1328 	else if (!page->ischunk)
1329 	{
1330 		char		oldstatus = page->status;
1331 
1332 		/* chunk header page was formerly non-lossy, make it lossy */
1333 		MemSet(page, 0, sizeof(PagetableEntry));
1334 		page->status = oldstatus;
1335 		page->blockno = chunk_pageno;
1336 		page->ischunk = true;
1337 		/* we assume it had some tuple bit(s) set, so mark it lossy */
1338 		page->words[0] = ((bitmapword) 1 << 0);
1339 		/* adjust counts */
1340 		tbm->nchunks++;
1341 		tbm->npages--;
1342 	}
1343 
1344 	/* Now set the original target page's bit */
1345 	wordnum = WORDNUM(bitno);
1346 	bitnum = BITNUM(bitno);
1347 	page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1348 }
1349 
1350 /*
1351  * tbm_lossify - lose some information to get back under the memory limit
1352  */
1353 static void
1354 tbm_lossify(TIDBitmap *tbm)
1355 {
1356 	pagetable_iterator i;
1357 	PagetableEntry *page;
1358 
1359 	/*
1360 	 * XXX Really stupid implementation: this just lossifies pages in
1361 	 * essentially random order.  We should be paying some attention to the
1362 	 * number of bits set in each page, instead.
1363 	 *
1364 	 * Since we are called as soon as nentries exceeds maxentries, we should
1365 	 * push nentries down to significantly less than maxentries, or else we'll
1366 	 * just end up doing this again very soon.  We shoot for maxentries/2.
1367 	 */
1368 	Assert(tbm->iterating == TBM_NOT_ITERATING);
1369 	Assert(tbm->status == TBM_HASH);
1370 
1371 	pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1372 	while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1373 	{
1374 		if (page->ischunk)
1375 			continue;			/* already a chunk header */
1376 
1377 		/*
1378 		 * If the page would become a chunk header, we won't save anything by
1379 		 * converting it to lossy, so skip it.
1380 		 */
1381 		if ((page->blockno % PAGES_PER_CHUNK) == 0)
1382 			continue;
1383 
1384 		/* This does the dirty work ... */
1385 		tbm_mark_page_lossy(tbm, page->blockno);
1386 
1387 		if (tbm->nentries <= tbm->maxentries / 2)
1388 		{
1389 			/*
1390 			 * We have made enough room. Remember where to start lossifying
1391 			 * next round, so we evenly iterate over the hashtable.
1392 			 */
1393 			tbm->lossify_start = i.cur;
1394 			break;
1395 		}
1396 
1397 		/*
1398 		 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1399 		 * hashtable and may have deleted the non-lossy chunk.  We can
1400 		 * continue the same hash table scan, since failure to visit one
1401 		 * element or visiting the newly inserted element, isn't fatal.
1402 		 */
1403 	}
1404 
1405 	/*
1406 	 * With a big bitmap and small work_mem, it's possible that we cannot get
1407 	 * under maxentries.  Again, if that happens, we'd end up uselessly
1408 	 * calling tbm_lossify over and over.  To prevent this from becoming a
1409 	 * performance sink, force maxentries up to at least double the current
1410 	 * number of entries.  (In essence, we're admitting inability to fit
1411 	 * within work_mem when we do this.)  Note that this test will not fire if
1412 	 * we broke out of the loop early; and if we didn't, the current number of
1413 	 * entries is simply not reducible any further.
1414 	 */
1415 	if (tbm->nentries > tbm->maxentries / 2)
1416 		tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1417 }
1418 
1419 /*
1420  * qsort comparator to handle PagetableEntry pointers.
1421  */
1422 static int
1423 tbm_comparator(const void *left, const void *right)
1424 {
1425 	BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1426 	BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1427 
1428 	if (l < r)
1429 		return -1;
1430 	else if (l > r)
1431 		return 1;
1432 	return 0;
1433 }
1434 
1435 /*
1436  * As above, but this will get index into PagetableEntry array.  Therefore,
1437  * it needs to get actual PagetableEntry using the index before comparing the
1438  * blockno.
1439  */
1440 static int
1441 tbm_shared_comparator(const void *left, const void *right, void *arg)
1442 {
1443 	PagetableEntry *base = (PagetableEntry *) arg;
1444 	PagetableEntry *lpage = &base[*(int *) left];
1445 	PagetableEntry *rpage = &base[*(int *) right];
1446 
1447 	if (lpage->blockno < rpage->blockno)
1448 		return -1;
1449 	else if (lpage->blockno > rpage->blockno)
1450 		return 1;
1451 	return 0;
1452 }
1453 
1454 /*
1455  *	tbm_attach_shared_iterate
1456  *
1457  *	Allocate a backend-private iterator and attach the shared iterator state
1458  *	to it so that multiple processed can iterate jointly.
1459  *
1460  *	We also converts the DSA pointers to local pointers and store them into
1461  *	our private iterator.
1462  */
1463 TBMSharedIterator *
1464 tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1465 {
1466 	TBMSharedIterator *iterator;
1467 	TBMSharedIteratorState *istate;
1468 
1469 	/*
1470 	 * Create the TBMSharedIterator struct, with enough trailing space to
1471 	 * serve the needs of the TBMIterateResult sub-struct.
1472 	 */
1473 	iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1474 											 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1475 
1476 	istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1477 
1478 	iterator->state = istate;
1479 
1480 	iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1481 
1482 	if (istate->npages)
1483 		iterator->ptpages = dsa_get_address(dsa, istate->spages);
1484 	if (istate->nchunks)
1485 		iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1486 
1487 	return iterator;
1488 }
1489 
1490 /*
1491  * pagetable_allocate
1492  *
1493  * Callback function for allocating the memory for hashtable elements.
1494  * Allocate memory for hashtable elements, using DSA if available.
1495  */
1496 static inline void *
1497 pagetable_allocate(pagetable_hash *pagetable, Size size)
1498 {
1499 	TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
1500 	PTEntryArray *ptbase;
1501 
1502 	if (tbm->dsa == NULL)
1503 		return MemoryContextAllocExtended(pagetable->ctx, size,
1504 										  MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1505 
1506 	/*
1507 	 * Save the dsapagetable reference in dsapagetableold before allocating
1508 	 * new memory so that pagetable_free can free the old entry.
1509 	 */
1510 	tbm->dsapagetableold = tbm->dsapagetable;
1511 	tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1512 											  sizeof(PTEntryArray) + size,
1513 											  DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
1514 	ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1515 
1516 	return ptbase->ptentry;
1517 }
1518 
1519 /*
1520  * pagetable_free
1521  *
1522  * Callback function for freeing hash table elements.
1523  */
1524 static inline void
1525 pagetable_free(pagetable_hash *pagetable, void *pointer)
1526 {
1527 	TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
1528 
1529 	/* pfree the input pointer if DSA is not available */
1530 	if (tbm->dsa == NULL)
1531 		pfree(pointer);
1532 	else if (DsaPointerIsValid(tbm->dsapagetableold))
1533 	{
1534 		dsa_free(tbm->dsa, tbm->dsapagetableold);
1535 		tbm->dsapagetableold = InvalidDsaPointer;
1536 	}
1537 }
1538 
1539 /*
1540  * tbm_calculate_entries
1541  *
1542  * Estimate number of hashtable entries we can have within maxbytes.
1543  */
1544 long
1545 tbm_calculate_entries(double maxbytes)
1546 {
1547 	long		nbuckets;
1548 
1549 	/*
1550 	 * Estimate number of hashtable entries we can have within maxbytes. This
1551 	 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1552 	 * for our purpose.  Also count an extra Pointer per entry for the arrays
1553 	 * created during iteration readout.
1554 	 */
1555 	nbuckets = maxbytes /
1556 		(sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1557 	nbuckets = Min(nbuckets, INT_MAX - 1);	/* safety limit */
1558 	nbuckets = Max(nbuckets, 16);	/* sanity limit */
1559 
1560 	return nbuckets;
1561 }
1562