1 /*-------------------------------------------------------------------------
2  *
3  * tidbitmap.c
4  *	  PostgreSQL tuple-id (TID) bitmap package
5  *
6  * This module provides bitmap data structures that are spiritually
7  * similar to Bitmapsets, but are specially adapted to store sets of
8  * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
9  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10  * Also, since we wish to be able to store very large tuple sets in
11  * memory with this data structure, we support "lossy" storage, in which
12  * we no longer remember individual tuple offsets on a page but only the
13  * fact that a particular page needs to be visited.
14  *
15  * The "lossy" storage uses one bit per disk page, so at the standard 8K
16  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17  * of memory.  People pushing around tables of that size should have a
18  * couple of Mb to spare, so we don't worry about providing a second level
19  * of lossiness.  In theory we could fall back to page ranges at some
20  * point, but for now that seems useless complexity.
21  *
22  * We also support the notion of candidate matches, or rechecking.  This
23  * means we know that a search need visit only some tuples on a page,
24  * but we are not certain that all of those tuples are real matches.
25  * So the eventual heap scan must recheck the quals for these tuples only,
26  * rather than rechecking the quals for all tuples on the page as in the
27  * lossy-bitmap case.  Rechecking can be specified when TIDs are inserted
28  * into a bitmap, and it can also happen internally when we AND a lossy
29  * and a non-lossy page.
30  *
31  *
32  * Copyright (c) 2003-2017, PostgreSQL Global Development Group
33  *
34  * IDENTIFICATION
35  *	  src/backend/nodes/tidbitmap.c
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include <limits.h>
42 
43 #include "access/htup_details.h"
44 #include "nodes/bitmapset.h"
45 #include "nodes/tidbitmap.h"
46 #include "storage/lwlock.h"
47 #include "utils/dsa.h"
48 #include "utils/hashutils.h"
49 
50 /*
51  * The maximum number of tuples per page is not large (typically 256 with
52  * 8K pages, or 1024 with 32K pages).  So there's not much point in making
53  * the per-page bitmaps variable size.  We just legislate that the size
54  * is this:
55  */
56 #define MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
57 
58 /*
59  * When we have to switch over to lossy storage, we use a data structure
60  * with one bit per page, where all pages having the same number DIV
61  * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
62  * and has the bit set for a given page, there must not be a per-page entry
63  * for that page in the page table.
64  *
65  * We actually store both exact pages and lossy chunks in the same hash
66  * table, using identical data structures.  (This is because the memory
67  * management for hashtables doesn't easily/efficiently allow space to be
68  * transferred easily from one hashtable to another.)  Therefore it's best
69  * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
70  * too different.  But we also want PAGES_PER_CHUNK to be a power of 2 to
71  * avoid expensive integer remainder operations.  So, define it like this:
72  */
73 #define PAGES_PER_CHUNK  (BLCKSZ / 32)
74 
75 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
76 
77 #define WORDNUM(x)	((x) / BITS_PER_BITMAPWORD)
78 #define BITNUM(x)	((x) % BITS_PER_BITMAPWORD)
79 
80 /* number of active words for an exact page: */
81 #define WORDS_PER_PAGE	((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
82 /* number of active words for a lossy chunk: */
83 #define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
84 
85 /*
86  * The hashtable entries are represented by this data structure.  For
87  * an exact page, blockno is the page number and bit k of the bitmap
88  * represents tuple offset k+1.  For a lossy chunk, blockno is the first
89  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
90  * bit k represents page blockno+k.  Note that it is not possible to
91  * have exact storage for the first page of a chunk if we are using
92  * lossy storage for any page in the chunk's range, since the same
93  * hashtable entry has to serve both purposes.
94  *
95  * recheck is used only on exact pages --- it indicates that although
96  * only the stated tuples need be checked, the full index qual condition
97  * must be checked for each (ie, these are candidate matches).
98  */
99 typedef struct PagetableEntry
100 {
101 	BlockNumber blockno;		/* page number (hashtable key) */
102 	char		status;			/* hash entry status */
103 	bool		ischunk;		/* T = lossy storage, F = exact */
104 	bool		recheck;		/* should the tuples be rechecked? */
105 	bitmapword	words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
106 } PagetableEntry;
107 
108 /*
109  * Holds array of pagetable entries.
110  */
111 typedef struct PTEntryArray
112 {
113 	pg_atomic_uint32 refcount;	/* no. of iterator attached */
114 	PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
115 } PTEntryArray;
116 
117 /*
118  * We want to avoid the overhead of creating the hashtable, which is
119  * comparatively large, when not necessary. Particularly when we are using a
120  * bitmap scan on the inside of a nestloop join: a bitmap may well live only
121  * long enough to accumulate one entry in such cases.  We therefore avoid
122  * creating an actual hashtable until we need two pagetable entries.  When
123  * just one pagetable entry is needed, we store it in a fixed field of
124  * TIDBitMap.  (NOTE: we don't get rid of the hashtable if the bitmap later
125  * shrinks down to zero or one page again.  So, status can be TBM_HASH even
126  * when nentries is zero or one.)
127  */
128 typedef enum
129 {
130 	TBM_EMPTY,					/* no hashtable, nentries == 0 */
131 	TBM_ONE_PAGE,				/* entry1 contains the single entry */
132 	TBM_HASH					/* pagetable is valid, entry1 is not */
133 } TBMStatus;
134 
135 /*
136  * Current iterating state of the TBM.
137  */
138 typedef enum
139 {
140 	TBM_NOT_ITERATING,			/* not yet converted to page and chunk array */
141 	TBM_ITERATING_PRIVATE,		/* converted to local page and chunk array */
142 	TBM_ITERATING_SHARED		/* converted to shared page and chunk array */
143 } TBMIteratingState;
144 
145 /*
146  * Here is the representation for a whole TIDBitMap:
147  */
148 struct TIDBitmap
149 {
150 	NodeTag		type;			/* to make it a valid Node */
151 	MemoryContext mcxt;			/* memory context containing me */
152 	TBMStatus	status;			/* see codes above */
153 	struct pagetable_hash *pagetable;	/* hash table of PagetableEntry's */
154 	int			nentries;		/* number of entries in pagetable */
155 	int			maxentries;		/* limit on same to meet maxbytes */
156 	int			npages;			/* number of exact entries in pagetable */
157 	int			nchunks;		/* number of lossy entries in pagetable */
158 	TBMIteratingState iterating;	/* tbm_begin_iterate called? */
159 	uint32		lossify_start;	/* offset to start lossifying hashtable at */
160 	PagetableEntry entry1;		/* used when status == TBM_ONE_PAGE */
161 	/* these are valid when iterating is true: */
162 	PagetableEntry **spages;	/* sorted exact-page list, or NULL */
163 	PagetableEntry **schunks;	/* sorted lossy-chunk list, or NULL */
164 	dsa_pointer dsapagetable;	/* dsa_pointer to the element array */
165 	dsa_pointer dsapagetableold;	/* dsa_pointer to the old element array */
166 	dsa_pointer ptpages;		/* dsa_pointer to the page array */
167 	dsa_pointer ptchunks;		/* dsa_pointer to the chunk array */
168 	dsa_area   *dsa;			/* reference to per-query dsa area */
169 };
170 
171 /*
172  * When iterating over a bitmap in sorted order, a TBMIterator is used to
173  * track our progress.  There can be several iterators scanning the same
174  * bitmap concurrently.  Note that the bitmap becomes read-only as soon as
175  * any iterator is created.
176  */
177 struct TBMIterator
178 {
179 	TIDBitmap  *tbm;			/* TIDBitmap we're iterating over */
180 	int			spageptr;		/* next spages index */
181 	int			schunkptr;		/* next schunks index */
182 	int			schunkbit;		/* next bit to check in current schunk */
183 	TBMIterateResult output;	/* MUST BE LAST (because variable-size) */
184 };
185 
186 /*
187  * Holds the shared members of the iterator so that multiple processes
188  * can jointly iterate.
189  */
190 typedef struct TBMSharedIteratorState
191 {
192 	int			nentries;		/* number of entries in pagetable */
193 	int			maxentries;		/* limit on same to meet maxbytes */
194 	int			npages;			/* number of exact entries in pagetable */
195 	int			nchunks;		/* number of lossy entries in pagetable */
196 	dsa_pointer pagetable;		/* dsa pointers to head of pagetable data */
197 	dsa_pointer spages;			/* dsa pointer to page array */
198 	dsa_pointer schunks;		/* dsa pointer to chunk array */
199 	LWLock		lock;			/* lock to protect below members */
200 	int			spageptr;		/* next spages index */
201 	int			schunkptr;		/* next schunks index */
202 	int			schunkbit;		/* next bit to check in current schunk */
203 } TBMSharedIteratorState;
204 
205 /*
206  * pagetable iteration array.
207  */
208 typedef struct PTIterationArray
209 {
210 	pg_atomic_uint32 refcount;	/* no. of iterator attached */
211 	int			index[FLEXIBLE_ARRAY_MEMBER];	/* index array */
212 } PTIterationArray;
213 
214 /*
215  * same as TBMIterator, but it is used for joint iteration, therefore this
216  * also holds a reference to the shared state.
217  */
218 struct TBMSharedIterator
219 {
220 	TBMSharedIteratorState *state;	/* shared state */
221 	PTEntryArray *ptbase;		/* pagetable element array */
222 	PTIterationArray *ptpages;	/* sorted exact page index list */
223 	PTIterationArray *ptchunks; /* sorted lossy page index list */
224 	TBMIterateResult output;	/* MUST BE LAST (because variable-size) */
225 };
226 
227 /* Local function prototypes */
228 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
229 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
230 				   const TIDBitmap *b);
231 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
232 				   BlockNumber pageno);
233 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
234 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
235 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
236 static void tbm_lossify(TIDBitmap *tbm);
237 static int	tbm_comparator(const void *left, const void *right);
238 static int tbm_shared_comparator(const void *left, const void *right,
239 					  void *arg);
240 
241 /* define hashtable mapping block numbers to PagetableEntry's */
242 #define SH_USE_NONDEFAULT_ALLOCATOR
243 #define SH_PREFIX pagetable
244 #define SH_ELEMENT_TYPE PagetableEntry
245 #define SH_KEY_TYPE BlockNumber
246 #define SH_KEY blockno
247 #define SH_HASH_KEY(tb, key) murmurhash32(key)
248 #define SH_EQUAL(tb, a, b) a == b
249 #define SH_SCOPE static inline
250 #define SH_DEFINE
251 #define SH_DECLARE
252 #include "lib/simplehash.h"
253 
254 
255 /*
256  * tbm_create - create an initially-empty bitmap
257  *
258  * The bitmap will live in the memory context that is CurrentMemoryContext
259  * at the time of this call.  It will be limited to (approximately) maxbytes
260  * total memory consumption.  If the DSA passed to this function is not NULL
261  * then the memory for storing elements of the underlying page table will
262  * be allocated from the DSA.
263  */
264 TIDBitmap *
tbm_create(long maxbytes,dsa_area * dsa)265 tbm_create(long maxbytes, dsa_area *dsa)
266 {
267 	TIDBitmap  *tbm;
268 	long		nbuckets;
269 
270 	/* Create the TIDBitmap struct and zero all its fields */
271 	tbm = makeNode(TIDBitmap);
272 
273 	tbm->mcxt = CurrentMemoryContext;
274 	tbm->status = TBM_EMPTY;
275 
276 	/*
277 	 * Estimate number of hashtable entries we can have within maxbytes. This
278 	 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
279 	 * for our purpose.  Also count an extra Pointer per entry for the arrays
280 	 * created during iteration readout.
281 	 */
282 	nbuckets = maxbytes /
283 		(sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
284 	nbuckets = Min(nbuckets, INT_MAX - 1);	/* safety limit */
285 	nbuckets = Max(nbuckets, 16);	/* sanity limit */
286 	tbm->maxentries = (int) nbuckets;
287 	tbm->lossify_start = 0;
288 	tbm->dsa = dsa;
289 	tbm->dsapagetable = InvalidDsaPointer;
290 	tbm->dsapagetableold = InvalidDsaPointer;
291 	tbm->ptpages = InvalidDsaPointer;
292 	tbm->ptchunks = InvalidDsaPointer;
293 
294 	return tbm;
295 }
296 
297 /*
298  * Actually create the hashtable.  Since this is a moderately expensive
299  * proposition, we don't do it until we have to.
300  */
301 static void
tbm_create_pagetable(TIDBitmap * tbm)302 tbm_create_pagetable(TIDBitmap *tbm)
303 {
304 	Assert(tbm->status != TBM_HASH);
305 	Assert(tbm->pagetable == NULL);
306 
307 	tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
308 
309 	/* If entry1 is valid, push it into the hashtable */
310 	if (tbm->status == TBM_ONE_PAGE)
311 	{
312 		PagetableEntry *page;
313 		bool		found;
314 		char		oldstatus;
315 
316 		page = pagetable_insert(tbm->pagetable,
317 								tbm->entry1.blockno,
318 								&found);
319 		Assert(!found);
320 		oldstatus = page->status;
321 		memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
322 		page->status = oldstatus;
323 	}
324 
325 	tbm->status = TBM_HASH;
326 }
327 
328 /*
329  * tbm_free - free a TIDBitmap
330  */
331 void
tbm_free(TIDBitmap * tbm)332 tbm_free(TIDBitmap *tbm)
333 {
334 	if (tbm->pagetable)
335 		pagetable_destroy(tbm->pagetable);
336 	if (tbm->spages)
337 		pfree(tbm->spages);
338 	if (tbm->schunks)
339 		pfree(tbm->schunks);
340 	pfree(tbm);
341 }
342 
343 /*
344  * tbm_free_shared_area - free shared state
345  *
346  * Free shared iterator state, Also free shared pagetable and iterator arrays
347  * memory if they are not referred by any of the shared iterator i.e recount
348  * is becomes 0.
349  */
350 void
tbm_free_shared_area(dsa_area * dsa,dsa_pointer dp)351 tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
352 {
353 	TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
354 	PTEntryArray *ptbase;
355 	PTIterationArray *ptpages;
356 	PTIterationArray *ptchunks;
357 
358 	if (DsaPointerIsValid(istate->pagetable))
359 	{
360 		ptbase = dsa_get_address(dsa, istate->pagetable);
361 		if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
362 			dsa_free(dsa, istate->pagetable);
363 	}
364 	if (DsaPointerIsValid(istate->spages))
365 	{
366 		ptpages = dsa_get_address(dsa, istate->spages);
367 		if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
368 			dsa_free(dsa, istate->spages);
369 	}
370 	if (DsaPointerIsValid(istate->schunks))
371 	{
372 		ptchunks = dsa_get_address(dsa, istate->schunks);
373 		if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
374 			dsa_free(dsa, istate->schunks);
375 	}
376 
377 	dsa_free(dsa, dp);
378 }
379 
380 /*
381  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
382  *
383  * If recheck is true, then the recheck flag will be set in the
384  * TBMIterateResult when any of these tuples are reported out.
385  */
386 void
tbm_add_tuples(TIDBitmap * tbm,const ItemPointer tids,int ntids,bool recheck)387 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
388 			   bool recheck)
389 {
390 	BlockNumber currblk = InvalidBlockNumber;
391 	PagetableEntry *page = NULL;	/* only valid when currblk is valid */
392 	int			i;
393 
394 	Assert(tbm->iterating == TBM_NOT_ITERATING);
395 	for (i = 0; i < ntids; i++)
396 	{
397 		BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
398 		OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
399 		int			wordnum,
400 					bitnum;
401 
402 		/* safety check to ensure we don't overrun bit array bounds */
403 		if (off < 1 || off > MAX_TUPLES_PER_PAGE)
404 			elog(ERROR, "tuple offset out of range: %u", off);
405 
406 		/*
407 		 * Look up target page unless we already did.  This saves cycles when
408 		 * the input includes consecutive tuples on the same page, which is
409 		 * common enough to justify an extra test here.
410 		 */
411 		if (blk != currblk)
412 		{
413 			if (tbm_page_is_lossy(tbm, blk))
414 				page = NULL;	/* remember page is lossy */
415 			else
416 				page = tbm_get_pageentry(tbm, blk);
417 			currblk = blk;
418 		}
419 
420 		if (page == NULL)
421 			continue;			/* whole page is already marked */
422 
423 		if (page->ischunk)
424 		{
425 			/* The page is a lossy chunk header, set bit for itself */
426 			wordnum = bitnum = 0;
427 		}
428 		else
429 		{
430 			/* Page is exact, so set bit for individual tuple */
431 			wordnum = WORDNUM(off - 1);
432 			bitnum = BITNUM(off - 1);
433 		}
434 		page->words[wordnum] |= ((bitmapword) 1 << bitnum);
435 		page->recheck |= recheck;
436 
437 		if (tbm->nentries > tbm->maxentries)
438 		{
439 			tbm_lossify(tbm);
440 			/* Page could have been converted to lossy, so force new lookup */
441 			currblk = InvalidBlockNumber;
442 		}
443 	}
444 }
445 
446 /*
447  * tbm_add_page - add a whole page to a TIDBitmap
448  *
449  * This causes the whole page to be reported (with the recheck flag)
450  * when the TIDBitmap is scanned.
451  */
452 void
tbm_add_page(TIDBitmap * tbm,BlockNumber pageno)453 tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
454 {
455 	/* Enter the page in the bitmap, or mark it lossy if already present */
456 	tbm_mark_page_lossy(tbm, pageno);
457 	/* If we went over the memory limit, lossify some more pages */
458 	if (tbm->nentries > tbm->maxentries)
459 		tbm_lossify(tbm);
460 }
461 
462 /*
463  * tbm_union - set union
464  *
465  * a is modified in-place, b is not changed
466  */
467 void
tbm_union(TIDBitmap * a,const TIDBitmap * b)468 tbm_union(TIDBitmap *a, const TIDBitmap *b)
469 {
470 	Assert(!a->iterating);
471 	/* Nothing to do if b is empty */
472 	if (b->nentries == 0)
473 		return;
474 	/* Scan through chunks and pages in b, merge into a */
475 	if (b->status == TBM_ONE_PAGE)
476 		tbm_union_page(a, &b->entry1);
477 	else
478 	{
479 		pagetable_iterator i;
480 		PagetableEntry *bpage;
481 
482 		Assert(b->status == TBM_HASH);
483 		pagetable_start_iterate(b->pagetable, &i);
484 		while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
485 			tbm_union_page(a, bpage);
486 	}
487 }
488 
489 /* Process one page of b during a union op */
490 static void
tbm_union_page(TIDBitmap * a,const PagetableEntry * bpage)491 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
492 {
493 	PagetableEntry *apage;
494 	int			wordnum;
495 
496 	if (bpage->ischunk)
497 	{
498 		/* Scan b's chunk, mark each indicated page lossy in a */
499 		for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
500 		{
501 			bitmapword	w = bpage->words[wordnum];
502 
503 			if (w != 0)
504 			{
505 				BlockNumber pg;
506 
507 				pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
508 				while (w != 0)
509 				{
510 					if (w & 1)
511 						tbm_mark_page_lossy(a, pg);
512 					pg++;
513 					w >>= 1;
514 				}
515 			}
516 		}
517 	}
518 	else if (tbm_page_is_lossy(a, bpage->blockno))
519 	{
520 		/* page is already lossy in a, nothing to do */
521 		return;
522 	}
523 	else
524 	{
525 		apage = tbm_get_pageentry(a, bpage->blockno);
526 		if (apage->ischunk)
527 		{
528 			/* The page is a lossy chunk header, set bit for itself */
529 			apage->words[0] |= ((bitmapword) 1 << 0);
530 		}
531 		else
532 		{
533 			/* Both pages are exact, merge at the bit level */
534 			for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
535 				apage->words[wordnum] |= bpage->words[wordnum];
536 			apage->recheck |= bpage->recheck;
537 		}
538 	}
539 
540 	if (a->nentries > a->maxentries)
541 		tbm_lossify(a);
542 }
543 
544 /*
545  * tbm_intersect - set intersection
546  *
547  * a is modified in-place, b is not changed
548  */
549 void
tbm_intersect(TIDBitmap * a,const TIDBitmap * b)550 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
551 {
552 	Assert(!a->iterating);
553 	/* Nothing to do if a is empty */
554 	if (a->nentries == 0)
555 		return;
556 	/* Scan through chunks and pages in a, try to match to b */
557 	if (a->status == TBM_ONE_PAGE)
558 	{
559 		if (tbm_intersect_page(a, &a->entry1, b))
560 		{
561 			/* Page is now empty, remove it from a */
562 			Assert(!a->entry1.ischunk);
563 			a->npages--;
564 			a->nentries--;
565 			Assert(a->nentries == 0);
566 			a->status = TBM_EMPTY;
567 		}
568 	}
569 	else
570 	{
571 		pagetable_iterator i;
572 		PagetableEntry *apage;
573 
574 		Assert(a->status == TBM_HASH);
575 		pagetable_start_iterate(a->pagetable, &i);
576 		while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
577 		{
578 			if (tbm_intersect_page(a, apage, b))
579 			{
580 				/* Page or chunk is now empty, remove it from a */
581 				if (apage->ischunk)
582 					a->nchunks--;
583 				else
584 					a->npages--;
585 				a->nentries--;
586 				if (!pagetable_delete(a->pagetable, apage->blockno))
587 					elog(ERROR, "hash table corrupted");
588 			}
589 		}
590 	}
591 }
592 
593 /*
594  * Process one page of a during an intersection op
595  *
596  * Returns TRUE if apage is now empty and should be deleted from a
597  */
598 static bool
tbm_intersect_page(TIDBitmap * a,PagetableEntry * apage,const TIDBitmap * b)599 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
600 {
601 	const PagetableEntry *bpage;
602 	int			wordnum;
603 
604 	if (apage->ischunk)
605 	{
606 		/* Scan each bit in chunk, try to clear */
607 		bool		candelete = true;
608 
609 		for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
610 		{
611 			bitmapword	w = apage->words[wordnum];
612 
613 			if (w != 0)
614 			{
615 				bitmapword	neww = w;
616 				BlockNumber pg;
617 				int			bitnum;
618 
619 				pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
620 				bitnum = 0;
621 				while (w != 0)
622 				{
623 					if (w & 1)
624 					{
625 						if (!tbm_page_is_lossy(b, pg) &&
626 							tbm_find_pageentry(b, pg) == NULL)
627 						{
628 							/* Page is not in b at all, lose lossy bit */
629 							neww &= ~((bitmapword) 1 << bitnum);
630 						}
631 					}
632 					pg++;
633 					bitnum++;
634 					w >>= 1;
635 				}
636 				apage->words[wordnum] = neww;
637 				if (neww != 0)
638 					candelete = false;
639 			}
640 		}
641 		return candelete;
642 	}
643 	else if (tbm_page_is_lossy(b, apage->blockno))
644 	{
645 		/*
646 		 * Some of the tuples in 'a' might not satisfy the quals for 'b', but
647 		 * because the page 'b' is lossy, we don't know which ones. Therefore
648 		 * we mark 'a' as requiring rechecks, to indicate that at most those
649 		 * tuples set in 'a' are matches.
650 		 */
651 		apage->recheck = true;
652 		return false;
653 	}
654 	else
655 	{
656 		bool		candelete = true;
657 
658 		bpage = tbm_find_pageentry(b, apage->blockno);
659 		if (bpage != NULL)
660 		{
661 			/* Both pages are exact, merge at the bit level */
662 			Assert(!bpage->ischunk);
663 			for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
664 			{
665 				apage->words[wordnum] &= bpage->words[wordnum];
666 				if (apage->words[wordnum] != 0)
667 					candelete = false;
668 			}
669 			apage->recheck |= bpage->recheck;
670 		}
671 		/* If there is no matching b page, we can just delete the a page */
672 		return candelete;
673 	}
674 }
675 
676 /*
677  * tbm_is_empty - is a TIDBitmap completely empty?
678  */
679 bool
tbm_is_empty(const TIDBitmap * tbm)680 tbm_is_empty(const TIDBitmap *tbm)
681 {
682 	return (tbm->nentries == 0);
683 }
684 
685 /*
686  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
687  *
688  * The TBMIterator struct is created in the caller's memory context.
689  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
690  * okay to just allow the memory context to be released, too.  It is caller's
691  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
692  * is freed.
693  *
694  * NB: after this is called, it is no longer allowed to modify the contents
695  * of the bitmap.  However, you can call this multiple times to scan the
696  * contents repeatedly, including parallel scans.
697  */
698 TBMIterator *
tbm_begin_iterate(TIDBitmap * tbm)699 tbm_begin_iterate(TIDBitmap *tbm)
700 {
701 	TBMIterator *iterator;
702 
703 	Assert(tbm->iterating != TBM_ITERATING_SHARED);
704 
705 	/*
706 	 * Create the TBMIterator struct, with enough trailing space to serve the
707 	 * needs of the TBMIterateResult sub-struct.
708 	 */
709 	iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
710 									  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
711 	iterator->tbm = tbm;
712 
713 	/*
714 	 * Initialize iteration pointers.
715 	 */
716 	iterator->spageptr = 0;
717 	iterator->schunkptr = 0;
718 	iterator->schunkbit = 0;
719 
720 	/*
721 	 * If we have a hashtable, create and fill the sorted page lists, unless
722 	 * we already did that for a previous iterator.  Note that the lists are
723 	 * attached to the bitmap not the iterator, so they can be used by more
724 	 * than one iterator.
725 	 */
726 	if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
727 	{
728 		pagetable_iterator i;
729 		PagetableEntry *page;
730 		int			npages;
731 		int			nchunks;
732 
733 		if (!tbm->spages && tbm->npages > 0)
734 			tbm->spages = (PagetableEntry **)
735 				MemoryContextAlloc(tbm->mcxt,
736 								   tbm->npages * sizeof(PagetableEntry *));
737 		if (!tbm->schunks && tbm->nchunks > 0)
738 			tbm->schunks = (PagetableEntry **)
739 				MemoryContextAlloc(tbm->mcxt,
740 								   tbm->nchunks * sizeof(PagetableEntry *));
741 
742 		npages = nchunks = 0;
743 		pagetable_start_iterate(tbm->pagetable, &i);
744 		while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
745 		{
746 			if (page->ischunk)
747 				tbm->schunks[nchunks++] = page;
748 			else
749 				tbm->spages[npages++] = page;
750 		}
751 		Assert(npages == tbm->npages);
752 		Assert(nchunks == tbm->nchunks);
753 		if (npages > 1)
754 			qsort(tbm->spages, npages, sizeof(PagetableEntry *),
755 				  tbm_comparator);
756 		if (nchunks > 1)
757 			qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
758 				  tbm_comparator);
759 	}
760 
761 	tbm->iterating = TBM_ITERATING_PRIVATE;
762 
763 	return iterator;
764 }
765 
766 /*
767  * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
768  *
769  * The necessary shared state will be allocated from the DSA passed to
770  * tbm_create, so that multiple processes can attach to it and iterate jointly.
771  *
772  * This will convert the pagetable hash into page and chunk array of the index
773  * into pagetable array.
774  */
775 dsa_pointer
tbm_prepare_shared_iterate(TIDBitmap * tbm)776 tbm_prepare_shared_iterate(TIDBitmap *tbm)
777 {
778 	dsa_pointer dp;
779 	TBMSharedIteratorState *istate;
780 	PTEntryArray *ptbase = NULL;
781 	PTIterationArray *ptpages = NULL;
782 	PTIterationArray *ptchunks = NULL;
783 
784 	Assert(tbm->dsa != NULL);
785 	Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
786 
787 	/*
788 	 * Allocate TBMSharedIteratorState from DSA to hold the shared members and
789 	 * lock, this will also be used by multiple worker for shared iterate.
790 	 */
791 	dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
792 	istate = dsa_get_address(tbm->dsa, dp);
793 
794 	/*
795 	 * If we're not already iterating, create and fill the sorted page lists.
796 	 * (If we are, the sorted page lists are already stored in the TIDBitmap,
797 	 * and we can just reuse them.)
798 	 */
799 	if (tbm->iterating == TBM_NOT_ITERATING)
800 	{
801 		pagetable_iterator i;
802 		PagetableEntry *page;
803 		int			idx;
804 		int			npages;
805 		int			nchunks;
806 
807 		/*
808 		 * Allocate the page and chunk array memory from the DSA to share
809 		 * across multiple processes.
810 		 */
811 		if (tbm->npages)
812 		{
813 			tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
814 										tbm->npages * sizeof(int));
815 			ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
816 			pg_atomic_init_u32(&ptpages->refcount, 0);
817 		}
818 		if (tbm->nchunks)
819 		{
820 			tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
821 										 tbm->nchunks * sizeof(int));
822 			ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
823 			pg_atomic_init_u32(&ptchunks->refcount, 0);
824 		}
825 
826 		/*
827 		 * If TBM status is TBM_HASH then iterate over the pagetable and
828 		 * convert it to page and chunk arrays.  But if it's in the
829 		 * TBM_ONE_PAGE mode then directly allocate the space for one entry
830 		 * from the DSA.
831 		 */
832 		npages = nchunks = 0;
833 		if (tbm->status == TBM_HASH)
834 		{
835 			ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
836 
837 			pagetable_start_iterate(tbm->pagetable, &i);
838 			while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
839 			{
840 				idx = page - ptbase->ptentry;
841 				if (page->ischunk)
842 					ptchunks->index[nchunks++] = idx;
843 				else
844 					ptpages->index[npages++] = idx;
845 			}
846 
847 			Assert(npages == tbm->npages);
848 			Assert(nchunks == tbm->nchunks);
849 		}
850 		else if (tbm->status == TBM_ONE_PAGE)
851 		{
852 			/*
853 			 * In one page mode allocate the space for one pagetable entry,
854 			 * initialize it, and directly store its index (i.e. 0) in the
855 			 * page array.
856 			 */
857 			tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
858 											 sizeof(PagetableEntry));
859 			ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
860 			memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
861 			ptpages->index[0] = 0;
862 		}
863 
864 		if (ptbase != NULL)
865 			pg_atomic_init_u32(&ptbase->refcount, 0);
866 		if (npages > 1)
867 			qsort_arg((void *) (ptpages->index), npages, sizeof(int),
868 					  tbm_shared_comparator, (void *) ptbase->ptentry);
869 		if (nchunks > 1)
870 			qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
871 					  tbm_shared_comparator, (void *) ptbase->ptentry);
872 	}
873 
874 	/*
875 	 * Store the TBM members in the shared state so that we can share them
876 	 * across multiple processes.
877 	 */
878 	istate->nentries = tbm->nentries;
879 	istate->maxentries = tbm->maxentries;
880 	istate->npages = tbm->npages;
881 	istate->nchunks = tbm->nchunks;
882 	istate->pagetable = tbm->dsapagetable;
883 	istate->spages = tbm->ptpages;
884 	istate->schunks = tbm->ptchunks;
885 
886 	ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
887 	ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
888 	ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
889 
890 	/*
891 	 * For every shared iterator, referring to pagetable and iterator array,
892 	 * increase the refcount by 1 so that while freeing the shared iterator we
893 	 * don't free pagetable and iterator array until its refcount becomes 0.
894 	 */
895 	if (ptbase != NULL)
896 		pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
897 	if (ptpages != NULL)
898 		pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
899 	if (ptchunks != NULL)
900 		pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
901 
902 	/* Initialize the iterator lock */
903 	LWLockInitialize(&istate->lock, LWTRANCHE_TBM);
904 
905 	/* Initialize the shared iterator state */
906 	istate->schunkbit = 0;
907 	istate->schunkptr = 0;
908 	istate->spageptr = 0;
909 
910 	tbm->iterating = TBM_ITERATING_SHARED;
911 
912 	return dp;
913 }
914 
915 /*
916  * tbm_extract_page_tuple - extract the tuple offsets from a page
917  *
918  * The extracted offsets are stored into TBMIterateResult.
919  */
920 static inline int
tbm_extract_page_tuple(PagetableEntry * page,TBMIterateResult * output)921 tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
922 {
923 	int			wordnum;
924 	int			ntuples = 0;
925 
926 	for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
927 	{
928 		bitmapword	w = page->words[wordnum];
929 
930 		if (w != 0)
931 		{
932 			int			off = wordnum * BITS_PER_BITMAPWORD + 1;
933 
934 			while (w != 0)
935 			{
936 				if (w & 1)
937 					output->offsets[ntuples++] = (OffsetNumber) off;
938 				off++;
939 				w >>= 1;
940 			}
941 		}
942 	}
943 
944 	return ntuples;
945 }
946 
947 /*
948  *	tbm_advance_schunkbit - Advance the chunkbit
949  */
950 static inline void
tbm_advance_schunkbit(PagetableEntry * chunk,int * schunkbitp)951 tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
952 {
953 	int			schunkbit = *schunkbitp;
954 
955 	while (schunkbit < PAGES_PER_CHUNK)
956 	{
957 		int			wordnum = WORDNUM(schunkbit);
958 		int			bitnum = BITNUM(schunkbit);
959 
960 		if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
961 			break;
962 		schunkbit++;
963 	}
964 
965 	*schunkbitp = schunkbit;
966 }
967 
968 /*
969  * tbm_iterate - scan through next page of a TIDBitmap
970  *
971  * Returns a TBMIterateResult representing one page, or NULL if there are
972  * no more pages to scan.  Pages are guaranteed to be delivered in numerical
973  * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
974  * remember the exact tuples to look at on this page --- the caller must
975  * examine all tuples on the page and check if they meet the intended
976  * condition.  If result->recheck is true, only the indicated tuples need
977  * be examined, but the condition must be rechecked anyway.  (For ease of
978  * testing, recheck is always set true when ntuples < 0.)
979  */
980 TBMIterateResult *
tbm_iterate(TBMIterator * iterator)981 tbm_iterate(TBMIterator *iterator)
982 {
983 	TIDBitmap  *tbm = iterator->tbm;
984 	TBMIterateResult *output = &(iterator->output);
985 
986 	Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
987 
988 	/*
989 	 * If lossy chunk pages remain, make sure we've advanced schunkptr/
990 	 * schunkbit to the next set bit.
991 	 */
992 	while (iterator->schunkptr < tbm->nchunks)
993 	{
994 		PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
995 		int			schunkbit = iterator->schunkbit;
996 
997 		tbm_advance_schunkbit(chunk, &schunkbit);
998 		if (schunkbit < PAGES_PER_CHUNK)
999 		{
1000 			iterator->schunkbit = schunkbit;
1001 			break;
1002 		}
1003 		/* advance to next chunk */
1004 		iterator->schunkptr++;
1005 		iterator->schunkbit = 0;
1006 	}
1007 
1008 	/*
1009 	 * If both chunk and per-page data remain, must output the numerically
1010 	 * earlier page.
1011 	 */
1012 	if (iterator->schunkptr < tbm->nchunks)
1013 	{
1014 		PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1015 		BlockNumber chunk_blockno;
1016 
1017 		chunk_blockno = chunk->blockno + iterator->schunkbit;
1018 		if (iterator->spageptr >= tbm->npages ||
1019 			chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1020 		{
1021 			/* Return a lossy page indicator from the chunk */
1022 			output->blockno = chunk_blockno;
1023 			output->ntuples = -1;
1024 			output->recheck = true;
1025 			iterator->schunkbit++;
1026 			return output;
1027 		}
1028 	}
1029 
1030 	if (iterator->spageptr < tbm->npages)
1031 	{
1032 		PagetableEntry *page;
1033 		int			ntuples;
1034 
1035 		/* In ONE_PAGE state, we don't allocate an spages[] array */
1036 		if (tbm->status == TBM_ONE_PAGE)
1037 			page = &tbm->entry1;
1038 		else
1039 			page = tbm->spages[iterator->spageptr];
1040 
1041 		/* scan bitmap to extract individual offset numbers */
1042 		ntuples = tbm_extract_page_tuple(page, output);
1043 		output->blockno = page->blockno;
1044 		output->ntuples = ntuples;
1045 		output->recheck = page->recheck;
1046 		iterator->spageptr++;
1047 		return output;
1048 	}
1049 
1050 	/* Nothing more in the bitmap */
1051 	return NULL;
1052 }
1053 
1054 /*
1055  *	tbm_shared_iterate - scan through next page of a TIDBitmap
1056  *
1057  *	As above, but this will iterate using an iterator which is shared
1058  *	across multiple processes.  We need to acquire the iterator LWLock,
1059  *	before accessing the shared members.
1060  */
1061 TBMIterateResult *
tbm_shared_iterate(TBMSharedIterator * iterator)1062 tbm_shared_iterate(TBMSharedIterator *iterator)
1063 {
1064 	TBMIterateResult *output = &iterator->output;
1065 	TBMSharedIteratorState *istate = iterator->state;
1066 	PagetableEntry *ptbase = NULL;
1067 	int		   *idxpages = NULL;
1068 	int		   *idxchunks = NULL;
1069 
1070 	if (iterator->ptbase != NULL)
1071 		ptbase = iterator->ptbase->ptentry;
1072 	if (iterator->ptpages != NULL)
1073 		idxpages = iterator->ptpages->index;
1074 	if (iterator->ptchunks != NULL)
1075 		idxchunks = iterator->ptchunks->index;
1076 
1077 	/* Acquire the LWLock before accessing the shared members */
1078 	LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1079 
1080 	/*
1081 	 * If lossy chunk pages remain, make sure we've advanced schunkptr/
1082 	 * schunkbit to the next set bit.
1083 	 */
1084 	while (istate->schunkptr < istate->nchunks)
1085 	{
1086 		PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1087 		int			schunkbit = istate->schunkbit;
1088 
1089 		tbm_advance_schunkbit(chunk, &schunkbit);
1090 		if (schunkbit < PAGES_PER_CHUNK)
1091 		{
1092 			istate->schunkbit = schunkbit;
1093 			break;
1094 		}
1095 		/* advance to next chunk */
1096 		istate->schunkptr++;
1097 		istate->schunkbit = 0;
1098 	}
1099 
1100 	/*
1101 	 * If both chunk and per-page data remain, must output the numerically
1102 	 * earlier page.
1103 	 */
1104 	if (istate->schunkptr < istate->nchunks)
1105 	{
1106 		PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1107 		BlockNumber chunk_blockno;
1108 
1109 		chunk_blockno = chunk->blockno + istate->schunkbit;
1110 
1111 		if (istate->spageptr >= istate->npages ||
1112 			chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1113 		{
1114 			/* Return a lossy page indicator from the chunk */
1115 			output->blockno = chunk_blockno;
1116 			output->ntuples = -1;
1117 			output->recheck = true;
1118 			istate->schunkbit++;
1119 
1120 			LWLockRelease(&istate->lock);
1121 			return output;
1122 		}
1123 	}
1124 
1125 	if (istate->spageptr < istate->npages)
1126 	{
1127 		PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1128 		int			ntuples;
1129 
1130 		/* scan bitmap to extract individual offset numbers */
1131 		ntuples = tbm_extract_page_tuple(page, output);
1132 		output->blockno = page->blockno;
1133 		output->ntuples = ntuples;
1134 		output->recheck = page->recheck;
1135 		istate->spageptr++;
1136 
1137 		LWLockRelease(&istate->lock);
1138 
1139 		return output;
1140 	}
1141 
1142 	LWLockRelease(&istate->lock);
1143 
1144 	/* Nothing more in the bitmap */
1145 	return NULL;
1146 }
1147 
1148 /*
1149  * tbm_end_iterate - finish an iteration over a TIDBitmap
1150  *
1151  * Currently this is just a pfree, but it might do more someday.  (For
1152  * instance, it could be useful to count open iterators and allow the
1153  * bitmap to return to read/write status when there are no more iterators.)
1154  */
1155 void
tbm_end_iterate(TBMIterator * iterator)1156 tbm_end_iterate(TBMIterator *iterator)
1157 {
1158 	pfree(iterator);
1159 }
1160 
1161 /*
1162  * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1163  *
1164  * This doesn't free any of the shared state associated with the iterator,
1165  * just our backend-private state.
1166  */
1167 void
tbm_end_shared_iterate(TBMSharedIterator * iterator)1168 tbm_end_shared_iterate(TBMSharedIterator *iterator)
1169 {
1170 	pfree(iterator);
1171 }
1172 
1173 /*
1174  * tbm_find_pageentry - find a PagetableEntry for the pageno
1175  *
1176  * Returns NULL if there is no non-lossy entry for the pageno.
1177  */
1178 static const PagetableEntry *
tbm_find_pageentry(const TIDBitmap * tbm,BlockNumber pageno)1179 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1180 {
1181 	const PagetableEntry *page;
1182 
1183 	if (tbm->nentries == 0)		/* in case pagetable doesn't exist */
1184 		return NULL;
1185 
1186 	if (tbm->status == TBM_ONE_PAGE)
1187 	{
1188 		page = &tbm->entry1;
1189 		if (page->blockno != pageno)
1190 			return NULL;
1191 		Assert(!page->ischunk);
1192 		return page;
1193 	}
1194 
1195 	page = pagetable_lookup(tbm->pagetable, pageno);
1196 	if (page == NULL)
1197 		return NULL;
1198 	if (page->ischunk)
1199 		return NULL;			/* don't want a lossy chunk header */
1200 	return page;
1201 }
1202 
1203 /*
1204  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1205  *
1206  * If new, the entry is marked as an exact (non-chunk) entry.
1207  *
1208  * This may cause the table to exceed the desired memory size.  It is
1209  * up to the caller to call tbm_lossify() at the next safe point if so.
1210  */
1211 static PagetableEntry *
tbm_get_pageentry(TIDBitmap * tbm,BlockNumber pageno)1212 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1213 {
1214 	PagetableEntry *page;
1215 	bool		found;
1216 
1217 	if (tbm->status == TBM_EMPTY)
1218 	{
1219 		/* Use the fixed slot */
1220 		page = &tbm->entry1;
1221 		found = false;
1222 		tbm->status = TBM_ONE_PAGE;
1223 	}
1224 	else
1225 	{
1226 		if (tbm->status == TBM_ONE_PAGE)
1227 		{
1228 			page = &tbm->entry1;
1229 			if (page->blockno == pageno)
1230 				return page;
1231 			/* Time to switch from one page to a hashtable */
1232 			tbm_create_pagetable(tbm);
1233 		}
1234 
1235 		/* Look up or create an entry */
1236 		page = pagetable_insert(tbm->pagetable, pageno, &found);
1237 	}
1238 
1239 	/* Initialize it if not present before */
1240 	if (!found)
1241 	{
1242 		char		oldstatus = page->status;
1243 
1244 		MemSet(page, 0, sizeof(PagetableEntry));
1245 		page->status = oldstatus;
1246 		page->blockno = pageno;
1247 		/* must count it too */
1248 		tbm->nentries++;
1249 		tbm->npages++;
1250 	}
1251 
1252 	return page;
1253 }
1254 
1255 /*
1256  * tbm_page_is_lossy - is the page marked as lossily stored?
1257  */
1258 static bool
tbm_page_is_lossy(const TIDBitmap * tbm,BlockNumber pageno)1259 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1260 {
1261 	PagetableEntry *page;
1262 	BlockNumber chunk_pageno;
1263 	int			bitno;
1264 
1265 	/* we can skip the lookup if there are no lossy chunks */
1266 	if (tbm->nchunks == 0)
1267 		return false;
1268 	Assert(tbm->status == TBM_HASH);
1269 
1270 	bitno = pageno % PAGES_PER_CHUNK;
1271 	chunk_pageno = pageno - bitno;
1272 
1273 	page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1274 
1275 	if (page != NULL && page->ischunk)
1276 	{
1277 		int			wordnum = WORDNUM(bitno);
1278 		int			bitnum = BITNUM(bitno);
1279 
1280 		if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1281 			return true;
1282 	}
1283 	return false;
1284 }
1285 
1286 /*
1287  * tbm_mark_page_lossy - mark the page number as lossily stored
1288  *
1289  * This may cause the table to exceed the desired memory size.  It is
1290  * up to the caller to call tbm_lossify() at the next safe point if so.
1291  */
1292 static void
tbm_mark_page_lossy(TIDBitmap * tbm,BlockNumber pageno)1293 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1294 {
1295 	PagetableEntry *page;
1296 	bool		found;
1297 	BlockNumber chunk_pageno;
1298 	int			bitno;
1299 	int			wordnum;
1300 	int			bitnum;
1301 
1302 	/* We force the bitmap into hashtable mode whenever it's lossy */
1303 	if (tbm->status != TBM_HASH)
1304 		tbm_create_pagetable(tbm);
1305 
1306 	bitno = pageno % PAGES_PER_CHUNK;
1307 	chunk_pageno = pageno - bitno;
1308 
1309 	/*
1310 	 * Remove any extant non-lossy entry for the page.  If the page is its own
1311 	 * chunk header, however, we skip this and handle the case below.
1312 	 */
1313 	if (bitno != 0)
1314 	{
1315 		if (pagetable_delete(tbm->pagetable, pageno))
1316 		{
1317 			/* It was present, so adjust counts */
1318 			tbm->nentries--;
1319 			tbm->npages--;		/* assume it must have been non-lossy */
1320 		}
1321 	}
1322 
1323 	/* Look up or create entry for chunk-header page */
1324 	page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1325 
1326 	/* Initialize it if not present before */
1327 	if (!found)
1328 	{
1329 		char		oldstatus = page->status;
1330 
1331 		MemSet(page, 0, sizeof(PagetableEntry));
1332 		page->status = oldstatus;
1333 		page->blockno = chunk_pageno;
1334 		page->ischunk = true;
1335 		/* must count it too */
1336 		tbm->nentries++;
1337 		tbm->nchunks++;
1338 	}
1339 	else if (!page->ischunk)
1340 	{
1341 		char		oldstatus = page->status;
1342 
1343 		/* chunk header page was formerly non-lossy, make it lossy */
1344 		MemSet(page, 0, sizeof(PagetableEntry));
1345 		page->status = oldstatus;
1346 		page->blockno = chunk_pageno;
1347 		page->ischunk = true;
1348 		/* we assume it had some tuple bit(s) set, so mark it lossy */
1349 		page->words[0] = ((bitmapword) 1 << 0);
1350 		/* adjust counts */
1351 		tbm->nchunks++;
1352 		tbm->npages--;
1353 	}
1354 
1355 	/* Now set the original target page's bit */
1356 	wordnum = WORDNUM(bitno);
1357 	bitnum = BITNUM(bitno);
1358 	page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1359 }
1360 
1361 /*
1362  * tbm_lossify - lose some information to get back under the memory limit
1363  */
1364 static void
tbm_lossify(TIDBitmap * tbm)1365 tbm_lossify(TIDBitmap *tbm)
1366 {
1367 	pagetable_iterator i;
1368 	PagetableEntry *page;
1369 
1370 	/*
1371 	 * XXX Really stupid implementation: this just lossifies pages in
1372 	 * essentially random order.  We should be paying some attention to the
1373 	 * number of bits set in each page, instead.
1374 	 *
1375 	 * Since we are called as soon as nentries exceeds maxentries, we should
1376 	 * push nentries down to significantly less than maxentries, or else we'll
1377 	 * just end up doing this again very soon.  We shoot for maxentries/2.
1378 	 */
1379 	Assert(tbm->iterating == TBM_NOT_ITERATING);
1380 	Assert(tbm->status == TBM_HASH);
1381 
1382 	pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1383 	while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1384 	{
1385 		if (page->ischunk)
1386 			continue;			/* already a chunk header */
1387 
1388 		/*
1389 		 * If the page would become a chunk header, we won't save anything by
1390 		 * converting it to lossy, so skip it.
1391 		 */
1392 		if ((page->blockno % PAGES_PER_CHUNK) == 0)
1393 			continue;
1394 
1395 		/* This does the dirty work ... */
1396 		tbm_mark_page_lossy(tbm, page->blockno);
1397 
1398 		if (tbm->nentries <= tbm->maxentries / 2)
1399 		{
1400 			/*
1401 			 * We have made enough room. Remember where to start lossifying
1402 			 * next round, so we evenly iterate over the hashtable.
1403 			 */
1404 			tbm->lossify_start = i.cur;
1405 			break;
1406 		}
1407 
1408 		/*
1409 		 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1410 		 * hashtable and may have deleted the non-lossy chunk.  We can
1411 		 * continue the same hash table scan, since failure to visit one
1412 		 * element or visiting the newly inserted element, isn't fatal.
1413 		 */
1414 	}
1415 
1416 	/*
1417 	 * With a big bitmap and small work_mem, it's possible that we cannot get
1418 	 * under maxentries.  Again, if that happens, we'd end up uselessly
1419 	 * calling tbm_lossify over and over.  To prevent this from becoming a
1420 	 * performance sink, force maxentries up to at least double the current
1421 	 * number of entries.  (In essence, we're admitting inability to fit
1422 	 * within work_mem when we do this.)  Note that this test will not fire if
1423 	 * we broke out of the loop early; and if we didn't, the current number of
1424 	 * entries is simply not reducible any further.
1425 	 */
1426 	if (tbm->nentries > tbm->maxentries / 2)
1427 		tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1428 }
1429 
1430 /*
1431  * qsort comparator to handle PagetableEntry pointers.
1432  */
1433 static int
tbm_comparator(const void * left,const void * right)1434 tbm_comparator(const void *left, const void *right)
1435 {
1436 	BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1437 	BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1438 
1439 	if (l < r)
1440 		return -1;
1441 	else if (l > r)
1442 		return 1;
1443 	return 0;
1444 }
1445 
1446 /*
1447  * As above, but this will get index into PagetableEntry array.  Therefore,
1448  * it needs to get actual PagetableEntry using the index before comparing the
1449  * blockno.
1450  */
1451 static int
tbm_shared_comparator(const void * left,const void * right,void * arg)1452 tbm_shared_comparator(const void *left, const void *right, void *arg)
1453 {
1454 	PagetableEntry *base = (PagetableEntry *) arg;
1455 	PagetableEntry *lpage = &base[*(int *) left];
1456 	PagetableEntry *rpage = &base[*(int *) right];
1457 
1458 	if (lpage->blockno < rpage->blockno)
1459 		return -1;
1460 	else if (lpage->blockno > rpage->blockno)
1461 		return 1;
1462 	return 0;
1463 }
1464 
1465 /*
1466  *	tbm_attach_shared_iterate
1467  *
1468  *	Allocate a backend-private iterator and attach the shared iterator state
1469  *	to it so that multiple processed can iterate jointly.
1470  *
1471  *	We also converts the DSA pointers to local pointers and store them into
1472  *	our private iterator.
1473  */
1474 TBMSharedIterator *
tbm_attach_shared_iterate(dsa_area * dsa,dsa_pointer dp)1475 tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1476 {
1477 	TBMSharedIterator *iterator;
1478 	TBMSharedIteratorState *istate;
1479 
1480 	/*
1481 	 * Create the TBMSharedIterator struct, with enough trailing space to
1482 	 * serve the needs of the TBMIterateResult sub-struct.
1483 	 */
1484 	iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1485 											 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1486 
1487 	istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1488 
1489 	iterator->state = istate;
1490 
1491 	iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1492 
1493 	if (istate->npages)
1494 		iterator->ptpages = dsa_get_address(dsa, istate->spages);
1495 	if (istate->nchunks)
1496 		iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1497 
1498 	return iterator;
1499 }
1500 
1501 /*
1502  * pagetable_allocate
1503  *
1504  * Callback function for allocating the memory for hashtable elements.
1505  * Allocate memory for hashtable elements, using DSA if available.
1506  */
1507 static inline void *
pagetable_allocate(pagetable_hash * pagetable,Size size)1508 pagetable_allocate(pagetable_hash *pagetable, Size size)
1509 {
1510 	TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
1511 	PTEntryArray *ptbase;
1512 
1513 	if (tbm->dsa == NULL)
1514 		return MemoryContextAllocExtended(pagetable->ctx, size,
1515 										  MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1516 
1517 	/*
1518 	 * Save the dsapagetable reference in dsapagetableold before allocating
1519 	 * new memory so that pagetable_free can free the old entry.
1520 	 */
1521 	tbm->dsapagetableold = tbm->dsapagetable;
1522 	tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1523 											  sizeof(PTEntryArray) + size,
1524 											  DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
1525 	ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1526 
1527 	return ptbase->ptentry;
1528 }
1529 
1530 /*
1531  * pagetable_free
1532  *
1533  * Callback function for freeing hash table elements.
1534  */
1535 static inline void
pagetable_free(pagetable_hash * pagetable,void * pointer)1536 pagetable_free(pagetable_hash *pagetable, void *pointer)
1537 {
1538 	TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
1539 
1540 	/* pfree the input pointer if DSA is not available */
1541 	if (tbm->dsa == NULL)
1542 		pfree(pointer);
1543 	else if (DsaPointerIsValid(tbm->dsapagetableold))
1544 	{
1545 		dsa_free(tbm->dsa, tbm->dsapagetableold);
1546 		tbm->dsapagetableold = InvalidDsaPointer;
1547 	}
1548 }
1549