1 /*-------------------------------------------------------------------------
2 *
3 * tidbitmap.c
4 * PostgreSQL tuple-id (TID) bitmap package
5 *
6 * This module provides bitmap data structures that are spiritually
7 * similar to Bitmapsets, but are specially adapted to store sets of
8 * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 * Also, since we wish to be able to store very large tuple sets in
11 * memory with this data structure, we support "lossy" storage, in which
12 * we no longer remember individual tuple offsets on a page but only the
13 * fact that a particular page needs to be visited.
14 *
15 * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 * of memory. People pushing around tables of that size should have a
18 * couple of Mb to spare, so we don't worry about providing a second level
19 * of lossiness. In theory we could fall back to page ranges at some
20 * point, but for now that seems useless complexity.
21 *
22 * We also support the notion of candidate matches, or rechecking. This
23 * means we know that a search need visit only some tuples on a page,
24 * but we are not certain that all of those tuples are real matches.
25 * So the eventual heap scan must recheck the quals for these tuples only,
26 * rather than rechecking the quals for all tuples on the page as in the
27 * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 * into a bitmap, and it can also happen internally when we AND a lossy
29 * and a non-lossy page.
30 *
31 *
32 * Copyright (c) 2003-2017, PostgreSQL Global Development Group
33 *
34 * IDENTIFICATION
35 * src/backend/nodes/tidbitmap.c
36 *
37 *-------------------------------------------------------------------------
38 */
39 #include "postgres.h"
40
41 #include <limits.h>
42
43 #include "access/htup_details.h"
44 #include "nodes/bitmapset.h"
45 #include "nodes/tidbitmap.h"
46 #include "storage/lwlock.h"
47 #include "utils/dsa.h"
48 #include "utils/hashutils.h"
49
50 /*
51 * The maximum number of tuples per page is not large (typically 256 with
52 * 8K pages, or 1024 with 32K pages). So there's not much point in making
53 * the per-page bitmaps variable size. We just legislate that the size
54 * is this:
55 */
56 #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
57
58 /*
59 * When we have to switch over to lossy storage, we use a data structure
60 * with one bit per page, where all pages having the same number DIV
61 * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
62 * and has the bit set for a given page, there must not be a per-page entry
63 * for that page in the page table.
64 *
65 * We actually store both exact pages and lossy chunks in the same hash
66 * table, using identical data structures. (This is because the memory
67 * management for hashtables doesn't easily/efficiently allow space to be
68 * transferred easily from one hashtable to another.) Therefore it's best
69 * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
70 * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
71 * avoid expensive integer remainder operations. So, define it like this:
72 */
73 #define PAGES_PER_CHUNK (BLCKSZ / 32)
74
75 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
76
77 #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
78 #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
79
80 /* number of active words for an exact page: */
81 #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
82 /* number of active words for a lossy chunk: */
83 #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
84
85 /*
86 * The hashtable entries are represented by this data structure. For
87 * an exact page, blockno is the page number and bit k of the bitmap
88 * represents tuple offset k+1. For a lossy chunk, blockno is the first
89 * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
90 * bit k represents page blockno+k. Note that it is not possible to
91 * have exact storage for the first page of a chunk if we are using
92 * lossy storage for any page in the chunk's range, since the same
93 * hashtable entry has to serve both purposes.
94 *
95 * recheck is used only on exact pages --- it indicates that although
96 * only the stated tuples need be checked, the full index qual condition
97 * must be checked for each (ie, these are candidate matches).
98 */
99 typedef struct PagetableEntry
100 {
101 BlockNumber blockno; /* page number (hashtable key) */
102 char status; /* hash entry status */
103 bool ischunk; /* T = lossy storage, F = exact */
104 bool recheck; /* should the tuples be rechecked? */
105 bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
106 } PagetableEntry;
107
108 /*
109 * Holds array of pagetable entries.
110 */
111 typedef struct PTEntryArray
112 {
113 pg_atomic_uint32 refcount; /* no. of iterator attached */
114 PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
115 } PTEntryArray;
116
117 /*
118 * We want to avoid the overhead of creating the hashtable, which is
119 * comparatively large, when not necessary. Particularly when we are using a
120 * bitmap scan on the inside of a nestloop join: a bitmap may well live only
121 * long enough to accumulate one entry in such cases. We therefore avoid
122 * creating an actual hashtable until we need two pagetable entries. When
123 * just one pagetable entry is needed, we store it in a fixed field of
124 * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
125 * shrinks down to zero or one page again. So, status can be TBM_HASH even
126 * when nentries is zero or one.)
127 */
128 typedef enum
129 {
130 TBM_EMPTY, /* no hashtable, nentries == 0 */
131 TBM_ONE_PAGE, /* entry1 contains the single entry */
132 TBM_HASH /* pagetable is valid, entry1 is not */
133 } TBMStatus;
134
135 /*
136 * Current iterating state of the TBM.
137 */
138 typedef enum
139 {
140 TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
141 TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
142 TBM_ITERATING_SHARED /* converted to shared page and chunk array */
143 } TBMIteratingState;
144
145 /*
146 * Here is the representation for a whole TIDBitMap:
147 */
148 struct TIDBitmap
149 {
150 NodeTag type; /* to make it a valid Node */
151 MemoryContext mcxt; /* memory context containing me */
152 TBMStatus status; /* see codes above */
153 struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
154 int nentries; /* number of entries in pagetable */
155 int maxentries; /* limit on same to meet maxbytes */
156 int npages; /* number of exact entries in pagetable */
157 int nchunks; /* number of lossy entries in pagetable */
158 TBMIteratingState iterating; /* tbm_begin_iterate called? */
159 uint32 lossify_start; /* offset to start lossifying hashtable at */
160 PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
161 /* these are valid when iterating is true: */
162 PagetableEntry **spages; /* sorted exact-page list, or NULL */
163 PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
164 dsa_pointer dsapagetable; /* dsa_pointer to the element array */
165 dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
166 dsa_pointer ptpages; /* dsa_pointer to the page array */
167 dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
168 dsa_area *dsa; /* reference to per-query dsa area */
169 };
170
171 /*
172 * When iterating over a bitmap in sorted order, a TBMIterator is used to
173 * track our progress. There can be several iterators scanning the same
174 * bitmap concurrently. Note that the bitmap becomes read-only as soon as
175 * any iterator is created.
176 */
177 struct TBMIterator
178 {
179 TIDBitmap *tbm; /* TIDBitmap we're iterating over */
180 int spageptr; /* next spages index */
181 int schunkptr; /* next schunks index */
182 int schunkbit; /* next bit to check in current schunk */
183 TBMIterateResult output; /* MUST BE LAST (because variable-size) */
184 };
185
186 /*
187 * Holds the shared members of the iterator so that multiple processes
188 * can jointly iterate.
189 */
190 typedef struct TBMSharedIteratorState
191 {
192 int nentries; /* number of entries in pagetable */
193 int maxentries; /* limit on same to meet maxbytes */
194 int npages; /* number of exact entries in pagetable */
195 int nchunks; /* number of lossy entries in pagetable */
196 dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
197 dsa_pointer spages; /* dsa pointer to page array */
198 dsa_pointer schunks; /* dsa pointer to chunk array */
199 LWLock lock; /* lock to protect below members */
200 int spageptr; /* next spages index */
201 int schunkptr; /* next schunks index */
202 int schunkbit; /* next bit to check in current schunk */
203 } TBMSharedIteratorState;
204
205 /*
206 * pagetable iteration array.
207 */
208 typedef struct PTIterationArray
209 {
210 pg_atomic_uint32 refcount; /* no. of iterator attached */
211 int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
212 } PTIterationArray;
213
214 /*
215 * same as TBMIterator, but it is used for joint iteration, therefore this
216 * also holds a reference to the shared state.
217 */
218 struct TBMSharedIterator
219 {
220 TBMSharedIteratorState *state; /* shared state */
221 PTEntryArray *ptbase; /* pagetable element array */
222 PTIterationArray *ptpages; /* sorted exact page index list */
223 PTIterationArray *ptchunks; /* sorted lossy page index list */
224 TBMIterateResult output; /* MUST BE LAST (because variable-size) */
225 };
226
227 /* Local function prototypes */
228 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
229 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
230 const TIDBitmap *b);
231 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
232 BlockNumber pageno);
233 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
234 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
235 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
236 static void tbm_lossify(TIDBitmap *tbm);
237 static int tbm_comparator(const void *left, const void *right);
238 static int tbm_shared_comparator(const void *left, const void *right,
239 void *arg);
240
241 /* define hashtable mapping block numbers to PagetableEntry's */
242 #define SH_USE_NONDEFAULT_ALLOCATOR
243 #define SH_PREFIX pagetable
244 #define SH_ELEMENT_TYPE PagetableEntry
245 #define SH_KEY_TYPE BlockNumber
246 #define SH_KEY blockno
247 #define SH_HASH_KEY(tb, key) murmurhash32(key)
248 #define SH_EQUAL(tb, a, b) a == b
249 #define SH_SCOPE static inline
250 #define SH_DEFINE
251 #define SH_DECLARE
252 #include "lib/simplehash.h"
253
254
255 /*
256 * tbm_create - create an initially-empty bitmap
257 *
258 * The bitmap will live in the memory context that is CurrentMemoryContext
259 * at the time of this call. It will be limited to (approximately) maxbytes
260 * total memory consumption. If the DSA passed to this function is not NULL
261 * then the memory for storing elements of the underlying page table will
262 * be allocated from the DSA.
263 */
264 TIDBitmap *
tbm_create(long maxbytes,dsa_area * dsa)265 tbm_create(long maxbytes, dsa_area *dsa)
266 {
267 TIDBitmap *tbm;
268 long nbuckets;
269
270 /* Create the TIDBitmap struct and zero all its fields */
271 tbm = makeNode(TIDBitmap);
272
273 tbm->mcxt = CurrentMemoryContext;
274 tbm->status = TBM_EMPTY;
275
276 /*
277 * Estimate number of hashtable entries we can have within maxbytes. This
278 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
279 * for our purpose. Also count an extra Pointer per entry for the arrays
280 * created during iteration readout.
281 */
282 nbuckets = maxbytes /
283 (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
284 nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
285 nbuckets = Max(nbuckets, 16); /* sanity limit */
286 tbm->maxentries = (int) nbuckets;
287 tbm->lossify_start = 0;
288 tbm->dsa = dsa;
289 tbm->dsapagetable = InvalidDsaPointer;
290 tbm->dsapagetableold = InvalidDsaPointer;
291 tbm->ptpages = InvalidDsaPointer;
292 tbm->ptchunks = InvalidDsaPointer;
293
294 return tbm;
295 }
296
297 /*
298 * Actually create the hashtable. Since this is a moderately expensive
299 * proposition, we don't do it until we have to.
300 */
301 static void
tbm_create_pagetable(TIDBitmap * tbm)302 tbm_create_pagetable(TIDBitmap *tbm)
303 {
304 Assert(tbm->status != TBM_HASH);
305 Assert(tbm->pagetable == NULL);
306
307 tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
308
309 /* If entry1 is valid, push it into the hashtable */
310 if (tbm->status == TBM_ONE_PAGE)
311 {
312 PagetableEntry *page;
313 bool found;
314 char oldstatus;
315
316 page = pagetable_insert(tbm->pagetable,
317 tbm->entry1.blockno,
318 &found);
319 Assert(!found);
320 oldstatus = page->status;
321 memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
322 page->status = oldstatus;
323 }
324
325 tbm->status = TBM_HASH;
326 }
327
328 /*
329 * tbm_free - free a TIDBitmap
330 */
331 void
tbm_free(TIDBitmap * tbm)332 tbm_free(TIDBitmap *tbm)
333 {
334 if (tbm->pagetable)
335 pagetable_destroy(tbm->pagetable);
336 if (tbm->spages)
337 pfree(tbm->spages);
338 if (tbm->schunks)
339 pfree(tbm->schunks);
340 pfree(tbm);
341 }
342
343 /*
344 * tbm_free_shared_area - free shared state
345 *
346 * Free shared iterator state, Also free shared pagetable and iterator arrays
347 * memory if they are not referred by any of the shared iterator i.e recount
348 * is becomes 0.
349 */
350 void
tbm_free_shared_area(dsa_area * dsa,dsa_pointer dp)351 tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
352 {
353 TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
354 PTEntryArray *ptbase;
355 PTIterationArray *ptpages;
356 PTIterationArray *ptchunks;
357
358 if (DsaPointerIsValid(istate->pagetable))
359 {
360 ptbase = dsa_get_address(dsa, istate->pagetable);
361 if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
362 dsa_free(dsa, istate->pagetable);
363 }
364 if (DsaPointerIsValid(istate->spages))
365 {
366 ptpages = dsa_get_address(dsa, istate->spages);
367 if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
368 dsa_free(dsa, istate->spages);
369 }
370 if (DsaPointerIsValid(istate->schunks))
371 {
372 ptchunks = dsa_get_address(dsa, istate->schunks);
373 if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
374 dsa_free(dsa, istate->schunks);
375 }
376
377 dsa_free(dsa, dp);
378 }
379
380 /*
381 * tbm_add_tuples - add some tuple IDs to a TIDBitmap
382 *
383 * If recheck is true, then the recheck flag will be set in the
384 * TBMIterateResult when any of these tuples are reported out.
385 */
386 void
tbm_add_tuples(TIDBitmap * tbm,const ItemPointer tids,int ntids,bool recheck)387 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
388 bool recheck)
389 {
390 BlockNumber currblk = InvalidBlockNumber;
391 PagetableEntry *page = NULL; /* only valid when currblk is valid */
392 int i;
393
394 Assert(tbm->iterating == TBM_NOT_ITERATING);
395 for (i = 0; i < ntids; i++)
396 {
397 BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
398 OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
399 int wordnum,
400 bitnum;
401
402 /* safety check to ensure we don't overrun bit array bounds */
403 if (off < 1 || off > MAX_TUPLES_PER_PAGE)
404 elog(ERROR, "tuple offset out of range: %u", off);
405
406 /*
407 * Look up target page unless we already did. This saves cycles when
408 * the input includes consecutive tuples on the same page, which is
409 * common enough to justify an extra test here.
410 */
411 if (blk != currblk)
412 {
413 if (tbm_page_is_lossy(tbm, blk))
414 page = NULL; /* remember page is lossy */
415 else
416 page = tbm_get_pageentry(tbm, blk);
417 currblk = blk;
418 }
419
420 if (page == NULL)
421 continue; /* whole page is already marked */
422
423 if (page->ischunk)
424 {
425 /* The page is a lossy chunk header, set bit for itself */
426 wordnum = bitnum = 0;
427 }
428 else
429 {
430 /* Page is exact, so set bit for individual tuple */
431 wordnum = WORDNUM(off - 1);
432 bitnum = BITNUM(off - 1);
433 }
434 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
435 page->recheck |= recheck;
436
437 if (tbm->nentries > tbm->maxentries)
438 {
439 tbm_lossify(tbm);
440 /* Page could have been converted to lossy, so force new lookup */
441 currblk = InvalidBlockNumber;
442 }
443 }
444 }
445
446 /*
447 * tbm_add_page - add a whole page to a TIDBitmap
448 *
449 * This causes the whole page to be reported (with the recheck flag)
450 * when the TIDBitmap is scanned.
451 */
452 void
tbm_add_page(TIDBitmap * tbm,BlockNumber pageno)453 tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
454 {
455 /* Enter the page in the bitmap, or mark it lossy if already present */
456 tbm_mark_page_lossy(tbm, pageno);
457 /* If we went over the memory limit, lossify some more pages */
458 if (tbm->nentries > tbm->maxentries)
459 tbm_lossify(tbm);
460 }
461
462 /*
463 * tbm_union - set union
464 *
465 * a is modified in-place, b is not changed
466 */
467 void
tbm_union(TIDBitmap * a,const TIDBitmap * b)468 tbm_union(TIDBitmap *a, const TIDBitmap *b)
469 {
470 Assert(!a->iterating);
471 /* Nothing to do if b is empty */
472 if (b->nentries == 0)
473 return;
474 /* Scan through chunks and pages in b, merge into a */
475 if (b->status == TBM_ONE_PAGE)
476 tbm_union_page(a, &b->entry1);
477 else
478 {
479 pagetable_iterator i;
480 PagetableEntry *bpage;
481
482 Assert(b->status == TBM_HASH);
483 pagetable_start_iterate(b->pagetable, &i);
484 while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
485 tbm_union_page(a, bpage);
486 }
487 }
488
489 /* Process one page of b during a union op */
490 static void
tbm_union_page(TIDBitmap * a,const PagetableEntry * bpage)491 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
492 {
493 PagetableEntry *apage;
494 int wordnum;
495
496 if (bpage->ischunk)
497 {
498 /* Scan b's chunk, mark each indicated page lossy in a */
499 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
500 {
501 bitmapword w = bpage->words[wordnum];
502
503 if (w != 0)
504 {
505 BlockNumber pg;
506
507 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
508 while (w != 0)
509 {
510 if (w & 1)
511 tbm_mark_page_lossy(a, pg);
512 pg++;
513 w >>= 1;
514 }
515 }
516 }
517 }
518 else if (tbm_page_is_lossy(a, bpage->blockno))
519 {
520 /* page is already lossy in a, nothing to do */
521 return;
522 }
523 else
524 {
525 apage = tbm_get_pageentry(a, bpage->blockno);
526 if (apage->ischunk)
527 {
528 /* The page is a lossy chunk header, set bit for itself */
529 apage->words[0] |= ((bitmapword) 1 << 0);
530 }
531 else
532 {
533 /* Both pages are exact, merge at the bit level */
534 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
535 apage->words[wordnum] |= bpage->words[wordnum];
536 apage->recheck |= bpage->recheck;
537 }
538 }
539
540 if (a->nentries > a->maxentries)
541 tbm_lossify(a);
542 }
543
544 /*
545 * tbm_intersect - set intersection
546 *
547 * a is modified in-place, b is not changed
548 */
549 void
tbm_intersect(TIDBitmap * a,const TIDBitmap * b)550 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
551 {
552 Assert(!a->iterating);
553 /* Nothing to do if a is empty */
554 if (a->nentries == 0)
555 return;
556 /* Scan through chunks and pages in a, try to match to b */
557 if (a->status == TBM_ONE_PAGE)
558 {
559 if (tbm_intersect_page(a, &a->entry1, b))
560 {
561 /* Page is now empty, remove it from a */
562 Assert(!a->entry1.ischunk);
563 a->npages--;
564 a->nentries--;
565 Assert(a->nentries == 0);
566 a->status = TBM_EMPTY;
567 }
568 }
569 else
570 {
571 pagetable_iterator i;
572 PagetableEntry *apage;
573
574 Assert(a->status == TBM_HASH);
575 pagetable_start_iterate(a->pagetable, &i);
576 while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
577 {
578 if (tbm_intersect_page(a, apage, b))
579 {
580 /* Page or chunk is now empty, remove it from a */
581 if (apage->ischunk)
582 a->nchunks--;
583 else
584 a->npages--;
585 a->nentries--;
586 if (!pagetable_delete(a->pagetable, apage->blockno))
587 elog(ERROR, "hash table corrupted");
588 }
589 }
590 }
591 }
592
593 /*
594 * Process one page of a during an intersection op
595 *
596 * Returns TRUE if apage is now empty and should be deleted from a
597 */
598 static bool
tbm_intersect_page(TIDBitmap * a,PagetableEntry * apage,const TIDBitmap * b)599 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
600 {
601 const PagetableEntry *bpage;
602 int wordnum;
603
604 if (apage->ischunk)
605 {
606 /* Scan each bit in chunk, try to clear */
607 bool candelete = true;
608
609 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
610 {
611 bitmapword w = apage->words[wordnum];
612
613 if (w != 0)
614 {
615 bitmapword neww = w;
616 BlockNumber pg;
617 int bitnum;
618
619 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
620 bitnum = 0;
621 while (w != 0)
622 {
623 if (w & 1)
624 {
625 if (!tbm_page_is_lossy(b, pg) &&
626 tbm_find_pageentry(b, pg) == NULL)
627 {
628 /* Page is not in b at all, lose lossy bit */
629 neww &= ~((bitmapword) 1 << bitnum);
630 }
631 }
632 pg++;
633 bitnum++;
634 w >>= 1;
635 }
636 apage->words[wordnum] = neww;
637 if (neww != 0)
638 candelete = false;
639 }
640 }
641 return candelete;
642 }
643 else if (tbm_page_is_lossy(b, apage->blockno))
644 {
645 /*
646 * Some of the tuples in 'a' might not satisfy the quals for 'b', but
647 * because the page 'b' is lossy, we don't know which ones. Therefore
648 * we mark 'a' as requiring rechecks, to indicate that at most those
649 * tuples set in 'a' are matches.
650 */
651 apage->recheck = true;
652 return false;
653 }
654 else
655 {
656 bool candelete = true;
657
658 bpage = tbm_find_pageentry(b, apage->blockno);
659 if (bpage != NULL)
660 {
661 /* Both pages are exact, merge at the bit level */
662 Assert(!bpage->ischunk);
663 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
664 {
665 apage->words[wordnum] &= bpage->words[wordnum];
666 if (apage->words[wordnum] != 0)
667 candelete = false;
668 }
669 apage->recheck |= bpage->recheck;
670 }
671 /* If there is no matching b page, we can just delete the a page */
672 return candelete;
673 }
674 }
675
676 /*
677 * tbm_is_empty - is a TIDBitmap completely empty?
678 */
679 bool
tbm_is_empty(const TIDBitmap * tbm)680 tbm_is_empty(const TIDBitmap *tbm)
681 {
682 return (tbm->nentries == 0);
683 }
684
685 /*
686 * tbm_begin_iterate - prepare to iterate through a TIDBitmap
687 *
688 * The TBMIterator struct is created in the caller's memory context.
689 * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
690 * okay to just allow the memory context to be released, too. It is caller's
691 * responsibility not to touch the TBMIterator anymore once the TIDBitmap
692 * is freed.
693 *
694 * NB: after this is called, it is no longer allowed to modify the contents
695 * of the bitmap. However, you can call this multiple times to scan the
696 * contents repeatedly, including parallel scans.
697 */
698 TBMIterator *
tbm_begin_iterate(TIDBitmap * tbm)699 tbm_begin_iterate(TIDBitmap *tbm)
700 {
701 TBMIterator *iterator;
702
703 Assert(tbm->iterating != TBM_ITERATING_SHARED);
704
705 /*
706 * Create the TBMIterator struct, with enough trailing space to serve the
707 * needs of the TBMIterateResult sub-struct.
708 */
709 iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
710 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
711 iterator->tbm = tbm;
712
713 /*
714 * Initialize iteration pointers.
715 */
716 iterator->spageptr = 0;
717 iterator->schunkptr = 0;
718 iterator->schunkbit = 0;
719
720 /*
721 * If we have a hashtable, create and fill the sorted page lists, unless
722 * we already did that for a previous iterator. Note that the lists are
723 * attached to the bitmap not the iterator, so they can be used by more
724 * than one iterator.
725 */
726 if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
727 {
728 pagetable_iterator i;
729 PagetableEntry *page;
730 int npages;
731 int nchunks;
732
733 if (!tbm->spages && tbm->npages > 0)
734 tbm->spages = (PagetableEntry **)
735 MemoryContextAlloc(tbm->mcxt,
736 tbm->npages * sizeof(PagetableEntry *));
737 if (!tbm->schunks && tbm->nchunks > 0)
738 tbm->schunks = (PagetableEntry **)
739 MemoryContextAlloc(tbm->mcxt,
740 tbm->nchunks * sizeof(PagetableEntry *));
741
742 npages = nchunks = 0;
743 pagetable_start_iterate(tbm->pagetable, &i);
744 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
745 {
746 if (page->ischunk)
747 tbm->schunks[nchunks++] = page;
748 else
749 tbm->spages[npages++] = page;
750 }
751 Assert(npages == tbm->npages);
752 Assert(nchunks == tbm->nchunks);
753 if (npages > 1)
754 qsort(tbm->spages, npages, sizeof(PagetableEntry *),
755 tbm_comparator);
756 if (nchunks > 1)
757 qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
758 tbm_comparator);
759 }
760
761 tbm->iterating = TBM_ITERATING_PRIVATE;
762
763 return iterator;
764 }
765
766 /*
767 * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
768 *
769 * The necessary shared state will be allocated from the DSA passed to
770 * tbm_create, so that multiple processes can attach to it and iterate jointly.
771 *
772 * This will convert the pagetable hash into page and chunk array of the index
773 * into pagetable array.
774 */
775 dsa_pointer
tbm_prepare_shared_iterate(TIDBitmap * tbm)776 tbm_prepare_shared_iterate(TIDBitmap *tbm)
777 {
778 dsa_pointer dp;
779 TBMSharedIteratorState *istate;
780 PTEntryArray *ptbase = NULL;
781 PTIterationArray *ptpages = NULL;
782 PTIterationArray *ptchunks = NULL;
783
784 Assert(tbm->dsa != NULL);
785 Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
786
787 /*
788 * Allocate TBMSharedIteratorState from DSA to hold the shared members and
789 * lock, this will also be used by multiple worker for shared iterate.
790 */
791 dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
792 istate = dsa_get_address(tbm->dsa, dp);
793
794 /*
795 * If we're not already iterating, create and fill the sorted page lists.
796 * (If we are, the sorted page lists are already stored in the TIDBitmap,
797 * and we can just reuse them.)
798 */
799 if (tbm->iterating == TBM_NOT_ITERATING)
800 {
801 pagetable_iterator i;
802 PagetableEntry *page;
803 int idx;
804 int npages;
805 int nchunks;
806
807 /*
808 * Allocate the page and chunk array memory from the DSA to share
809 * across multiple processes.
810 */
811 if (tbm->npages)
812 {
813 tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
814 tbm->npages * sizeof(int));
815 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
816 pg_atomic_init_u32(&ptpages->refcount, 0);
817 }
818 if (tbm->nchunks)
819 {
820 tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
821 tbm->nchunks * sizeof(int));
822 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
823 pg_atomic_init_u32(&ptchunks->refcount, 0);
824 }
825
826 /*
827 * If TBM status is TBM_HASH then iterate over the pagetable and
828 * convert it to page and chunk arrays. But if it's in the
829 * TBM_ONE_PAGE mode then directly allocate the space for one entry
830 * from the DSA.
831 */
832 npages = nchunks = 0;
833 if (tbm->status == TBM_HASH)
834 {
835 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
836
837 pagetable_start_iterate(tbm->pagetable, &i);
838 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
839 {
840 idx = page - ptbase->ptentry;
841 if (page->ischunk)
842 ptchunks->index[nchunks++] = idx;
843 else
844 ptpages->index[npages++] = idx;
845 }
846
847 Assert(npages == tbm->npages);
848 Assert(nchunks == tbm->nchunks);
849 }
850 else if (tbm->status == TBM_ONE_PAGE)
851 {
852 /*
853 * In one page mode allocate the space for one pagetable entry,
854 * initialize it, and directly store its index (i.e. 0) in the
855 * page array.
856 */
857 tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
858 sizeof(PagetableEntry));
859 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
860 memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
861 ptpages->index[0] = 0;
862 }
863
864 if (ptbase != NULL)
865 pg_atomic_init_u32(&ptbase->refcount, 0);
866 if (npages > 1)
867 qsort_arg((void *) (ptpages->index), npages, sizeof(int),
868 tbm_shared_comparator, (void *) ptbase->ptentry);
869 if (nchunks > 1)
870 qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
871 tbm_shared_comparator, (void *) ptbase->ptentry);
872 }
873
874 /*
875 * Store the TBM members in the shared state so that we can share them
876 * across multiple processes.
877 */
878 istate->nentries = tbm->nentries;
879 istate->maxentries = tbm->maxentries;
880 istate->npages = tbm->npages;
881 istate->nchunks = tbm->nchunks;
882 istate->pagetable = tbm->dsapagetable;
883 istate->spages = tbm->ptpages;
884 istate->schunks = tbm->ptchunks;
885
886 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
887 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
888 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
889
890 /*
891 * For every shared iterator, referring to pagetable and iterator array,
892 * increase the refcount by 1 so that while freeing the shared iterator we
893 * don't free pagetable and iterator array until its refcount becomes 0.
894 */
895 if (ptbase != NULL)
896 pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
897 if (ptpages != NULL)
898 pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
899 if (ptchunks != NULL)
900 pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
901
902 /* Initialize the iterator lock */
903 LWLockInitialize(&istate->lock, LWTRANCHE_TBM);
904
905 /* Initialize the shared iterator state */
906 istate->schunkbit = 0;
907 istate->schunkptr = 0;
908 istate->spageptr = 0;
909
910 tbm->iterating = TBM_ITERATING_SHARED;
911
912 return dp;
913 }
914
915 /*
916 * tbm_extract_page_tuple - extract the tuple offsets from a page
917 *
918 * The extracted offsets are stored into TBMIterateResult.
919 */
920 static inline int
tbm_extract_page_tuple(PagetableEntry * page,TBMIterateResult * output)921 tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
922 {
923 int wordnum;
924 int ntuples = 0;
925
926 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
927 {
928 bitmapword w = page->words[wordnum];
929
930 if (w != 0)
931 {
932 int off = wordnum * BITS_PER_BITMAPWORD + 1;
933
934 while (w != 0)
935 {
936 if (w & 1)
937 output->offsets[ntuples++] = (OffsetNumber) off;
938 off++;
939 w >>= 1;
940 }
941 }
942 }
943
944 return ntuples;
945 }
946
947 /*
948 * tbm_advance_schunkbit - Advance the chunkbit
949 */
950 static inline void
tbm_advance_schunkbit(PagetableEntry * chunk,int * schunkbitp)951 tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
952 {
953 int schunkbit = *schunkbitp;
954
955 while (schunkbit < PAGES_PER_CHUNK)
956 {
957 int wordnum = WORDNUM(schunkbit);
958 int bitnum = BITNUM(schunkbit);
959
960 if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
961 break;
962 schunkbit++;
963 }
964
965 *schunkbitp = schunkbit;
966 }
967
968 /*
969 * tbm_iterate - scan through next page of a TIDBitmap
970 *
971 * Returns a TBMIterateResult representing one page, or NULL if there are
972 * no more pages to scan. Pages are guaranteed to be delivered in numerical
973 * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
974 * remember the exact tuples to look at on this page --- the caller must
975 * examine all tuples on the page and check if they meet the intended
976 * condition. If result->recheck is true, only the indicated tuples need
977 * be examined, but the condition must be rechecked anyway. (For ease of
978 * testing, recheck is always set true when ntuples < 0.)
979 */
980 TBMIterateResult *
tbm_iterate(TBMIterator * iterator)981 tbm_iterate(TBMIterator *iterator)
982 {
983 TIDBitmap *tbm = iterator->tbm;
984 TBMIterateResult *output = &(iterator->output);
985
986 Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
987
988 /*
989 * If lossy chunk pages remain, make sure we've advanced schunkptr/
990 * schunkbit to the next set bit.
991 */
992 while (iterator->schunkptr < tbm->nchunks)
993 {
994 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
995 int schunkbit = iterator->schunkbit;
996
997 tbm_advance_schunkbit(chunk, &schunkbit);
998 if (schunkbit < PAGES_PER_CHUNK)
999 {
1000 iterator->schunkbit = schunkbit;
1001 break;
1002 }
1003 /* advance to next chunk */
1004 iterator->schunkptr++;
1005 iterator->schunkbit = 0;
1006 }
1007
1008 /*
1009 * If both chunk and per-page data remain, must output the numerically
1010 * earlier page.
1011 */
1012 if (iterator->schunkptr < tbm->nchunks)
1013 {
1014 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1015 BlockNumber chunk_blockno;
1016
1017 chunk_blockno = chunk->blockno + iterator->schunkbit;
1018 if (iterator->spageptr >= tbm->npages ||
1019 chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1020 {
1021 /* Return a lossy page indicator from the chunk */
1022 output->blockno = chunk_blockno;
1023 output->ntuples = -1;
1024 output->recheck = true;
1025 iterator->schunkbit++;
1026 return output;
1027 }
1028 }
1029
1030 if (iterator->spageptr < tbm->npages)
1031 {
1032 PagetableEntry *page;
1033 int ntuples;
1034
1035 /* In ONE_PAGE state, we don't allocate an spages[] array */
1036 if (tbm->status == TBM_ONE_PAGE)
1037 page = &tbm->entry1;
1038 else
1039 page = tbm->spages[iterator->spageptr];
1040
1041 /* scan bitmap to extract individual offset numbers */
1042 ntuples = tbm_extract_page_tuple(page, output);
1043 output->blockno = page->blockno;
1044 output->ntuples = ntuples;
1045 output->recheck = page->recheck;
1046 iterator->spageptr++;
1047 return output;
1048 }
1049
1050 /* Nothing more in the bitmap */
1051 return NULL;
1052 }
1053
1054 /*
1055 * tbm_shared_iterate - scan through next page of a TIDBitmap
1056 *
1057 * As above, but this will iterate using an iterator which is shared
1058 * across multiple processes. We need to acquire the iterator LWLock,
1059 * before accessing the shared members.
1060 */
1061 TBMIterateResult *
tbm_shared_iterate(TBMSharedIterator * iterator)1062 tbm_shared_iterate(TBMSharedIterator *iterator)
1063 {
1064 TBMIterateResult *output = &iterator->output;
1065 TBMSharedIteratorState *istate = iterator->state;
1066 PagetableEntry *ptbase = NULL;
1067 int *idxpages = NULL;
1068 int *idxchunks = NULL;
1069
1070 if (iterator->ptbase != NULL)
1071 ptbase = iterator->ptbase->ptentry;
1072 if (iterator->ptpages != NULL)
1073 idxpages = iterator->ptpages->index;
1074 if (iterator->ptchunks != NULL)
1075 idxchunks = iterator->ptchunks->index;
1076
1077 /* Acquire the LWLock before accessing the shared members */
1078 LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1079
1080 /*
1081 * If lossy chunk pages remain, make sure we've advanced schunkptr/
1082 * schunkbit to the next set bit.
1083 */
1084 while (istate->schunkptr < istate->nchunks)
1085 {
1086 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1087 int schunkbit = istate->schunkbit;
1088
1089 tbm_advance_schunkbit(chunk, &schunkbit);
1090 if (schunkbit < PAGES_PER_CHUNK)
1091 {
1092 istate->schunkbit = schunkbit;
1093 break;
1094 }
1095 /* advance to next chunk */
1096 istate->schunkptr++;
1097 istate->schunkbit = 0;
1098 }
1099
1100 /*
1101 * If both chunk and per-page data remain, must output the numerically
1102 * earlier page.
1103 */
1104 if (istate->schunkptr < istate->nchunks)
1105 {
1106 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1107 BlockNumber chunk_blockno;
1108
1109 chunk_blockno = chunk->blockno + istate->schunkbit;
1110
1111 if (istate->spageptr >= istate->npages ||
1112 chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1113 {
1114 /* Return a lossy page indicator from the chunk */
1115 output->blockno = chunk_blockno;
1116 output->ntuples = -1;
1117 output->recheck = true;
1118 istate->schunkbit++;
1119
1120 LWLockRelease(&istate->lock);
1121 return output;
1122 }
1123 }
1124
1125 if (istate->spageptr < istate->npages)
1126 {
1127 PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1128 int ntuples;
1129
1130 /* scan bitmap to extract individual offset numbers */
1131 ntuples = tbm_extract_page_tuple(page, output);
1132 output->blockno = page->blockno;
1133 output->ntuples = ntuples;
1134 output->recheck = page->recheck;
1135 istate->spageptr++;
1136
1137 LWLockRelease(&istate->lock);
1138
1139 return output;
1140 }
1141
1142 LWLockRelease(&istate->lock);
1143
1144 /* Nothing more in the bitmap */
1145 return NULL;
1146 }
1147
1148 /*
1149 * tbm_end_iterate - finish an iteration over a TIDBitmap
1150 *
1151 * Currently this is just a pfree, but it might do more someday. (For
1152 * instance, it could be useful to count open iterators and allow the
1153 * bitmap to return to read/write status when there are no more iterators.)
1154 */
1155 void
tbm_end_iterate(TBMIterator * iterator)1156 tbm_end_iterate(TBMIterator *iterator)
1157 {
1158 pfree(iterator);
1159 }
1160
1161 /*
1162 * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1163 *
1164 * This doesn't free any of the shared state associated with the iterator,
1165 * just our backend-private state.
1166 */
1167 void
tbm_end_shared_iterate(TBMSharedIterator * iterator)1168 tbm_end_shared_iterate(TBMSharedIterator *iterator)
1169 {
1170 pfree(iterator);
1171 }
1172
1173 /*
1174 * tbm_find_pageentry - find a PagetableEntry for the pageno
1175 *
1176 * Returns NULL if there is no non-lossy entry for the pageno.
1177 */
1178 static const PagetableEntry *
tbm_find_pageentry(const TIDBitmap * tbm,BlockNumber pageno)1179 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1180 {
1181 const PagetableEntry *page;
1182
1183 if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1184 return NULL;
1185
1186 if (tbm->status == TBM_ONE_PAGE)
1187 {
1188 page = &tbm->entry1;
1189 if (page->blockno != pageno)
1190 return NULL;
1191 Assert(!page->ischunk);
1192 return page;
1193 }
1194
1195 page = pagetable_lookup(tbm->pagetable, pageno);
1196 if (page == NULL)
1197 return NULL;
1198 if (page->ischunk)
1199 return NULL; /* don't want a lossy chunk header */
1200 return page;
1201 }
1202
1203 /*
1204 * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1205 *
1206 * If new, the entry is marked as an exact (non-chunk) entry.
1207 *
1208 * This may cause the table to exceed the desired memory size. It is
1209 * up to the caller to call tbm_lossify() at the next safe point if so.
1210 */
1211 static PagetableEntry *
tbm_get_pageentry(TIDBitmap * tbm,BlockNumber pageno)1212 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1213 {
1214 PagetableEntry *page;
1215 bool found;
1216
1217 if (tbm->status == TBM_EMPTY)
1218 {
1219 /* Use the fixed slot */
1220 page = &tbm->entry1;
1221 found = false;
1222 tbm->status = TBM_ONE_PAGE;
1223 }
1224 else
1225 {
1226 if (tbm->status == TBM_ONE_PAGE)
1227 {
1228 page = &tbm->entry1;
1229 if (page->blockno == pageno)
1230 return page;
1231 /* Time to switch from one page to a hashtable */
1232 tbm_create_pagetable(tbm);
1233 }
1234
1235 /* Look up or create an entry */
1236 page = pagetable_insert(tbm->pagetable, pageno, &found);
1237 }
1238
1239 /* Initialize it if not present before */
1240 if (!found)
1241 {
1242 char oldstatus = page->status;
1243
1244 MemSet(page, 0, sizeof(PagetableEntry));
1245 page->status = oldstatus;
1246 page->blockno = pageno;
1247 /* must count it too */
1248 tbm->nentries++;
1249 tbm->npages++;
1250 }
1251
1252 return page;
1253 }
1254
1255 /*
1256 * tbm_page_is_lossy - is the page marked as lossily stored?
1257 */
1258 static bool
tbm_page_is_lossy(const TIDBitmap * tbm,BlockNumber pageno)1259 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1260 {
1261 PagetableEntry *page;
1262 BlockNumber chunk_pageno;
1263 int bitno;
1264
1265 /* we can skip the lookup if there are no lossy chunks */
1266 if (tbm->nchunks == 0)
1267 return false;
1268 Assert(tbm->status == TBM_HASH);
1269
1270 bitno = pageno % PAGES_PER_CHUNK;
1271 chunk_pageno = pageno - bitno;
1272
1273 page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1274
1275 if (page != NULL && page->ischunk)
1276 {
1277 int wordnum = WORDNUM(bitno);
1278 int bitnum = BITNUM(bitno);
1279
1280 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1281 return true;
1282 }
1283 return false;
1284 }
1285
1286 /*
1287 * tbm_mark_page_lossy - mark the page number as lossily stored
1288 *
1289 * This may cause the table to exceed the desired memory size. It is
1290 * up to the caller to call tbm_lossify() at the next safe point if so.
1291 */
1292 static void
tbm_mark_page_lossy(TIDBitmap * tbm,BlockNumber pageno)1293 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1294 {
1295 PagetableEntry *page;
1296 bool found;
1297 BlockNumber chunk_pageno;
1298 int bitno;
1299 int wordnum;
1300 int bitnum;
1301
1302 /* We force the bitmap into hashtable mode whenever it's lossy */
1303 if (tbm->status != TBM_HASH)
1304 tbm_create_pagetable(tbm);
1305
1306 bitno = pageno % PAGES_PER_CHUNK;
1307 chunk_pageno = pageno - bitno;
1308
1309 /*
1310 * Remove any extant non-lossy entry for the page. If the page is its own
1311 * chunk header, however, we skip this and handle the case below.
1312 */
1313 if (bitno != 0)
1314 {
1315 if (pagetable_delete(tbm->pagetable, pageno))
1316 {
1317 /* It was present, so adjust counts */
1318 tbm->nentries--;
1319 tbm->npages--; /* assume it must have been non-lossy */
1320 }
1321 }
1322
1323 /* Look up or create entry for chunk-header page */
1324 page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1325
1326 /* Initialize it if not present before */
1327 if (!found)
1328 {
1329 char oldstatus = page->status;
1330
1331 MemSet(page, 0, sizeof(PagetableEntry));
1332 page->status = oldstatus;
1333 page->blockno = chunk_pageno;
1334 page->ischunk = true;
1335 /* must count it too */
1336 tbm->nentries++;
1337 tbm->nchunks++;
1338 }
1339 else if (!page->ischunk)
1340 {
1341 char oldstatus = page->status;
1342
1343 /* chunk header page was formerly non-lossy, make it lossy */
1344 MemSet(page, 0, sizeof(PagetableEntry));
1345 page->status = oldstatus;
1346 page->blockno = chunk_pageno;
1347 page->ischunk = true;
1348 /* we assume it had some tuple bit(s) set, so mark it lossy */
1349 page->words[0] = ((bitmapword) 1 << 0);
1350 /* adjust counts */
1351 tbm->nchunks++;
1352 tbm->npages--;
1353 }
1354
1355 /* Now set the original target page's bit */
1356 wordnum = WORDNUM(bitno);
1357 bitnum = BITNUM(bitno);
1358 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1359 }
1360
1361 /*
1362 * tbm_lossify - lose some information to get back under the memory limit
1363 */
1364 static void
tbm_lossify(TIDBitmap * tbm)1365 tbm_lossify(TIDBitmap *tbm)
1366 {
1367 pagetable_iterator i;
1368 PagetableEntry *page;
1369
1370 /*
1371 * XXX Really stupid implementation: this just lossifies pages in
1372 * essentially random order. We should be paying some attention to the
1373 * number of bits set in each page, instead.
1374 *
1375 * Since we are called as soon as nentries exceeds maxentries, we should
1376 * push nentries down to significantly less than maxentries, or else we'll
1377 * just end up doing this again very soon. We shoot for maxentries/2.
1378 */
1379 Assert(tbm->iterating == TBM_NOT_ITERATING);
1380 Assert(tbm->status == TBM_HASH);
1381
1382 pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1383 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1384 {
1385 if (page->ischunk)
1386 continue; /* already a chunk header */
1387
1388 /*
1389 * If the page would become a chunk header, we won't save anything by
1390 * converting it to lossy, so skip it.
1391 */
1392 if ((page->blockno % PAGES_PER_CHUNK) == 0)
1393 continue;
1394
1395 /* This does the dirty work ... */
1396 tbm_mark_page_lossy(tbm, page->blockno);
1397
1398 if (tbm->nentries <= tbm->maxentries / 2)
1399 {
1400 /*
1401 * We have made enough room. Remember where to start lossifying
1402 * next round, so we evenly iterate over the hashtable.
1403 */
1404 tbm->lossify_start = i.cur;
1405 break;
1406 }
1407
1408 /*
1409 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1410 * hashtable and may have deleted the non-lossy chunk. We can
1411 * continue the same hash table scan, since failure to visit one
1412 * element or visiting the newly inserted element, isn't fatal.
1413 */
1414 }
1415
1416 /*
1417 * With a big bitmap and small work_mem, it's possible that we cannot get
1418 * under maxentries. Again, if that happens, we'd end up uselessly
1419 * calling tbm_lossify over and over. To prevent this from becoming a
1420 * performance sink, force maxentries up to at least double the current
1421 * number of entries. (In essence, we're admitting inability to fit
1422 * within work_mem when we do this.) Note that this test will not fire if
1423 * we broke out of the loop early; and if we didn't, the current number of
1424 * entries is simply not reducible any further.
1425 */
1426 if (tbm->nentries > tbm->maxentries / 2)
1427 tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1428 }
1429
1430 /*
1431 * qsort comparator to handle PagetableEntry pointers.
1432 */
1433 static int
tbm_comparator(const void * left,const void * right)1434 tbm_comparator(const void *left, const void *right)
1435 {
1436 BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1437 BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1438
1439 if (l < r)
1440 return -1;
1441 else if (l > r)
1442 return 1;
1443 return 0;
1444 }
1445
1446 /*
1447 * As above, but this will get index into PagetableEntry array. Therefore,
1448 * it needs to get actual PagetableEntry using the index before comparing the
1449 * blockno.
1450 */
1451 static int
tbm_shared_comparator(const void * left,const void * right,void * arg)1452 tbm_shared_comparator(const void *left, const void *right, void *arg)
1453 {
1454 PagetableEntry *base = (PagetableEntry *) arg;
1455 PagetableEntry *lpage = &base[*(int *) left];
1456 PagetableEntry *rpage = &base[*(int *) right];
1457
1458 if (lpage->blockno < rpage->blockno)
1459 return -1;
1460 else if (lpage->blockno > rpage->blockno)
1461 return 1;
1462 return 0;
1463 }
1464
1465 /*
1466 * tbm_attach_shared_iterate
1467 *
1468 * Allocate a backend-private iterator and attach the shared iterator state
1469 * to it so that multiple processed can iterate jointly.
1470 *
1471 * We also converts the DSA pointers to local pointers and store them into
1472 * our private iterator.
1473 */
1474 TBMSharedIterator *
tbm_attach_shared_iterate(dsa_area * dsa,dsa_pointer dp)1475 tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1476 {
1477 TBMSharedIterator *iterator;
1478 TBMSharedIteratorState *istate;
1479
1480 /*
1481 * Create the TBMSharedIterator struct, with enough trailing space to
1482 * serve the needs of the TBMIterateResult sub-struct.
1483 */
1484 iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1485 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1486
1487 istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1488
1489 iterator->state = istate;
1490
1491 iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1492
1493 if (istate->npages)
1494 iterator->ptpages = dsa_get_address(dsa, istate->spages);
1495 if (istate->nchunks)
1496 iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1497
1498 return iterator;
1499 }
1500
1501 /*
1502 * pagetable_allocate
1503 *
1504 * Callback function for allocating the memory for hashtable elements.
1505 * Allocate memory for hashtable elements, using DSA if available.
1506 */
1507 static inline void *
pagetable_allocate(pagetable_hash * pagetable,Size size)1508 pagetable_allocate(pagetable_hash *pagetable, Size size)
1509 {
1510 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1511 PTEntryArray *ptbase;
1512
1513 if (tbm->dsa == NULL)
1514 return MemoryContextAllocExtended(pagetable->ctx, size,
1515 MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1516
1517 /*
1518 * Save the dsapagetable reference in dsapagetableold before allocating
1519 * new memory so that pagetable_free can free the old entry.
1520 */
1521 tbm->dsapagetableold = tbm->dsapagetable;
1522 tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1523 sizeof(PTEntryArray) + size,
1524 DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
1525 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1526
1527 return ptbase->ptentry;
1528 }
1529
1530 /*
1531 * pagetable_free
1532 *
1533 * Callback function for freeing hash table elements.
1534 */
1535 static inline void
pagetable_free(pagetable_hash * pagetable,void * pointer)1536 pagetable_free(pagetable_hash *pagetable, void *pointer)
1537 {
1538 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1539
1540 /* pfree the input pointer if DSA is not available */
1541 if (tbm->dsa == NULL)
1542 pfree(pointer);
1543 else if (DsaPointerIsValid(tbm->dsapagetableold))
1544 {
1545 dsa_free(tbm->dsa, tbm->dsapagetableold);
1546 tbm->dsapagetableold = InvalidDsaPointer;
1547 }
1548 }
1549