1 /*------------------------------------------------------------------------- 2 * 3 * tidbitmap.c 4 * PostgreSQL tuple-id (TID) bitmap package 5 * 6 * This module provides bitmap data structures that are spiritually 7 * similar to Bitmapsets, but are specially adapted to store sets of 8 * tuple identifiers (TIDs), or ItemPointers. In particular, the division 9 * of an ItemPointer into BlockNumber and OffsetNumber is catered for. 10 * Also, since we wish to be able to store very large tuple sets in 11 * memory with this data structure, we support "lossy" storage, in which 12 * we no longer remember individual tuple offsets on a page but only the 13 * fact that a particular page needs to be visited. 14 * 15 * The "lossy" storage uses one bit per disk page, so at the standard 8K 16 * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb 17 * of memory. People pushing around tables of that size should have a 18 * couple of Mb to spare, so we don't worry about providing a second level 19 * of lossiness. In theory we could fall back to page ranges at some endit()20 * point, but for now that seems useless complexity. 21 * 22 * We also support the notion of candidate matches, or rechecking. This 23 * means we know that a search need visit only some tuples on a page, 24 * but we are not certain that all of those tuples are real matches. 25 * So the eventual heap scan must recheck the quals for these tuples only, 26 * rather than rechecking the quals for all tuples on the page as in the 27 * lossy-bitmap case. Rechecking can be specified when TIDs are inserted 28 * into a bitmap, and it can also happen internally when we AND a lossy 29 * and a non-lossy page. 30 * 31 * 32 * Copyright (c) 2003-2021, PostgreSQL Global Development Group 33 * 34 * IDENTIFICATION 35 * src/backend/nodes/tidbitmap.c 36 * 37 *------------------------------------------------------------------------- 38 */ 39 #include "postgres.h" 40 41 #include <limits.h> 42 43 #include "access/htup_details.h" 44 #include "common/hashfn.h" 45 #include "nodes/bitmapset.h" 46 #include "nodes/tidbitmap.h" 47 #include "storage/lwlock.h" 48 #include "utils/dsa.h" 49 50 /* 51 * The maximum number of tuples per page is not large (typically 256 with 52 * 8K pages, or 1024 with 32K pages). So there's not much point in making 53 * the per-page bitmaps variable size. We just legislate that the size 54 * is this: 55 */ 56 #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage 57 58 /* 59 * When we have to switch over to lossy storage, we use a data structure 60 * with one bit per page, where all pages having the same number DIV 61 * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present 62 * and has the bit set for a given page, there must not be a per-page entry 63 * for that page in the page table. 64 * 65 * We actually store both exact pages and lossy chunks in the same hash 66 * table, using identical data structures. (This is because the memory 67 * management for hashtables doesn't easily/efficiently allow space to be 68 * transferred easily from one hashtable to another.) Therefore it's best 69 * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not 70 * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to 71 * avoid expensive integer remainder operations. So, define it like this: 72 */ 73 #define PAGES_PER_CHUNK (BLCKSZ / 32) 74 75 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */ 76 77 #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD) 78 #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD) 79 80 /* number of active words for an exact page: */ 81 #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1) 82 /* number of active words for a lossy chunk: */ 83 #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1) 84 85 /* 86 * The hashtable entries are represented by this data structure. For 87 * an exact page, blockno is the page number and bit k of the bitmap 88 * represents tuple offset k+1. For a lossy chunk, blockno is the first 89 * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and 90 * bit k represents page blockno+k. Note that it is not possible to 91 * have exact storage for the first page of a chunk if we are using 92 * lossy storage for any page in the chunk's range, since the same 93 * hashtable entry has to serve both purposes. 94 * 95 * recheck is used only on exact pages --- it indicates that although 96 * only the stated tuples need be checked, the full index qual condition 97 * must be checked for each (ie, these are candidate matches). 98 */ 99 typedef struct PagetableEntry 100 { 101 BlockNumber blockno; /* page number (hashtable key) */ 102 char status; /* hash entry status */ 103 bool ischunk; /* T = lossy storage, F = exact */ 104 bool recheck; /* should the tuples be rechecked? */ 105 bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]; 106 } PagetableEntry; 107 108 /* 109 * Holds array of pagetable entries. 110 */ 111 typedef struct PTEntryArray 112 { 113 pg_atomic_uint32 refcount; /* no. of iterator attached */ 114 PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER]; 115 } PTEntryArray; 116 117 /* 118 * We want to avoid the overhead of creating the hashtable, which is 119 * comparatively large, when not necessary. Particularly when we are using a 120 * bitmap scan on the inside of a nestloop join: a bitmap may well live only 121 * long enough to accumulate one entry in such cases. We therefore avoid 122 * creating an actual hashtable until we need two pagetable entries. When 123 * just one pagetable entry is needed, we store it in a fixed field of 124 * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later 125 * shrinks down to zero or one page again. So, status can be TBM_HASH even 126 * when nentries is zero or one.) 127 */ 128 typedef enum 129 { 130 TBM_EMPTY, /* no hashtable, nentries == 0 */ 131 TBM_ONE_PAGE, /* entry1 contains the single entry */ 132 TBM_HASH /* pagetable is valid, entry1 is not */ 133 } TBMStatus; 134 135 /* 136 * Current iterating state of the TBM. 137 */ 138 typedef enum 139 { 140 TBM_NOT_ITERATING, /* not yet converted to page and chunk array */ 141 TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */ 142 TBM_ITERATING_SHARED /* converted to shared page and chunk array */ 143 } TBMIteratingState; 144 145 /* 146 * Here is the representation for a whole TIDBitMap: 147 */ 148 struct TIDBitmap 149 { 150 NodeTag type; /* to make it a valid Node */ 151 MemoryContext mcxt; /* memory context containing me */ 152 TBMStatus status; /* see codes above */ 153 struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */ 154 int nentries; /* number of entries in pagetable */ 155 int maxentries; /* limit on same to meet maxbytes */ 156 int npages; /* number of exact entries in pagetable */ 157 int nchunks; /* number of lossy entries in pagetable */ 158 TBMIteratingState iterating; /* tbm_begin_iterate called? */ 159 uint32 lossify_start; /* offset to start lossifying hashtable at */ 160 PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ 161 /* these are valid when iterating is true: */ 162 PagetableEntry **spages; /* sorted exact-page list, or NULL */ 163 PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */ 164 dsa_pointer dsapagetable; /* dsa_pointer to the element array */ 165 dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */ 166 dsa_pointer ptpages; /* dsa_pointer to the page array */ 167 dsa_pointer ptchunks; /* dsa_pointer to the chunk array */ 168 dsa_area *dsa; /* reference to per-query dsa area */ 169 }; 170 171 /* 172 * When iterating over a bitmap in sorted order, a TBMIterator is used to 173 * track our progress. There can be several iterators scanning the same 174 * bitmap concurrently. Note that the bitmap becomes read-only as soon as 175 * any iterator is created. 176 */ 177 struct TBMIterator 178 { 179 TIDBitmap *tbm; /* TIDBitmap we're iterating over */ 180 int spageptr; /* next spages index */ 181 int schunkptr; /* next schunks index */ 182 int schunkbit; /* next bit to check in current schunk */ 183 TBMIterateResult output; /* MUST BE LAST (because variable-size) */ 184 }; 185 186 /* 187 * Holds the shared members of the iterator so that multiple processes 188 * can jointly iterate. 189 */ 190 typedef struct TBMSharedIteratorState 191 { 192 int nentries; /* number of entries in pagetable */ 193 int maxentries; /* limit on same to meet maxbytes */ 194 int npages; /* number of exact entries in pagetable */ 195 int nchunks; /* number of lossy entries in pagetable */ 196 dsa_pointer pagetable; /* dsa pointers to head of pagetable data */ 197 dsa_pointer spages; /* dsa pointer to page array */ 198 dsa_pointer schunks; /* dsa pointer to chunk array */ 199 LWLock lock; /* lock to protect below members */ 200 int spageptr; /* next spages index */ 201 int schunkptr; /* next schunks index */ 202 int schunkbit; /* next bit to check in current schunk */ 203 } TBMSharedIteratorState; 204 205 /* 206 * pagetable iteration array. 207 */ 208 typedef struct PTIterationArray 209 { 210 pg_atomic_uint32 refcount; /* no. of iterator attached */ 211 int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */ 212 } PTIterationArray; 213 214 /* 215 * same as TBMIterator, but it is used for joint iteration, therefore this 216 * also holds a reference to the shared state. 217 */ 218 struct TBMSharedIterator 219 { 220 TBMSharedIteratorState *state; /* shared state */ 221 PTEntryArray *ptbase; /* pagetable element array */ 222 PTIterationArray *ptpages; /* sorted exact page index list */ 223 PTIterationArray *ptchunks; /* sorted lossy page index list */ 224 TBMIterateResult output; /* MUST BE LAST (because variable-size) */ 225 }; 226 227 /* Local function prototypes */ 228 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage); 229 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, 230 const TIDBitmap *b); 231 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm, 232 BlockNumber pageno); 233 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno); 234 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno); 235 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno); 236 static void tbm_lossify(TIDBitmap *tbm); 237 static int tbm_comparator(const void *left, const void *right); 238 static int tbm_shared_comparator(const void *left, const void *right, 239 void *arg); 240 241 /* define hashtable mapping block numbers to PagetableEntry's */ 242 #define SH_USE_NONDEFAULT_ALLOCATOR 243 #define SH_PREFIX pagetable 244 #define SH_ELEMENT_TYPE PagetableEntry 245 #define SH_KEY_TYPE BlockNumber 246 #define SH_KEY blockno 247 #define SH_HASH_KEY(tb, key) murmurhash32(key) 248 #define SH_EQUAL(tb, a, b) a == b 249 #define SH_SCOPE static inline 250 #define SH_DEFINE 251 #define SH_DECLARE 252 #include "lib/simplehash.h" 253 254 255 /* 256 * tbm_create - create an initially-empty bitmap 257 * 258 * The bitmap will live in the memory context that is CurrentMemoryContext 259 * at the time of this call. It will be limited to (approximately) maxbytes 260 * total memory consumption. If the DSA passed to this function is not NULL 261 * then the memory for storing elements of the underlying page table will 262 * be allocated from the DSA. 263 */ 264 TIDBitmap * 265 tbm_create(long maxbytes, dsa_area *dsa) 266 { 267 TIDBitmap *tbm; 268 269 /* Create the TIDBitmap struct and zero all its fields */ 270 tbm = makeNode(TIDBitmap); 271 272 tbm->mcxt = CurrentMemoryContext; 273 tbm->status = TBM_EMPTY; 274 275 tbm->maxentries = (int) tbm_calculate_entries(maxbytes); 276 tbm->lossify_start = 0; 277 tbm->dsa = dsa; 278 tbm->dsapagetable = InvalidDsaPointer; 279 tbm->dsapagetableold = InvalidDsaPointer; 280 tbm->ptpages = InvalidDsaPointer; 281 tbm->ptchunks = InvalidDsaPointer; 282 283 return tbm; 284 } 285 286 /* 287 * Actually create the hashtable. Since this is a moderately expensive 288 * proposition, we don't do it until we have to. 289 */ 290 static void 291 tbm_create_pagetable(TIDBitmap *tbm) 292 { 293 Assert(tbm->status != TBM_HASH); 294 Assert(tbm->pagetable == NULL); 295 296 tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm); 297 298 /* If entry1 is valid, push it into the hashtable */ 299 if (tbm->status == TBM_ONE_PAGE) 300 { 301 PagetableEntry *page; 302 bool found; 303 char oldstatus; 304 305 page = pagetable_insert(tbm->pagetable, 306 tbm->entry1.blockno, 307 &found); 308 Assert(!found); 309 oldstatus = page->status; 310 memcpy(page, &tbm->entry1, sizeof(PagetableEntry)); 311 page->status = oldstatus; 312 } 313 314 tbm->status = TBM_HASH; 315 } 316 317 /* 318 * tbm_free - free a TIDBitmap 319 */ 320 void 321 tbm_free(TIDBitmap *tbm) 322 { 323 if (tbm->pagetable) 324 pagetable_destroy(tbm->pagetable); 325 if (tbm->spages) 326 pfree(tbm->spages); 327 if (tbm->schunks) 328 pfree(tbm->schunks); 329 pfree(tbm); 330 } 331 332 /* 333 * tbm_free_shared_area - free shared state 334 * 335 * Free shared iterator state, Also free shared pagetable and iterator arrays 336 * memory if they are not referred by any of the shared iterator i.e recount 337 * is becomes 0. 338 */ 339 void 340 tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp) 341 { 342 TBMSharedIteratorState *istate = dsa_get_address(dsa, dp); 343 PTEntryArray *ptbase; 344 PTIterationArray *ptpages; 345 PTIterationArray *ptchunks; 346 347 if (DsaPointerIsValid(istate->pagetable)) 348 { 349 ptbase = dsa_get_address(dsa, istate->pagetable); 350 if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0) 351 dsa_free(dsa, istate->pagetable); 352 } 353 if (DsaPointerIsValid(istate->spages)) 354 { 355 ptpages = dsa_get_address(dsa, istate->spages); 356 if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0) 357 dsa_free(dsa, istate->spages); 358 } 359 if (DsaPointerIsValid(istate->schunks)) 360 { 361 ptchunks = dsa_get_address(dsa, istate->schunks); 362 if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0) 363 dsa_free(dsa, istate->schunks); 364 } 365 366 dsa_free(dsa, dp); 367 } 368 369 /* 370 * tbm_add_tuples - add some tuple IDs to a TIDBitmap 371 * 372 * If recheck is true, then the recheck flag will be set in the 373 * TBMIterateResult when any of these tuples are reported out. 374 */ 375 void 376 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, 377 bool recheck) 378 { 379 BlockNumber currblk = InvalidBlockNumber; 380 PagetableEntry *page = NULL; /* only valid when currblk is valid */ 381 int i; 382 383 Assert(tbm->iterating == TBM_NOT_ITERATING); 384 for (i = 0; i < ntids; i++) 385 { 386 BlockNumber blk = ItemPointerGetBlockNumber(tids + i); 387 OffsetNumber off = ItemPointerGetOffsetNumber(tids + i); 388 int wordnum, 389 bitnum; 390 391 /* safety check to ensure we don't overrun bit array bounds */ 392 if (off < 1 || off > MAX_TUPLES_PER_PAGE) 393 elog(ERROR, "tuple offset out of range: %u", off); 394 395 /* 396 * Look up target page unless we already did. This saves cycles when 397 * the input includes consecutive tuples on the same page, which is 398 * common enough to justify an extra test here. 399 */ 400 if (blk != currblk) 401 { 402 if (tbm_page_is_lossy(tbm, blk)) 403 page = NULL; /* remember page is lossy */ 404 else 405 page = tbm_get_pageentry(tbm, blk); 406 currblk = blk; 407 } 408 409 if (page == NULL) 410 continue; /* whole page is already marked */ 411 412 if (page->ischunk) 413 { 414 /* The page is a lossy chunk header, set bit for itself */ 415 wordnum = bitnum = 0; 416 } 417 else 418 { 419 /* Page is exact, so set bit for individual tuple */ 420 wordnum = WORDNUM(off - 1); 421 bitnum = BITNUM(off - 1); 422 } 423 page->words[wordnum] |= ((bitmapword) 1 << bitnum); 424 page->recheck |= recheck; 425 426 if (tbm->nentries > tbm->maxentries) 427 { 428 tbm_lossify(tbm); 429 /* Page could have been converted to lossy, so force new lookup */ 430 currblk = InvalidBlockNumber; 431 } 432 } 433 } 434 435 /* 436 * tbm_add_page - add a whole page to a TIDBitmap 437 * 438 * This causes the whole page to be reported (with the recheck flag) 439 * when the TIDBitmap is scanned. 440 */ 441 void 442 tbm_add_page(TIDBitmap *tbm, BlockNumber pageno) 443 { 444 /* Enter the page in the bitmap, or mark it lossy if already present */ 445 tbm_mark_page_lossy(tbm, pageno); 446 /* If we went over the memory limit, lossify some more pages */ 447 if (tbm->nentries > tbm->maxentries) 448 tbm_lossify(tbm); 449 } 450 451 /* 452 * tbm_union - set union 453 * 454 * a is modified in-place, b is not changed 455 */ 456 void 457 tbm_union(TIDBitmap *a, const TIDBitmap *b) 458 { 459 Assert(!a->iterating); 460 /* Nothing to do if b is empty */ 461 if (b->nentries == 0) 462 return; 463 /* Scan through chunks and pages in b, merge into a */ 464 if (b->status == TBM_ONE_PAGE) 465 tbm_union_page(a, &b->entry1); 466 else 467 { 468 pagetable_iterator i; 469 PagetableEntry *bpage; 470 471 Assert(b->status == TBM_HASH); 472 pagetable_start_iterate(b->pagetable, &i); 473 while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL) 474 tbm_union_page(a, bpage); 475 } 476 } 477 478 /* Process one page of b during a union op */ 479 static void 480 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage) 481 { 482 PagetableEntry *apage; 483 int wordnum; 484 485 if (bpage->ischunk) 486 { 487 /* Scan b's chunk, mark each indicated page lossy in a */ 488 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) 489 { 490 bitmapword w = bpage->words[wordnum]; 491 492 if (w != 0) 493 { 494 BlockNumber pg; 495 496 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD); 497 while (w != 0) 498 { 499 if (w & 1) 500 tbm_mark_page_lossy(a, pg); 501 pg++; 502 w >>= 1; 503 } 504 } 505 } 506 } 507 else if (tbm_page_is_lossy(a, bpage->blockno)) 508 { 509 /* page is already lossy in a, nothing to do */ 510 return; 511 } 512 else 513 { 514 apage = tbm_get_pageentry(a, bpage->blockno); 515 if (apage->ischunk) 516 { 517 /* The page is a lossy chunk header, set bit for itself */ 518 apage->words[0] |= ((bitmapword) 1 << 0); 519 } 520 else 521 { 522 /* Both pages are exact, merge at the bit level */ 523 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) 524 apage->words[wordnum] |= bpage->words[wordnum]; 525 apage->recheck |= bpage->recheck; 526 } 527 } 528 529 if (a->nentries > a->maxentries) 530 tbm_lossify(a); 531 } 532 533 /* 534 * tbm_intersect - set intersection 535 * 536 * a is modified in-place, b is not changed 537 */ 538 void 539 tbm_intersect(TIDBitmap *a, const TIDBitmap *b) 540 { 541 Assert(!a->iterating); 542 /* Nothing to do if a is empty */ 543 if (a->nentries == 0) 544 return; 545 /* Scan through chunks and pages in a, try to match to b */ 546 if (a->status == TBM_ONE_PAGE) 547 { 548 if (tbm_intersect_page(a, &a->entry1, b)) 549 { 550 /* Page is now empty, remove it from a */ 551 Assert(!a->entry1.ischunk); 552 a->npages--; 553 a->nentries--; 554 Assert(a->nentries == 0); 555 a->status = TBM_EMPTY; 556 } 557 } 558 else 559 { 560 pagetable_iterator i; 561 PagetableEntry *apage; 562 563 Assert(a->status == TBM_HASH); 564 pagetable_start_iterate(a->pagetable, &i); 565 while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL) 566 { 567 if (tbm_intersect_page(a, apage, b)) 568 { 569 /* Page or chunk is now empty, remove it from a */ 570 if (apage->ischunk) 571 a->nchunks--; 572 else 573 a->npages--; 574 a->nentries--; 575 if (!pagetable_delete(a->pagetable, apage->blockno)) 576 elog(ERROR, "hash table corrupted"); 577 } 578 } 579 } 580 } 581 582 /* 583 * Process one page of a during an intersection op 584 * 585 * Returns true if apage is now empty and should be deleted from a 586 */ 587 static bool 588 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b) 589 { 590 const PagetableEntry *bpage; 591 int wordnum; 592 593 if (apage->ischunk) 594 { 595 /* Scan each bit in chunk, try to clear */ 596 bool candelete = true; 597 598 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) 599 { 600 bitmapword w = apage->words[wordnum]; 601 602 if (w != 0) 603 { 604 bitmapword neww = w; 605 BlockNumber pg; 606 int bitnum; 607 608 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD); 609 bitnum = 0; 610 while (w != 0) 611 { 612 if (w & 1) 613 { 614 if (!tbm_page_is_lossy(b, pg) && 615 tbm_find_pageentry(b, pg) == NULL) 616 { 617 /* Page is not in b at all, lose lossy bit */ 618 neww &= ~((bitmapword) 1 << bitnum); 619 } 620 } 621 pg++; 622 bitnum++; 623 w >>= 1; 624 } 625 apage->words[wordnum] = neww; 626 if (neww != 0) 627 candelete = false; 628 } 629 } 630 return candelete; 631 } 632 else if (tbm_page_is_lossy(b, apage->blockno)) 633 { 634 /* 635 * Some of the tuples in 'a' might not satisfy the quals for 'b', but 636 * because the page 'b' is lossy, we don't know which ones. Therefore 637 * we mark 'a' as requiring rechecks, to indicate that at most those 638 * tuples set in 'a' are matches. 639 */ 640 apage->recheck = true; 641 return false; 642 } 643 else 644 { 645 bool candelete = true; 646 647 bpage = tbm_find_pageentry(b, apage->blockno); 648 if (bpage != NULL) 649 { 650 /* Both pages are exact, merge at the bit level */ 651 Assert(!bpage->ischunk); 652 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) 653 { 654 apage->words[wordnum] &= bpage->words[wordnum]; 655 if (apage->words[wordnum] != 0) 656 candelete = false; 657 } 658 apage->recheck |= bpage->recheck; 659 } 660 /* If there is no matching b page, we can just delete the a page */ 661 return candelete; 662 } 663 } 664 665 /* 666 * tbm_is_empty - is a TIDBitmap completely empty? 667 */ 668 bool 669 tbm_is_empty(const TIDBitmap *tbm) 670 { 671 return (tbm->nentries == 0); 672 } 673 674 /* 675 * tbm_begin_iterate - prepare to iterate through a TIDBitmap 676 * 677 * The TBMIterator struct is created in the caller's memory context. 678 * For a clean shutdown of the iteration, call tbm_end_iterate; but it's 679 * okay to just allow the memory context to be released, too. It is caller's 680 * responsibility not to touch the TBMIterator anymore once the TIDBitmap 681 * is freed. 682 * 683 * NB: after this is called, it is no longer allowed to modify the contents 684 * of the bitmap. However, you can call this multiple times to scan the 685 * contents repeatedly, including parallel scans. 686 */ 687 TBMIterator * 688 tbm_begin_iterate(TIDBitmap *tbm) 689 { 690 TBMIterator *iterator; 691 692 Assert(tbm->iterating != TBM_ITERATING_SHARED); 693 694 /* 695 * Create the TBMIterator struct, with enough trailing space to serve the 696 * needs of the TBMIterateResult sub-struct. 697 */ 698 iterator = (TBMIterator *) palloc(sizeof(TBMIterator) + 699 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); 700 iterator->tbm = tbm; 701 702 /* 703 * Initialize iteration pointers. 704 */ 705 iterator->spageptr = 0; 706 iterator->schunkptr = 0; 707 iterator->schunkbit = 0; 708 709 /* 710 * If we have a hashtable, create and fill the sorted page lists, unless 711 * we already did that for a previous iterator. Note that the lists are 712 * attached to the bitmap not the iterator, so they can be used by more 713 * than one iterator. 714 */ 715 if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING) 716 { 717 pagetable_iterator i; 718 PagetableEntry *page; 719 int npages; 720 int nchunks; 721 722 if (!tbm->spages && tbm->npages > 0) 723 tbm->spages = (PagetableEntry **) 724 MemoryContextAlloc(tbm->mcxt, 725 tbm->npages * sizeof(PagetableEntry *)); 726 if (!tbm->schunks && tbm->nchunks > 0) 727 tbm->schunks = (PagetableEntry **) 728 MemoryContextAlloc(tbm->mcxt, 729 tbm->nchunks * sizeof(PagetableEntry *)); 730 731 npages = nchunks = 0; 732 pagetable_start_iterate(tbm->pagetable, &i); 733 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) 734 { 735 if (page->ischunk) 736 tbm->schunks[nchunks++] = page; 737 else 738 tbm->spages[npages++] = page; 739 } 740 Assert(npages == tbm->npages); 741 Assert(nchunks == tbm->nchunks); 742 if (npages > 1) 743 qsort(tbm->spages, npages, sizeof(PagetableEntry *), 744 tbm_comparator); 745 if (nchunks > 1) 746 qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), 747 tbm_comparator); 748 } 749 750 tbm->iterating = TBM_ITERATING_PRIVATE; 751 752 return iterator; 753 } 754 755 /* 756 * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap. 757 * 758 * The necessary shared state will be allocated from the DSA passed to 759 * tbm_create, so that multiple processes can attach to it and iterate jointly. 760 * 761 * This will convert the pagetable hash into page and chunk array of the index 762 * into pagetable array. 763 */ 764 dsa_pointer 765 tbm_prepare_shared_iterate(TIDBitmap *tbm) 766 { 767 dsa_pointer dp; 768 TBMSharedIteratorState *istate; 769 PTEntryArray *ptbase = NULL; 770 PTIterationArray *ptpages = NULL; 771 PTIterationArray *ptchunks = NULL; 772 773 Assert(tbm->dsa != NULL); 774 Assert(tbm->iterating != TBM_ITERATING_PRIVATE); 775 776 /* 777 * Allocate TBMSharedIteratorState from DSA to hold the shared members and 778 * lock, this will also be used by multiple worker for shared iterate. 779 */ 780 dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState)); 781 istate = dsa_get_address(tbm->dsa, dp); 782 783 /* 784 * If we're not already iterating, create and fill the sorted page lists. 785 * (If we are, the sorted page lists are already stored in the TIDBitmap, 786 * and we can just reuse them.) 787 */ 788 if (tbm->iterating == TBM_NOT_ITERATING) 789 { 790 pagetable_iterator i; 791 PagetableEntry *page; 792 int idx; 793 int npages; 794 int nchunks; 795 796 /* 797 * Allocate the page and chunk array memory from the DSA to share 798 * across multiple processes. 799 */ 800 if (tbm->npages) 801 { 802 tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) + 803 tbm->npages * sizeof(int)); 804 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages); 805 pg_atomic_init_u32(&ptpages->refcount, 0); 806 } 807 if (tbm->nchunks) 808 { 809 tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) + 810 tbm->nchunks * sizeof(int)); 811 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks); 812 pg_atomic_init_u32(&ptchunks->refcount, 0); 813 } 814 815 /* 816 * If TBM status is TBM_HASH then iterate over the pagetable and 817 * convert it to page and chunk arrays. But if it's in the 818 * TBM_ONE_PAGE mode then directly allocate the space for one entry 819 * from the DSA. 820 */ 821 npages = nchunks = 0; 822 if (tbm->status == TBM_HASH) 823 { 824 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); 825 826 pagetable_start_iterate(tbm->pagetable, &i); 827 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) 828 { 829 idx = page - ptbase->ptentry; 830 if (page->ischunk) 831 ptchunks->index[nchunks++] = idx; 832 else 833 ptpages->index[npages++] = idx; 834 } 835 836 Assert(npages == tbm->npages); 837 Assert(nchunks == tbm->nchunks); 838 } 839 else if (tbm->status == TBM_ONE_PAGE) 840 { 841 /* 842 * In one page mode allocate the space for one pagetable entry, 843 * initialize it, and directly store its index (i.e. 0) in the 844 * page array. 845 */ 846 tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) + 847 sizeof(PagetableEntry)); 848 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); 849 memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry)); 850 ptpages->index[0] = 0; 851 } 852 853 if (ptbase != NULL) 854 pg_atomic_init_u32(&ptbase->refcount, 0); 855 if (npages > 1) 856 qsort_arg((void *) (ptpages->index), npages, sizeof(int), 857 tbm_shared_comparator, (void *) ptbase->ptentry); 858 if (nchunks > 1) 859 qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int), 860 tbm_shared_comparator, (void *) ptbase->ptentry); 861 } 862 863 /* 864 * Store the TBM members in the shared state so that we can share them 865 * across multiple processes. 866 */ 867 istate->nentries = tbm->nentries; 868 istate->maxentries = tbm->maxentries; 869 istate->npages = tbm->npages; 870 istate->nchunks = tbm->nchunks; 871 istate->pagetable = tbm->dsapagetable; 872 istate->spages = tbm->ptpages; 873 istate->schunks = tbm->ptchunks; 874 875 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); 876 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages); 877 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks); 878 879 /* 880 * For every shared iterator, referring to pagetable and iterator array, 881 * increase the refcount by 1 so that while freeing the shared iterator we 882 * don't free pagetable and iterator array until its refcount becomes 0. 883 */ 884 if (ptbase != NULL) 885 pg_atomic_add_fetch_u32(&ptbase->refcount, 1); 886 if (ptpages != NULL) 887 pg_atomic_add_fetch_u32(&ptpages->refcount, 1); 888 if (ptchunks != NULL) 889 pg_atomic_add_fetch_u32(&ptchunks->refcount, 1); 890 891 /* Initialize the iterator lock */ 892 LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP); 893 894 /* Initialize the shared iterator state */ 895 istate->schunkbit = 0; 896 istate->schunkptr = 0; 897 istate->spageptr = 0; 898 899 tbm->iterating = TBM_ITERATING_SHARED; 900 901 return dp; 902 } 903 904 /* 905 * tbm_extract_page_tuple - extract the tuple offsets from a page 906 * 907 * The extracted offsets are stored into TBMIterateResult. 908 */ 909 static inline int 910 tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output) 911 { 912 int wordnum; 913 int ntuples = 0; 914 915 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) 916 { 917 bitmapword w = page->words[wordnum]; 918 919 if (w != 0) 920 { 921 int off = wordnum * BITS_PER_BITMAPWORD + 1; 922 923 while (w != 0) 924 { 925 if (w & 1) 926 output->offsets[ntuples++] = (OffsetNumber) off; 927 off++; 928 w >>= 1; 929 } 930 } 931 } 932 933 return ntuples; 934 } 935 936 /* 937 * tbm_advance_schunkbit - Advance the schunkbit 938 */ 939 static inline void 940 tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp) 941 { 942 int schunkbit = *schunkbitp; 943 944 while (schunkbit < PAGES_PER_CHUNK) 945 { 946 int wordnum = WORDNUM(schunkbit); 947 int bitnum = BITNUM(schunkbit); 948 949 if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) 950 break; 951 schunkbit++; 952 } 953 954 *schunkbitp = schunkbit; 955 } 956 957 /* 958 * tbm_iterate - scan through next page of a TIDBitmap 959 * 960 * Returns a TBMIterateResult representing one page, or NULL if there are 961 * no more pages to scan. Pages are guaranteed to be delivered in numerical 962 * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to 963 * remember the exact tuples to look at on this page --- the caller must 964 * examine all tuples on the page and check if they meet the intended 965 * condition. If result->recheck is true, only the indicated tuples need 966 * be examined, but the condition must be rechecked anyway. (For ease of 967 * testing, recheck is always set true when ntuples < 0.) 968 */ 969 TBMIterateResult * 970 tbm_iterate(TBMIterator *iterator) 971 { 972 TIDBitmap *tbm = iterator->tbm; 973 TBMIterateResult *output = &(iterator->output); 974 975 Assert(tbm->iterating == TBM_ITERATING_PRIVATE); 976 977 /* 978 * If lossy chunk pages remain, make sure we've advanced schunkptr/ 979 * schunkbit to the next set bit. 980 */ 981 while (iterator->schunkptr < tbm->nchunks) 982 { 983 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr]; 984 int schunkbit = iterator->schunkbit; 985 986 tbm_advance_schunkbit(chunk, &schunkbit); 987 if (schunkbit < PAGES_PER_CHUNK) 988 { 989 iterator->schunkbit = schunkbit; 990 break; 991 } 992 /* advance to next chunk */ 993 iterator->schunkptr++; 994 iterator->schunkbit = 0; 995 } 996 997 /* 998 * If both chunk and per-page data remain, must output the numerically 999 * earlier page. 1000 */ 1001 if (iterator->schunkptr < tbm->nchunks) 1002 { 1003 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr]; 1004 BlockNumber chunk_blockno; 1005 1006 chunk_blockno = chunk->blockno + iterator->schunkbit; 1007 if (iterator->spageptr >= tbm->npages || 1008 chunk_blockno < tbm->spages[iterator->spageptr]->blockno) 1009 { 1010 /* Return a lossy page indicator from the chunk */ 1011 output->blockno = chunk_blockno; 1012 output->ntuples = -1; 1013 output->recheck = true; 1014 iterator->schunkbit++; 1015 return output; 1016 } 1017 } 1018 1019 if (iterator->spageptr < tbm->npages) 1020 { 1021 PagetableEntry *page; 1022 int ntuples; 1023 1024 /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */ 1025 if (tbm->status == TBM_ONE_PAGE) 1026 page = &tbm->entry1; 1027 else 1028 page = tbm->spages[iterator->spageptr]; 1029 1030 /* scan bitmap to extract individual offset numbers */ 1031 ntuples = tbm_extract_page_tuple(page, output); 1032 output->blockno = page->blockno; 1033 output->ntuples = ntuples; 1034 output->recheck = page->recheck; 1035 iterator->spageptr++; 1036 return output; 1037 } 1038 1039 /* Nothing more in the bitmap */ 1040 return NULL; 1041 } 1042 1043 /* 1044 * tbm_shared_iterate - scan through next page of a TIDBitmap 1045 * 1046 * As above, but this will iterate using an iterator which is shared 1047 * across multiple processes. We need to acquire the iterator LWLock, 1048 * before accessing the shared members. 1049 */ 1050 TBMIterateResult * 1051 tbm_shared_iterate(TBMSharedIterator *iterator) 1052 { 1053 TBMIterateResult *output = &iterator->output; 1054 TBMSharedIteratorState *istate = iterator->state; 1055 PagetableEntry *ptbase = NULL; 1056 int *idxpages = NULL; 1057 int *idxchunks = NULL; 1058 1059 if (iterator->ptbase != NULL) 1060 ptbase = iterator->ptbase->ptentry; 1061 if (iterator->ptpages != NULL) 1062 idxpages = iterator->ptpages->index; 1063 if (iterator->ptchunks != NULL) 1064 idxchunks = iterator->ptchunks->index; 1065 1066 /* Acquire the LWLock before accessing the shared members */ 1067 LWLockAcquire(&istate->lock, LW_EXCLUSIVE); 1068 1069 /* 1070 * If lossy chunk pages remain, make sure we've advanced schunkptr/ 1071 * schunkbit to the next set bit. 1072 */ 1073 while (istate->schunkptr < istate->nchunks) 1074 { 1075 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]]; 1076 int schunkbit = istate->schunkbit; 1077 1078 tbm_advance_schunkbit(chunk, &schunkbit); 1079 if (schunkbit < PAGES_PER_CHUNK) 1080 { 1081 istate->schunkbit = schunkbit; 1082 break; 1083 } 1084 /* advance to next chunk */ 1085 istate->schunkptr++; 1086 istate->schunkbit = 0; 1087 } 1088 1089 /* 1090 * If both chunk and per-page data remain, must output the numerically 1091 * earlier page. 1092 */ 1093 if (istate->schunkptr < istate->nchunks) 1094 { 1095 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]]; 1096 BlockNumber chunk_blockno; 1097 1098 chunk_blockno = chunk->blockno + istate->schunkbit; 1099 1100 if (istate->spageptr >= istate->npages || 1101 chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno) 1102 { 1103 /* Return a lossy page indicator from the chunk */ 1104 output->blockno = chunk_blockno; 1105 output->ntuples = -1; 1106 output->recheck = true; 1107 istate->schunkbit++; 1108 1109 LWLockRelease(&istate->lock); 1110 return output; 1111 } 1112 } 1113 1114 if (istate->spageptr < istate->npages) 1115 { 1116 PagetableEntry *page = &ptbase[idxpages[istate->spageptr]]; 1117 int ntuples; 1118 1119 /* scan bitmap to extract individual offset numbers */ 1120 ntuples = tbm_extract_page_tuple(page, output); 1121 output->blockno = page->blockno; 1122 output->ntuples = ntuples; 1123 output->recheck = page->recheck; 1124 istate->spageptr++; 1125 1126 LWLockRelease(&istate->lock); 1127 1128 return output; 1129 } 1130 1131 LWLockRelease(&istate->lock); 1132 1133 /* Nothing more in the bitmap */ 1134 return NULL; 1135 } 1136 1137 /* 1138 * tbm_end_iterate - finish an iteration over a TIDBitmap 1139 * 1140 * Currently this is just a pfree, but it might do more someday. (For 1141 * instance, it could be useful to count open iterators and allow the 1142 * bitmap to return to read/write status when there are no more iterators.) 1143 */ 1144 void 1145 tbm_end_iterate(TBMIterator *iterator) 1146 { 1147 pfree(iterator); 1148 } 1149 1150 /* 1151 * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap 1152 * 1153 * This doesn't free any of the shared state associated with the iterator, 1154 * just our backend-private state. 1155 */ 1156 void 1157 tbm_end_shared_iterate(TBMSharedIterator *iterator) 1158 { 1159 pfree(iterator); 1160 } 1161 1162 /* 1163 * tbm_find_pageentry - find a PagetableEntry for the pageno 1164 * 1165 * Returns NULL if there is no non-lossy entry for the pageno. 1166 */ 1167 static const PagetableEntry * 1168 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno) 1169 { 1170 const PagetableEntry *page; 1171 1172 if (tbm->nentries == 0) /* in case pagetable doesn't exist */ 1173 return NULL; 1174 1175 if (tbm->status == TBM_ONE_PAGE) 1176 { 1177 page = &tbm->entry1; 1178 if (page->blockno != pageno) 1179 return NULL; 1180 Assert(!page->ischunk); 1181 return page; 1182 } 1183 1184 page = pagetable_lookup(tbm->pagetable, pageno); 1185 if (page == NULL) 1186 return NULL; 1187 if (page->ischunk) 1188 return NULL; /* don't want a lossy chunk header */ 1189 return page; 1190 } 1191 1192 /* 1193 * tbm_get_pageentry - find or create a PagetableEntry for the pageno 1194 * 1195 * If new, the entry is marked as an exact (non-chunk) entry. 1196 * 1197 * This may cause the table to exceed the desired memory size. It is 1198 * up to the caller to call tbm_lossify() at the next safe point if so. 1199 */ 1200 static PagetableEntry * 1201 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) 1202 { 1203 PagetableEntry *page; 1204 bool found; 1205 1206 if (tbm->status == TBM_EMPTY) 1207 { 1208 /* Use the fixed slot */ 1209 page = &tbm->entry1; 1210 found = false; 1211 tbm->status = TBM_ONE_PAGE; 1212 } 1213 else 1214 { 1215 if (tbm->status == TBM_ONE_PAGE) 1216 { 1217 page = &tbm->entry1; 1218 if (page->blockno == pageno) 1219 return page; 1220 /* Time to switch from one page to a hashtable */ 1221 tbm_create_pagetable(tbm); 1222 } 1223 1224 /* Look up or create an entry */ 1225 page = pagetable_insert(tbm->pagetable, pageno, &found); 1226 } 1227 1228 /* Initialize it if not present before */ 1229 if (!found) 1230 { 1231 char oldstatus = page->status; 1232 1233 MemSet(page, 0, sizeof(PagetableEntry)); 1234 page->status = oldstatus; 1235 page->blockno = pageno; 1236 /* must count it too */ 1237 tbm->nentries++; 1238 tbm->npages++; 1239 } 1240 1241 return page; 1242 } 1243 1244 /* 1245 * tbm_page_is_lossy - is the page marked as lossily stored? 1246 */ 1247 static bool 1248 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno) 1249 { 1250 PagetableEntry *page; 1251 BlockNumber chunk_pageno; 1252 int bitno; 1253 1254 /* we can skip the lookup if there are no lossy chunks */ 1255 if (tbm->nchunks == 0) 1256 return false; 1257 Assert(tbm->status == TBM_HASH); 1258 1259 bitno = pageno % PAGES_PER_CHUNK; 1260 chunk_pageno = pageno - bitno; 1261 1262 page = pagetable_lookup(tbm->pagetable, chunk_pageno); 1263 1264 if (page != NULL && page->ischunk) 1265 { 1266 int wordnum = WORDNUM(bitno); 1267 int bitnum = BITNUM(bitno); 1268 1269 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) 1270 return true; 1271 } 1272 return false; 1273 } 1274 1275 /* 1276 * tbm_mark_page_lossy - mark the page number as lossily stored 1277 * 1278 * This may cause the table to exceed the desired memory size. It is 1279 * up to the caller to call tbm_lossify() at the next safe point if so. 1280 */ 1281 static void 1282 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) 1283 { 1284 PagetableEntry *page; 1285 bool found; 1286 BlockNumber chunk_pageno; 1287 int bitno; 1288 int wordnum; 1289 int bitnum; 1290 1291 /* We force the bitmap into hashtable mode whenever it's lossy */ 1292 if (tbm->status != TBM_HASH) 1293 tbm_create_pagetable(tbm); 1294 1295 bitno = pageno % PAGES_PER_CHUNK; 1296 chunk_pageno = pageno - bitno; 1297 1298 /* 1299 * Remove any extant non-lossy entry for the page. If the page is its own 1300 * chunk header, however, we skip this and handle the case below. 1301 */ 1302 if (bitno != 0) 1303 { 1304 if (pagetable_delete(tbm->pagetable, pageno)) 1305 { 1306 /* It was present, so adjust counts */ 1307 tbm->nentries--; 1308 tbm->npages--; /* assume it must have been non-lossy */ 1309 } 1310 } 1311 1312 /* Look up or create entry for chunk-header page */ 1313 page = pagetable_insert(tbm->pagetable, chunk_pageno, &found); 1314 1315 /* Initialize it if not present before */ 1316 if (!found) 1317 { 1318 char oldstatus = page->status; 1319 1320 MemSet(page, 0, sizeof(PagetableEntry)); 1321 page->status = oldstatus; 1322 page->blockno = chunk_pageno; 1323 page->ischunk = true; 1324 /* must count it too */ 1325 tbm->nentries++; 1326 tbm->nchunks++; 1327 } 1328 else if (!page->ischunk) 1329 { 1330 char oldstatus = page->status; 1331 1332 /* chunk header page was formerly non-lossy, make it lossy */ 1333 MemSet(page, 0, sizeof(PagetableEntry)); 1334 page->status = oldstatus; 1335 page->blockno = chunk_pageno; 1336 page->ischunk = true; 1337 /* we assume it had some tuple bit(s) set, so mark it lossy */ 1338 page->words[0] = ((bitmapword) 1 << 0); 1339 /* adjust counts */ 1340 tbm->nchunks++; 1341 tbm->npages--; 1342 } 1343 1344 /* Now set the original target page's bit */ 1345 wordnum = WORDNUM(bitno); 1346 bitnum = BITNUM(bitno); 1347 page->words[wordnum] |= ((bitmapword) 1 << bitnum); 1348 } 1349 1350 /* 1351 * tbm_lossify - lose some information to get back under the memory limit 1352 */ 1353 static void 1354 tbm_lossify(TIDBitmap *tbm) 1355 { 1356 pagetable_iterator i; 1357 PagetableEntry *page; 1358 1359 /* 1360 * XXX Really stupid implementation: this just lossifies pages in 1361 * essentially random order. We should be paying some attention to the 1362 * number of bits set in each page, instead. 1363 * 1364 * Since we are called as soon as nentries exceeds maxentries, we should 1365 * push nentries down to significantly less than maxentries, or else we'll 1366 * just end up doing this again very soon. We shoot for maxentries/2. 1367 */ 1368 Assert(tbm->iterating == TBM_NOT_ITERATING); 1369 Assert(tbm->status == TBM_HASH); 1370 1371 pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start); 1372 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) 1373 { 1374 if (page->ischunk) 1375 continue; /* already a chunk header */ 1376 1377 /* 1378 * If the page would become a chunk header, we won't save anything by 1379 * converting it to lossy, so skip it. 1380 */ 1381 if ((page->blockno % PAGES_PER_CHUNK) == 0) 1382 continue; 1383 1384 /* This does the dirty work ... */ 1385 tbm_mark_page_lossy(tbm, page->blockno); 1386 1387 if (tbm->nentries <= tbm->maxentries / 2) 1388 { 1389 /* 1390 * We have made enough room. Remember where to start lossifying 1391 * next round, so we evenly iterate over the hashtable. 1392 */ 1393 tbm->lossify_start = i.cur; 1394 break; 1395 } 1396 1397 /* 1398 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the 1399 * hashtable and may have deleted the non-lossy chunk. We can 1400 * continue the same hash table scan, since failure to visit one 1401 * element or visiting the newly inserted element, isn't fatal. 1402 */ 1403 } 1404 1405 /* 1406 * With a big bitmap and small work_mem, it's possible that we cannot get 1407 * under maxentries. Again, if that happens, we'd end up uselessly 1408 * calling tbm_lossify over and over. To prevent this from becoming a 1409 * performance sink, force maxentries up to at least double the current 1410 * number of entries. (In essence, we're admitting inability to fit 1411 * within work_mem when we do this.) Note that this test will not fire if 1412 * we broke out of the loop early; and if we didn't, the current number of 1413 * entries is simply not reducible any further. 1414 */ 1415 if (tbm->nentries > tbm->maxentries / 2) 1416 tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2; 1417 } 1418 1419 /* 1420 * qsort comparator to handle PagetableEntry pointers. 1421 */ 1422 static int 1423 tbm_comparator(const void *left, const void *right) 1424 { 1425 BlockNumber l = (*((PagetableEntry *const *) left))->blockno; 1426 BlockNumber r = (*((PagetableEntry *const *) right))->blockno; 1427 1428 if (l < r) 1429 return -1; 1430 else if (l > r) 1431 return 1; 1432 return 0; 1433 } 1434 1435 /* 1436 * As above, but this will get index into PagetableEntry array. Therefore, 1437 * it needs to get actual PagetableEntry using the index before comparing the 1438 * blockno. 1439 */ 1440 static int 1441 tbm_shared_comparator(const void *left, const void *right, void *arg) 1442 { 1443 PagetableEntry *base = (PagetableEntry *) arg; 1444 PagetableEntry *lpage = &base[*(int *) left]; 1445 PagetableEntry *rpage = &base[*(int *) right]; 1446 1447 if (lpage->blockno < rpage->blockno) 1448 return -1; 1449 else if (lpage->blockno > rpage->blockno) 1450 return 1; 1451 return 0; 1452 } 1453 1454 /* 1455 * tbm_attach_shared_iterate 1456 * 1457 * Allocate a backend-private iterator and attach the shared iterator state 1458 * to it so that multiple processed can iterate jointly. 1459 * 1460 * We also converts the DSA pointers to local pointers and store them into 1461 * our private iterator. 1462 */ 1463 TBMSharedIterator * 1464 tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp) 1465 { 1466 TBMSharedIterator *iterator; 1467 TBMSharedIteratorState *istate; 1468 1469 /* 1470 * Create the TBMSharedIterator struct, with enough trailing space to 1471 * serve the needs of the TBMIterateResult sub-struct. 1472 */ 1473 iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) + 1474 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); 1475 1476 istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp); 1477 1478 iterator->state = istate; 1479 1480 iterator->ptbase = dsa_get_address(dsa, istate->pagetable); 1481 1482 if (istate->npages) 1483 iterator->ptpages = dsa_get_address(dsa, istate->spages); 1484 if (istate->nchunks) 1485 iterator->ptchunks = dsa_get_address(dsa, istate->schunks); 1486 1487 return iterator; 1488 } 1489 1490 /* 1491 * pagetable_allocate 1492 * 1493 * Callback function for allocating the memory for hashtable elements. 1494 * Allocate memory for hashtable elements, using DSA if available. 1495 */ 1496 static inline void * 1497 pagetable_allocate(pagetable_hash *pagetable, Size size) 1498 { 1499 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data; 1500 PTEntryArray *ptbase; 1501 1502 if (tbm->dsa == NULL) 1503 return MemoryContextAllocExtended(pagetable->ctx, size, 1504 MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO); 1505 1506 /* 1507 * Save the dsapagetable reference in dsapagetableold before allocating 1508 * new memory so that pagetable_free can free the old entry. 1509 */ 1510 tbm->dsapagetableold = tbm->dsapagetable; 1511 tbm->dsapagetable = dsa_allocate_extended(tbm->dsa, 1512 sizeof(PTEntryArray) + size, 1513 DSA_ALLOC_HUGE | DSA_ALLOC_ZERO); 1514 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); 1515 1516 return ptbase->ptentry; 1517 } 1518 1519 /* 1520 * pagetable_free 1521 * 1522 * Callback function for freeing hash table elements. 1523 */ 1524 static inline void 1525 pagetable_free(pagetable_hash *pagetable, void *pointer) 1526 { 1527 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data; 1528 1529 /* pfree the input pointer if DSA is not available */ 1530 if (tbm->dsa == NULL) 1531 pfree(pointer); 1532 else if (DsaPointerIsValid(tbm->dsapagetableold)) 1533 { 1534 dsa_free(tbm->dsa, tbm->dsapagetableold); 1535 tbm->dsapagetableold = InvalidDsaPointer; 1536 } 1537 } 1538 1539 /* 1540 * tbm_calculate_entries 1541 * 1542 * Estimate number of hashtable entries we can have within maxbytes. 1543 */ 1544 long 1545 tbm_calculate_entries(double maxbytes) 1546 { 1547 long nbuckets; 1548 1549 /* 1550 * Estimate number of hashtable entries we can have within maxbytes. This 1551 * estimates the hash cost as sizeof(PagetableEntry), which is good enough 1552 * for our purpose. Also count an extra Pointer per entry for the arrays 1553 * created during iteration readout. 1554 */ 1555 nbuckets = maxbytes / 1556 (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer)); 1557 nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */ 1558 nbuckets = Max(nbuckets, 16); /* sanity limit */ 1559 1560 return nbuckets; 1561 } 1562