1 /*------------------------------------------------------------------------- 2 * 3 * freelist.c 4 * routines for managing the buffer pool's replacement strategy. 5 * 6 * 7 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group 8 * Portions Copyright (c) 1994, Regents of the University of California 9 * 10 * 11 * IDENTIFICATION 12 * src/backend/storage/buffer/freelist.c 13 * 14 *------------------------------------------------------------------------- 15 */ 16 #include "postgres.h" 17 18 #include "port/atomics.h" 19 #include "storage/buf_internals.h" 20 #include "storage/bufmgr.h" 21 #include "storage/proc.h" 22 23 #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var)))) 24 25 26 /* 27 * The shared freelist control information. 28 */ 29 typedef struct 30 { 31 /* Spinlock: protects the values below */ 32 slock_t buffer_strategy_lock; 33 34 /* 35 * Clock sweep hand: index of next buffer to consider grabbing. Note that 36 * this isn't a concrete buffer - we only ever increase the value. So, to 37 * get an actual buffer, it needs to be used modulo NBuffers. 38 */ 39 pg_atomic_uint32 nextVictimBuffer; 40 41 int firstFreeBuffer; /* Head of list of unused buffers */ 42 int lastFreeBuffer; /* Tail of list of unused buffers */ 43 44 /* 45 * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is, 46 * when the list is empty) 47 */ 48 49 /* 50 * Statistics. These counters should be wide enough that they can't 51 * overflow during a single bgwriter cycle. 52 */ 53 uint32 completePasses; /* Complete cycles of the clock sweep */ 54 pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ 55 56 /* 57 * Bgworker process to be notified upon activity or -1 if none. See 58 * StrategyNotifyBgWriter. 59 */ 60 int bgwprocno; 61 } BufferStrategyControl; 62 63 /* Pointers to shared state */ 64 static BufferStrategyControl *StrategyControl = NULL; 65 66 /* 67 * Private (non-shared) state for managing a ring of shared buffers to re-use. 68 * This is currently the only kind of BufferAccessStrategy object, but someday 69 * we might have more kinds. 70 */ 71 typedef struct BufferAccessStrategyData 72 { 73 /* Overall strategy type */ 74 BufferAccessStrategyType btype; 75 /* Number of elements in buffers[] array */ 76 int ring_size; 77 78 /* 79 * Index of the "current" slot in the ring, ie, the one most recently 80 * returned by GetBufferFromRing. 81 */ 82 int current; 83 84 /* 85 * True if the buffer just returned by StrategyGetBuffer had been in the 86 * ring already. 87 */ 88 bool current_was_in_ring; 89 90 /* 91 * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we 92 * have not yet selected a buffer for this ring slot. For allocation 93 * simplicity this is palloc'd together with the fixed fields of the 94 * struct. 95 */ 96 Buffer buffers[FLEXIBLE_ARRAY_MEMBER]; 97 } BufferAccessStrategyData; 98 99 100 /* Prototypes for internal functions */ 101 static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy, 102 uint32 *buf_state); 103 static void AddBufferToRing(BufferAccessStrategy strategy, 104 BufferDesc *buf); 105 106 /* 107 * ClockSweepTick - Helper routine for StrategyGetBuffer() 108 * 109 * Move the clock hand one buffer ahead of its current position and return the 110 * id of the buffer now under the hand. 111 */ 112 static inline uint32 113 ClockSweepTick(void) 114 { 115 uint32 victim; 116 117 /* 118 * Atomically move hand ahead one buffer - if there's several processes 119 * doing this, this can lead to buffers being returned slightly out of 120 * apparent order. 121 */ 122 victim = 123 pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); 124 125 if (victim >= NBuffers) 126 { 127 uint32 originalVictim = victim; 128 129 /* always wrap what we look up in BufferDescriptors */ 130 victim = victim % NBuffers; 131 132 /* 133 * If we're the one that just caused a wraparound, force 134 * completePasses to be incremented while holding the spinlock. We 135 * need the spinlock so StrategySyncStart() can return a consistent 136 * value consisting of nextVictimBuffer and completePasses. 137 */ 138 if (victim == 0) 139 { 140 uint32 expected; 141 uint32 wrapped; 142 bool success = false; 143 144 expected = originalVictim + 1; 145 146 while (!success) 147 { 148 /* 149 * Acquire the spinlock while increasing completePasses. That 150 * allows other readers to read nextVictimBuffer and 151 * completePasses in a consistent manner which is required for 152 * StrategySyncStart(). In theory delaying the increment 153 * could lead to an overflow of nextVictimBuffers, but that's 154 * highly unlikely and wouldn't be particularly harmful. 155 */ 156 SpinLockAcquire(&StrategyControl->buffer_strategy_lock); 157 158 wrapped = expected % NBuffers; 159 160 success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer, 161 &expected, wrapped); 162 if (success) 163 StrategyControl->completePasses++; 164 SpinLockRelease(&StrategyControl->buffer_strategy_lock); 165 } 166 } 167 } 168 return victim; 169 } 170 171 /* 172 * have_free_buffer -- a lockless check to see if there is a free buffer in 173 * buffer pool. 174 * 175 * If the result is true that will become stale once free buffers are moved out 176 * by other operations, so the caller who strictly want to use a free buffer 177 * should not call this. 178 */ 179 bool 180 have_free_buffer(void) 181 { 182 if (StrategyControl->firstFreeBuffer >= 0) 183 return true; 184 else 185 return false; 186 } 187 188 /* 189 * StrategyGetBuffer 190 * 191 * Called by the bufmgr to get the next candidate buffer to use in 192 * BufferAlloc(). The only hard requirement BufferAlloc() has is that 193 * the selected buffer must not currently be pinned by anyone. 194 * 195 * strategy is a BufferAccessStrategy object, or NULL for default strategy. 196 * 197 * To ensure that no one else can pin the buffer before we do, we must 198 * return the buffer with the buffer header spinlock still held. 199 */ 200 BufferDesc * 201 StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) 202 { 203 BufferDesc *buf; 204 int bgwprocno; 205 int trycounter; 206 uint32 local_buf_state; /* to avoid repeated (de-)referencing */ 207 208 /* 209 * If given a strategy object, see whether it can select a buffer. We 210 * assume strategy objects don't need buffer_strategy_lock. 211 */ 212 if (strategy != NULL) 213 { 214 buf = GetBufferFromRing(strategy, buf_state); 215 if (buf != NULL) 216 return buf; 217 } 218 219 /* 220 * If asked, we need to waken the bgwriter. Since we don't want to rely on 221 * a spinlock for this we force a read from shared memory once, and then 222 * set the latch based on that value. We need to go through that length 223 * because otherwise bgwprocno might be reset while/after we check because 224 * the compiler might just reread from memory. 225 * 226 * This can possibly set the latch of the wrong process if the bgwriter 227 * dies in the wrong moment. But since PGPROC->procLatch is never 228 * deallocated the worst consequence of that is that we set the latch of 229 * some arbitrary process. 230 */ 231 bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno); 232 if (bgwprocno != -1) 233 { 234 /* reset bgwprocno first, before setting the latch */ 235 StrategyControl->bgwprocno = -1; 236 237 /* 238 * Not acquiring ProcArrayLock here which is slightly icky. It's 239 * actually fine because procLatch isn't ever freed, so we just can 240 * potentially set the wrong process' (or no process') latch. 241 */ 242 SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch); 243 } 244 245 /* 246 * We count buffer allocation requests so that the bgwriter can estimate 247 * the rate of buffer consumption. Note that buffers recycled by a 248 * strategy object are intentionally not counted here. 249 */ 250 pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); 251 252 /* 253 * First check, without acquiring the lock, whether there's buffers in the 254 * freelist. Since we otherwise don't require the spinlock in every 255 * StrategyGetBuffer() invocation, it'd be sad to acquire it here - 256 * uselessly in most cases. That obviously leaves a race where a buffer is 257 * put on the freelist but we don't see the store yet - but that's pretty 258 * harmless, it'll just get used during the next buffer acquisition. 259 * 260 * If there's buffers on the freelist, acquire the spinlock to pop one 261 * buffer of the freelist. Then check whether that buffer is usable and 262 * repeat if not. 263 * 264 * Note that the freeNext fields are considered to be protected by the 265 * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to 266 * manipulate them without holding the spinlock. 267 */ 268 if (StrategyControl->firstFreeBuffer >= 0) 269 { 270 while (true) 271 { 272 /* Acquire the spinlock to remove element from the freelist */ 273 SpinLockAcquire(&StrategyControl->buffer_strategy_lock); 274 275 if (StrategyControl->firstFreeBuffer < 0) 276 { 277 SpinLockRelease(&StrategyControl->buffer_strategy_lock); 278 break; 279 } 280 281 buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer); 282 Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); 283 284 /* Unconditionally remove buffer from freelist */ 285 StrategyControl->firstFreeBuffer = buf->freeNext; 286 buf->freeNext = FREENEXT_NOT_IN_LIST; 287 288 /* 289 * Release the lock so someone else can access the freelist while 290 * we check out this buffer. 291 */ 292 SpinLockRelease(&StrategyControl->buffer_strategy_lock); 293 294 /* 295 * If the buffer is pinned or has a nonzero usage_count, we cannot 296 * use it; discard it and retry. (This can only happen if VACUUM 297 * put a valid buffer in the freelist and then someone else used 298 * it before we got to it. It's probably impossible altogether as 299 * of 8.3, but we'd better check anyway.) 300 */ 301 local_buf_state = LockBufHdr(buf); 302 if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 303 && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0) 304 { 305 if (strategy != NULL) 306 AddBufferToRing(strategy, buf); 307 *buf_state = local_buf_state; 308 return buf; 309 } 310 UnlockBufHdr(buf, local_buf_state); 311 312 } 313 } 314 315 /* Nothing on the freelist, so run the "clock sweep" algorithm */ 316 trycounter = NBuffers; 317 for (;;) 318 { 319 buf = GetBufferDescriptor(ClockSweepTick()); 320 321 /* 322 * If the buffer is pinned or has a nonzero usage_count, we cannot use 323 * it; decrement the usage_count (unless pinned) and keep scanning. 324 */ 325 local_buf_state = LockBufHdr(buf); 326 327 if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0) 328 { 329 if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0) 330 { 331 local_buf_state -= BUF_USAGECOUNT_ONE; 332 333 trycounter = NBuffers; 334 } 335 else 336 { 337 /* Found a usable buffer */ 338 if (strategy != NULL) 339 AddBufferToRing(strategy, buf); 340 *buf_state = local_buf_state; 341 return buf; 342 } 343 } 344 else if (--trycounter == 0) 345 { 346 /* 347 * We've scanned all the buffers without making any state changes, 348 * so all the buffers are pinned (or were when we looked at them). 349 * We could hope that someone will free one eventually, but it's 350 * probably better to fail than to risk getting stuck in an 351 * infinite loop. 352 */ 353 UnlockBufHdr(buf, local_buf_state); 354 elog(ERROR, "no unpinned buffers available"); 355 } 356 UnlockBufHdr(buf, local_buf_state); 357 } 358 } 359 360 /* 361 * StrategyFreeBuffer: put a buffer on the freelist 362 */ 363 void 364 StrategyFreeBuffer(BufferDesc *buf) 365 { 366 SpinLockAcquire(&StrategyControl->buffer_strategy_lock); 367 368 /* 369 * It is possible that we are told to put something in the freelist that 370 * is already in it; don't screw up the list if so. 371 */ 372 if (buf->freeNext == FREENEXT_NOT_IN_LIST) 373 { 374 buf->freeNext = StrategyControl->firstFreeBuffer; 375 if (buf->freeNext < 0) 376 StrategyControl->lastFreeBuffer = buf->buf_id; 377 StrategyControl->firstFreeBuffer = buf->buf_id; 378 } 379 380 SpinLockRelease(&StrategyControl->buffer_strategy_lock); 381 } 382 383 /* 384 * StrategySyncStart -- tell BufferSync where to start syncing 385 * 386 * The result is the buffer index of the best buffer to sync first. 387 * BufferSync() will proceed circularly around the buffer array from there. 388 * 389 * In addition, we return the completed-pass count (which is effectively 390 * the higher-order bits of nextVictimBuffer) and the count of recent buffer 391 * allocs if non-NULL pointers are passed. The alloc count is reset after 392 * being read. 393 */ 394 int 395 StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) 396 { 397 uint32 nextVictimBuffer; 398 int result; 399 400 SpinLockAcquire(&StrategyControl->buffer_strategy_lock); 401 nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer); 402 result = nextVictimBuffer % NBuffers; 403 404 if (complete_passes) 405 { 406 *complete_passes = StrategyControl->completePasses; 407 408 /* 409 * Additionally add the number of wraparounds that happened before 410 * completePasses could be incremented. C.f. ClockSweepTick(). 411 */ 412 *complete_passes += nextVictimBuffer / NBuffers; 413 } 414 415 if (num_buf_alloc) 416 { 417 *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); 418 } 419 SpinLockRelease(&StrategyControl->buffer_strategy_lock); 420 return result; 421 } 422 423 /* 424 * StrategyNotifyBgWriter -- set or clear allocation notification latch 425 * 426 * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will 427 * set that latch. Pass -1 to clear the pending notification before it 428 * happens. This feature is used by the bgwriter process to wake itself up 429 * from hibernation, and is not meant for anybody else to use. 430 */ 431 void 432 StrategyNotifyBgWriter(int bgwprocno) 433 { 434 /* 435 * We acquire buffer_strategy_lock just to ensure that the store appears 436 * atomic to StrategyGetBuffer. The bgwriter should call this rather 437 * infrequently, so there's no performance penalty from being safe. 438 */ 439 SpinLockAcquire(&StrategyControl->buffer_strategy_lock); 440 StrategyControl->bgwprocno = bgwprocno; 441 SpinLockRelease(&StrategyControl->buffer_strategy_lock); 442 } 443 444 445 /* 446 * StrategyShmemSize 447 * 448 * estimate the size of shared memory used by the freelist-related structures. 449 * 450 * Note: for somewhat historical reasons, the buffer lookup hashtable size 451 * is also determined here. 452 */ 453 Size 454 StrategyShmemSize(void) 455 { 456 Size size = 0; 457 458 /* size of lookup hash table ... see comment in StrategyInitialize */ 459 size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); 460 461 /* size of the shared replacement strategy control block */ 462 size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl))); 463 464 return size; 465 } 466 467 /* 468 * StrategyInitialize -- initialize the buffer cache replacement 469 * strategy. 470 * 471 * Assumes: All of the buffers are already built into a linked list. 472 * Only called by postmaster and only during initialization. 473 */ 474 void 475 StrategyInitialize(bool init) 476 { 477 bool found; 478 479 /* 480 * Initialize the shared buffer lookup hashtable. 481 * 482 * Since we can't tolerate running out of lookup table entries, we must be 483 * sure to specify an adequate table size here. The maximum steady-state 484 * usage is of course NBuffers entries, but BufferAlloc() tries to insert 485 * a new entry before deleting the old. In principle this could be 486 * happening in each partition concurrently, so we could need as many as 487 * NBuffers + NUM_BUFFER_PARTITIONS entries. 488 */ 489 InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); 490 491 /* 492 * Get or create the shared strategy control block 493 */ 494 StrategyControl = (BufferStrategyControl *) 495 ShmemInitStruct("Buffer Strategy Status", 496 sizeof(BufferStrategyControl), 497 &found); 498 499 if (!found) 500 { 501 /* 502 * Only done once, usually in postmaster 503 */ 504 Assert(init); 505 506 SpinLockInit(&StrategyControl->buffer_strategy_lock); 507 508 /* 509 * Grab the whole linked list of free buffers for our strategy. We 510 * assume it was previously set up by InitBufferPool(). 511 */ 512 StrategyControl->firstFreeBuffer = 0; 513 StrategyControl->lastFreeBuffer = NBuffers - 1; 514 515 /* Initialize the clock sweep pointer */ 516 pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); 517 518 /* Clear statistics */ 519 StrategyControl->completePasses = 0; 520 pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); 521 522 /* No pending notification */ 523 StrategyControl->bgwprocno = -1; 524 } 525 else 526 Assert(!init); 527 } 528 529 530 /* ---------------------------------------------------------------- 531 * Backend-private buffer ring management 532 * ---------------------------------------------------------------- 533 */ 534 535 536 /* 537 * GetAccessStrategy -- create a BufferAccessStrategy object 538 * 539 * The object is allocated in the current memory context. 540 */ 541 BufferAccessStrategy 542 GetAccessStrategy(BufferAccessStrategyType btype) 543 { 544 BufferAccessStrategy strategy; 545 int ring_size; 546 547 /* 548 * Select ring size to use. See buffer/README for rationales. 549 * 550 * Note: if you change the ring size for BAS_BULKREAD, see also 551 * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c. 552 */ 553 switch (btype) 554 { 555 case BAS_NORMAL: 556 /* if someone asks for NORMAL, just give 'em a "default" object */ 557 return NULL; 558 559 case BAS_BULKREAD: 560 ring_size = 256 * 1024 / BLCKSZ; 561 break; 562 case BAS_BULKWRITE: 563 ring_size = 16 * 1024 * 1024 / BLCKSZ; 564 break; 565 case BAS_VACUUM: 566 ring_size = 256 * 1024 / BLCKSZ; 567 break; 568 569 default: 570 elog(ERROR, "unrecognized buffer access strategy: %d", 571 (int) btype); 572 return NULL; /* keep compiler quiet */ 573 } 574 575 /* Make sure ring isn't an undue fraction of shared buffers */ 576 ring_size = Min(NBuffers / 8, ring_size); 577 578 /* Allocate the object and initialize all elements to zeroes */ 579 strategy = (BufferAccessStrategy) 580 palloc0(offsetof(BufferAccessStrategyData, buffers) + 581 ring_size * sizeof(Buffer)); 582 583 /* Set fields that don't start out zero */ 584 strategy->btype = btype; 585 strategy->ring_size = ring_size; 586 587 return strategy; 588 } 589 590 /* 591 * FreeAccessStrategy -- release a BufferAccessStrategy object 592 * 593 * A simple pfree would do at the moment, but we would prefer that callers 594 * don't assume that much about the representation of BufferAccessStrategy. 595 */ 596 void 597 FreeAccessStrategy(BufferAccessStrategy strategy) 598 { 599 /* don't crash if called on a "default" strategy */ 600 if (strategy != NULL) 601 pfree(strategy); 602 } 603 604 /* 605 * GetBufferFromRing -- returns a buffer from the ring, or NULL if the 606 * ring is empty. 607 * 608 * The bufhdr spin lock is held on the returned buffer. 609 */ 610 static BufferDesc * 611 GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) 612 { 613 BufferDesc *buf; 614 Buffer bufnum; 615 uint32 local_buf_state; /* to avoid repeated (de-)referencing */ 616 617 618 /* Advance to next ring slot */ 619 if (++strategy->current >= strategy->ring_size) 620 strategy->current = 0; 621 622 /* 623 * If the slot hasn't been filled yet, tell the caller to allocate a new 624 * buffer with the normal allocation strategy. He will then fill this 625 * slot by calling AddBufferToRing with the new buffer. 626 */ 627 bufnum = strategy->buffers[strategy->current]; 628 if (bufnum == InvalidBuffer) 629 { 630 strategy->current_was_in_ring = false; 631 return NULL; 632 } 633 634 /* 635 * If the buffer is pinned we cannot use it under any circumstances. 636 * 637 * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, 638 * since our own previous usage of the ring element would have left it 639 * there, but it might've been decremented by clock sweep since then). A 640 * higher usage_count indicates someone else has touched the buffer, so we 641 * shouldn't re-use it. 642 */ 643 buf = GetBufferDescriptor(bufnum - 1); 644 local_buf_state = LockBufHdr(buf); 645 if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 646 && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1) 647 { 648 strategy->current_was_in_ring = true; 649 *buf_state = local_buf_state; 650 return buf; 651 } 652 UnlockBufHdr(buf, local_buf_state); 653 654 /* 655 * Tell caller to allocate a new buffer with the normal allocation 656 * strategy. He'll then replace this ring element via AddBufferToRing. 657 */ 658 strategy->current_was_in_ring = false; 659 return NULL; 660 } 661 662 /* 663 * AddBufferToRing -- add a buffer to the buffer ring 664 * 665 * Caller must hold the buffer header spinlock on the buffer. Since this 666 * is called with the spinlock held, it had better be quite cheap. 667 */ 668 static void 669 AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf) 670 { 671 strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf); 672 } 673 674 /* 675 * StrategyRejectBuffer -- consider rejecting a dirty buffer 676 * 677 * When a nondefault strategy is used, the buffer manager calls this function 678 * when it turns out that the buffer selected by StrategyGetBuffer needs to 679 * be written out and doing so would require flushing WAL too. This gives us 680 * a chance to choose a different victim. 681 * 682 * Returns true if buffer manager should ask for a new victim, and false 683 * if this buffer should be written and re-used. 684 */ 685 bool 686 StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf) 687 { 688 /* We only do this in bulkread mode */ 689 if (strategy->btype != BAS_BULKREAD) 690 return false; 691 692 /* Don't muck with behavior of normal buffer-replacement strategy */ 693 if (!strategy->current_was_in_ring || 694 strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf)) 695 return false; 696 697 /* 698 * Remove the dirty buffer from the ring; necessary to prevent infinite 699 * loop if all ring members are dirty. 700 */ 701 strategy->buffers[strategy->current] = InvalidBuffer; 702 703 return true; 704 } 705