1 /*------------------------------------------------------------------------- 2 * 3 * buf_internals.h 4 * Internal definitions for buffer manager and the buffer replacement 5 * strategy. 6 * 7 * 8 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group 9 * Portions Copyright (c) 1994, Regents of the University of California 10 * 11 * src/include/storage/buf_internals.h 12 * 13 *------------------------------------------------------------------------- 14 */ 15 #ifndef BUFMGR_INTERNALS_H 16 #define BUFMGR_INTERNALS_H 17 18 #include "port/atomics.h" 19 #include "storage/buf.h" 20 #include "storage/bufmgr.h" 21 #include "storage/condition_variable.h" 22 #include "storage/latch.h" 23 #include "storage/lwlock.h" 24 #include "storage/shmem.h" 25 #include "storage/smgr.h" 26 #include "storage/spin.h" 27 #include "utils/relcache.h" 28 29 /* 30 * Buffer state is a single 32-bit variable where following data is combined. 31 * 32 * - 18 bits refcount 33 * - 4 bits usage count 34 * - 10 bits of flags 35 * 36 * Combining these values allows to perform some operations without locking 37 * the buffer header, by modifying them together with a CAS loop. 38 * 39 * The definition of buffer state components is below. 40 */ 41 #define BUF_REFCOUNT_ONE 1 42 #define BUF_REFCOUNT_MASK ((1U << 18) - 1) 43 #define BUF_USAGECOUNT_MASK 0x003C0000U 44 #define BUF_USAGECOUNT_ONE (1U << 18) 45 #define BUF_USAGECOUNT_SHIFT 18 46 #define BUF_FLAG_MASK 0xFFC00000U 47 48 /* Get refcount and usagecount from buffer state */ 49 #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK) 50 #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT) 51 52 /* 53 * Flags for buffer descriptors 54 * 55 * Note: BM_TAG_VALID essentially means that there is a buffer hashtable 56 * entry associated with the buffer's tag. 57 */ 58 #define BM_LOCKED (1U << 22) /* buffer header is locked */ 59 #define BM_DIRTY (1U << 23) /* data needs writing */ 60 #define BM_VALID (1U << 24) /* data is valid */ 61 #define BM_TAG_VALID (1U << 25) /* tag is assigned */ 62 #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */ 63 #define BM_IO_ERROR (1U << 27) /* previous I/O failed */ 64 #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */ 65 #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */ 66 #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */ 67 #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged, 68 * or init fork) */ 69 /* 70 * The maximum allowed value of usage_count represents a tradeoff between 71 * accuracy and speed of the clock-sweep buffer management algorithm. A 72 * large value (comparable to NBuffers) would approximate LRU semantics. 73 * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of 74 * clock sweeps to find a free buffer, so in practice we don't want the 75 * value to be very large. 76 */ 77 #define BM_MAX_USAGE_COUNT 5 78 79 /* 80 * Buffer tag identifies which disk block the buffer contains. 81 * 82 * Note: the BufferTag data must be sufficient to determine where to write the 83 * block, without reference to pg_class or pg_tablespace entries. It's 84 * possible that the backend flushing the buffer doesn't even believe the 85 * relation is visible yet (its xact may have started before the xact that 86 * created the rel). The storage manager must be able to cope anyway. 87 * 88 * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have 89 * to be fixed to zero them, since this struct is used as a hash key. 90 */ 91 typedef struct buftag 92 { 93 RelFileNode rnode; /* physical relation identifier */ 94 ForkNumber forkNum; 95 BlockNumber blockNum; /* blknum relative to begin of reln */ 96 } BufferTag; 97 98 #define CLEAR_BUFFERTAG(a) \ 99 ( \ 100 (a).rnode.spcNode = InvalidOid, \ 101 (a).rnode.dbNode = InvalidOid, \ 102 (a).rnode.relNode = InvalidOid, \ 103 (a).forkNum = InvalidForkNumber, \ 104 (a).blockNum = InvalidBlockNumber \ 105 ) 106 107 #define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \ 108 ( \ 109 (a).rnode = (xx_rnode), \ 110 (a).forkNum = (xx_forkNum), \ 111 (a).blockNum = (xx_blockNum) \ 112 ) 113 114 #define BUFFERTAGS_EQUAL(a,b) \ 115 ( \ 116 RelFileNodeEquals((a).rnode, (b).rnode) && \ 117 (a).blockNum == (b).blockNum && \ 118 (a).forkNum == (b).forkNum \ 119 ) 120 121 /* 122 * The shared buffer mapping table is partitioned to reduce contention. 123 * To determine which partition lock a given tag requires, compute the tag's 124 * hash code with BufTableHashCode(), then apply BufMappingPartitionLock(). 125 * NB: NUM_BUFFER_PARTITIONS must be a power of 2! 126 */ 127 #define BufTableHashPartition(hashcode) \ 128 ((hashcode) % NUM_BUFFER_PARTITIONS) 129 #define BufMappingPartitionLock(hashcode) \ 130 (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \ 131 BufTableHashPartition(hashcode)].lock) 132 #define BufMappingPartitionLockByIndex(i) \ 133 (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock) 134 135 /* 136 * BufferDesc -- shared descriptor/state data for a single shared buffer. 137 * 138 * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change 139 * the tag, state or wait_backend_pid fields. In general, buffer header lock 140 * is a spinlock which is combined with flags, refcount and usagecount into 141 * single atomic variable. This layout allow us to do some operations in a 142 * single atomic operation, without actually acquiring and releasing spinlock; 143 * for instance, increase or decrease refcount. buf_id field never changes 144 * after initialization, so does not need locking. freeNext is protected by 145 * the buffer_strategy_lock not buffer header lock. The LWLock can take care 146 * of itself. The buffer header lock is *not* used to control access to the 147 * data in the buffer! 148 * 149 * It's assumed that nobody changes the state field while buffer header lock 150 * is held. Thus buffer header lock holder can do complex updates of the 151 * state variable in single write, simultaneously with lock release (cleaning 152 * BM_LOCKED flag). On the other hand, updating of state without holding 153 * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag 154 * is not set. Atomic increment/decrement, OR/AND etc. are not allowed. 155 * 156 * An exception is that if we have the buffer pinned, its tag can't change 157 * underneath us, so we can examine the tag without locking the buffer header. 158 * Also, in places we do one-time reads of the flags without bothering to 159 * lock the buffer header; this is generally for situations where we don't 160 * expect the flag bit being tested to be changing. 161 * 162 * We can't physically remove items from a disk page if another backend has 163 * the buffer pinned. Hence, a backend may need to wait for all other pins 164 * to go away. This is signaled by storing its own PID into 165 * wait_backend_pid and setting flag bit BM_PIN_COUNT_WAITER. At present, 166 * there can be only one such waiter per buffer. 167 * 168 * We use this same struct for local buffer headers, but the locks are not 169 * used and not all of the flag bits are useful either. To avoid unnecessary 170 * overhead, manipulations of the state field should be done without actual 171 * atomic operations (i.e. only pg_atomic_read_u32() and 172 * pg_atomic_unlocked_write_u32()). 173 * 174 * Be careful to avoid increasing the size of the struct when adding or 175 * reordering members. Keeping it below 64 bytes (the most common CPU 176 * cache line size) is fairly important for performance. 177 * 178 * Per-buffer I/O condition variables are currently kept outside this struct in 179 * a separate array. They could be moved in here and still fit within that 180 * limit on common systems, but for now that is not done. 181 */ 182 typedef struct BufferDesc 183 { 184 BufferTag tag; /* ID of page contained in buffer */ 185 int buf_id; /* buffer's index number (from 0) */ 186 187 /* state of the tag, containing flags, refcount and usagecount */ 188 pg_atomic_uint32 state; 189 190 int wait_backend_pid; /* backend PID of pin-count waiter */ 191 int freeNext; /* link in freelist chain */ 192 LWLock content_lock; /* to lock access to buffer contents */ 193 } BufferDesc; 194 195 /* 196 * Concurrent access to buffer headers has proven to be more efficient if 197 * they're cache line aligned. So we force the start of the BufferDescriptors 198 * array to be on a cache line boundary and force the elements to be cache 199 * line sized. 200 * 201 * XXX: As this is primarily matters in highly concurrent workloads which 202 * probably all are 64bit these days, and the space wastage would be a bit 203 * more noticeable on 32bit systems, we don't force the stride to be cache 204 * line sized on those. If somebody does actual performance testing, we can 205 * reevaluate. 206 * 207 * Note that local buffer descriptors aren't forced to be aligned - as there's 208 * no concurrent access to those it's unlikely to be beneficial. 209 * 210 * We use a 64-byte cache line size here, because that's the most common 211 * size. Making it bigger would be a waste of memory. Even if running on a 212 * platform with either 32 or 128 byte line sizes, it's good to align to 213 * boundaries and avoid false sharing. 214 */ 215 #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1) 216 217 typedef union BufferDescPadded 218 { 219 BufferDesc bufferdesc; 220 char pad[BUFFERDESC_PAD_TO_SIZE]; 221 } BufferDescPadded; 222 223 #define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc) 224 #define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)]) 225 226 #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1) 227 228 #define BufferDescriptorGetIOCV(bdesc) \ 229 (&(BufferIOCVArray[(bdesc)->buf_id]).cv) 230 #define BufferDescriptorGetContentLock(bdesc) \ 231 ((LWLock*) (&(bdesc)->content_lock)) 232 233 extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray; 234 235 /* 236 * The freeNext field is either the index of the next freelist entry, 237 * or one of these special values: 238 */ 239 #define FREENEXT_END_OF_LIST (-1) 240 #define FREENEXT_NOT_IN_LIST (-2) 241 242 /* 243 * Functions for acquiring/releasing a shared buffer header's spinlock. Do 244 * not apply these to local buffers! 245 */ 246 extern uint32 LockBufHdr(BufferDesc *desc); 247 #define UnlockBufHdr(desc, s) \ 248 do { \ 249 pg_write_barrier(); \ 250 pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \ 251 } while (0) 252 253 254 /* 255 * The PendingWriteback & WritebackContext structure are used to keep 256 * information about pending flush requests to be issued to the OS. 257 */ 258 typedef struct PendingWriteback 259 { 260 /* could store different types of pending flushes here */ 261 BufferTag tag; 262 } PendingWriteback; 263 264 /* struct forward declared in bufmgr.h */ 265 typedef struct WritebackContext 266 { 267 /* pointer to the max number of writeback requests to coalesce */ 268 int *max_pending; 269 270 /* current number of pending writeback requests */ 271 int nr_pending; 272 273 /* pending requests */ 274 PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]; 275 } WritebackContext; 276 277 /* in buf_init.c */ 278 extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; 279 extern PGDLLIMPORT WritebackContext BackendWritebackContext; 280 281 /* in localbuf.c */ 282 extern BufferDesc *LocalBufferDescriptors; 283 284 /* in bufmgr.c */ 285 286 /* 287 * Structure to sort buffers per file on checkpoints. 288 * 289 * This structure is allocated per buffer in shared memory, so it should be 290 * kept as small as possible. 291 */ 292 typedef struct CkptSortItem 293 { 294 Oid tsId; 295 Oid relNode; 296 ForkNumber forkNum; 297 BlockNumber blockNum; 298 int buf_id; 299 } CkptSortItem; 300 301 extern CkptSortItem *CkptBufferIds; 302 303 /* 304 * Internal buffer management routines 305 */ 306 /* bufmgr.c */ 307 extern void WritebackContextInit(WritebackContext *context, int *max_pending); 308 extern void IssuePendingWritebacks(WritebackContext *context); 309 extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag); 310 311 /* freelist.c */ 312 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, 313 uint32 *buf_state); 314 extern void StrategyFreeBuffer(BufferDesc *buf); 315 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, 316 BufferDesc *buf); 317 318 extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc); 319 extern void StrategyNotifyBgWriter(int bgwprocno); 320 321 extern Size StrategyShmemSize(void); 322 extern void StrategyInitialize(bool init); 323 extern bool have_free_buffer(void); 324 325 /* buf_table.c */ 326 extern Size BufTableShmemSize(int size); 327 extern void InitBufTable(int size); 328 extern uint32 BufTableHashCode(BufferTag *tagPtr); 329 extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode); 330 extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); 331 extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); 332 333 /* localbuf.c */ 334 extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, 335 ForkNumber forkNum, 336 BlockNumber blockNum); 337 extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, 338 BlockNumber blockNum, bool *foundPtr); 339 extern void MarkLocalBufferDirty(Buffer buffer); 340 extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, 341 BlockNumber firstDelBlock); 342 extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode); 343 extern void AtEOXact_LocalBuffers(bool isCommit); 344 345 #endif /* BUFMGR_INTERNALS_H */ 346