1 /*-------------------------------------------------------------------------
2  *
3  * buf_internals.h
4  *	  Internal definitions for buffer manager and the buffer replacement
5  *	  strategy.
6  *
7  *
8  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
9  * Portions Copyright (c) 1994, Regents of the University of California
10  *
11  * src/include/storage/buf_internals.h
12  *
13  *-------------------------------------------------------------------------
14  */
15 #ifndef BUFMGR_INTERNALS_H
16 #define BUFMGR_INTERNALS_H
17 
18 #include "storage/buf.h"
19 #include "storage/bufmgr.h"
20 #include "storage/latch.h"
21 #include "storage/lwlock.h"
22 #include "storage/shmem.h"
23 #include "storage/smgr.h"
24 #include "port/atomics.h"
25 #include "storage/spin.h"
26 #include "utils/relcache.h"
27 
28 
29 /*
30  * Buffer state is a single 32-bit variable where following data is combined.
31  *
32  * - 18 bits refcount
33  * - 4 bits usage count
34  * - 10 bits of flags
35  *
36  * Combining these values allows to perform some operations without locking
37  * the buffer header, by modifying them together with a CAS loop.
38  *
39  * The definition of buffer state components is below.
40  */
41 #define BUF_REFCOUNT_ONE 1
42 #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
43 #define BUF_USAGECOUNT_MASK 0x003C0000U
44 #define BUF_USAGECOUNT_ONE (1U << 18)
45 #define BUF_USAGECOUNT_SHIFT 18
46 #define BUF_FLAG_MASK 0xFFC00000U
47 
48 /* Get refcount and usagecount from buffer state */
49 #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
50 #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
51 
52 /*
53  * Flags for buffer descriptors
54  *
55  * Note: TAG_VALID essentially means that there is a buffer hashtable
56  * entry associated with the buffer's tag.
57  */
58 #define BM_LOCKED				(1U << 22)	/* buffer header is locked */
59 #define BM_DIRTY				(1U << 23)	/* data needs writing */
60 #define BM_VALID				(1U << 24)	/* data is valid */
61 #define BM_TAG_VALID			(1U << 25)	/* tag is assigned */
62 #define BM_IO_IN_PROGRESS		(1U << 26)	/* read or write in progress */
63 #define BM_IO_ERROR				(1U << 27)	/* previous I/O failed */
64 #define BM_JUST_DIRTIED			(1U << 28)	/* dirtied since write started */
65 #define BM_PIN_COUNT_WAITER		(1U << 29)	/* have waiter for sole pin */
66 #define BM_CHECKPOINT_NEEDED	(1U << 30)	/* must write for checkpoint */
67 #define BM_PERMANENT			(1U << 31)	/* permanent buffer (not unlogged,
68 											 * or init fork) */
69 /*
70  * The maximum allowed value of usage_count represents a tradeoff between
71  * accuracy and speed of the clock-sweep buffer management algorithm.  A
72  * large value (comparable to NBuffers) would approximate LRU semantics.
73  * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
74  * clock sweeps to find a free buffer, so in practice we don't want the
75  * value to be very large.
76  */
77 #define BM_MAX_USAGE_COUNT	5
78 
79 /*
80  * Buffer tag identifies which disk block the buffer contains.
81  *
82  * Note: the BufferTag data must be sufficient to determine where to write the
83  * block, without reference to pg_class or pg_tablespace entries.  It's
84  * possible that the backend flushing the buffer doesn't even believe the
85  * relation is visible yet (its xact may have started before the xact that
86  * created the rel).  The storage manager must be able to cope anyway.
87  *
88  * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
89  * to be fixed to zero them, since this struct is used as a hash key.
90  */
91 typedef struct buftag
92 {
93 	RelFileNode rnode;			/* physical relation identifier */
94 	ForkNumber	forkNum;
95 	BlockNumber blockNum;		/* blknum relative to begin of reln */
96 } BufferTag;
97 
98 #define CLEAR_BUFFERTAG(a) \
99 ( \
100 	(a).rnode.spcNode = InvalidOid, \
101 	(a).rnode.dbNode = InvalidOid, \
102 	(a).rnode.relNode = InvalidOid, \
103 	(a).forkNum = InvalidForkNumber, \
104 	(a).blockNum = InvalidBlockNumber \
105 )
106 
107 #define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \
108 ( \
109 	(a).rnode = (xx_rnode), \
110 	(a).forkNum = (xx_forkNum), \
111 	(a).blockNum = (xx_blockNum) \
112 )
113 
114 #define BUFFERTAGS_EQUAL(a,b) \
115 ( \
116 	RelFileNodeEquals((a).rnode, (b).rnode) && \
117 	(a).blockNum == (b).blockNum && \
118 	(a).forkNum == (b).forkNum \
119 )
120 
121 /*
122  * The shared buffer mapping table is partitioned to reduce contention.
123  * To determine which partition lock a given tag requires, compute the tag's
124  * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
125  * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
126  */
127 #define BufTableHashPartition(hashcode) \
128 	((hashcode) % NUM_BUFFER_PARTITIONS)
129 #define BufMappingPartitionLock(hashcode) \
130 	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
131 		BufTableHashPartition(hashcode)].lock)
132 #define BufMappingPartitionLockByIndex(i) \
133 	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
134 
135 /*
136  *	BufferDesc -- shared descriptor/state data for a single shared buffer.
137  *
138  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
139  * the tag, state or wait_backend_pid fields.  In general, buffer header lock
140  * is a spinlock which is combined with flags, refcount and usagecount into
141  * single atomic variable.  This layout allow us to do some operations in a
142  * single atomic operation, without actually acquiring and releasing spinlock;
143  * for instance, increase or decrease refcount.  buf_id field never changes
144  * after initialization, so does not need locking.  freeNext is protected by
145  * the buffer_strategy_lock not buffer header lock.  The LWLock can take care
146  * of itself.  The buffer header lock is *not* used to control access to the
147  * data in the buffer!
148  *
149  * It's assumed that nobody changes the state field while buffer header lock
150  * is held.  Thus buffer header lock holder can do complex updates of the
151  * state variable in single write, simultaneously with lock release (cleaning
152  * BM_LOCKED flag).  On the other hand, updating of state without holding
153  * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
154  * is not set.  Atomic increment/decrement, OR/AND etc. are not allowed.
155  *
156  * An exception is that if we have the buffer pinned, its tag can't change
157  * underneath us, so we can examine the tag without locking the buffer header.
158  * Also, in places we do one-time reads of the flags without bothering to
159  * lock the buffer header; this is generally for situations where we don't
160  * expect the flag bit being tested to be changing.
161  *
162  * We can't physically remove items from a disk page if another backend has
163  * the buffer pinned.  Hence, a backend may need to wait for all other pins
164  * to go away.  This is signaled by storing its own PID into
165  * wait_backend_pid and setting flag bit BM_PIN_COUNT_WAITER.  At present,
166  * there can be only one such waiter per buffer.
167  *
168  * We use this same struct for local buffer headers, but the locks are not
169  * used and not all of the flag bits are useful either. To avoid unnecessary
170  * overhead, manipulations of the state field should be done without actual
171  * atomic operations (i.e. only pg_atomic_read_u32() and
172  * pg_atomic_unlocked_write_u32()).
173  *
174  * Be careful to avoid increasing the size of the struct when adding or
175  * reordering members.  Keeping it below 64 bytes (the most common CPU
176  * cache line size) is fairly important for performance.
177  */
178 typedef struct BufferDesc
179 {
180 	BufferTag	tag;			/* ID of page contained in buffer */
181 	int			buf_id;			/* buffer's index number (from 0) */
182 
183 	/* state of the tag, containing flags, refcount and usagecount */
184 	pg_atomic_uint32 state;
185 
186 	int			wait_backend_pid;	/* backend PID of pin-count waiter */
187 	int			freeNext;		/* link in freelist chain */
188 
189 	LWLock		content_lock;	/* to lock access to buffer contents */
190 } BufferDesc;
191 
192 /*
193  * Concurrent access to buffer headers has proven to be more efficient if
194  * they're cache line aligned. So we force the start of the BufferDescriptors
195  * array to be on a cache line boundary and force the elements to be cache
196  * line sized.
197  *
198  * XXX: As this is primarily matters in highly concurrent workloads which
199  * probably all are 64bit these days, and the space wastage would be a bit
200  * more noticeable on 32bit systems, we don't force the stride to be cache
201  * line sized on those. If somebody does actual performance testing, we can
202  * reevaluate.
203  *
204  * Note that local buffer descriptors aren't forced to be aligned - as there's
205  * no concurrent access to those it's unlikely to be beneficial.
206  *
207  * We use a 64-byte cache line size here, because that's the most common
208  * size. Making it bigger would be a waste of memory. Even if running on a
209  * platform with either 32 or 128 byte line sizes, it's good to align to
210  * boundaries and avoid false sharing.
211  */
212 #define BUFFERDESC_PAD_TO_SIZE	(SIZEOF_VOID_P == 8 ? 64 : 1)
213 
214 typedef union BufferDescPadded
215 {
216 	BufferDesc	bufferdesc;
217 	char		pad[BUFFERDESC_PAD_TO_SIZE];
218 } BufferDescPadded;
219 
220 #define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc)
221 #define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)])
222 
223 #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
224 
225 #define BufferDescriptorGetIOLock(bdesc) \
226 	(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
227 #define BufferDescriptorGetContentLock(bdesc) \
228 	((LWLock*) (&(bdesc)->content_lock))
229 
230 extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
231 
232 /*
233  * The freeNext field is either the index of the next freelist entry,
234  * or one of these special values:
235  */
236 #define FREENEXT_END_OF_LIST	(-1)
237 #define FREENEXT_NOT_IN_LIST	(-2)
238 
239 /*
240  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
241  * not apply these to local buffers!
242  */
243 extern uint32 LockBufHdr(BufferDesc *desc);
244 #define UnlockBufHdr(desc, s)	\
245 	do {	\
246 		pg_write_barrier(); \
247 		pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
248 	} while (0)
249 
250 
251 /*
252  * The PendingWriteback & WritebackContext structure are used to keep
253  * information about pending flush requests to be issued to the OS.
254  */
255 typedef struct PendingWriteback
256 {
257 	/* could store different types of pending flushes here */
258 	BufferTag	tag;
259 } PendingWriteback;
260 
261 /* struct forward declared in bufmgr.h */
262 typedef struct WritebackContext
263 {
264 	/* pointer to the max number of writeback requests to coalesce */
265 	int		   *max_pending;
266 
267 	/* current number of pending writeback requests */
268 	int			nr_pending;
269 
270 	/* pending requests */
271 	PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
272 } WritebackContext;
273 
274 /* in buf_init.c */
275 extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
276 extern PGDLLIMPORT WritebackContext BackendWritebackContext;
277 
278 /* in localbuf.c */
279 extern BufferDesc *LocalBufferDescriptors;
280 
281 /* in bufmgr.c */
282 
283 /*
284  * Structure to sort buffers per file on checkpoints.
285  *
286  * This structure is allocated per buffer in shared memory, so it should be
287  * kept as small as possible.
288  */
289 typedef struct CkptSortItem
290 {
291 	Oid			tsId;
292 	Oid			relNode;
293 	ForkNumber	forkNum;
294 	BlockNumber blockNum;
295 	int			buf_id;
296 } CkptSortItem;
297 
298 extern CkptSortItem *CkptBufferIds;
299 
300 /*
301  * Internal buffer management routines
302  */
303 /* bufmgr.c */
304 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
305 extern void IssuePendingWritebacks(WritebackContext *context);
306 extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
307 
308 /* freelist.c */
309 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
310 				  uint32 *buf_state);
311 extern void StrategyFreeBuffer(BufferDesc *buf);
312 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
313 					 BufferDesc *buf);
314 
315 extern int	StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
316 extern void StrategyNotifyBgWriter(int bgwprocno);
317 
318 extern Size StrategyShmemSize(void);
319 extern void StrategyInitialize(bool init);
320 
321 /* buf_table.c */
322 extern Size BufTableShmemSize(int size);
323 extern void InitBufTable(int size);
324 extern uint32 BufTableHashCode(BufferTag *tagPtr);
325 extern int	BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
326 extern int	BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
327 extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
328 
329 /* localbuf.c */
330 extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
331 					BlockNumber blockNum);
332 extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
333 				 BlockNumber blockNum, bool *foundPtr);
334 extern void MarkLocalBufferDirty(Buffer buffer);
335 extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
336 							BlockNumber firstDelBlock);
337 extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode);
338 extern void AtEOXact_LocalBuffers(bool isCommit);
339 
340 #endif							/* BUFMGR_INTERNALS_H */
341