1 /*-------------------------------------------------------------------------
2  *
3  * visibilitymap.c
4  *	  bitmap for tracking visibility of heap tuples
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/heap/visibilitymap.c
12  *
13  * INTERFACE ROUTINES
14  *		visibilitymap_clear  - clear bits for one page in the visibility map
15  *		visibilitymap_pin	 - pin a map page for setting a bit
16  *		visibilitymap_pin_ok - check whether correct map page is already pinned
17  *		visibilitymap_set	 - set a bit in a previously pinned page
18  *		visibilitymap_get_status - get status of bits
19  *		visibilitymap_count  - count number of bits set in visibility map
20  *		visibilitymap_truncate	- truncate the visibility map
21  *
22  * NOTES
23  *
24  * The visibility map is a bitmap with two bits (all-visible and all-frozen)
25  * per heap page. A set all-visible bit means that all tuples on the page are
26  * known visible to all transactions, and therefore the page doesn't need to
27  * be vacuumed. A set all-frozen bit means that all tuples on the page are
28  * completely frozen, and therefore the page doesn't need to be vacuumed even
29  * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum).
30  * The all-frozen bit must be set only when the page is already all-visible.
31  *
32  * The map is conservative in the sense that we make sure that whenever a bit
33  * is set, we know the condition is true, but if a bit is not set, it might or
34  * might not be true.
35  *
36  * Clearing visibility map bits is not separately WAL-logged.  The callers
37  * must make sure that whenever a bit is cleared, the bit is cleared on WAL
38  * replay of the updating operation as well.
39  *
40  * When we *set* a visibility map during VACUUM, we must write WAL.  This may
41  * seem counterintuitive, since the bit is basically a hint: if it is clear,
42  * it may still be the case that every tuple on the page is visible to all
43  * transactions; we just don't know that for certain.  The difficulty is that
44  * there are two bits which are typically set together: the PD_ALL_VISIBLE bit
45  * on the page itself, and the visibility map bit.  If a crash occurs after the
46  * visibility map page makes it to disk and before the updated heap page makes
47  * it to disk, redo must set the bit on the heap page.  Otherwise, the next
48  * insert, update, or delete on the heap page will fail to realize that the
49  * visibility map bit must be cleared, possibly causing index-only scans to
50  * return wrong answers.
51  *
52  * VACUUM will normally skip pages for which the visibility map bit is set;
53  * such pages can't contain any dead tuples and therefore don't need vacuuming.
54  *
55  * LOCKING
56  *
57  * In heapam.c, whenever a page is modified so that not all tuples on the
58  * page are visible to everyone anymore, the corresponding bit in the
59  * visibility map is cleared. In order to be crash-safe, we need to do this
60  * while still holding a lock on the heap page and in the same critical
61  * section that logs the page modification. However, we don't want to hold
62  * the buffer lock over any I/O that may be required to read in the visibility
63  * map page.  To avoid this, we examine the heap page before locking it;
64  * if the page-level PD_ALL_VISIBLE bit is set, we pin the visibility map
65  * bit.  Then, we lock the buffer.  But this creates a race condition: there
66  * is a possibility that in the time it takes to lock the buffer, the
67  * PD_ALL_VISIBLE bit gets set.  If that happens, we have to unlock the
68  * buffer, pin the visibility map page, and relock the buffer.  This shouldn't
69  * happen often, because only VACUUM currently sets visibility map bits,
70  * and the race will only occur if VACUUM processes a given page at almost
71  * exactly the same time that someone tries to further modify it.
72  *
73  * To set a bit, you need to hold a lock on the heap page. That prevents
74  * the race condition where VACUUM sees that all tuples on the page are
75  * visible to everyone, but another backend modifies the page before VACUUM
76  * sets the bit in the visibility map.
77  *
78  * When a bit is set, the LSN of the visibility map page is updated to make
79  * sure that the visibility map update doesn't get written to disk before the
80  * WAL record of the changes that made it possible to set the bit is flushed.
81  * But when a bit is cleared, we don't have to do that because it's always
82  * safe to clear a bit in the map from correctness point of view.
83  *
84  *-------------------------------------------------------------------------
85  */
86 #include "postgres.h"
87 
88 #include "access/heapam_xlog.h"
89 #include "access/visibilitymap.h"
90 #include "access/xlog.h"
91 #include "miscadmin.h"
92 #include "storage/bufmgr.h"
93 #include "storage/lmgr.h"
94 #include "storage/smgr.h"
95 #include "utils/inval.h"
96 
97 
98 /*#define TRACE_VISIBILITYMAP */
99 
100 /*
101  * Size of the bitmap on each visibility map page, in bytes. There's no
102  * extra headers, so the whole page minus the standard page header is
103  * used for the bitmap.
104  */
105 #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
106 
107 /* Number of heap blocks we can represent in one byte */
108 #define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
109 
110 /* Number of heap blocks we can represent in one visibility map page. */
111 #define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
112 
113 /* Mapping from heap block number to the right bit in the visibility map */
114 #define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
115 #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
116 #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
117 
118 /* tables for fast counting of set bits for visible and frozen */
119 static const uint8 number_of_ones_for_visible[256] = {
120 	0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
121 	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
122 	0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
123 	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
124 	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
125 	2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
126 	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
127 	2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
128 	0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
129 	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
130 	0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
131 	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
132 	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
133 	2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
134 	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
135 	2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4
136 };
137 static const uint8 number_of_ones_for_frozen[256] = {
138 	0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
139 	0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
140 	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
141 	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
142 	0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
143 	0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
144 	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
145 	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
146 	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
147 	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
148 	2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
149 	2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
150 	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
151 	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
152 	2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
153 	2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4
154 };
155 
156 /* prototypes for internal routines */
157 static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
158 static void vm_extend(Relation rel, BlockNumber nvmblocks);
159 
160 
161 /*
162  *	visibilitymap_clear - clear specified bits for one page in visibility map
163  *
164  * You must pass a buffer containing the correct map page to this function.
165  * Call visibilitymap_pin first to pin the right one. This function doesn't do
166  * any I/O.  Returns true if any bits have been cleared and false otherwise.
167  */
168 bool
visibilitymap_clear(Relation rel,BlockNumber heapBlk,Buffer buf,uint8 flags)169 visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf, uint8 flags)
170 {
171 	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
172 	int			mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
173 	int			mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
174 	uint8		mask = flags << mapOffset;
175 	char	   *map;
176 	bool		cleared = false;
177 
178 	Assert(flags & VISIBILITYMAP_VALID_BITS);
179 
180 #ifdef TRACE_VISIBILITYMAP
181 	elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
182 #endif
183 
184 	if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
185 		elog(ERROR, "wrong buffer passed to visibilitymap_clear");
186 
187 	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
188 	map = PageGetContents(BufferGetPage(buf));
189 
190 	if (map[mapByte] & mask)
191 	{
192 		map[mapByte] &= ~mask;
193 
194 		MarkBufferDirty(buf);
195 		cleared = true;
196 	}
197 
198 	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
199 
200 	return cleared;
201 }
202 
203 /*
204  *	visibilitymap_pin - pin a map page for setting a bit
205  *
206  * Setting a bit in the visibility map is a two-phase operation. First, call
207  * visibilitymap_pin, to pin the visibility map page containing the bit for
208  * the heap page. Because that can require I/O to read the map page, you
209  * shouldn't hold a lock on the heap page while doing that. Then, call
210  * visibilitymap_set to actually set the bit.
211  *
212  * On entry, *buf should be InvalidBuffer or a valid buffer returned by
213  * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
214  * relation. On return, *buf is a valid buffer with the map page containing
215  * the bit for heapBlk.
216  *
217  * If the page doesn't exist in the map file yet, it is extended.
218  */
219 void
visibilitymap_pin(Relation rel,BlockNumber heapBlk,Buffer * buf)220 visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf)
221 {
222 	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
223 
224 	/* Reuse the old pinned buffer if possible */
225 	if (BufferIsValid(*buf))
226 	{
227 		if (BufferGetBlockNumber(*buf) == mapBlock)
228 			return;
229 
230 		ReleaseBuffer(*buf);
231 	}
232 	*buf = vm_readbuf(rel, mapBlock, true);
233 }
234 
235 /*
236  *	visibilitymap_pin_ok - do we already have the correct page pinned?
237  *
238  * On entry, buf should be InvalidBuffer or a valid buffer returned by
239  * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
240  * relation.  The return value indicates whether the buffer covers the
241  * given heapBlk.
242  */
243 bool
visibilitymap_pin_ok(BlockNumber heapBlk,Buffer buf)244 visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
245 {
246 	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
247 
248 	return BufferIsValid(buf) && BufferGetBlockNumber(buf) == mapBlock;
249 }
250 
251 /*
252  *	visibilitymap_set - set bit(s) on a previously pinned page
253  *
254  * recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
255  * or InvalidXLogRecPtr in normal running.  The page LSN is advanced to the
256  * one provided; in normal running, we generate a new XLOG record and set the
257  * page LSN to that value.  cutoff_xid is the largest xmin on the page being
258  * marked all-visible; it is needed for Hot Standby, and can be
259  * InvalidTransactionId if the page contains no tuples.  It can also be set
260  * to InvalidTransactionId when a page that is already all-visible is being
261  * marked all-frozen.
262  *
263  * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling
264  * this function. Except in recovery, caller should also pass the heap
265  * buffer. When checksums are enabled and we're not in recovery, we must add
266  * the heap buffer to the WAL chain to protect it from being torn.
267  *
268  * You must pass a buffer containing the correct map page to this function.
269  * Call visibilitymap_pin first to pin the right one. This function doesn't do
270  * any I/O.
271  */
272 void
visibilitymap_set(Relation rel,BlockNumber heapBlk,Buffer heapBuf,XLogRecPtr recptr,Buffer vmBuf,TransactionId cutoff_xid,uint8 flags)273 visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
274 				  XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid,
275 				  uint8 flags)
276 {
277 	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
278 	uint32		mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
279 	uint8		mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
280 	Page		page;
281 	uint8	   *map;
282 
283 #ifdef TRACE_VISIBILITYMAP
284 	elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
285 #endif
286 
287 	Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
288 	Assert(InRecovery || BufferIsValid(heapBuf));
289 	Assert(flags & VISIBILITYMAP_VALID_BITS);
290 
291 	/* Check that we have the right heap page pinned, if present */
292 	if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk)
293 		elog(ERROR, "wrong heap buffer passed to visibilitymap_set");
294 
295 	/* Check that we have the right VM page pinned */
296 	if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock)
297 		elog(ERROR, "wrong VM buffer passed to visibilitymap_set");
298 
299 	page = BufferGetPage(vmBuf);
300 	map = (uint8 *) PageGetContents(page);
301 	LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE);
302 
303 	if (flags != (map[mapByte] >> mapOffset & VISIBILITYMAP_VALID_BITS))
304 	{
305 		START_CRIT_SECTION();
306 
307 		map[mapByte] |= (flags << mapOffset);
308 		MarkBufferDirty(vmBuf);
309 
310 		if (RelationNeedsWAL(rel))
311 		{
312 			if (XLogRecPtrIsInvalid(recptr))
313 			{
314 				Assert(!InRecovery);
315 				recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf,
316 										  cutoff_xid, flags);
317 
318 				/*
319 				 * If data checksums are enabled (or wal_log_hints=on), we
320 				 * need to protect the heap page from being torn.
321 				 */
322 				if (XLogHintBitIsNeeded())
323 				{
324 					Page		heapPage = BufferGetPage(heapBuf);
325 
326 					/* caller is expected to set PD_ALL_VISIBLE first */
327 					Assert(PageIsAllVisible(heapPage));
328 					PageSetLSN(heapPage, recptr);
329 				}
330 			}
331 			PageSetLSN(page, recptr);
332 		}
333 
334 		END_CRIT_SECTION();
335 	}
336 
337 	LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK);
338 }
339 
340 /*
341  *	visibilitymap_get_status - get status of bits
342  *
343  * Are all tuples on heapBlk visible to all or are marked frozen, according
344  * to the visibility map?
345  *
346  * On entry, *buf should be InvalidBuffer or a valid buffer returned by an
347  * earlier call to visibilitymap_pin or visibilitymap_get_status on the same
348  * relation. On return, *buf is a valid buffer with the map page containing
349  * the bit for heapBlk, or InvalidBuffer. The caller is responsible for
350  * releasing *buf after it's done testing and setting bits.
351  *
352  * NOTE: This function is typically called without a lock on the heap page,
353  * so somebody else could change the bit just after we look at it.  In fact,
354  * since we don't lock the visibility map page either, it's even possible that
355  * someone else could have changed the bit just before we look at it, but yet
356  * we might see the old value.  It is the caller's responsibility to deal with
357  * all concurrency issues!
358  */
359 uint8
visibilitymap_get_status(Relation rel,BlockNumber heapBlk,Buffer * buf)360 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *buf)
361 {
362 	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
363 	uint32		mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
364 	uint8		mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
365 	char	   *map;
366 	uint8		result;
367 
368 #ifdef TRACE_VISIBILITYMAP
369 	elog(DEBUG1, "vm_get_status %s %d", RelationGetRelationName(rel), heapBlk);
370 #endif
371 
372 	/* Reuse the old pinned buffer if possible */
373 	if (BufferIsValid(*buf))
374 	{
375 		if (BufferGetBlockNumber(*buf) != mapBlock)
376 		{
377 			ReleaseBuffer(*buf);
378 			*buf = InvalidBuffer;
379 		}
380 	}
381 
382 	if (!BufferIsValid(*buf))
383 	{
384 		*buf = vm_readbuf(rel, mapBlock, false);
385 		if (!BufferIsValid(*buf))
386 			return false;
387 	}
388 
389 	map = PageGetContents(BufferGetPage(*buf));
390 
391 	/*
392 	 * A single byte read is atomic.  There could be memory-ordering effects
393 	 * here, but for performance reasons we make it the caller's job to worry
394 	 * about that.
395 	 */
396 	result = ((map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS);
397 	return result;
398 }
399 
400 /*
401  *	visibilitymap_count  - count number of bits set in visibility map
402  *
403  * Note: we ignore the possibility of race conditions when the table is being
404  * extended concurrently with the call.  New pages added to the table aren't
405  * going to be marked all-visible or all-frozen, so they won't affect the result.
406  */
407 void
visibilitymap_count(Relation rel,BlockNumber * all_visible,BlockNumber * all_frozen)408 visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen)
409 {
410 	BlockNumber mapBlock;
411 
412 	/* all_visible must be specified */
413 	Assert(all_visible);
414 
415 	*all_visible = 0;
416 	if (all_frozen)
417 		*all_frozen = 0;
418 
419 	for (mapBlock = 0;; mapBlock++)
420 	{
421 		Buffer		mapBuffer;
422 		unsigned char *map;
423 		int			i;
424 
425 		/*
426 		 * Read till we fall off the end of the map.  We assume that any extra
427 		 * bytes in the last page are zeroed, so we don't bother excluding
428 		 * them from the count.
429 		 */
430 		mapBuffer = vm_readbuf(rel, mapBlock, false);
431 		if (!BufferIsValid(mapBuffer))
432 			break;
433 
434 		/*
435 		 * We choose not to lock the page, since the result is going to be
436 		 * immediately stale anyway if anyone is concurrently setting or
437 		 * clearing bits, and we only really need an approximate value.
438 		 */
439 		map = (unsigned char *) PageGetContents(BufferGetPage(mapBuffer));
440 
441 		for (i = 0; i < MAPSIZE; i++)
442 		{
443 			*all_visible += number_of_ones_for_visible[map[i]];
444 			if (all_frozen)
445 				*all_frozen += number_of_ones_for_frozen[map[i]];
446 		}
447 
448 		ReleaseBuffer(mapBuffer);
449 	}
450 }
451 
452 /*
453  *	visibilitymap_truncate - truncate the visibility map
454  *
455  * The caller must hold AccessExclusiveLock on the relation, to ensure that
456  * other backends receive the smgr invalidation event that this function sends
457  * before they access the VM again.
458  *
459  * nheapblocks is the new size of the heap.
460  */
461 void
visibilitymap_truncate(Relation rel,BlockNumber nheapblocks)462 visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
463 {
464 	BlockNumber newnblocks;
465 
466 	/* last remaining block, byte, and bit */
467 	BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks);
468 	uint32		truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks);
469 	uint8		truncOffset = HEAPBLK_TO_OFFSET(nheapblocks);
470 
471 #ifdef TRACE_VISIBILITYMAP
472 	elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks);
473 #endif
474 
475 	RelationOpenSmgr(rel);
476 
477 	/*
478 	 * If no visibility map has been created yet for this relation, there's
479 	 * nothing to truncate.
480 	 */
481 	if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
482 		return;
483 
484 	/*
485 	 * Unless the new size is exactly at a visibility map page boundary, the
486 	 * tail bits in the last remaining map page, representing truncated heap
487 	 * blocks, need to be cleared. This is not only tidy, but also necessary
488 	 * because we don't get a chance to clear the bits if the heap is extended
489 	 * again.
490 	 */
491 	if (truncByte != 0 || truncOffset != 0)
492 	{
493 		Buffer		mapBuffer;
494 		Page		page;
495 		char	   *map;
496 
497 		newnblocks = truncBlock + 1;
498 
499 		mapBuffer = vm_readbuf(rel, truncBlock, false);
500 		if (!BufferIsValid(mapBuffer))
501 		{
502 			/* nothing to do, the file was already smaller */
503 			return;
504 		}
505 
506 		page = BufferGetPage(mapBuffer);
507 		map = PageGetContents(page);
508 
509 		LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
510 
511 		/* NO EREPORT(ERROR) from here till changes are logged */
512 		START_CRIT_SECTION();
513 
514 		/* Clear out the unwanted bytes. */
515 		MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));
516 
517 		/*----
518 		 * Mask out the unwanted bits of the last remaining byte.
519 		 *
520 		 * ((1 << 0) - 1) = 00000000
521 		 * ((1 << 1) - 1) = 00000001
522 		 * ...
523 		 * ((1 << 6) - 1) = 00111111
524 		 * ((1 << 7) - 1) = 01111111
525 		 *----
526 		 */
527 		map[truncByte] &= (1 << truncOffset) - 1;
528 
529 		/*
530 		 * Truncation of a relation is WAL-logged at a higher-level, and we
531 		 * will be called at WAL replay. But if checksums are enabled, we need
532 		 * to still write a WAL record to protect against a torn page, if the
533 		 * page is flushed to disk before the truncation WAL record. We cannot
534 		 * use MarkBufferDirtyHint here, because that will not dirty the page
535 		 * during recovery.
536 		 */
537 		MarkBufferDirty(mapBuffer);
538 		if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
539 			log_newpage_buffer(mapBuffer, false);
540 
541 		END_CRIT_SECTION();
542 
543 		UnlockReleaseBuffer(mapBuffer);
544 	}
545 	else
546 		newnblocks = truncBlock;
547 
548 	if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks)
549 	{
550 		/* nothing to do, the file was already smaller than requested size */
551 		return;
552 	}
553 
554 	/* Truncate the unused VM pages, and send smgr inval message */
555 	smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks);
556 
557 	/*
558 	 * We might as well update the local smgr_vm_nblocks setting. smgrtruncate
559 	 * sent an smgr cache inval message, which will cause other backends to
560 	 * invalidate their copy of smgr_vm_nblocks, and this one too at the next
561 	 * command boundary.  But this ensures it isn't outright wrong until then.
562 	 */
563 	if (rel->rd_smgr)
564 		rel->rd_smgr->smgr_vm_nblocks = newnblocks;
565 }
566 
567 /*
568  * Read a visibility map page.
569  *
570  * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
571  * true, the visibility map file is extended.
572  */
573 static Buffer
vm_readbuf(Relation rel,BlockNumber blkno,bool extend)574 vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
575 {
576 	Buffer		buf;
577 
578 	/*
579 	 * We might not have opened the relation at the smgr level yet, or we
580 	 * might have been forced to close it by a sinval message.  The code below
581 	 * won't necessarily notice relation extension immediately when extend =
582 	 * false, so we rely on sinval messages to ensure that our ideas about the
583 	 * size of the map aren't too far out of date.
584 	 */
585 	RelationOpenSmgr(rel);
586 
587 	/*
588 	 * If we haven't cached the size of the visibility map fork yet, check it
589 	 * first.
590 	 */
591 	if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber)
592 	{
593 		if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
594 			rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr,
595 														VISIBILITYMAP_FORKNUM);
596 		else
597 			rel->rd_smgr->smgr_vm_nblocks = 0;
598 	}
599 
600 	/* Handle requests beyond EOF */
601 	if (blkno >= rel->rd_smgr->smgr_vm_nblocks)
602 	{
603 		if (extend)
604 			vm_extend(rel, blkno + 1);
605 		else
606 			return InvalidBuffer;
607 	}
608 
609 	/*
610 	 * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's
611 	 * always safe to clear bits, so it's better to clear corrupt pages than
612 	 * error out.
613 	 *
614 	 * The initialize-the-page part is trickier than it looks, because of the
615 	 * possibility of multiple backends doing this concurrently, and our
616 	 * desire to not uselessly take the buffer lock in the normal path where
617 	 * the page is OK.  We must take the lock to initialize the page, so
618 	 * recheck page newness after we have the lock, in case someone else
619 	 * already did it.  Also, because we initially check PageIsNew with no
620 	 * lock, it's possible to fall through and return the buffer while someone
621 	 * else is still initializing the page (i.e., we might see pd_upper as set
622 	 * but other page header fields are still zeroes).  This is harmless for
623 	 * callers that will take a buffer lock themselves, but some callers
624 	 * inspect the page without any lock at all.  The latter is OK only so
625 	 * long as it doesn't depend on the page header having correct contents.
626 	 * Current usage is safe because PageGetContents() does not require that.
627 	 */
628 	buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno,
629 							 RBM_ZERO_ON_ERROR, NULL);
630 	if (PageIsNew(BufferGetPage(buf)))
631 	{
632 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
633 		if (PageIsNew(BufferGetPage(buf)))
634 			PageInit(BufferGetPage(buf), BLCKSZ, 0);
635 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
636 	}
637 	return buf;
638 }
639 
640 /*
641  * Ensure that the visibility map fork is at least vm_nblocks long, extending
642  * it if necessary with zeroed pages.
643  */
644 static void
vm_extend(Relation rel,BlockNumber vm_nblocks)645 vm_extend(Relation rel, BlockNumber vm_nblocks)
646 {
647 	BlockNumber vm_nblocks_now;
648 	PGAlignedBlock pg;
649 
650 	PageInit((Page) pg.data, BLCKSZ, 0);
651 
652 	/*
653 	 * We use the relation extension lock to lock out other backends trying to
654 	 * extend the visibility map at the same time. It also locks out extension
655 	 * of the main fork, unnecessarily, but extending the visibility map
656 	 * happens seldom enough that it doesn't seem worthwhile to have a
657 	 * separate lock tag type for it.
658 	 *
659 	 * Note that another backend might have extended or created the relation
660 	 * by the time we get the lock.
661 	 */
662 	LockRelationForExtension(rel, ExclusiveLock);
663 
664 	/* Might have to re-open if a cache flush happened */
665 	RelationOpenSmgr(rel);
666 
667 	/*
668 	 * Create the file first if it doesn't exist.  If smgr_vm_nblocks is
669 	 * positive then it must exist, no need for an smgrexists call.
670 	 */
671 	if ((rel->rd_smgr->smgr_vm_nblocks == 0 ||
672 		 rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber) &&
673 		!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
674 		smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);
675 
676 	vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
677 
678 	/* Now extend the file */
679 	while (vm_nblocks_now < vm_nblocks)
680 	{
681 		PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
682 
683 		smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
684 				   pg.data, false);
685 		vm_nblocks_now++;
686 	}
687 
688 	/*
689 	 * Send a shared-inval message to force other backends to close any smgr
690 	 * references they may have for this rel, which we are about to change.
691 	 * This is a useful optimization because it means that backends don't have
692 	 * to keep checking for creation or extension of the file, which happens
693 	 * infrequently.
694 	 */
695 	CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode);
696 
697 	/* Update local cache with the up-to-date size */
698 	rel->rd_smgr->smgr_vm_nblocks = vm_nblocks_now;
699 
700 	UnlockRelationForExtension(rel, ExclusiveLock);
701 }
702