1 /*-------------------------------------------------------------------------
2 *
3 * visibilitymap.c
4 * bitmap for tracking visibility of heap tuples
5 *
6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/heap/visibilitymap.c
12 *
13 * INTERFACE ROUTINES
14 * visibilitymap_clear - clear bits for one page in the visibility map
15 * visibilitymap_pin - pin a map page for setting a bit
16 * visibilitymap_pin_ok - check whether correct map page is already pinned
17 * visibilitymap_set - set a bit in a previously pinned page
18 * visibilitymap_get_status - get status of bits
19 * visibilitymap_count - count number of bits set in visibility map
20 * visibilitymap_truncate - truncate the visibility map
21 *
22 * NOTES
23 *
24 * The visibility map is a bitmap with two bits (all-visible and all-frozen)
25 * per heap page. A set all-visible bit means that all tuples on the page are
26 * known visible to all transactions, and therefore the page doesn't need to
27 * be vacuumed. A set all-frozen bit means that all tuples on the page are
28 * completely frozen, and therefore the page doesn't need to be vacuumed even
29 * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum).
30 * The all-frozen bit must be set only when the page is already all-visible.
31 *
32 * The map is conservative in the sense that we make sure that whenever a bit
33 * is set, we know the condition is true, but if a bit is not set, it might or
34 * might not be true.
35 *
36 * Clearing visibility map bits is not separately WAL-logged. The callers
37 * must make sure that whenever a bit is cleared, the bit is cleared on WAL
38 * replay of the updating operation as well.
39 *
40 * When we *set* a visibility map during VACUUM, we must write WAL. This may
41 * seem counterintuitive, since the bit is basically a hint: if it is clear,
42 * it may still be the case that every tuple on the page is visible to all
43 * transactions; we just don't know that for certain. The difficulty is that
44 * there are two bits which are typically set together: the PD_ALL_VISIBLE bit
45 * on the page itself, and the visibility map bit. If a crash occurs after the
46 * visibility map page makes it to disk and before the updated heap page makes
47 * it to disk, redo must set the bit on the heap page. Otherwise, the next
48 * insert, update, or delete on the heap page will fail to realize that the
49 * visibility map bit must be cleared, possibly causing index-only scans to
50 * return wrong answers.
51 *
52 * VACUUM will normally skip pages for which the visibility map bit is set;
53 * such pages can't contain any dead tuples and therefore don't need vacuuming.
54 *
55 * LOCKING
56 *
57 * In heapam.c, whenever a page is modified so that not all tuples on the
58 * page are visible to everyone anymore, the corresponding bit in the
59 * visibility map is cleared. In order to be crash-safe, we need to do this
60 * while still holding a lock on the heap page and in the same critical
61 * section that logs the page modification. However, we don't want to hold
62 * the buffer lock over any I/O that may be required to read in the visibility
63 * map page. To avoid this, we examine the heap page before locking it;
64 * if the page-level PD_ALL_VISIBLE bit is set, we pin the visibility map
65 * bit. Then, we lock the buffer. But this creates a race condition: there
66 * is a possibility that in the time it takes to lock the buffer, the
67 * PD_ALL_VISIBLE bit gets set. If that happens, we have to unlock the
68 * buffer, pin the visibility map page, and relock the buffer. This shouldn't
69 * happen often, because only VACUUM currently sets visibility map bits,
70 * and the race will only occur if VACUUM processes a given page at almost
71 * exactly the same time that someone tries to further modify it.
72 *
73 * To set a bit, you need to hold a lock on the heap page. That prevents
74 * the race condition where VACUUM sees that all tuples on the page are
75 * visible to everyone, but another backend modifies the page before VACUUM
76 * sets the bit in the visibility map.
77 *
78 * When a bit is set, the LSN of the visibility map page is updated to make
79 * sure that the visibility map update doesn't get written to disk before the
80 * WAL record of the changes that made it possible to set the bit is flushed.
81 * But when a bit is cleared, we don't have to do that because it's always
82 * safe to clear a bit in the map from correctness point of view.
83 *
84 *-------------------------------------------------------------------------
85 */
86 #include "postgres.h"
87
88 #include "access/heapam_xlog.h"
89 #include "access/visibilitymap.h"
90 #include "access/xlog.h"
91 #include "miscadmin.h"
92 #include "storage/bufmgr.h"
93 #include "storage/lmgr.h"
94 #include "storage/smgr.h"
95 #include "utils/inval.h"
96
97
98 /*#define TRACE_VISIBILITYMAP */
99
100 /*
101 * Size of the bitmap on each visibility map page, in bytes. There's no
102 * extra headers, so the whole page minus the standard page header is
103 * used for the bitmap.
104 */
105 #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
106
107 /* Number of heap blocks we can represent in one byte */
108 #define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
109
110 /* Number of heap blocks we can represent in one visibility map page. */
111 #define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
112
113 /* Mapping from heap block number to the right bit in the visibility map */
114 #define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
115 #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
116 #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
117
118 /* tables for fast counting of set bits for visible and frozen */
119 static const uint8 number_of_ones_for_visible[256] = {
120 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
121 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
122 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
123 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
124 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
125 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
126 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
127 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
128 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
129 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
130 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
131 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
132 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
133 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
134 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
135 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4
136 };
137 static const uint8 number_of_ones_for_frozen[256] = {
138 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
139 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
140 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
141 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
142 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
143 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
144 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
145 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
146 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
147 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
148 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
149 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
150 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
151 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
152 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
153 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4
154 };
155
156 /* prototypes for internal routines */
157 static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
158 static void vm_extend(Relation rel, BlockNumber nvmblocks);
159
160
161 /*
162 * visibilitymap_clear - clear specified bits for one page in visibility map
163 *
164 * You must pass a buffer containing the correct map page to this function.
165 * Call visibilitymap_pin first to pin the right one. This function doesn't do
166 * any I/O. Returns true if any bits have been cleared and false otherwise.
167 */
168 bool
visibilitymap_clear(Relation rel,BlockNumber heapBlk,Buffer buf,uint8 flags)169 visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf, uint8 flags)
170 {
171 BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
172 int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
173 int mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
174 uint8 mask = flags << mapOffset;
175 char *map;
176 bool cleared = false;
177
178 Assert(flags & VISIBILITYMAP_VALID_BITS);
179
180 #ifdef TRACE_VISIBILITYMAP
181 elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
182 #endif
183
184 if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
185 elog(ERROR, "wrong buffer passed to visibilitymap_clear");
186
187 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
188 map = PageGetContents(BufferGetPage(buf));
189
190 if (map[mapByte] & mask)
191 {
192 map[mapByte] &= ~mask;
193
194 MarkBufferDirty(buf);
195 cleared = true;
196 }
197
198 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
199
200 return cleared;
201 }
202
203 /*
204 * visibilitymap_pin - pin a map page for setting a bit
205 *
206 * Setting a bit in the visibility map is a two-phase operation. First, call
207 * visibilitymap_pin, to pin the visibility map page containing the bit for
208 * the heap page. Because that can require I/O to read the map page, you
209 * shouldn't hold a lock on the heap page while doing that. Then, call
210 * visibilitymap_set to actually set the bit.
211 *
212 * On entry, *buf should be InvalidBuffer or a valid buffer returned by
213 * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
214 * relation. On return, *buf is a valid buffer with the map page containing
215 * the bit for heapBlk.
216 *
217 * If the page doesn't exist in the map file yet, it is extended.
218 */
219 void
visibilitymap_pin(Relation rel,BlockNumber heapBlk,Buffer * buf)220 visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf)
221 {
222 BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
223
224 /* Reuse the old pinned buffer if possible */
225 if (BufferIsValid(*buf))
226 {
227 if (BufferGetBlockNumber(*buf) == mapBlock)
228 return;
229
230 ReleaseBuffer(*buf);
231 }
232 *buf = vm_readbuf(rel, mapBlock, true);
233 }
234
235 /*
236 * visibilitymap_pin_ok - do we already have the correct page pinned?
237 *
238 * On entry, buf should be InvalidBuffer or a valid buffer returned by
239 * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
240 * relation. The return value indicates whether the buffer covers the
241 * given heapBlk.
242 */
243 bool
visibilitymap_pin_ok(BlockNumber heapBlk,Buffer buf)244 visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
245 {
246 BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
247
248 return BufferIsValid(buf) && BufferGetBlockNumber(buf) == mapBlock;
249 }
250
251 /*
252 * visibilitymap_set - set bit(s) on a previously pinned page
253 *
254 * recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
255 * or InvalidXLogRecPtr in normal running. The page LSN is advanced to the
256 * one provided; in normal running, we generate a new XLOG record and set the
257 * page LSN to that value. cutoff_xid is the largest xmin on the page being
258 * marked all-visible; it is needed for Hot Standby, and can be
259 * InvalidTransactionId if the page contains no tuples. It can also be set
260 * to InvalidTransactionId when a page that is already all-visible is being
261 * marked all-frozen.
262 *
263 * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling
264 * this function. Except in recovery, caller should also pass the heap
265 * buffer. When checksums are enabled and we're not in recovery, we must add
266 * the heap buffer to the WAL chain to protect it from being torn.
267 *
268 * You must pass a buffer containing the correct map page to this function.
269 * Call visibilitymap_pin first to pin the right one. This function doesn't do
270 * any I/O.
271 */
272 void
visibilitymap_set(Relation rel,BlockNumber heapBlk,Buffer heapBuf,XLogRecPtr recptr,Buffer vmBuf,TransactionId cutoff_xid,uint8 flags)273 visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
274 XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid,
275 uint8 flags)
276 {
277 BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
278 uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
279 uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
280 Page page;
281 uint8 *map;
282
283 #ifdef TRACE_VISIBILITYMAP
284 elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
285 #endif
286
287 Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
288 Assert(InRecovery || BufferIsValid(heapBuf));
289 Assert(flags & VISIBILITYMAP_VALID_BITS);
290
291 /* Check that we have the right heap page pinned, if present */
292 if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk)
293 elog(ERROR, "wrong heap buffer passed to visibilitymap_set");
294
295 /* Check that we have the right VM page pinned */
296 if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock)
297 elog(ERROR, "wrong VM buffer passed to visibilitymap_set");
298
299 page = BufferGetPage(vmBuf);
300 map = (uint8 *) PageGetContents(page);
301 LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE);
302
303 if (flags != (map[mapByte] >> mapOffset & VISIBILITYMAP_VALID_BITS))
304 {
305 START_CRIT_SECTION();
306
307 map[mapByte] |= (flags << mapOffset);
308 MarkBufferDirty(vmBuf);
309
310 if (RelationNeedsWAL(rel))
311 {
312 if (XLogRecPtrIsInvalid(recptr))
313 {
314 Assert(!InRecovery);
315 recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf,
316 cutoff_xid, flags);
317
318 /*
319 * If data checksums are enabled (or wal_log_hints=on), we
320 * need to protect the heap page from being torn.
321 */
322 if (XLogHintBitIsNeeded())
323 {
324 Page heapPage = BufferGetPage(heapBuf);
325
326 /* caller is expected to set PD_ALL_VISIBLE first */
327 Assert(PageIsAllVisible(heapPage));
328 PageSetLSN(heapPage, recptr);
329 }
330 }
331 PageSetLSN(page, recptr);
332 }
333
334 END_CRIT_SECTION();
335 }
336
337 LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK);
338 }
339
340 /*
341 * visibilitymap_get_status - get status of bits
342 *
343 * Are all tuples on heapBlk visible to all or are marked frozen, according
344 * to the visibility map?
345 *
346 * On entry, *buf should be InvalidBuffer or a valid buffer returned by an
347 * earlier call to visibilitymap_pin or visibilitymap_get_status on the same
348 * relation. On return, *buf is a valid buffer with the map page containing
349 * the bit for heapBlk, or InvalidBuffer. The caller is responsible for
350 * releasing *buf after it's done testing and setting bits.
351 *
352 * NOTE: This function is typically called without a lock on the heap page,
353 * so somebody else could change the bit just after we look at it. In fact,
354 * since we don't lock the visibility map page either, it's even possible that
355 * someone else could have changed the bit just before we look at it, but yet
356 * we might see the old value. It is the caller's responsibility to deal with
357 * all concurrency issues!
358 */
359 uint8
visibilitymap_get_status(Relation rel,BlockNumber heapBlk,Buffer * buf)360 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *buf)
361 {
362 BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
363 uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
364 uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
365 char *map;
366 uint8 result;
367
368 #ifdef TRACE_VISIBILITYMAP
369 elog(DEBUG1, "vm_get_status %s %d", RelationGetRelationName(rel), heapBlk);
370 #endif
371
372 /* Reuse the old pinned buffer if possible */
373 if (BufferIsValid(*buf))
374 {
375 if (BufferGetBlockNumber(*buf) != mapBlock)
376 {
377 ReleaseBuffer(*buf);
378 *buf = InvalidBuffer;
379 }
380 }
381
382 if (!BufferIsValid(*buf))
383 {
384 *buf = vm_readbuf(rel, mapBlock, false);
385 if (!BufferIsValid(*buf))
386 return false;
387 }
388
389 map = PageGetContents(BufferGetPage(*buf));
390
391 /*
392 * A single byte read is atomic. There could be memory-ordering effects
393 * here, but for performance reasons we make it the caller's job to worry
394 * about that.
395 */
396 result = ((map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS);
397 return result;
398 }
399
400 /*
401 * visibilitymap_count - count number of bits set in visibility map
402 *
403 * Note: we ignore the possibility of race conditions when the table is being
404 * extended concurrently with the call. New pages added to the table aren't
405 * going to be marked all-visible or all-frozen, so they won't affect the result.
406 */
407 void
visibilitymap_count(Relation rel,BlockNumber * all_visible,BlockNumber * all_frozen)408 visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen)
409 {
410 BlockNumber mapBlock;
411
412 /* all_visible must be specified */
413 Assert(all_visible);
414
415 *all_visible = 0;
416 if (all_frozen)
417 *all_frozen = 0;
418
419 for (mapBlock = 0;; mapBlock++)
420 {
421 Buffer mapBuffer;
422 unsigned char *map;
423 int i;
424
425 /*
426 * Read till we fall off the end of the map. We assume that any extra
427 * bytes in the last page are zeroed, so we don't bother excluding
428 * them from the count.
429 */
430 mapBuffer = vm_readbuf(rel, mapBlock, false);
431 if (!BufferIsValid(mapBuffer))
432 break;
433
434 /*
435 * We choose not to lock the page, since the result is going to be
436 * immediately stale anyway if anyone is concurrently setting or
437 * clearing bits, and we only really need an approximate value.
438 */
439 map = (unsigned char *) PageGetContents(BufferGetPage(mapBuffer));
440
441 for (i = 0; i < MAPSIZE; i++)
442 {
443 *all_visible += number_of_ones_for_visible[map[i]];
444 if (all_frozen)
445 *all_frozen += number_of_ones_for_frozen[map[i]];
446 }
447
448 ReleaseBuffer(mapBuffer);
449 }
450 }
451
452 /*
453 * visibilitymap_truncate - truncate the visibility map
454 *
455 * The caller must hold AccessExclusiveLock on the relation, to ensure that
456 * other backends receive the smgr invalidation event that this function sends
457 * before they access the VM again.
458 *
459 * nheapblocks is the new size of the heap.
460 */
461 void
visibilitymap_truncate(Relation rel,BlockNumber nheapblocks)462 visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
463 {
464 BlockNumber newnblocks;
465
466 /* last remaining block, byte, and bit */
467 BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks);
468 uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks);
469 uint8 truncOffset = HEAPBLK_TO_OFFSET(nheapblocks);
470
471 #ifdef TRACE_VISIBILITYMAP
472 elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks);
473 #endif
474
475 RelationOpenSmgr(rel);
476
477 /*
478 * If no visibility map has been created yet for this relation, there's
479 * nothing to truncate.
480 */
481 if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
482 return;
483
484 /*
485 * Unless the new size is exactly at a visibility map page boundary, the
486 * tail bits in the last remaining map page, representing truncated heap
487 * blocks, need to be cleared. This is not only tidy, but also necessary
488 * because we don't get a chance to clear the bits if the heap is extended
489 * again.
490 */
491 if (truncByte != 0 || truncOffset != 0)
492 {
493 Buffer mapBuffer;
494 Page page;
495 char *map;
496
497 newnblocks = truncBlock + 1;
498
499 mapBuffer = vm_readbuf(rel, truncBlock, false);
500 if (!BufferIsValid(mapBuffer))
501 {
502 /* nothing to do, the file was already smaller */
503 return;
504 }
505
506 page = BufferGetPage(mapBuffer);
507 map = PageGetContents(page);
508
509 LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
510
511 /* NO EREPORT(ERROR) from here till changes are logged */
512 START_CRIT_SECTION();
513
514 /* Clear out the unwanted bytes. */
515 MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));
516
517 /*----
518 * Mask out the unwanted bits of the last remaining byte.
519 *
520 * ((1 << 0) - 1) = 00000000
521 * ((1 << 1) - 1) = 00000001
522 * ...
523 * ((1 << 6) - 1) = 00111111
524 * ((1 << 7) - 1) = 01111111
525 *----
526 */
527 map[truncByte] &= (1 << truncOffset) - 1;
528
529 /*
530 * Truncation of a relation is WAL-logged at a higher-level, and we
531 * will be called at WAL replay. But if checksums are enabled, we need
532 * to still write a WAL record to protect against a torn page, if the
533 * page is flushed to disk before the truncation WAL record. We cannot
534 * use MarkBufferDirtyHint here, because that will not dirty the page
535 * during recovery.
536 */
537 MarkBufferDirty(mapBuffer);
538 if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
539 log_newpage_buffer(mapBuffer, false);
540
541 END_CRIT_SECTION();
542
543 UnlockReleaseBuffer(mapBuffer);
544 }
545 else
546 newnblocks = truncBlock;
547
548 if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks)
549 {
550 /* nothing to do, the file was already smaller than requested size */
551 return;
552 }
553
554 /* Truncate the unused VM pages, and send smgr inval message */
555 smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks);
556
557 /*
558 * We might as well update the local smgr_vm_nblocks setting. smgrtruncate
559 * sent an smgr cache inval message, which will cause other backends to
560 * invalidate their copy of smgr_vm_nblocks, and this one too at the next
561 * command boundary. But this ensures it isn't outright wrong until then.
562 */
563 if (rel->rd_smgr)
564 rel->rd_smgr->smgr_vm_nblocks = newnblocks;
565 }
566
567 /*
568 * Read a visibility map page.
569 *
570 * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
571 * true, the visibility map file is extended.
572 */
573 static Buffer
vm_readbuf(Relation rel,BlockNumber blkno,bool extend)574 vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
575 {
576 Buffer buf;
577
578 /*
579 * We might not have opened the relation at the smgr level yet, or we
580 * might have been forced to close it by a sinval message. The code below
581 * won't necessarily notice relation extension immediately when extend =
582 * false, so we rely on sinval messages to ensure that our ideas about the
583 * size of the map aren't too far out of date.
584 */
585 RelationOpenSmgr(rel);
586
587 /*
588 * If we haven't cached the size of the visibility map fork yet, check it
589 * first.
590 */
591 if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber)
592 {
593 if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
594 rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr,
595 VISIBILITYMAP_FORKNUM);
596 else
597 rel->rd_smgr->smgr_vm_nblocks = 0;
598 }
599
600 /* Handle requests beyond EOF */
601 if (blkno >= rel->rd_smgr->smgr_vm_nblocks)
602 {
603 if (extend)
604 vm_extend(rel, blkno + 1);
605 else
606 return InvalidBuffer;
607 }
608
609 /*
610 * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's
611 * always safe to clear bits, so it's better to clear corrupt pages than
612 * error out.
613 *
614 * The initialize-the-page part is trickier than it looks, because of the
615 * possibility of multiple backends doing this concurrently, and our
616 * desire to not uselessly take the buffer lock in the normal path where
617 * the page is OK. We must take the lock to initialize the page, so
618 * recheck page newness after we have the lock, in case someone else
619 * already did it. Also, because we initially check PageIsNew with no
620 * lock, it's possible to fall through and return the buffer while someone
621 * else is still initializing the page (i.e., we might see pd_upper as set
622 * but other page header fields are still zeroes). This is harmless for
623 * callers that will take a buffer lock themselves, but some callers
624 * inspect the page without any lock at all. The latter is OK only so
625 * long as it doesn't depend on the page header having correct contents.
626 * Current usage is safe because PageGetContents() does not require that.
627 */
628 buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno,
629 RBM_ZERO_ON_ERROR, NULL);
630 if (PageIsNew(BufferGetPage(buf)))
631 {
632 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
633 if (PageIsNew(BufferGetPage(buf)))
634 PageInit(BufferGetPage(buf), BLCKSZ, 0);
635 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
636 }
637 return buf;
638 }
639
640 /*
641 * Ensure that the visibility map fork is at least vm_nblocks long, extending
642 * it if necessary with zeroed pages.
643 */
644 static void
vm_extend(Relation rel,BlockNumber vm_nblocks)645 vm_extend(Relation rel, BlockNumber vm_nblocks)
646 {
647 BlockNumber vm_nblocks_now;
648 PGAlignedBlock pg;
649
650 PageInit((Page) pg.data, BLCKSZ, 0);
651
652 /*
653 * We use the relation extension lock to lock out other backends trying to
654 * extend the visibility map at the same time. It also locks out extension
655 * of the main fork, unnecessarily, but extending the visibility map
656 * happens seldom enough that it doesn't seem worthwhile to have a
657 * separate lock tag type for it.
658 *
659 * Note that another backend might have extended or created the relation
660 * by the time we get the lock.
661 */
662 LockRelationForExtension(rel, ExclusiveLock);
663
664 /* Might have to re-open if a cache flush happened */
665 RelationOpenSmgr(rel);
666
667 /*
668 * Create the file first if it doesn't exist. If smgr_vm_nblocks is
669 * positive then it must exist, no need for an smgrexists call.
670 */
671 if ((rel->rd_smgr->smgr_vm_nblocks == 0 ||
672 rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber) &&
673 !smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
674 smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);
675
676 vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
677
678 /* Now extend the file */
679 while (vm_nblocks_now < vm_nblocks)
680 {
681 PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
682
683 smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
684 pg.data, false);
685 vm_nblocks_now++;
686 }
687
688 /*
689 * Send a shared-inval message to force other backends to close any smgr
690 * references they may have for this rel, which we are about to change.
691 * This is a useful optimization because it means that backends don't have
692 * to keep checking for creation or extension of the file, which happens
693 * infrequently.
694 */
695 CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode);
696
697 /* Update local cache with the up-to-date size */
698 rel->rd_smgr->smgr_vm_nblocks = vm_nblocks_now;
699
700 UnlockRelationForExtension(rel, ExclusiveLock);
701 }
702