1 /*-------------------------------------------------------------------------
2 *
3 * nbtpage.c
4 * BTree-specific page management code for the Postgres btree access
5 * method.
6 *
7 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 *
11 * IDENTIFICATION
12 * src/backend/access/nbtree/nbtpage.c
13 *
14 * NOTES
15 * Postgres btree pages look like ordinary relation pages. The opaque
16 * data at high addresses includes pointers to left and right siblings
17 * and flag data describing page state. The first page in a btree, page
18 * zero, is special -- it stores meta-information describing the tree.
19 * Pages one and higher store the actual tree data.
20 *
21 *-------------------------------------------------------------------------
22 */
23 #include "postgres.h"
24
25 #include "access/nbtree.h"
26 #include "access/nbtxlog.h"
27 #include "access/tableam.h"
28 #include "access/transam.h"
29 #include "access/xlog.h"
30 #include "access/xloginsert.h"
31 #include "miscadmin.h"
32 #include "storage/indexfsm.h"
33 #include "storage/lmgr.h"
34 #include "storage/predicate.h"
35 #include "storage/procarray.h"
36 #include "utils/memdebug.h"
37 #include "utils/memutils.h"
38 #include "utils/snapmgr.h"
39
40 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
41 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
42 FullTransactionId safexid);
43 static void _bt_delitems_delete(Relation rel, Buffer buf,
44 TransactionId latestRemovedXid,
45 OffsetNumber *deletable, int ndeletable,
46 BTVacuumPosting *updatable, int nupdatable);
47 static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
48 OffsetNumber *updatedoffsets,
49 Size *updatedbuflen, bool needswal);
50 static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
51 BTStack stack);
52 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
53 BlockNumber scanblkno,
54 bool *rightsib_empty,
55 BTVacState *vstate);
56 static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
57 BTStack stack,
58 Buffer *subtreeparent,
59 OffsetNumber *poffset,
60 BlockNumber *topparent,
61 BlockNumber *topparentrightsib);
62 static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
63 FullTransactionId safexid);
64
65 /*
66 * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
67 */
68 void
_bt_initmetapage(Page page,BlockNumber rootbknum,uint32 level,bool allequalimage)69 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
70 bool allequalimage)
71 {
72 BTMetaPageData *metad;
73 BTPageOpaque metaopaque;
74
75 _bt_pageinit(page, BLCKSZ);
76
77 metad = BTPageGetMeta(page);
78 metad->btm_magic = BTREE_MAGIC;
79 metad->btm_version = BTREE_VERSION;
80 metad->btm_root = rootbknum;
81 metad->btm_level = level;
82 metad->btm_fastroot = rootbknum;
83 metad->btm_fastlevel = level;
84 metad->btm_last_cleanup_num_delpages = 0;
85 metad->btm_last_cleanup_num_heap_tuples = -1.0;
86 metad->btm_allequalimage = allequalimage;
87
88 metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
89 metaopaque->btpo_flags = BTP_META;
90
91 /*
92 * Set pd_lower just past the end of the metadata. This is essential,
93 * because without doing so, metadata will be lost if xlog.c compresses
94 * the page.
95 */
96 ((PageHeader) page)->pd_lower =
97 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
98 }
99
100 /*
101 * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
102 * 3, the last version that can be updated without broadly affecting
103 * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
104 *
105 * This routine does purely in-memory image upgrade. Caller is
106 * responsible for locking, WAL-logging etc.
107 */
108 void
_bt_upgrademetapage(Page page)109 _bt_upgrademetapage(Page page)
110 {
111 BTMetaPageData *metad;
112 BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
113
114 metad = BTPageGetMeta(page);
115 metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
116
117 /* It must be really a meta page of upgradable version */
118 Assert(metaopaque->btpo_flags & BTP_META);
119 Assert(metad->btm_version < BTREE_NOVAC_VERSION);
120 Assert(metad->btm_version >= BTREE_MIN_VERSION);
121
122 /* Set version number and fill extra fields added into version 3 */
123 metad->btm_version = BTREE_NOVAC_VERSION;
124 metad->btm_last_cleanup_num_delpages = 0;
125 metad->btm_last_cleanup_num_heap_tuples = -1.0;
126 /* Only a REINDEX can set this field */
127 Assert(!metad->btm_allequalimage);
128 metad->btm_allequalimage = false;
129
130 /* Adjust pd_lower (see _bt_initmetapage() for details) */
131 ((PageHeader) page)->pd_lower =
132 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
133 }
134
135 /*
136 * Get metadata from share-locked buffer containing metapage, while performing
137 * standard sanity checks.
138 *
139 * Callers that cache data returned here in local cache should note that an
140 * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
141 * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
142 */
143 static BTMetaPageData *
_bt_getmeta(Relation rel,Buffer metabuf)144 _bt_getmeta(Relation rel, Buffer metabuf)
145 {
146 Page metapg;
147 BTPageOpaque metaopaque;
148 BTMetaPageData *metad;
149
150 metapg = BufferGetPage(metabuf);
151 metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
152 metad = BTPageGetMeta(metapg);
153
154 /* sanity-check the metapage */
155 if (!P_ISMETA(metaopaque) ||
156 metad->btm_magic != BTREE_MAGIC)
157 ereport(ERROR,
158 (errcode(ERRCODE_INDEX_CORRUPTED),
159 errmsg("index \"%s\" is not a btree",
160 RelationGetRelationName(rel))));
161
162 if (metad->btm_version < BTREE_MIN_VERSION ||
163 metad->btm_version > BTREE_VERSION)
164 ereport(ERROR,
165 (errcode(ERRCODE_INDEX_CORRUPTED),
166 errmsg("version mismatch in index \"%s\": file version %d, "
167 "current version %d, minimal supported version %d",
168 RelationGetRelationName(rel),
169 metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
170
171 return metad;
172 }
173
174 /*
175 * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
176 *
177 * Called by btvacuumcleanup when btbulkdelete was never called because no
178 * index tuples needed to be deleted.
179 */
180 bool
_bt_vacuum_needs_cleanup(Relation rel)181 _bt_vacuum_needs_cleanup(Relation rel)
182 {
183 Buffer metabuf;
184 Page metapg;
185 BTMetaPageData *metad;
186 uint32 btm_version;
187 BlockNumber prev_num_delpages;
188
189 /*
190 * Copy details from metapage to local variables quickly.
191 *
192 * Note that we deliberately avoid using cached version of metapage here.
193 */
194 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
195 metapg = BufferGetPage(metabuf);
196 metad = BTPageGetMeta(metapg);
197 btm_version = metad->btm_version;
198
199 if (btm_version < BTREE_NOVAC_VERSION)
200 {
201 /*
202 * Metapage needs to be dynamically upgraded to store fields that are
203 * only present when btm_version >= BTREE_NOVAC_VERSION
204 */
205 _bt_relbuf(rel, metabuf);
206 return true;
207 }
208
209 prev_num_delpages = metad->btm_last_cleanup_num_delpages;
210 _bt_relbuf(rel, metabuf);
211
212 /*
213 * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
214 * total size of the index. We can reasonably expect (though are not
215 * guaranteed) to be able to recycle this many pages if we decide to do a
216 * btvacuumscan call during the ongoing btvacuumcleanup. For further
217 * details see the nbtree/README section on placing deleted pages in the
218 * FSM.
219 */
220 if (prev_num_delpages > 0 &&
221 prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
222 return true;
223
224 return false;
225 }
226
227 /*
228 * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.
229 *
230 * Called at the end of btvacuumcleanup, when num_delpages value has been
231 * finalized.
232 */
233 void
_bt_set_cleanup_info(Relation rel,BlockNumber num_delpages)234 _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
235 {
236 Buffer metabuf;
237 Page metapg;
238 BTMetaPageData *metad;
239
240 /*
241 * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
242 * field started out as a TransactionId field called btm_oldest_btpo_xact.
243 * Both "versions" are just uint32 fields. It was convenient to repurpose
244 * the field when we began to use 64-bit XIDs in deleted pages.
245 *
246 * It's possible that a pg_upgrade'd database will contain an XID value in
247 * what is now recognized as the metapage's btm_last_cleanup_num_delpages
248 * field. _bt_vacuum_needs_cleanup() may even believe that this value
249 * indicates that there are lots of pages that it needs to recycle, when
250 * in reality there are only one or two. The worst that can happen is
251 * that there will be a call to btvacuumscan a little earlier, which will
252 * set btm_last_cleanup_num_delpages to a sane value when we're called.
253 *
254 * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
255 * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
256 * to be consistent.
257 */
258 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
259 metapg = BufferGetPage(metabuf);
260 metad = BTPageGetMeta(metapg);
261
262 /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
263 if (metad->btm_version >= BTREE_NOVAC_VERSION &&
264 metad->btm_last_cleanup_num_delpages == num_delpages)
265 {
266 /* Usually means index continues to have num_delpages of 0 */
267 _bt_relbuf(rel, metabuf);
268 return;
269 }
270
271 /* trade in our read lock for a write lock */
272 _bt_unlockbuf(rel, metabuf);
273 _bt_lockbuf(rel, metabuf, BT_WRITE);
274
275 START_CRIT_SECTION();
276
277 /* upgrade meta-page if needed */
278 if (metad->btm_version < BTREE_NOVAC_VERSION)
279 _bt_upgrademetapage(metapg);
280
281 /* update cleanup-related information */
282 metad->btm_last_cleanup_num_delpages = num_delpages;
283 metad->btm_last_cleanup_num_heap_tuples = -1.0;
284 MarkBufferDirty(metabuf);
285
286 /* write wal record if needed */
287 if (RelationNeedsWAL(rel))
288 {
289 xl_btree_metadata md;
290 XLogRecPtr recptr;
291
292 XLogBeginInsert();
293 XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
294
295 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
296 md.version = metad->btm_version;
297 md.root = metad->btm_root;
298 md.level = metad->btm_level;
299 md.fastroot = metad->btm_fastroot;
300 md.fastlevel = metad->btm_fastlevel;
301 md.last_cleanup_num_delpages = num_delpages;
302 md.allequalimage = metad->btm_allequalimage;
303
304 XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
305
306 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
307
308 PageSetLSN(metapg, recptr);
309 }
310
311 END_CRIT_SECTION();
312
313 _bt_relbuf(rel, metabuf);
314 }
315
316 /*
317 * _bt_getroot() -- Get the root page of the btree.
318 *
319 * Since the root page can move around the btree file, we have to read
320 * its location from the metadata page, and then read the root page
321 * itself. If no root page exists yet, we have to create one.
322 *
323 * The access type parameter (BT_READ or BT_WRITE) controls whether
324 * a new root page will be created or not. If access = BT_READ,
325 * and no root page exists, we just return InvalidBuffer. For
326 * BT_WRITE, we try to create the root page if it doesn't exist.
327 * NOTE that the returned root page will have only a read lock set
328 * on it even if access = BT_WRITE!
329 *
330 * The returned page is not necessarily the true root --- it could be
331 * a "fast root" (a page that is alone in its level due to deletions).
332 * Also, if the root page is split while we are "in flight" to it,
333 * what we will return is the old root, which is now just the leftmost
334 * page on a probably-not-very-wide level. For most purposes this is
335 * as good as or better than the true root, so we do not bother to
336 * insist on finding the true root. We do, however, guarantee to
337 * return a live (not deleted or half-dead) page.
338 *
339 * On successful return, the root page is pinned and read-locked.
340 * The metadata page is not locked or pinned on exit.
341 */
342 Buffer
_bt_getroot(Relation rel,int access)343 _bt_getroot(Relation rel, int access)
344 {
345 Buffer metabuf;
346 Buffer rootbuf;
347 Page rootpage;
348 BTPageOpaque rootopaque;
349 BlockNumber rootblkno;
350 uint32 rootlevel;
351 BTMetaPageData *metad;
352
353 /*
354 * Try to use previously-cached metapage data to find the root. This
355 * normally saves one buffer access per index search, which is a very
356 * helpful savings in bufmgr traffic and hence contention.
357 */
358 if (rel->rd_amcache != NULL)
359 {
360 metad = (BTMetaPageData *) rel->rd_amcache;
361 /* We shouldn't have cached it if any of these fail */
362 Assert(metad->btm_magic == BTREE_MAGIC);
363 Assert(metad->btm_version >= BTREE_MIN_VERSION);
364 Assert(metad->btm_version <= BTREE_VERSION);
365 Assert(!metad->btm_allequalimage ||
366 metad->btm_version > BTREE_NOVAC_VERSION);
367 Assert(metad->btm_root != P_NONE);
368
369 rootblkno = metad->btm_fastroot;
370 Assert(rootblkno != P_NONE);
371 rootlevel = metad->btm_fastlevel;
372
373 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
374 rootpage = BufferGetPage(rootbuf);
375 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
376
377 /*
378 * Since the cache might be stale, we check the page more carefully
379 * here than normal. We *must* check that it's not deleted. If it's
380 * not alone on its level, then we reject too --- this may be overly
381 * paranoid but better safe than sorry. Note we don't check P_ISROOT,
382 * because that's not set in a "fast root".
383 */
384 if (!P_IGNORE(rootopaque) &&
385 rootopaque->btpo_level == rootlevel &&
386 P_LEFTMOST(rootopaque) &&
387 P_RIGHTMOST(rootopaque))
388 {
389 /* OK, accept cached page as the root */
390 return rootbuf;
391 }
392 _bt_relbuf(rel, rootbuf);
393 /* Cache is stale, throw it away */
394 if (rel->rd_amcache)
395 pfree(rel->rd_amcache);
396 rel->rd_amcache = NULL;
397 }
398
399 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
400 metad = _bt_getmeta(rel, metabuf);
401
402 /* if no root page initialized yet, do it */
403 if (metad->btm_root == P_NONE)
404 {
405 Page metapg;
406
407 /* If access = BT_READ, caller doesn't want us to create root yet */
408 if (access == BT_READ)
409 {
410 _bt_relbuf(rel, metabuf);
411 return InvalidBuffer;
412 }
413
414 /* trade in our read lock for a write lock */
415 _bt_unlockbuf(rel, metabuf);
416 _bt_lockbuf(rel, metabuf, BT_WRITE);
417
418 /*
419 * Race condition: if someone else initialized the metadata between
420 * the time we released the read lock and acquired the write lock, we
421 * must avoid doing it again.
422 */
423 if (metad->btm_root != P_NONE)
424 {
425 /*
426 * Metadata initialized by someone else. In order to guarantee no
427 * deadlocks, we have to release the metadata page and start all
428 * over again. (Is that really true? But it's hardly worth trying
429 * to optimize this case.)
430 */
431 _bt_relbuf(rel, metabuf);
432 return _bt_getroot(rel, access);
433 }
434
435 /*
436 * Get, initialize, write, and leave a lock of the appropriate type on
437 * the new root page. Since this is the first page in the tree, it's
438 * a leaf as well as the root.
439 */
440 rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
441 rootblkno = BufferGetBlockNumber(rootbuf);
442 rootpage = BufferGetPage(rootbuf);
443 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
444 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
445 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
446 rootopaque->btpo_level = 0;
447 rootopaque->btpo_cycleid = 0;
448 /* Get raw page pointer for metapage */
449 metapg = BufferGetPage(metabuf);
450
451 /* NO ELOG(ERROR) till meta is updated */
452 START_CRIT_SECTION();
453
454 /* upgrade metapage if needed */
455 if (metad->btm_version < BTREE_NOVAC_VERSION)
456 _bt_upgrademetapage(metapg);
457
458 metad->btm_root = rootblkno;
459 metad->btm_level = 0;
460 metad->btm_fastroot = rootblkno;
461 metad->btm_fastlevel = 0;
462 metad->btm_last_cleanup_num_delpages = 0;
463 metad->btm_last_cleanup_num_heap_tuples = -1.0;
464
465 MarkBufferDirty(rootbuf);
466 MarkBufferDirty(metabuf);
467
468 /* XLOG stuff */
469 if (RelationNeedsWAL(rel))
470 {
471 xl_btree_newroot xlrec;
472 XLogRecPtr recptr;
473 xl_btree_metadata md;
474
475 XLogBeginInsert();
476 XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
477 XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
478
479 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
480 md.version = metad->btm_version;
481 md.root = rootblkno;
482 md.level = 0;
483 md.fastroot = rootblkno;
484 md.fastlevel = 0;
485 md.last_cleanup_num_delpages = 0;
486 md.allequalimage = metad->btm_allequalimage;
487
488 XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
489
490 xlrec.rootblk = rootblkno;
491 xlrec.level = 0;
492
493 XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
494
495 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
496
497 PageSetLSN(rootpage, recptr);
498 PageSetLSN(metapg, recptr);
499 }
500
501 END_CRIT_SECTION();
502
503 /*
504 * swap root write lock for read lock. There is no danger of anyone
505 * else accessing the new root page while it's unlocked, since no one
506 * else knows where it is yet.
507 */
508 _bt_unlockbuf(rel, rootbuf);
509 _bt_lockbuf(rel, rootbuf, BT_READ);
510
511 /* okay, metadata is correct, release lock on it without caching */
512 _bt_relbuf(rel, metabuf);
513 }
514 else
515 {
516 rootblkno = metad->btm_fastroot;
517 Assert(rootblkno != P_NONE);
518 rootlevel = metad->btm_fastlevel;
519
520 /*
521 * Cache the metapage data for next time
522 */
523 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
524 sizeof(BTMetaPageData));
525 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
526
527 /*
528 * We are done with the metapage; arrange to release it via first
529 * _bt_relandgetbuf call
530 */
531 rootbuf = metabuf;
532
533 for (;;)
534 {
535 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
536 rootpage = BufferGetPage(rootbuf);
537 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
538
539 if (!P_IGNORE(rootopaque))
540 break;
541
542 /* it's dead, Jim. step right one page */
543 if (P_RIGHTMOST(rootopaque))
544 elog(ERROR, "no live root page found in index \"%s\"",
545 RelationGetRelationName(rel));
546 rootblkno = rootopaque->btpo_next;
547 }
548
549 if (rootopaque->btpo_level != rootlevel)
550 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
551 rootblkno, RelationGetRelationName(rel),
552 rootopaque->btpo_level, rootlevel);
553 }
554
555 /*
556 * By here, we have a pin and read lock on the root page, and no lock set
557 * on the metadata page. Return the root page's buffer.
558 */
559 return rootbuf;
560 }
561
562 /*
563 * _bt_gettrueroot() -- Get the true root page of the btree.
564 *
565 * This is the same as the BT_READ case of _bt_getroot(), except
566 * we follow the true-root link not the fast-root link.
567 *
568 * By the time we acquire lock on the root page, it might have been split and
569 * not be the true root anymore. This is okay for the present uses of this
570 * routine; we only really need to be able to move up at least one tree level
571 * from whatever non-root page we were at. If we ever do need to lock the
572 * one true root page, we could loop here, re-reading the metapage on each
573 * failure. (Note that it wouldn't do to hold the lock on the metapage while
574 * moving to the root --- that'd deadlock against any concurrent root split.)
575 */
576 Buffer
_bt_gettrueroot(Relation rel)577 _bt_gettrueroot(Relation rel)
578 {
579 Buffer metabuf;
580 Page metapg;
581 BTPageOpaque metaopaque;
582 Buffer rootbuf;
583 Page rootpage;
584 BTPageOpaque rootopaque;
585 BlockNumber rootblkno;
586 uint32 rootlevel;
587 BTMetaPageData *metad;
588
589 /*
590 * We don't try to use cached metapage data here, since (a) this path is
591 * not performance-critical, and (b) if we are here it suggests our cache
592 * is out-of-date anyway. In light of point (b), it's probably safest to
593 * actively flush any cached metapage info.
594 */
595 if (rel->rd_amcache)
596 pfree(rel->rd_amcache);
597 rel->rd_amcache = NULL;
598
599 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
600 metapg = BufferGetPage(metabuf);
601 metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
602 metad = BTPageGetMeta(metapg);
603
604 if (!P_ISMETA(metaopaque) ||
605 metad->btm_magic != BTREE_MAGIC)
606 ereport(ERROR,
607 (errcode(ERRCODE_INDEX_CORRUPTED),
608 errmsg("index \"%s\" is not a btree",
609 RelationGetRelationName(rel))));
610
611 if (metad->btm_version < BTREE_MIN_VERSION ||
612 metad->btm_version > BTREE_VERSION)
613 ereport(ERROR,
614 (errcode(ERRCODE_INDEX_CORRUPTED),
615 errmsg("version mismatch in index \"%s\": file version %d, "
616 "current version %d, minimal supported version %d",
617 RelationGetRelationName(rel),
618 metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
619
620 /* if no root page initialized yet, fail */
621 if (metad->btm_root == P_NONE)
622 {
623 _bt_relbuf(rel, metabuf);
624 return InvalidBuffer;
625 }
626
627 rootblkno = metad->btm_root;
628 rootlevel = metad->btm_level;
629
630 /*
631 * We are done with the metapage; arrange to release it via first
632 * _bt_relandgetbuf call
633 */
634 rootbuf = metabuf;
635
636 for (;;)
637 {
638 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
639 rootpage = BufferGetPage(rootbuf);
640 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
641
642 if (!P_IGNORE(rootopaque))
643 break;
644
645 /* it's dead, Jim. step right one page */
646 if (P_RIGHTMOST(rootopaque))
647 elog(ERROR, "no live root page found in index \"%s\"",
648 RelationGetRelationName(rel));
649 rootblkno = rootopaque->btpo_next;
650 }
651
652 if (rootopaque->btpo_level != rootlevel)
653 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
654 rootblkno, RelationGetRelationName(rel),
655 rootopaque->btpo_level, rootlevel);
656
657 return rootbuf;
658 }
659
660 /*
661 * _bt_getrootheight() -- Get the height of the btree search tree.
662 *
663 * We return the level (counting from zero) of the current fast root.
664 * This represents the number of tree levels we'd have to descend through
665 * to start any btree index search.
666 *
667 * This is used by the planner for cost-estimation purposes. Since it's
668 * only an estimate, slightly-stale data is fine, hence we don't worry
669 * about updating previously cached data.
670 */
671 int
_bt_getrootheight(Relation rel)672 _bt_getrootheight(Relation rel)
673 {
674 BTMetaPageData *metad;
675
676 if (rel->rd_amcache == NULL)
677 {
678 Buffer metabuf;
679
680 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
681 metad = _bt_getmeta(rel, metabuf);
682
683 /*
684 * If there's no root page yet, _bt_getroot() doesn't expect a cache
685 * to be made, so just stop here and report the index height is zero.
686 * (XXX perhaps _bt_getroot() should be changed to allow this case.)
687 */
688 if (metad->btm_root == P_NONE)
689 {
690 _bt_relbuf(rel, metabuf);
691 return 0;
692 }
693
694 /*
695 * Cache the metapage data for next time
696 */
697 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
698 sizeof(BTMetaPageData));
699 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
700 _bt_relbuf(rel, metabuf);
701 }
702
703 /* Get cached page */
704 metad = (BTMetaPageData *) rel->rd_amcache;
705 /* We shouldn't have cached it if any of these fail */
706 Assert(metad->btm_magic == BTREE_MAGIC);
707 Assert(metad->btm_version >= BTREE_MIN_VERSION);
708 Assert(metad->btm_version <= BTREE_VERSION);
709 Assert(!metad->btm_allequalimage ||
710 metad->btm_version > BTREE_NOVAC_VERSION);
711 Assert(metad->btm_fastroot != P_NONE);
712
713 return metad->btm_fastlevel;
714 }
715
716 /*
717 * _bt_metaversion() -- Get version/status info from metapage.
718 *
719 * Sets caller's *heapkeyspace and *allequalimage arguments using data
720 * from the B-Tree metapage (could be locally-cached version). This
721 * information needs to be stashed in insertion scankey, so we provide a
722 * single function that fetches both at once.
723 *
724 * This is used to determine the rules that must be used to descend a
725 * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
726 * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
727 * performance when inserting a new BTScanInsert-wise duplicate tuple
728 * among many leaf pages already full of such duplicates.
729 *
730 * Also sets allequalimage field, which indicates whether or not it is
731 * safe to apply deduplication. We rely on the assumption that
732 * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
733 * pg_upgrade'd from Postgres 12.
734 */
735 void
_bt_metaversion(Relation rel,bool * heapkeyspace,bool * allequalimage)736 _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
737 {
738 BTMetaPageData *metad;
739
740 if (rel->rd_amcache == NULL)
741 {
742 Buffer metabuf;
743
744 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
745 metad = _bt_getmeta(rel, metabuf);
746
747 /*
748 * If there's no root page yet, _bt_getroot() doesn't expect a cache
749 * to be made, so just stop here. (XXX perhaps _bt_getroot() should
750 * be changed to allow this case.)
751 */
752 if (metad->btm_root == P_NONE)
753 {
754 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
755 *allequalimage = metad->btm_allequalimage;
756
757 _bt_relbuf(rel, metabuf);
758 return;
759 }
760
761 /*
762 * Cache the metapage data for next time
763 *
764 * An on-the-fly version upgrade performed by _bt_upgrademetapage()
765 * can change the nbtree version for an index without invalidating any
766 * local cache. This is okay because it can only happen when moving
767 * from version 2 to version 3, both of which are !heapkeyspace
768 * versions.
769 */
770 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
771 sizeof(BTMetaPageData));
772 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
773 _bt_relbuf(rel, metabuf);
774 }
775
776 /* Get cached page */
777 metad = (BTMetaPageData *) rel->rd_amcache;
778 /* We shouldn't have cached it if any of these fail */
779 Assert(metad->btm_magic == BTREE_MAGIC);
780 Assert(metad->btm_version >= BTREE_MIN_VERSION);
781 Assert(metad->btm_version <= BTREE_VERSION);
782 Assert(!metad->btm_allequalimage ||
783 metad->btm_version > BTREE_NOVAC_VERSION);
784 Assert(metad->btm_fastroot != P_NONE);
785
786 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
787 *allequalimage = metad->btm_allequalimage;
788 }
789
790 /*
791 * _bt_checkpage() -- Verify that a freshly-read page looks sane.
792 */
793 void
_bt_checkpage(Relation rel,Buffer buf)794 _bt_checkpage(Relation rel, Buffer buf)
795 {
796 Page page = BufferGetPage(buf);
797
798 /*
799 * ReadBuffer verifies that every newly-read page passes
800 * PageHeaderIsValid, which means it either contains a reasonably sane
801 * page header or is all-zero. We have to defend against the all-zero
802 * case, however.
803 */
804 if (PageIsNew(page))
805 ereport(ERROR,
806 (errcode(ERRCODE_INDEX_CORRUPTED),
807 errmsg("index \"%s\" contains unexpected zero page at block %u",
808 RelationGetRelationName(rel),
809 BufferGetBlockNumber(buf)),
810 errhint("Please REINDEX it.")));
811
812 /*
813 * Additionally check that the special area looks sane.
814 */
815 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
816 ereport(ERROR,
817 (errcode(ERRCODE_INDEX_CORRUPTED),
818 errmsg("index \"%s\" contains corrupted page at block %u",
819 RelationGetRelationName(rel),
820 BufferGetBlockNumber(buf)),
821 errhint("Please REINDEX it.")));
822 }
823
824 /*
825 * Log the reuse of a page from the FSM.
826 */
827 static void
_bt_log_reuse_page(Relation rel,BlockNumber blkno,FullTransactionId safexid)828 _bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid)
829 {
830 xl_btree_reuse_page xlrec_reuse;
831
832 /*
833 * Note that we don't register the buffer with the record, because this
834 * operation doesn't modify the page. This record only exists to provide a
835 * conflict point for Hot Standby.
836 */
837
838 /* XLOG stuff */
839 xlrec_reuse.node = rel->rd_node;
840 xlrec_reuse.block = blkno;
841 xlrec_reuse.latestRemovedFullXid = safexid;
842
843 XLogBeginInsert();
844 XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
845
846 XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
847 }
848
849 /*
850 * _bt_getbuf() -- Get a buffer by block number for read or write.
851 *
852 * blkno == P_NEW means to get an unallocated index page. The page
853 * will be initialized before returning it.
854 *
855 * The general rule in nbtree is that it's never okay to access a
856 * page without holding both a buffer pin and a buffer lock on
857 * the page's buffer.
858 *
859 * When this routine returns, the appropriate lock is set on the
860 * requested buffer and its reference count has been incremented
861 * (ie, the buffer is "locked and pinned"). Also, we apply
862 * _bt_checkpage to sanity-check the page (except in P_NEW case),
863 * and perform Valgrind client requests that help Valgrind detect
864 * unsafe page accesses.
865 *
866 * Note: raw LockBuffer() calls are disallowed in nbtree; all
867 * buffer lock requests need to go through wrapper functions such
868 * as _bt_lockbuf().
869 */
870 Buffer
_bt_getbuf(Relation rel,BlockNumber blkno,int access)871 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
872 {
873 Buffer buf;
874
875 if (blkno != P_NEW)
876 {
877 /* Read an existing block of the relation */
878 buf = ReadBuffer(rel, blkno);
879 _bt_lockbuf(rel, buf, access);
880 _bt_checkpage(rel, buf);
881 }
882 else
883 {
884 bool needLock;
885 Page page;
886
887 Assert(access == BT_WRITE);
888
889 /*
890 * First see if the FSM knows of any free pages.
891 *
892 * We can't trust the FSM's report unreservedly; we have to check that
893 * the page is still free. (For example, an already-free page could
894 * have been re-used between the time the last VACUUM scanned it and
895 * the time the VACUUM made its FSM updates.)
896 *
897 * In fact, it's worse than that: we can't even assume that it's safe
898 * to take a lock on the reported page. If somebody else has a lock
899 * on it, or even worse our own caller does, we could deadlock. (The
900 * own-caller scenario is actually not improbable. Consider an index
901 * on a serial or timestamp column. Nearly all splits will be at the
902 * rightmost page, so it's entirely likely that _bt_split will call us
903 * while holding a lock on the page most recently acquired from FSM. A
904 * VACUUM running concurrently with the previous split could well have
905 * placed that page back in FSM.)
906 *
907 * To get around that, we ask for only a conditional lock on the
908 * reported page. If we fail, then someone else is using the page,
909 * and we may reasonably assume it's not free. (If we happen to be
910 * wrong, the worst consequence is the page will be lost to use till
911 * the next VACUUM, which is no big problem.)
912 */
913 for (;;)
914 {
915 blkno = GetFreeIndexPage(rel);
916 if (blkno == InvalidBlockNumber)
917 break;
918 buf = ReadBuffer(rel, blkno);
919 if (_bt_conditionallockbuf(rel, buf))
920 {
921 page = BufferGetPage(buf);
922
923 /*
924 * It's possible to find an all-zeroes page in an index. For
925 * example, a backend might successfully extend the relation
926 * one page and then crash before it is able to make a WAL
927 * entry for adding the page. If we find a zeroed page then
928 * reclaim it immediately.
929 */
930 if (PageIsNew(page))
931 {
932 /* Okay to use page. Initialize and return it. */
933 _bt_pageinit(page, BufferGetPageSize(buf));
934 return buf;
935 }
936
937 if (BTPageIsRecyclable(page))
938 {
939 /*
940 * If we are generating WAL for Hot Standby then create a
941 * WAL record that will allow us to conflict with queries
942 * running on standby, in case they have snapshots older
943 * than safexid value
944 */
945 if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
946 _bt_log_reuse_page(rel, blkno,
947 BTPageGetDeleteXid(page));
948
949 /* Okay to use page. Re-initialize and return it. */
950 _bt_pageinit(page, BufferGetPageSize(buf));
951 return buf;
952 }
953 elog(DEBUG2, "FSM returned nonrecyclable page");
954 _bt_relbuf(rel, buf);
955 }
956 else
957 {
958 elog(DEBUG2, "FSM returned nonlockable page");
959 /* couldn't get lock, so just drop pin */
960 ReleaseBuffer(buf);
961 }
962 }
963
964 /*
965 * Extend the relation by one page.
966 *
967 * We have to use a lock to ensure no one else is extending the rel at
968 * the same time, else we will both try to initialize the same new
969 * page. We can skip locking for new or temp relations, however,
970 * since no one else could be accessing them.
971 */
972 needLock = !RELATION_IS_LOCAL(rel);
973
974 if (needLock)
975 LockRelationForExtension(rel, ExclusiveLock);
976
977 buf = ReadBuffer(rel, P_NEW);
978
979 /* Acquire buffer lock on new page */
980 _bt_lockbuf(rel, buf, BT_WRITE);
981
982 /*
983 * Release the file-extension lock; it's now OK for someone else to
984 * extend the relation some more. Note that we cannot release this
985 * lock before we have buffer lock on the new page, or we risk a race
986 * condition against btvacuumscan --- see comments therein.
987 */
988 if (needLock)
989 UnlockRelationForExtension(rel, ExclusiveLock);
990
991 /* Initialize the new page before returning it */
992 page = BufferGetPage(buf);
993 Assert(PageIsNew(page));
994 _bt_pageinit(page, BufferGetPageSize(buf));
995 }
996
997 /* ref count and lock type are correct */
998 return buf;
999 }
1000
1001 /*
1002 * _bt_relandgetbuf() -- release a locked buffer and get another one.
1003 *
1004 * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
1005 * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
1006 * then it reduces to just _bt_getbuf; allowing this case simplifies some
1007 * callers.
1008 *
1009 * The original motivation for using this was to avoid two entries to the
1010 * bufmgr when one would do. However, now it's mainly just a notational
1011 * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
1012 * is when the target page is the same one already in the buffer.
1013 */
1014 Buffer
_bt_relandgetbuf(Relation rel,Buffer obuf,BlockNumber blkno,int access)1015 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
1016 {
1017 Buffer buf;
1018
1019 Assert(blkno != P_NEW);
1020 if (BufferIsValid(obuf))
1021 _bt_unlockbuf(rel, obuf);
1022 buf = ReleaseAndReadBuffer(obuf, rel, blkno);
1023 _bt_lockbuf(rel, buf, access);
1024
1025 _bt_checkpage(rel, buf);
1026 return buf;
1027 }
1028
1029 /*
1030 * _bt_relbuf() -- release a locked buffer.
1031 *
1032 * Lock and pin (refcount) are both dropped.
1033 */
1034 void
_bt_relbuf(Relation rel,Buffer buf)1035 _bt_relbuf(Relation rel, Buffer buf)
1036 {
1037 _bt_unlockbuf(rel, buf);
1038 ReleaseBuffer(buf);
1039 }
1040
1041 /*
1042 * _bt_lockbuf() -- lock a pinned buffer.
1043 *
1044 * Lock is acquired without acquiring another pin. This is like a raw
1045 * LockBuffer() call, but performs extra steps needed by Valgrind.
1046 *
1047 * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1048 * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1049 */
1050 void
_bt_lockbuf(Relation rel,Buffer buf,int access)1051 _bt_lockbuf(Relation rel, Buffer buf, int access)
1052 {
1053 /* LockBuffer() asserts that pin is held by this backend */
1054 LockBuffer(buf, access);
1055
1056 /*
1057 * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1058 * an nbtree error (e.g. a unique violation error). That won't cause
1059 * Valgrind false positives.
1060 *
1061 * The nbtree client requests are superimposed on top of the bufmgr.c
1062 * buffer pin client requests. In the event of an nbtree error the buffer
1063 * will certainly get marked as defined when the backend once again
1064 * acquires its first pin on the buffer. (Of course, if the backend never
1065 * touches the buffer again then it doesn't matter that it remains
1066 * non-accessible to Valgrind.)
1067 *
1068 * Note: When an IndexTuple C pointer gets computed using an ItemId read
1069 * from a page while a lock was held, the C pointer becomes unsafe to
1070 * dereference forever as soon as the lock is released. Valgrind can only
1071 * detect cases where the pointer gets dereferenced with no _current_
1072 * lock/pin held, though.
1073 */
1074 if (!RelationUsesLocalBuffers(rel))
1075 VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1076 }
1077
1078 /*
1079 * _bt_unlockbuf() -- unlock a pinned buffer.
1080 */
1081 void
_bt_unlockbuf(Relation rel,Buffer buf)1082 _bt_unlockbuf(Relation rel, Buffer buf)
1083 {
1084 /*
1085 * Buffer is pinned and locked, which means that it is expected to be
1086 * defined and addressable. Check that proactively.
1087 */
1088 VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1089
1090 /* LockBuffer() asserts that pin is held by this backend */
1091 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1092
1093 if (!RelationUsesLocalBuffers(rel))
1094 VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
1095 }
1096
1097 /*
1098 * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
1099 * buffer.
1100 *
1101 * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1102 * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1103 */
1104 bool
_bt_conditionallockbuf(Relation rel,Buffer buf)1105 _bt_conditionallockbuf(Relation rel, Buffer buf)
1106 {
1107 /* ConditionalLockBuffer() asserts that pin is held by this backend */
1108 if (!ConditionalLockBuffer(buf))
1109 return false;
1110
1111 if (!RelationUsesLocalBuffers(rel))
1112 VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1113
1114 return true;
1115 }
1116
1117 /*
1118 * _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
1119 * lock.
1120 */
1121 void
_bt_upgradelockbufcleanup(Relation rel,Buffer buf)1122 _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
1123 {
1124 /*
1125 * Buffer is pinned and locked, which means that it is expected to be
1126 * defined and addressable. Check that proactively.
1127 */
1128 VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1129
1130 /* LockBuffer() asserts that pin is held by this backend */
1131 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1132 LockBufferForCleanup(buf);
1133 }
1134
1135 /*
1136 * _bt_pageinit() -- Initialize a new page.
1137 *
1138 * On return, the page header is initialized; data space is empty;
1139 * special space is zeroed out.
1140 */
1141 void
_bt_pageinit(Page page,Size size)1142 _bt_pageinit(Page page, Size size)
1143 {
1144 PageInit(page, size, sizeof(BTPageOpaqueData));
1145 }
1146
1147 /*
1148 * Delete item(s) from a btree leaf page during VACUUM.
1149 *
1150 * This routine assumes that the caller has a super-exclusive write lock on
1151 * the buffer. Also, the given deletable and updatable arrays *must* be
1152 * sorted in ascending order.
1153 *
1154 * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1155 * in an existing posting list item are to be removed. This works by
1156 * updating/overwriting an existing item with caller's new version of the item
1157 * (a version that lacks the TIDs that are to be deleted).
1158 *
1159 * We record VACUUMs and b-tree deletes differently in WAL. Deletes must
1160 * generate their own latestRemovedXid by accessing the table directly,
1161 * whereas VACUUMs rely on the initial VACUUM table scan performing
1162 * WAL-logging that takes care of the issue for the table's indexes
1163 * indirectly. Also, we remove the VACUUM cycle ID from pages, which b-tree
1164 * deletes don't do.
1165 */
1166 void
_bt_delitems_vacuum(Relation rel,Buffer buf,OffsetNumber * deletable,int ndeletable,BTVacuumPosting * updatable,int nupdatable)1167 _bt_delitems_vacuum(Relation rel, Buffer buf,
1168 OffsetNumber *deletable, int ndeletable,
1169 BTVacuumPosting *updatable, int nupdatable)
1170 {
1171 Page page = BufferGetPage(buf);
1172 BTPageOpaque opaque;
1173 bool needswal = RelationNeedsWAL(rel);
1174 char *updatedbuf = NULL;
1175 Size updatedbuflen = 0;
1176 OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1177
1178 /* Shouldn't be called unless there's something to do */
1179 Assert(ndeletable > 0 || nupdatable > 0);
1180
1181 /* Generate new version of posting lists without deleted TIDs */
1182 if (nupdatable > 0)
1183 updatedbuf = _bt_delitems_update(updatable, nupdatable,
1184 updatedoffsets, &updatedbuflen,
1185 needswal);
1186
1187 /* No ereport(ERROR) until changes are logged */
1188 START_CRIT_SECTION();
1189
1190 /*
1191 * Handle posting tuple updates.
1192 *
1193 * Deliberately do this before handling simple deletes. If we did it the
1194 * other way around (i.e. WAL record order -- simple deletes before
1195 * updates) then we'd have to make compensating changes to the 'updatable'
1196 * array of offset numbers.
1197 *
1198 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1199 * happens to already be set. It's important that we not interfere with
1200 * _bt_delitems_delete().
1201 */
1202 for (int i = 0; i < nupdatable; i++)
1203 {
1204 OffsetNumber updatedoffset = updatedoffsets[i];
1205 IndexTuple itup;
1206 Size itemsz;
1207
1208 itup = updatable[i]->itup;
1209 itemsz = MAXALIGN(IndexTupleSize(itup));
1210 if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1211 itemsz))
1212 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1213 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1214 }
1215
1216 /* Now handle simple deletes of entire tuples */
1217 if (ndeletable > 0)
1218 PageIndexMultiDelete(page, deletable, ndeletable);
1219
1220 /*
1221 * We can clear the vacuum cycle ID since this page has certainly been
1222 * processed by the current vacuum scan.
1223 */
1224 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1225 opaque->btpo_cycleid = 0;
1226
1227 /*
1228 * Clear the BTP_HAS_GARBAGE page flag.
1229 *
1230 * This flag indicates the presence of LP_DEAD items on the page (though
1231 * not reliably). Note that we only rely on it with pg_upgrade'd
1232 * !heapkeyspace indexes. That's why clearing it here won't usually
1233 * interfere with _bt_delitems_delete().
1234 */
1235 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1236
1237 MarkBufferDirty(buf);
1238
1239 /* XLOG stuff */
1240 if (needswal)
1241 {
1242 XLogRecPtr recptr;
1243 xl_btree_vacuum xlrec_vacuum;
1244
1245 xlrec_vacuum.ndeleted = ndeletable;
1246 xlrec_vacuum.nupdated = nupdatable;
1247
1248 XLogBeginInsert();
1249 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1250 XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1251
1252 if (ndeletable > 0)
1253 XLogRegisterBufData(0, (char *) deletable,
1254 ndeletable * sizeof(OffsetNumber));
1255
1256 if (nupdatable > 0)
1257 {
1258 XLogRegisterBufData(0, (char *) updatedoffsets,
1259 nupdatable * sizeof(OffsetNumber));
1260 XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1261 }
1262
1263 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1264
1265 PageSetLSN(page, recptr);
1266 }
1267
1268 END_CRIT_SECTION();
1269
1270 /* can't leak memory here */
1271 if (updatedbuf != NULL)
1272 pfree(updatedbuf);
1273 /* free tuples allocated within _bt_delitems_update() */
1274 for (int i = 0; i < nupdatable; i++)
1275 pfree(updatable[i]->itup);
1276 }
1277
1278 /*
1279 * Delete item(s) from a btree leaf page during single-page cleanup.
1280 *
1281 * This routine assumes that the caller has pinned and write locked the
1282 * buffer. Also, the given deletable and updatable arrays *must* be sorted in
1283 * ascending order.
1284 *
1285 * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1286 * in an existing posting list item are to be removed. This works by
1287 * updating/overwriting an existing item with caller's new version of the item
1288 * (a version that lacks the TIDs that are to be deleted).
1289 *
1290 * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1291 * the page, but it needs its own latestRemovedXid from caller (caller gets
1292 * this from tableam). This is used by the REDO routine to generate recovery
1293 * conflicts. The other difference is that only _bt_delitems_vacuum will
1294 * clear page's VACUUM cycle ID.
1295 */
1296 static void
_bt_delitems_delete(Relation rel,Buffer buf,TransactionId latestRemovedXid,OffsetNumber * deletable,int ndeletable,BTVacuumPosting * updatable,int nupdatable)1297 _bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid,
1298 OffsetNumber *deletable, int ndeletable,
1299 BTVacuumPosting *updatable, int nupdatable)
1300 {
1301 Page page = BufferGetPage(buf);
1302 BTPageOpaque opaque;
1303 bool needswal = RelationNeedsWAL(rel);
1304 char *updatedbuf = NULL;
1305 Size updatedbuflen = 0;
1306 OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1307
1308 /* Shouldn't be called unless there's something to do */
1309 Assert(ndeletable > 0 || nupdatable > 0);
1310
1311 /* Generate new versions of posting lists without deleted TIDs */
1312 if (nupdatable > 0)
1313 updatedbuf = _bt_delitems_update(updatable, nupdatable,
1314 updatedoffsets, &updatedbuflen,
1315 needswal);
1316
1317 /* No ereport(ERROR) until changes are logged */
1318 START_CRIT_SECTION();
1319
1320 /* Handle updates and deletes just like _bt_delitems_vacuum */
1321 for (int i = 0; i < nupdatable; i++)
1322 {
1323 OffsetNumber updatedoffset = updatedoffsets[i];
1324 IndexTuple itup;
1325 Size itemsz;
1326
1327 itup = updatable[i]->itup;
1328 itemsz = MAXALIGN(IndexTupleSize(itup));
1329 if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1330 itemsz))
1331 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1332 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1333 }
1334
1335 if (ndeletable > 0)
1336 PageIndexMultiDelete(page, deletable, ndeletable);
1337
1338 /*
1339 * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
1340 * this point. The VACUUM command alone controls vacuum cycle IDs.
1341 */
1342 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1343
1344 /*
1345 * Clear the BTP_HAS_GARBAGE page flag.
1346 *
1347 * This flag indicates the presence of LP_DEAD items on the page (though
1348 * not reliably). Note that we only rely on it with pg_upgrade'd
1349 * !heapkeyspace indexes.
1350 */
1351 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1352
1353 MarkBufferDirty(buf);
1354
1355 /* XLOG stuff */
1356 if (needswal)
1357 {
1358 XLogRecPtr recptr;
1359 xl_btree_delete xlrec_delete;
1360
1361 xlrec_delete.latestRemovedXid = latestRemovedXid;
1362 xlrec_delete.ndeleted = ndeletable;
1363 xlrec_delete.nupdated = nupdatable;
1364
1365 XLogBeginInsert();
1366 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1367 XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1368
1369 if (ndeletable > 0)
1370 XLogRegisterBufData(0, (char *) deletable,
1371 ndeletable * sizeof(OffsetNumber));
1372
1373 if (nupdatable > 0)
1374 {
1375 XLogRegisterBufData(0, (char *) updatedoffsets,
1376 nupdatable * sizeof(OffsetNumber));
1377 XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1378 }
1379
1380 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1381
1382 PageSetLSN(page, recptr);
1383 }
1384
1385 END_CRIT_SECTION();
1386
1387 /* can't leak memory here */
1388 if (updatedbuf != NULL)
1389 pfree(updatedbuf);
1390 /* free tuples allocated within _bt_delitems_update() */
1391 for (int i = 0; i < nupdatable; i++)
1392 pfree(updatable[i]->itup);
1393 }
1394
1395 /*
1396 * Set up state needed to delete TIDs from posting list tuples via "updating"
1397 * the tuple. Performs steps common to both _bt_delitems_vacuum and
1398 * _bt_delitems_delete. These steps must take place before each function's
1399 * critical section begins.
1400 *
1401 * updatable and nupdatable are inputs, though note that we will use
1402 * _bt_update_posting() to replace the original itup with a pointer to a final
1403 * version in palloc()'d memory. Caller should free the tuples when its done.
1404 *
1405 * The first nupdatable entries from updatedoffsets are set to the page offset
1406 * number for posting list tuples that caller updates. This is mostly useful
1407 * because caller may need to WAL-log the page offsets (though we always do
1408 * this for caller out of convenience).
1409 *
1410 * Returns buffer consisting of an array of xl_btree_update structs that
1411 * describe the steps we perform here for caller (though only when needswal is
1412 * true). Also sets *updatedbuflen to the final size of the buffer. This
1413 * buffer is used by caller when WAL logging is required.
1414 */
1415 static char *
_bt_delitems_update(BTVacuumPosting * updatable,int nupdatable,OffsetNumber * updatedoffsets,Size * updatedbuflen,bool needswal)1416 _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
1417 OffsetNumber *updatedoffsets, Size *updatedbuflen,
1418 bool needswal)
1419 {
1420 char *updatedbuf = NULL;
1421 Size buflen = 0;
1422
1423 /* Shouldn't be called unless there's something to do */
1424 Assert(nupdatable > 0);
1425
1426 for (int i = 0; i < nupdatable; i++)
1427 {
1428 BTVacuumPosting vacposting = updatable[i];
1429 Size itemsz;
1430
1431 /* Replace work area IndexTuple with updated version */
1432 _bt_update_posting(vacposting);
1433
1434 /* Keep track of size of xl_btree_update for updatedbuf in passing */
1435 itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
1436 buflen += itemsz;
1437
1438 /* Build updatedoffsets buffer in passing */
1439 updatedoffsets[i] = vacposting->updatedoffset;
1440 }
1441
1442 /* XLOG stuff */
1443 if (needswal)
1444 {
1445 Size offset = 0;
1446
1447 /* Allocate, set final size for caller */
1448 updatedbuf = palloc(buflen);
1449 *updatedbuflen = buflen;
1450 for (int i = 0; i < nupdatable; i++)
1451 {
1452 BTVacuumPosting vacposting = updatable[i];
1453 Size itemsz;
1454 xl_btree_update update;
1455
1456 update.ndeletedtids = vacposting->ndeletedtids;
1457 memcpy(updatedbuf + offset, &update.ndeletedtids,
1458 SizeOfBtreeUpdate);
1459 offset += SizeOfBtreeUpdate;
1460
1461 itemsz = update.ndeletedtids * sizeof(uint16);
1462 memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1463 offset += itemsz;
1464 }
1465 }
1466
1467 return updatedbuf;
1468 }
1469
1470 /*
1471 * Comparator used by _bt_delitems_delete_check() to restore deltids array
1472 * back to its original leaf-page-wise sort order
1473 */
1474 static int
_bt_delitems_cmp(const void * a,const void * b)1475 _bt_delitems_cmp(const void *a, const void *b)
1476 {
1477 TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
1478 TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
1479
1480 if (indexdelete1->id > indexdelete2->id)
1481 return 1;
1482 if (indexdelete1->id < indexdelete2->id)
1483 return -1;
1484
1485 Assert(false);
1486
1487 return 0;
1488 }
1489
1490 /*
1491 * Try to delete item(s) from a btree leaf page during single-page cleanup.
1492 *
1493 * nbtree interface to table_index_delete_tuples(). Deletes a subset of index
1494 * tuples from caller's deltids array: those whose TIDs are found safe to
1495 * delete by the tableam (or already marked LP_DEAD in index, and so already
1496 * known to be deletable by our simple index deletion caller). We physically
1497 * delete index tuples from buf leaf page last of all (for index tuples where
1498 * that is known to be safe following our table_index_delete_tuples() call).
1499 *
1500 * Simple index deletion caller only includes TIDs from index tuples marked
1501 * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
1502 * included without increasing the total number of distinct table blocks for
1503 * the deletion operation as a whole. This approach often allows us to delete
1504 * some extra index tuples that were practically free for tableam to check in
1505 * passing (when they actually turn out to be safe to delete). It probably
1506 * only makes sense for the tableam to go ahead with these extra checks when
1507 * it is block-oriented (otherwise the checks probably won't be practically
1508 * free, which we rely on). The tableam interface requires the tableam side
1509 * to handle the problem, though, so this is okay (we as an index AM are free
1510 * to make the simplifying assumption that all tableams must be block-based).
1511 *
1512 * Bottom-up index deletion caller provides all the TIDs from the leaf page,
1513 * without expecting that tableam will check most of them. The tableam has
1514 * considerable discretion around which entries/blocks it checks. Our role in
1515 * costing the bottom-up deletion operation is strictly advisory.
1516 *
1517 * Note: Caller must have added deltids entries (i.e. entries that go in
1518 * delstate's main array) in leaf-page-wise order: page offset number order,
1519 * TID order among entries taken from the same posting list tuple (tiebreak on
1520 * TID). This order is convenient to work with here.
1521 *
1522 * Note: We also rely on the id field of each deltids element "capturing" this
1523 * original leaf-page-wise order. That is, we expect to be able to get back
1524 * to the original leaf-page-wise order just by sorting deltids on the id
1525 * field (tableam will sort deltids for its own reasons, so we'll need to put
1526 * it back in leaf-page-wise order afterwards).
1527 */
1528 void
_bt_delitems_delete_check(Relation rel,Buffer buf,Relation heapRel,TM_IndexDeleteOp * delstate)1529 _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel,
1530 TM_IndexDeleteOp *delstate)
1531 {
1532 Page page = BufferGetPage(buf);
1533 TransactionId latestRemovedXid;
1534 OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1535 int ndeletable = 0,
1536 nupdatable = 0;
1537 OffsetNumber deletable[MaxIndexTuplesPerPage];
1538 BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1539
1540 /* Use tableam interface to determine which tuples to delete first */
1541 latestRemovedXid = table_index_delete_tuples(heapRel, delstate);
1542
1543 /* Should not WAL-log latestRemovedXid unless it's required */
1544 if (!XLogStandbyInfoActive() || !RelationNeedsWAL(rel))
1545 latestRemovedXid = InvalidTransactionId;
1546
1547 /*
1548 * Construct a leaf-page-wise description of what _bt_delitems_delete()
1549 * needs to do to physically delete index tuples from the page.
1550 *
1551 * Must sort deltids array to restore leaf-page-wise order (original order
1552 * before call to tableam). This is the order that the loop expects.
1553 *
1554 * Note that deltids array might be a lot smaller now. It might even have
1555 * no entries at all (with bottom-up deletion caller), in which case there
1556 * is nothing left to do.
1557 */
1558 qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1559 _bt_delitems_cmp);
1560 if (delstate->ndeltids == 0)
1561 {
1562 Assert(delstate->bottomup);
1563 return;
1564 }
1565
1566 /* We definitely have to delete at least one index tuple (or one TID) */
1567 for (int i = 0; i < delstate->ndeltids; i++)
1568 {
1569 TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1570 OffsetNumber idxoffnum = dstatus->idxoffnum;
1571 ItemId itemid = PageGetItemId(page, idxoffnum);
1572 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1573 int nestedi,
1574 nitem;
1575 BTVacuumPosting vacposting;
1576
1577 Assert(OffsetNumberIsValid(idxoffnum));
1578
1579 if (idxoffnum == postingidxoffnum)
1580 {
1581 /*
1582 * This deltid entry is a TID from a posting list tuple that has
1583 * already been completely processed
1584 */
1585 Assert(BTreeTupleIsPosting(itup));
1586 Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
1587 &delstate->deltids[i].tid) < 0);
1588 Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup),
1589 &delstate->deltids[i].tid) >= 0);
1590 continue;
1591 }
1592
1593 if (!BTreeTupleIsPosting(itup))
1594 {
1595 /* Plain non-pivot tuple */
1596 Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1597 if (dstatus->knowndeletable)
1598 deletable[ndeletable++] = idxoffnum;
1599 continue;
1600 }
1601
1602 /*
1603 * itup is a posting list tuple whose lowest deltids entry (which may
1604 * or may not be for the first TID from itup) is considered here now.
1605 * We should process all of the deltids entries for the posting list
1606 * together now, though (not just the lowest). Remember to skip over
1607 * later itup-related entries during later iterations of outermost
1608 * loop.
1609 */
1610 postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1611 nestedi = i; /* Initialize for first itup deltids entry */
1612 vacposting = NULL; /* Describes final action for itup */
1613 nitem = BTreeTupleGetNPosting(itup);
1614 for (int p = 0; p < nitem; p++)
1615 {
1616 ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1617 int ptidcmp = -1;
1618
1619 /*
1620 * This nested loop reuses work across ptid TIDs taken from itup.
1621 * We take advantage of the fact that both itup's TIDs and deltids
1622 * entries (within a single itup/posting list grouping) must both
1623 * be in ascending TID order.
1624 */
1625 for (; nestedi < delstate->ndeltids; nestedi++)
1626 {
1627 TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1628 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1629
1630 /* Stop once we get past all itup related deltids entries */
1631 Assert(tdstatus->idxoffnum >= idxoffnum);
1632 if (tdstatus->idxoffnum != idxoffnum)
1633 break;
1634
1635 /* Skip past non-deletable itup related entries up front */
1636 if (!tdstatus->knowndeletable)
1637 continue;
1638
1639 /* Entry is first partial ptid match (or an exact match)? */
1640 ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1641 if (ptidcmp >= 0)
1642 {
1643 /* Greater than or equal (partial or exact) match... */
1644 break;
1645 }
1646 }
1647
1648 /* ...exact ptid match to a deletable deltids entry? */
1649 if (ptidcmp != 0)
1650 continue;
1651
1652 /* Exact match for deletable deltids entry -- ptid gets deleted */
1653 if (vacposting == NULL)
1654 {
1655 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1656 nitem * sizeof(uint16));
1657 vacposting->itup = itup;
1658 vacposting->updatedoffset = idxoffnum;
1659 vacposting->ndeletedtids = 0;
1660 }
1661 vacposting->deletetids[vacposting->ndeletedtids++] = p;
1662 }
1663
1664 /* Final decision on itup, a posting list tuple */
1665
1666 if (vacposting == NULL)
1667 {
1668 /* No TIDs to delete from itup -- do nothing */
1669 }
1670 else if (vacposting->ndeletedtids == nitem)
1671 {
1672 /* Straight delete of itup (to delete all TIDs) */
1673 deletable[ndeletable++] = idxoffnum;
1674 /* Turns out we won't need granular information */
1675 pfree(vacposting);
1676 }
1677 else
1678 {
1679 /* Delete some (but not all) TIDs from itup */
1680 Assert(vacposting->ndeletedtids > 0 &&
1681 vacposting->ndeletedtids < nitem);
1682 updatable[nupdatable++] = vacposting;
1683 }
1684 }
1685
1686 /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1687 _bt_delitems_delete(rel, buf, latestRemovedXid, deletable, ndeletable,
1688 updatable, nupdatable);
1689
1690 /* be tidy */
1691 for (int i = 0; i < nupdatable; i++)
1692 pfree(updatable[i]);
1693 }
1694
1695 /*
1696 * Check that leftsib page (the btpo_prev of target page) is not marked with
1697 * INCOMPLETE_SPLIT flag. Used during page deletion.
1698 *
1699 * Returning true indicates that page flag is set in leftsib (which is
1700 * definitely still the left sibling of target). When that happens, the
1701 * target doesn't have a downlink in parent, and the page deletion algorithm
1702 * isn't prepared to handle that. Deletion of the target page (or the whole
1703 * subtree that contains the target page) cannot take place.
1704 *
1705 * Caller should not have a lock on the target page itself, since pages on the
1706 * same level must always be locked left to right to avoid deadlocks.
1707 */
1708 static bool
_bt_leftsib_splitflag(Relation rel,BlockNumber leftsib,BlockNumber target)1709 _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
1710 {
1711 Buffer buf;
1712 Page page;
1713 BTPageOpaque opaque;
1714 bool result;
1715
1716 /* Easy case: No left sibling */
1717 if (leftsib == P_NONE)
1718 return false;
1719
1720 buf = _bt_getbuf(rel, leftsib, BT_READ);
1721 page = BufferGetPage(buf);
1722 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1723
1724 /*
1725 * If the left sibling was concurrently split, so that its next-pointer
1726 * doesn't point to the current page anymore, the split that created
1727 * target must be completed. Caller can reasonably expect that there will
1728 * be a downlink to the target page that it can relocate using its stack.
1729 * (We don't allow splitting an incompletely split page again until the
1730 * previous split has been completed.)
1731 */
1732 result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1733 _bt_relbuf(rel, buf);
1734
1735 return result;
1736 }
1737
1738 /*
1739 * Check that leafrightsib page (the btpo_next of target leaf page) is not
1740 * marked with ISHALFDEAD flag. Used during page deletion.
1741 *
1742 * Returning true indicates that page flag is set in leafrightsib, so page
1743 * deletion cannot go ahead. Our caller is not prepared to deal with the case
1744 * where the parent page does not have a pivot tuples whose downlink points to
1745 * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
1746 * seem worth going to the trouble of teaching our caller to deal with it.
1747 * The situation will be resolved after VACUUM finishes the deletion of the
1748 * half-dead page (when a future VACUUM operation reaches the target page
1749 * again).
1750 *
1751 * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1752 * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
1753 * okay because of the restriction on deleting pages that are the rightmost
1754 * page of their parent (i.e. that such deletions can only take place when the
1755 * entire subtree must be deleted). The leaf level check made here will apply
1756 * to a right "cousin" leaf page rather than a simple right sibling leaf page
1757 * in cases where caller actually goes on to attempt deleting pages that are
1758 * above the leaf page. The right cousin leaf page is representative of the
1759 * left edge of the subtree to the right of the to-be-deleted subtree as a
1760 * whole, which is exactly the condition that our caller cares about.
1761 * (Besides, internal pages are never marked half-dead, so it isn't even
1762 * possible to _directly_ assess if an internal page is part of some other
1763 * to-be-deleted subtree.)
1764 */
1765 static bool
_bt_rightsib_halfdeadflag(Relation rel,BlockNumber leafrightsib)1766 _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
1767 {
1768 Buffer buf;
1769 Page page;
1770 BTPageOpaque opaque;
1771 bool result;
1772
1773 Assert(leafrightsib != P_NONE);
1774
1775 buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1776 page = BufferGetPage(buf);
1777 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1778
1779 Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1780 result = P_ISHALFDEAD(opaque);
1781 _bt_relbuf(rel, buf);
1782
1783 return result;
1784 }
1785
1786 /*
1787 * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1788 *
1789 * This action unlinks the leaf page from the b-tree structure, removing all
1790 * pointers leading to it --- but not touching its own left and right links.
1791 * The page cannot be physically reclaimed right away, since other processes
1792 * may currently be trying to follow links leading to the page; they have to
1793 * be allowed to use its right-link to recover. See nbtree/README.
1794 *
1795 * On entry, the target buffer must be pinned and locked (either read or write
1796 * lock is OK). The page must be an empty leaf page, which may be half-dead
1797 * already (a half-dead page should only be passed to us when an earlier
1798 * VACUUM operation was interrupted, though). Note in particular that caller
1799 * should never pass a buffer containing an existing deleted page here. The
1800 * lock and pin on caller's buffer will be dropped before we return.
1801 *
1802 * Maintains bulk delete stats for caller, which are taken from vstate. We
1803 * need to cooperate closely with caller here so that whole VACUUM operation
1804 * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
1805 * delete in passing. If such pages happen to be from a block number that is
1806 * ahead of the current scanblkno position, then caller is expected to count
1807 * them directly later on. It's simpler for us to understand caller's
1808 * requirements than it would be for caller to understand when or how a
1809 * deleted page became deleted after the fact.
1810 *
1811 * NOTE: this leaks memory. Rather than trying to clean up everything
1812 * carefully, it's better to run it in a temp context that can be reset
1813 * frequently.
1814 */
1815 void
_bt_pagedel(Relation rel,Buffer leafbuf,BTVacState * vstate)1816 _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
1817 {
1818 BlockNumber rightsib;
1819 bool rightsib_empty;
1820 Page page;
1821 BTPageOpaque opaque;
1822
1823 /*
1824 * Save original leafbuf block number from caller. Only deleted blocks
1825 * that are <= scanblkno are added to bulk delete stat's pages_deleted
1826 * count.
1827 */
1828 BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1829
1830 /*
1831 * "stack" is a search stack leading (approximately) to the target page.
1832 * It is initially NULL, but when iterating, we keep it to avoid
1833 * duplicated search effort.
1834 *
1835 * Also, when "stack" is not NULL, we have already checked that the
1836 * current page is not the right half of an incomplete split, i.e. the
1837 * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1838 * when the current target page is to the right of caller's initial page
1839 * (the scanblkno page).
1840 */
1841 BTStack stack = NULL;
1842
1843 for (;;)
1844 {
1845 page = BufferGetPage(leafbuf);
1846 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1847
1848 /*
1849 * Internal pages are never deleted directly, only as part of deleting
1850 * the whole subtree all the way down to leaf level.
1851 *
1852 * Also check for deleted pages here. Caller never passes us a fully
1853 * deleted page. Only VACUUM can delete pages, so there can't have
1854 * been a concurrent deletion. Assume that we reached any deleted
1855 * page encountered here by following a sibling link, and that the
1856 * index is corrupt.
1857 */
1858 Assert(!P_ISDELETED(opaque));
1859 if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1860 {
1861 /*
1862 * Pre-9.4 page deletion only marked internal pages as half-dead,
1863 * but now we only use that flag on leaf pages. The old algorithm
1864 * was never supposed to leave half-dead pages in the tree, it was
1865 * just a transient state, but it was nevertheless possible in
1866 * error scenarios. We don't know how to deal with them here. They
1867 * are harmless as far as searches are considered, but inserts
1868 * into the deleted keyspace could add out-of-order downlinks in
1869 * the upper levels. Log a notice, hopefully the admin will notice
1870 * and reindex.
1871 */
1872 if (P_ISHALFDEAD(opaque))
1873 ereport(LOG,
1874 (errcode(ERRCODE_INDEX_CORRUPTED),
1875 errmsg("index \"%s\" contains a half-dead internal page",
1876 RelationGetRelationName(rel)),
1877 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1878
1879 if (P_ISDELETED(opaque))
1880 ereport(LOG,
1881 (errcode(ERRCODE_INDEX_CORRUPTED),
1882 errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1883 BufferGetBlockNumber(leafbuf),
1884 scanblkno,
1885 RelationGetRelationName(rel))));
1886
1887 _bt_relbuf(rel, leafbuf);
1888 return;
1889 }
1890
1891 /*
1892 * We can never delete rightmost pages nor root pages. While at it,
1893 * check that page is empty, since it's possible that the leafbuf page
1894 * was empty a moment ago, but has since had some inserts.
1895 *
1896 * To keep the algorithm simple, we also never delete an incompletely
1897 * split page (they should be rare enough that this doesn't make any
1898 * meaningful difference to disk usage):
1899 *
1900 * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1901 * left half of an incomplete split, but ensuring that it's not the
1902 * right half is more complicated. For that, we have to check that
1903 * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1904 * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1905 * release the lock on scanblkno/leafbuf, check the left sibling, and
1906 * construct a search stack to scanblkno. On subsequent iterations,
1907 * we know we stepped right from a page that passed these tests, so
1908 * it's OK.
1909 */
1910 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1911 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1912 P_INCOMPLETE_SPLIT(opaque))
1913 {
1914 /* Should never fail to delete a half-dead page */
1915 Assert(!P_ISHALFDEAD(opaque));
1916
1917 _bt_relbuf(rel, leafbuf);
1918 return;
1919 }
1920
1921 /*
1922 * First, remove downlink pointing to the page (or a parent of the
1923 * page, if we are going to delete a taller subtree), and mark the
1924 * leafbuf page half-dead
1925 */
1926 if (!P_ISHALFDEAD(opaque))
1927 {
1928 /*
1929 * We need an approximate pointer to the page's parent page. We
1930 * use a variant of the standard search mechanism to search for
1931 * the page's high key; this will give us a link to either the
1932 * current parent or someplace to its left (if there are multiple
1933 * equal high keys, which is possible with !heapkeyspace indexes).
1934 *
1935 * Also check if this is the right-half of an incomplete split
1936 * (see comment above).
1937 */
1938 if (!stack)
1939 {
1940 BTScanInsert itup_key;
1941 ItemId itemid;
1942 IndexTuple targetkey;
1943 BlockNumber leftsib,
1944 leafblkno;
1945 Buffer sleafbuf;
1946
1947 itemid = PageGetItemId(page, P_HIKEY);
1948 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1949
1950 leftsib = opaque->btpo_prev;
1951 leafblkno = BufferGetBlockNumber(leafbuf);
1952
1953 /*
1954 * To avoid deadlocks, we'd better drop the leaf page lock
1955 * before going further.
1956 */
1957 _bt_unlockbuf(rel, leafbuf);
1958
1959 /*
1960 * Check that the left sibling of leafbuf (if any) is not
1961 * marked with INCOMPLETE_SPLIT flag before proceeding
1962 */
1963 Assert(leafblkno == scanblkno);
1964 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1965 {
1966 ReleaseBuffer(leafbuf);
1967 return;
1968 }
1969
1970 /* we need an insertion scan key for the search, so build one */
1971 itup_key = _bt_mkscankey(rel, targetkey);
1972 /* find the leftmost leaf page with matching pivot/high key */
1973 itup_key->pivotsearch = true;
1974 stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
1975 /* won't need a second lock or pin on leafbuf */
1976 _bt_relbuf(rel, sleafbuf);
1977
1978 /*
1979 * Re-lock the leaf page, and start over to use our stack
1980 * within _bt_mark_page_halfdead. We must do it that way
1981 * because it's possible that leafbuf can no longer be
1982 * deleted. We need to recheck.
1983 *
1984 * Note: We can't simply hold on to the sleafbuf lock instead,
1985 * because it's barely possible that sleafbuf is not the same
1986 * page as leafbuf. This happens when leafbuf split after our
1987 * original lock was dropped, but before _bt_search finished
1988 * its descent. We rely on the assumption that we'll find
1989 * leafbuf isn't safe to delete anymore in this scenario.
1990 * (Page deletion can cope with the stack being to the left of
1991 * leafbuf, but not to the right of leafbuf.)
1992 */
1993 _bt_lockbuf(rel, leafbuf, BT_WRITE);
1994 continue;
1995 }
1996
1997 /*
1998 * See if it's safe to delete the leaf page, and determine how
1999 * many parent/internal pages above the leaf level will be
2000 * deleted. If it's safe then _bt_mark_page_halfdead will also
2001 * perform the first phase of deletion, which includes marking the
2002 * leafbuf page half-dead.
2003 */
2004 Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
2005 if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
2006 {
2007 _bt_relbuf(rel, leafbuf);
2008 return;
2009 }
2010 }
2011
2012 /*
2013 * Then unlink it from its siblings. Each call to
2014 * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2015 * making it shallower. Iterate until the leafbuf page is deleted.
2016 */
2017 rightsib_empty = false;
2018 Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2019 while (P_ISHALFDEAD(opaque))
2020 {
2021 /* Check for interrupts in _bt_unlink_halfdead_page */
2022 if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2023 &rightsib_empty, vstate))
2024 {
2025 /*
2026 * _bt_unlink_halfdead_page should never fail, since we
2027 * established that deletion is generally safe in
2028 * _bt_mark_page_halfdead -- index must be corrupt.
2029 *
2030 * Note that _bt_unlink_halfdead_page already released the
2031 * lock and pin on leafbuf for us.
2032 */
2033 Assert(false);
2034 return;
2035 }
2036 }
2037
2038 Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2039
2040 rightsib = opaque->btpo_next;
2041
2042 _bt_relbuf(rel, leafbuf);
2043
2044 /*
2045 * Check here, as calling loops will have locks held, preventing
2046 * interrupts from being processed.
2047 */
2048 CHECK_FOR_INTERRUPTS();
2049
2050 /*
2051 * The page has now been deleted. If its right sibling is completely
2052 * empty, it's possible that the reason we haven't deleted it earlier
2053 * is that it was the rightmost child of the parent. Now that we
2054 * removed the downlink for this page, the right sibling might now be
2055 * the only child of the parent, and could be removed. It would be
2056 * picked up by the next vacuum anyway, but might as well try to
2057 * remove it now, so loop back to process the right sibling.
2058 *
2059 * Note: This relies on the assumption that _bt_getstackbuf() will be
2060 * able to reuse our original descent stack with a different child
2061 * block (provided that the child block is to the right of the
2062 * original leaf page reached by _bt_search()). It will even update
2063 * the descent stack each time we loop around, avoiding repeated work.
2064 */
2065 if (!rightsib_empty)
2066 break;
2067
2068 leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2069 }
2070 }
2071
2072 /*
2073 * First stage of page deletion.
2074 *
2075 * Establish the height of the to-be-deleted subtree with leafbuf at its
2076 * lowest level, remove the downlink to the subtree, and mark leafbuf
2077 * half-dead. The final to-be-deleted subtree is usually just leafbuf itself,
2078 * but may include additional internal pages (at most one per level of the
2079 * tree below the root).
2080 *
2081 * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
2082 * the rightmost child of its parent (and parent has more than one downlink).
2083 * Returns 'true' when the first stage of page deletion completed
2084 * successfully.
2085 */
2086 static bool
_bt_mark_page_halfdead(Relation rel,Buffer leafbuf,BTStack stack)2087 _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
2088 {
2089 BlockNumber leafblkno;
2090 BlockNumber leafrightsib;
2091 BlockNumber topparent;
2092 BlockNumber topparentrightsib;
2093 ItemId itemid;
2094 Page page;
2095 BTPageOpaque opaque;
2096 Buffer subtreeparent;
2097 OffsetNumber poffset;
2098 OffsetNumber nextoffset;
2099 IndexTuple itup;
2100 IndexTupleData trunctuple;
2101
2102 page = BufferGetPage(leafbuf);
2103 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2104
2105 Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
2106 P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
2107 P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2108
2109 /*
2110 * Save info about the leaf page.
2111 */
2112 leafblkno = BufferGetBlockNumber(leafbuf);
2113 leafrightsib = opaque->btpo_next;
2114
2115 /*
2116 * Before attempting to lock the parent page, check that the right sibling
2117 * is not in half-dead state. A half-dead right sibling would have no
2118 * downlink in the parent, which would be highly confusing later when we
2119 * delete the downlink. It would fail the "right sibling of target page
2120 * is also the next child in parent page" cross-check below.
2121 */
2122 if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
2123 {
2124 elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
2125 leafblkno, leafrightsib);
2126 return false;
2127 }
2128
2129 /*
2130 * We cannot delete a page that is the rightmost child of its immediate
2131 * parent, unless it is the only child --- in which case the parent has to
2132 * be deleted too, and the same condition applies recursively to it. We
2133 * have to check this condition all the way up before trying to delete,
2134 * and lock the parent of the root of the to-be-deleted subtree (the
2135 * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
2136 * for us. We remove the downlink to the "top parent" page (subtree root
2137 * page) from the subtree parent page below.
2138 *
2139 * Initialize topparent to be leafbuf page now. The final to-be-deleted
2140 * subtree is often a degenerate one page subtree consisting only of the
2141 * leafbuf page. When that happens, the leafbuf page is the final subtree
2142 * root page/top parent page.
2143 */
2144 topparent = leafblkno;
2145 topparentrightsib = leafrightsib;
2146 if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
2147 &subtreeparent, &poffset,
2148 &topparent, &topparentrightsib))
2149 return false;
2150
2151 /*
2152 * Check that the parent-page index items we're about to delete/overwrite
2153 * in subtree parent page contain what we expect. This can fail if the
2154 * index has become corrupt for some reason. We want to throw any error
2155 * before entering the critical section --- otherwise it'd be a PANIC.
2156 */
2157 page = BufferGetPage(subtreeparent);
2158 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2159
2160 #ifdef USE_ASSERT_CHECKING
2161
2162 /*
2163 * This is just an assertion because _bt_lock_subtree_parent should have
2164 * guaranteed tuple has the expected contents
2165 */
2166 itemid = PageGetItemId(page, poffset);
2167 itup = (IndexTuple) PageGetItem(page, itemid);
2168 Assert(BTreeTupleGetDownLink(itup) == topparent);
2169 #endif
2170
2171 nextoffset = OffsetNumberNext(poffset);
2172 itemid = PageGetItemId(page, nextoffset);
2173 itup = (IndexTuple) PageGetItem(page, itemid);
2174 if (BTreeTupleGetDownLink(itup) != topparentrightsib)
2175 ereport(ERROR,
2176 (errcode(ERRCODE_INDEX_CORRUPTED),
2177 errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
2178 topparentrightsib, topparent,
2179 BTreeTupleGetDownLink(itup),
2180 BufferGetBlockNumber(subtreeparent),
2181 RelationGetRelationName(rel))));
2182
2183 /*
2184 * Any insert which would have gone on the leaf block will now go to its
2185 * right sibling. In other words, the key space moves right.
2186 */
2187 PredicateLockPageCombine(rel, leafblkno, leafrightsib);
2188
2189 /* No ereport(ERROR) until changes are logged */
2190 START_CRIT_SECTION();
2191
2192 /*
2193 * Update parent of subtree. We want to delete the downlink to the top
2194 * parent page/root of the subtree, and the *following* key. Easiest way
2195 * is to copy the right sibling's downlink over the downlink that points
2196 * to top parent page, and then delete the right sibling's original pivot
2197 * tuple.
2198 *
2199 * Lanin and Shasha make the key space move left when deleting a page,
2200 * whereas the key space moves right here. That's why we cannot simply
2201 * delete the pivot tuple with the downlink to the top parent page. See
2202 * nbtree/README.
2203 */
2204 page = BufferGetPage(subtreeparent);
2205 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2206
2207 itemid = PageGetItemId(page, poffset);
2208 itup = (IndexTuple) PageGetItem(page, itemid);
2209 BTreeTupleSetDownLink(itup, topparentrightsib);
2210
2211 nextoffset = OffsetNumberNext(poffset);
2212 PageIndexTupleDelete(page, nextoffset);
2213
2214 /*
2215 * Mark the leaf page as half-dead, and stamp it with a link to the top
2216 * parent page. When the leaf page is also the top parent page, the link
2217 * is set to InvalidBlockNumber.
2218 */
2219 page = BufferGetPage(leafbuf);
2220 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2221 opaque->btpo_flags |= BTP_HALF_DEAD;
2222
2223 Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
2224 MemSet(&trunctuple, 0, sizeof(IndexTupleData));
2225 trunctuple.t_info = sizeof(IndexTupleData);
2226 if (topparent != leafblkno)
2227 BTreeTupleSetTopParent(&trunctuple, topparent);
2228 else
2229 BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
2230
2231 if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
2232 IndexTupleSize(&trunctuple)))
2233 elog(ERROR, "could not overwrite high key in half-dead page");
2234
2235 /* Must mark buffers dirty before XLogInsert */
2236 MarkBufferDirty(subtreeparent);
2237 MarkBufferDirty(leafbuf);
2238
2239 /* XLOG stuff */
2240 if (RelationNeedsWAL(rel))
2241 {
2242 xl_btree_mark_page_halfdead xlrec;
2243 XLogRecPtr recptr;
2244
2245 xlrec.poffset = poffset;
2246 xlrec.leafblk = leafblkno;
2247 if (topparent != leafblkno)
2248 xlrec.topparent = topparent;
2249 else
2250 xlrec.topparent = InvalidBlockNumber;
2251
2252 XLogBeginInsert();
2253 XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
2254 XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
2255
2256 page = BufferGetPage(leafbuf);
2257 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2258 xlrec.leftblk = opaque->btpo_prev;
2259 xlrec.rightblk = opaque->btpo_next;
2260
2261 XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead);
2262
2263 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
2264
2265 page = BufferGetPage(subtreeparent);
2266 PageSetLSN(page, recptr);
2267 page = BufferGetPage(leafbuf);
2268 PageSetLSN(page, recptr);
2269 }
2270
2271 END_CRIT_SECTION();
2272
2273 _bt_relbuf(rel, subtreeparent);
2274 return true;
2275 }
2276
2277 /*
2278 * Second stage of page deletion.
2279 *
2280 * Unlinks a single page (in the subtree undergoing deletion) from its
2281 * siblings. Also marks the page deleted.
2282 *
2283 * To get rid of the whole subtree, including the leaf page itself, call here
2284 * until the leaf page is deleted. The original "top parent" established in
2285 * the first stage of deletion is deleted in the first call here, while the
2286 * leaf page is deleted in the last call here. Note that the leaf page itself
2287 * is often the initial top parent page.
2288 *
2289 * Returns 'false' if the page could not be unlinked (shouldn't happen). If
2290 * the right sibling of the current target page is empty, *rightsib_empty is
2291 * set to true, allowing caller to delete the target's right sibling page in
2292 * passing. Note that *rightsib_empty is only actually used by caller when
2293 * target page is leafbuf, following last call here for leafbuf/the subtree
2294 * containing leafbuf. (We always set *rightsib_empty for caller, just to be
2295 * consistent.)
2296 *
2297 * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
2298 * On success exit, we'll be holding pin and write lock. On failure exit,
2299 * we'll release both pin and lock before returning (we define it that way
2300 * to avoid having to reacquire a lock we already released).
2301 */
2302 static bool
_bt_unlink_halfdead_page(Relation rel,Buffer leafbuf,BlockNumber scanblkno,bool * rightsib_empty,BTVacState * vstate)2303 _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
2304 bool *rightsib_empty, BTVacState *vstate)
2305 {
2306 BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
2307 IndexBulkDeleteResult *stats = vstate->stats;
2308 BlockNumber leafleftsib;
2309 BlockNumber leafrightsib;
2310 BlockNumber target;
2311 BlockNumber leftsib;
2312 BlockNumber rightsib;
2313 Buffer lbuf = InvalidBuffer;
2314 Buffer buf;
2315 Buffer rbuf;
2316 Buffer metabuf = InvalidBuffer;
2317 Page metapg = NULL;
2318 BTMetaPageData *metad = NULL;
2319 ItemId itemid;
2320 Page page;
2321 BTPageOpaque opaque;
2322 FullTransactionId safexid;
2323 bool rightsib_is_rightmost;
2324 uint32 targetlevel;
2325 IndexTuple leafhikey;
2326 BlockNumber leaftopparent;
2327
2328 page = BufferGetPage(leafbuf);
2329 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2330
2331 Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
2332
2333 /*
2334 * Remember some information about the leaf page.
2335 */
2336 itemid = PageGetItemId(page, P_HIKEY);
2337 leafhikey = (IndexTuple) PageGetItem(page, itemid);
2338 target = BTreeTupleGetTopParent(leafhikey);
2339 leafleftsib = opaque->btpo_prev;
2340 leafrightsib = opaque->btpo_next;
2341
2342 _bt_unlockbuf(rel, leafbuf);
2343
2344 /*
2345 * Check here, as calling loops will have locks held, preventing
2346 * interrupts from being processed.
2347 */
2348 CHECK_FOR_INTERRUPTS();
2349
2350 /* Unlink the current top parent of the subtree */
2351 if (!BlockNumberIsValid(target))
2352 {
2353 /* Target is leaf page (or leaf page is top parent, if you prefer) */
2354 target = leafblkno;
2355
2356 buf = leafbuf;
2357 leftsib = leafleftsib;
2358 targetlevel = 0;
2359 }
2360 else
2361 {
2362 /* Target is the internal page taken from leaf's top parent link */
2363 Assert(target != leafblkno);
2364
2365 /* Fetch the block number of the target's left sibling */
2366 buf = _bt_getbuf(rel, target, BT_READ);
2367 page = BufferGetPage(buf);
2368 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2369 leftsib = opaque->btpo_prev;
2370 targetlevel = opaque->btpo_level;
2371 Assert(targetlevel > 0);
2372
2373 /*
2374 * To avoid deadlocks, we'd better drop the target page lock before
2375 * going further.
2376 */
2377 _bt_unlockbuf(rel, buf);
2378 }
2379
2380 /*
2381 * We have to lock the pages we need to modify in the standard order:
2382 * moving right, then up. Else we will deadlock against other writers.
2383 *
2384 * So, first lock the leaf page, if it's not the target. Then find and
2385 * write-lock the current left sibling of the target page. The sibling
2386 * that was current a moment ago could have split, so we may have to move
2387 * right.
2388 */
2389 if (target != leafblkno)
2390 _bt_lockbuf(rel, leafbuf, BT_WRITE);
2391 if (leftsib != P_NONE)
2392 {
2393 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2394 page = BufferGetPage(lbuf);
2395 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2396 while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2397 {
2398 bool leftsibvalid = true;
2399
2400 /*
2401 * Before we follow the link from the page that was the left
2402 * sibling mere moments ago, validate its right link. This
2403 * reduces the opportunities for loop to fail to ever make any
2404 * progress in the presence of index corruption.
2405 *
2406 * Note: we rely on the assumption that there can only be one
2407 * vacuum process running at a time (against the same index).
2408 */
2409 if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
2410 leftsib == opaque->btpo_next)
2411 leftsibvalid = false;
2412
2413 leftsib = opaque->btpo_next;
2414 _bt_relbuf(rel, lbuf);
2415
2416 if (!leftsibvalid)
2417 {
2418 if (target != leafblkno)
2419 {
2420 /* we have only a pin on target, but pin+lock on leafbuf */
2421 ReleaseBuffer(buf);
2422 _bt_relbuf(rel, leafbuf);
2423 }
2424 else
2425 {
2426 /* we have only a pin on leafbuf */
2427 ReleaseBuffer(leafbuf);
2428 }
2429
2430 ereport(LOG,
2431 (errcode(ERRCODE_INDEX_CORRUPTED),
2432 errmsg_internal("valid left sibling for deletion target could not be located: "
2433 "left sibling %u of target %u with leafblkno %u and scanblkno %u in index \"%s\"",
2434 leftsib, target, leafblkno, scanblkno,
2435 RelationGetRelationName(rel))));
2436
2437 return false;
2438 }
2439
2440 CHECK_FOR_INTERRUPTS();
2441
2442 /* step right one page */
2443 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2444 page = BufferGetPage(lbuf);
2445 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2446 }
2447 }
2448 else
2449 lbuf = InvalidBuffer;
2450
2451 /* Next write-lock the target page itself */
2452 _bt_lockbuf(rel, buf, BT_WRITE);
2453 page = BufferGetPage(buf);
2454 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2455
2456 /*
2457 * Check page is still empty etc, else abandon deletion. This is just for
2458 * paranoia's sake; a half-dead page cannot resurrect because there can be
2459 * only one vacuum process running at a time.
2460 */
2461 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2462 elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
2463 target, RelationGetRelationName(rel));
2464
2465 if (opaque->btpo_prev != leftsib)
2466 ereport(ERROR,
2467 (errcode(ERRCODE_INDEX_CORRUPTED),
2468 errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
2469 leftsib, opaque->btpo_prev, target,
2470 RelationGetRelationName(rel))));
2471
2472 if (target == leafblkno)
2473 {
2474 if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2475 !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2476 elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
2477 target, RelationGetRelationName(rel));
2478
2479 /* Leaf page is also target page: don't set leaftopparent */
2480 leaftopparent = InvalidBlockNumber;
2481 }
2482 else
2483 {
2484 IndexTuple finaldataitem;
2485
2486 if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2487 P_ISLEAF(opaque))
2488 elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
2489 targetlevel, target, RelationGetRelationName(rel));
2490
2491 /* Target is internal: set leaftopparent for next call here... */
2492 itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2493 finaldataitem = (IndexTuple) PageGetItem(page, itemid);
2494 leaftopparent = BTreeTupleGetDownLink(finaldataitem);
2495 /* ...except when it would be a redundant pointer-to-self */
2496 if (leaftopparent == leafblkno)
2497 leaftopparent = InvalidBlockNumber;
2498 }
2499
2500 /* No leaftopparent for level 0 (leaf page) or level 1 target */
2501 Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
2502
2503 /*
2504 * And next write-lock the (current) right sibling.
2505 */
2506 rightsib = opaque->btpo_next;
2507 rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2508 page = BufferGetPage(rbuf);
2509 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2510 if (opaque->btpo_prev != target)
2511 ereport(ERROR,
2512 (errcode(ERRCODE_INDEX_CORRUPTED),
2513 errmsg_internal("right sibling's left-link doesn't match: "
2514 "block %u links to %u instead of expected %u in index \"%s\"",
2515 rightsib, opaque->btpo_prev, target,
2516 RelationGetRelationName(rel))));
2517 rightsib_is_rightmost = P_RIGHTMOST(opaque);
2518 *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2519
2520 /*
2521 * If we are deleting the next-to-last page on the target's level, then
2522 * the rightsib is a candidate to become the new fast root. (In theory, it
2523 * might be possible to push the fast root even further down, but the odds
2524 * of doing so are slim, and the locking considerations daunting.)
2525 *
2526 * We can safely acquire a lock on the metapage here --- see comments for
2527 * _bt_newroot().
2528 */
2529 if (leftsib == P_NONE && rightsib_is_rightmost)
2530 {
2531 page = BufferGetPage(rbuf);
2532 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2533 if (P_RIGHTMOST(opaque))
2534 {
2535 /* rightsib will be the only one left on the level */
2536 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2537 metapg = BufferGetPage(metabuf);
2538 metad = BTPageGetMeta(metapg);
2539
2540 /*
2541 * The expected case here is btm_fastlevel == targetlevel+1; if
2542 * the fastlevel is <= targetlevel, something is wrong, and we
2543 * choose to overwrite it to fix it.
2544 */
2545 if (metad->btm_fastlevel > targetlevel + 1)
2546 {
2547 /* no update wanted */
2548 _bt_relbuf(rel, metabuf);
2549 metabuf = InvalidBuffer;
2550 }
2551 }
2552 }
2553
2554 /*
2555 * Here we begin doing the deletion.
2556 */
2557
2558 /* No ereport(ERROR) until changes are logged */
2559 START_CRIT_SECTION();
2560
2561 /*
2562 * Update siblings' side-links. Note the target page's side-links will
2563 * continue to point to the siblings. Asserts here are just rechecking
2564 * things we already verified above.
2565 */
2566 if (BufferIsValid(lbuf))
2567 {
2568 page = BufferGetPage(lbuf);
2569 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2570 Assert(opaque->btpo_next == target);
2571 opaque->btpo_next = rightsib;
2572 }
2573 page = BufferGetPage(rbuf);
2574 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2575 Assert(opaque->btpo_prev == target);
2576 opaque->btpo_prev = leftsib;
2577
2578 /*
2579 * If we deleted a parent of the targeted leaf page, instead of the leaf
2580 * itself, update the leaf to point to the next remaining child in the
2581 * subtree.
2582 *
2583 * Note: We rely on the fact that a buffer pin on the leaf page has been
2584 * held since leafhikey was initialized. This is safe, though only
2585 * because the page was already half-dead at that point. The leaf page
2586 * cannot have been modified by any other backend during the period when
2587 * no lock was held.
2588 */
2589 if (target != leafblkno)
2590 BTreeTupleSetTopParent(leafhikey, leaftopparent);
2591
2592 /*
2593 * Mark the page itself deleted. It can be recycled when all current
2594 * transactions are gone. Storing GetTopTransactionId() would work, but
2595 * we're in VACUUM and would not otherwise have an XID. Having already
2596 * updated links to the target, ReadNextFullTransactionId() suffices as an
2597 * upper bound. Any scan having retained a now-stale link is advertising
2598 * in its PGPROC an xmin less than or equal to the value we read here. It
2599 * will continue to do so, holding back the xmin horizon, for the duration
2600 * of that scan.
2601 */
2602 page = BufferGetPage(buf);
2603 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2604 Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2605
2606 /*
2607 * Store upper bound XID that's used to determine when deleted page is no
2608 * longer needed as a tombstone
2609 */
2610 safexid = ReadNextFullTransactionId();
2611 BTPageSetDeleted(page, safexid);
2612 opaque->btpo_cycleid = 0;
2613
2614 /* And update the metapage, if needed */
2615 if (BufferIsValid(metabuf))
2616 {
2617 /* upgrade metapage if needed */
2618 if (metad->btm_version < BTREE_NOVAC_VERSION)
2619 _bt_upgrademetapage(metapg);
2620 metad->btm_fastroot = rightsib;
2621 metad->btm_fastlevel = targetlevel;
2622 MarkBufferDirty(metabuf);
2623 }
2624
2625 /* Must mark buffers dirty before XLogInsert */
2626 MarkBufferDirty(rbuf);
2627 MarkBufferDirty(buf);
2628 if (BufferIsValid(lbuf))
2629 MarkBufferDirty(lbuf);
2630 if (target != leafblkno)
2631 MarkBufferDirty(leafbuf);
2632
2633 /* XLOG stuff */
2634 if (RelationNeedsWAL(rel))
2635 {
2636 xl_btree_unlink_page xlrec;
2637 xl_btree_metadata xlmeta;
2638 uint8 xlinfo;
2639 XLogRecPtr recptr;
2640
2641 XLogBeginInsert();
2642
2643 XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
2644 if (BufferIsValid(lbuf))
2645 XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2646 XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
2647 if (target != leafblkno)
2648 XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2649
2650 /* information stored on the target/to-be-unlinked block */
2651 xlrec.leftsib = leftsib;
2652 xlrec.rightsib = rightsib;
2653 xlrec.level = targetlevel;
2654 xlrec.safexid = safexid;
2655
2656 /* information needed to recreate the leaf block (if not the target) */
2657 xlrec.leafleftsib = leafleftsib;
2658 xlrec.leafrightsib = leafrightsib;
2659 xlrec.leaftopparent = leaftopparent;
2660
2661 XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2662
2663 if (BufferIsValid(metabuf))
2664 {
2665 XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2666
2667 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2668 xlmeta.version = metad->btm_version;
2669 xlmeta.root = metad->btm_root;
2670 xlmeta.level = metad->btm_level;
2671 xlmeta.fastroot = metad->btm_fastroot;
2672 xlmeta.fastlevel = metad->btm_fastlevel;
2673 xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2674 xlmeta.allequalimage = metad->btm_allequalimage;
2675
2676 XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
2677 xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2678 }
2679 else
2680 xlinfo = XLOG_BTREE_UNLINK_PAGE;
2681
2682 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2683
2684 if (BufferIsValid(metabuf))
2685 {
2686 PageSetLSN(metapg, recptr);
2687 }
2688 page = BufferGetPage(rbuf);
2689 PageSetLSN(page, recptr);
2690 page = BufferGetPage(buf);
2691 PageSetLSN(page, recptr);
2692 if (BufferIsValid(lbuf))
2693 {
2694 page = BufferGetPage(lbuf);
2695 PageSetLSN(page, recptr);
2696 }
2697 if (target != leafblkno)
2698 {
2699 page = BufferGetPage(leafbuf);
2700 PageSetLSN(page, recptr);
2701 }
2702 }
2703
2704 END_CRIT_SECTION();
2705
2706 /* release metapage */
2707 if (BufferIsValid(metabuf))
2708 _bt_relbuf(rel, metabuf);
2709
2710 /* release siblings */
2711 if (BufferIsValid(lbuf))
2712 _bt_relbuf(rel, lbuf);
2713 _bt_relbuf(rel, rbuf);
2714
2715 /* If the target is not leafbuf, we're done with it now -- release it */
2716 if (target != leafblkno)
2717 _bt_relbuf(rel, buf);
2718
2719 /*
2720 * Maintain pages_newly_deleted, which is simply the number of pages
2721 * deleted by the ongoing VACUUM operation.
2722 *
2723 * Maintain pages_deleted in a way that takes into account how
2724 * btvacuumpage() will count deleted pages that have yet to become
2725 * scanblkno -- only count page when it's not going to get that treatment
2726 * later on.
2727 */
2728 stats->pages_newly_deleted++;
2729 if (target <= scanblkno)
2730 stats->pages_deleted++;
2731
2732 /*
2733 * Remember information about the target page (now a newly deleted page)
2734 * in dedicated vstate space for later. The page will be considered as a
2735 * candidate to place in the FSM at the end of the current btvacuumscan()
2736 * call.
2737 */
2738 _bt_pendingfsm_add(vstate, target, safexid);
2739
2740 return true;
2741 }
2742
2743 /*
2744 * Establish how tall the to-be-deleted subtree will be during the first stage
2745 * of page deletion.
2746 *
2747 * Caller's child argument is the block number of the page caller wants to
2748 * delete (this is leafbuf's block number, except when we're called
2749 * recursively). stack is a search stack leading to it. Note that we will
2750 * update the stack entry(s) to reflect current downlink positions --- this is
2751 * similar to the corresponding point in page split handling.
2752 *
2753 * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
2754 * false. Returns true on success, in which case caller can use certain
2755 * details established here to perform the first stage of deletion. This
2756 * function is the last point at which page deletion may be deemed unsafe
2757 * (barring index corruption, or unexpected concurrent page deletions).
2758 *
2759 * We write lock the parent of the root of the to-be-deleted subtree for
2760 * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
2761 * caller). Caller will have to remove a downlink from *subtreeparent. We
2762 * also set a *subtreeparent offset number in *poffset, to indicate the
2763 * location of the pivot tuple that contains the relevant downlink.
2764 *
2765 * The root of the to-be-deleted subtree is called the "top parent". Note
2766 * that the leafbuf page is often the final "top parent" page (you can think
2767 * of the leafbuf page as a degenerate single page subtree when that happens).
2768 * Caller should initialize *topparent to the target leafbuf page block number
2769 * (while *topparentrightsib should be set to leafbuf's right sibling block
2770 * number). We will update *topparent (and *topparentrightsib) for caller
2771 * here, though only when it turns out that caller will delete at least one
2772 * internal page (i.e. only when caller needs to store a valid link to the top
2773 * parent block in the leafbuf page using BTreeTupleSetTopParent()).
2774 */
2775 static bool
_bt_lock_subtree_parent(Relation rel,BlockNumber child,BTStack stack,Buffer * subtreeparent,OffsetNumber * poffset,BlockNumber * topparent,BlockNumber * topparentrightsib)2776 _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack,
2777 Buffer *subtreeparent, OffsetNumber *poffset,
2778 BlockNumber *topparent, BlockNumber *topparentrightsib)
2779 {
2780 BlockNumber parent,
2781 leftsibparent;
2782 OffsetNumber parentoffset,
2783 maxoff;
2784 Buffer pbuf;
2785 Page page;
2786 BTPageOpaque opaque;
2787
2788 /*
2789 * Locate the pivot tuple whose downlink points to "child". Write lock
2790 * the parent page itself.
2791 */
2792 pbuf = _bt_getstackbuf(rel, stack, child);
2793 if (pbuf == InvalidBuffer)
2794 {
2795 /*
2796 * Failed to "re-find" a pivot tuple whose downlink matched our child
2797 * block number on the parent level -- the index must be corrupt.
2798 * Don't even try to delete the leafbuf subtree. Just report the
2799 * issue and press on with vacuuming the index.
2800 *
2801 * Note: _bt_getstackbuf() recovers from concurrent page splits that
2802 * take place on the parent level. Its approach is a near-exhaustive
2803 * linear search. This also gives it a surprisingly good chance of
2804 * recovering in the event of a buggy or inconsistent opclass. But we
2805 * don't rely on that here.
2806 */
2807 ereport(LOG,
2808 (errcode(ERRCODE_INDEX_CORRUPTED),
2809 errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2810 RelationGetRelationName(rel), child)));
2811 return false;
2812 }
2813
2814 parent = stack->bts_blkno;
2815 parentoffset = stack->bts_offset;
2816
2817 page = BufferGetPage(pbuf);
2818 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2819 maxoff = PageGetMaxOffsetNumber(page);
2820 leftsibparent = opaque->btpo_prev;
2821
2822 /*
2823 * _bt_getstackbuf() completes page splits on returned parent buffer when
2824 * required.
2825 *
2826 * In general it's a bad idea for VACUUM to use up more disk space, which
2827 * is why page deletion does not finish incomplete page splits most of the
2828 * time. We allow this limited exception because the risk is much lower,
2829 * and the potential downside of not proceeding is much higher: A single
2830 * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2831 * prevent us from deleting hundreds of empty leaf pages from one level
2832 * down.
2833 */
2834 Assert(!P_INCOMPLETE_SPLIT(opaque));
2835
2836 if (parentoffset < maxoff)
2837 {
2838 /*
2839 * Child is not the rightmost child in parent, so it's safe to delete
2840 * the subtree whose root/topparent is child page
2841 */
2842 *subtreeparent = pbuf;
2843 *poffset = parentoffset;
2844 return true;
2845 }
2846
2847 /*
2848 * Child is the rightmost child of parent.
2849 *
2850 * Since it's the rightmost child of parent, deleting the child (or
2851 * deleting the subtree whose root/topparent is the child page) is only
2852 * safe when it's also possible to delete the parent.
2853 */
2854 Assert(parentoffset == maxoff);
2855 if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2856 {
2857 /*
2858 * Child isn't parent's only child, or parent is rightmost on its
2859 * entire level. Definitely cannot delete any pages.
2860 */
2861 _bt_relbuf(rel, pbuf);
2862 return false;
2863 }
2864
2865 /*
2866 * Now make sure that the parent deletion is itself safe by examining the
2867 * child's grandparent page. Recurse, passing the parent page as the
2868 * child page (child's grandparent is the parent on the next level up). If
2869 * parent deletion is unsafe, then child deletion must also be unsafe (in
2870 * which case caller cannot delete any pages at all).
2871 */
2872 *topparent = parent;
2873 *topparentrightsib = opaque->btpo_next;
2874
2875 /*
2876 * Release lock on parent before recursing.
2877 *
2878 * It's OK to release page locks on parent before recursive call locks
2879 * grandparent. An internal page can only acquire an entry if the child
2880 * is split, but that cannot happen as long as we still hold a lock on the
2881 * leafbuf page.
2882 */
2883 _bt_relbuf(rel, pbuf);
2884
2885 /*
2886 * Before recursing, check that the left sibling of parent (if any) is not
2887 * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2888 * parent lock).
2889 *
2890 * Note: We deliberately avoid completing incomplete splits here.
2891 */
2892 if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2893 return false;
2894
2895 /* Recurse to examine child page's grandparent page */
2896 return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
2897 subtreeparent, poffset,
2898 topparent, topparentrightsib);
2899 }
2900
2901 /*
2902 * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
2903 * optimization.
2904 *
2905 * Called at the start of a btvacuumscan(). Caller's cleanuponly argument
2906 * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
2907 *
2908 * We expect to allocate memory inside VACUUM's top-level memory context here.
2909 * The working buffer is subject to a limit based on work_mem. Our strategy
2910 * when the array can no longer grow within the bounds of that limit is to
2911 * stop saving additional newly deleted pages, while proceeding as usual with
2912 * the pages that we can fit.
2913 */
2914 void
_bt_pendingfsm_init(Relation rel,BTVacState * vstate,bool cleanuponly)2915 _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
2916 {
2917 int64 maxbufsize;
2918
2919 /*
2920 * Don't bother with optimization in cleanup-only case -- we don't expect
2921 * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2922 * can only take place because this optimization didn't work out during
2923 * the last VACUUM.
2924 */
2925 if (cleanuponly)
2926 return;
2927
2928 /*
2929 * Cap maximum size of array so that we always respect work_mem. Avoid
2930 * int overflow here.
2931 */
2932 vstate->bufsize = 256;
2933 maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM);
2934 maxbufsize = Min(maxbufsize, INT_MAX);
2935 maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2936 /* Stay sane with small work_mem */
2937 maxbufsize = Max(maxbufsize, vstate->bufsize);
2938 vstate->maxbufsize = maxbufsize;
2939
2940 /* Allocate buffer, indicate that there are currently 0 pending pages */
2941 vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
2942 vstate->npendingpages = 0;
2943 }
2944
2945 /*
2946 * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
2947 * the ongoing VACUUM operation) into the free space map -- though only when
2948 * it is actually safe to do so by now.
2949 *
2950 * Called at the end of a btvacuumscan(), just before free space map vacuuming
2951 * takes place.
2952 *
2953 * Frees memory allocated by _bt_pendingfsm_init(), if any.
2954 */
2955 void
_bt_pendingfsm_finalize(Relation rel,BTVacState * vstate)2956 _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
2957 {
2958 IndexBulkDeleteResult *stats = vstate->stats;
2959
2960 Assert(stats->pages_newly_deleted >= vstate->npendingpages);
2961
2962 if (vstate->npendingpages == 0)
2963 {
2964 /* Just free memory when nothing to do */
2965 if (vstate->pendingpages)
2966 pfree(vstate->pendingpages);
2967
2968 return;
2969 }
2970
2971 #ifdef DEBUG_BTREE_PENDING_FSM
2972
2973 /*
2974 * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
2975 * placing pending pages in the FSM. Note that the optimization will
2976 * never be effective without some other backend concurrently consuming an
2977 * XID.
2978 */
2979 pg_usleep(5000000L);
2980 #endif
2981
2982 /*
2983 * Recompute VACUUM XID boundaries.
2984 *
2985 * We don't actually care about the oldest non-removable XID. Computing
2986 * the oldest such XID has a useful side-effect that we rely on: it
2987 * forcibly updates the XID horizon state for this backend. This step is
2988 * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
2989 * that it is now safe to recycle newly deleted pages without this step.
2990 */
2991 GetOldestNonRemovableTransactionId(NULL);
2992
2993 for (int i = 0; i < vstate->npendingpages; i++)
2994 {
2995 BlockNumber target = vstate->pendingpages[i].target;
2996 FullTransactionId safexid = vstate->pendingpages[i].safexid;
2997
2998 /*
2999 * Do the equivalent of checking BTPageIsRecyclable(), but without
3000 * accessing the page again a second time.
3001 *
3002 * Give up on finding the first non-recyclable page -- all later pages
3003 * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3004 * to the array in safexid order.
3005 */
3006 if (!GlobalVisCheckRemovableFullXid(NULL, safexid))
3007 break;
3008
3009 RecordFreeIndexPage(rel, target);
3010 stats->pages_free++;
3011 }
3012
3013 pfree(vstate->pendingpages);
3014 }
3015
3016 /*
3017 * Maintain array of pages that were deleted during current btvacuumscan()
3018 * call, for use in _bt_pendingfsm_finalize()
3019 */
3020 static void
_bt_pendingfsm_add(BTVacState * vstate,BlockNumber target,FullTransactionId safexid)3021 _bt_pendingfsm_add(BTVacState *vstate,
3022 BlockNumber target,
3023 FullTransactionId safexid)
3024 {
3025 Assert(vstate->npendingpages <= vstate->bufsize);
3026 Assert(vstate->bufsize <= vstate->maxbufsize);
3027
3028 #ifdef USE_ASSERT_CHECKING
3029
3030 /*
3031 * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
3032 * array will always be in safexid order (since that is the order that we
3033 * save them in here)
3034 */
3035 if (vstate->npendingpages > 0)
3036 {
3037 FullTransactionId lastsafexid =
3038 vstate->pendingpages[vstate->npendingpages - 1].safexid;
3039
3040 Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
3041 }
3042 #endif
3043
3044 /*
3045 * If temp buffer reaches maxbufsize/work_mem capacity then we discard
3046 * information about this page.
3047 *
3048 * Note that this also covers the case where we opted to not use the
3049 * optimization in _bt_pendingfsm_init().
3050 */
3051 if (vstate->npendingpages == vstate->maxbufsize)
3052 return;
3053
3054 /* Consider enlarging buffer */
3055 if (vstate->npendingpages == vstate->bufsize)
3056 {
3057 int newbufsize = vstate->bufsize * 2;
3058
3059 /* Respect work_mem */
3060 if (newbufsize > vstate->maxbufsize)
3061 newbufsize = vstate->maxbufsize;
3062
3063 vstate->bufsize = newbufsize;
3064 vstate->pendingpages =
3065 repalloc(vstate->pendingpages,
3066 sizeof(BTPendingFSM) * vstate->bufsize);
3067 }
3068
3069 /* Save metadata for newly deleted page */
3070 vstate->pendingpages[vstate->npendingpages].target = target;
3071 vstate->pendingpages[vstate->npendingpages].safexid = safexid;
3072 vstate->npendingpages++;
3073 }
3074