1 /*-------------------------------------------------------------------------
2  *
3  * verify_nbtree.c
4  *		Verifies the integrity of nbtree indexes based on invariants.
5  *
6  * For B-Tree indexes, verification includes checking that each page in the
7  * target index has items in logical order as reported by an insertion scankey
8  * (the insertion scankey sort-wise NULL semantics are needed for
9  * verification).
10  *
11  * When index-to-heap verification is requested, a Bloom filter is used to
12  * fingerprint all tuples in the target index, as the index is traversed to
13  * verify its structure.  A heap scan later uses Bloom filter probes to verify
14  * that every visible heap tuple has a matching index tuple.
15  *
16  *
17  * Copyright (c) 2017-2021, PostgreSQL Global Development Group
18  *
19  * IDENTIFICATION
20  *	  contrib/amcheck/verify_nbtree.c
21  *
22  *-------------------------------------------------------------------------
23  */
24 #include "postgres.h"
25 
26 #include "access/htup_details.h"
27 #include "access/nbtree.h"
28 #include "access/table.h"
29 #include "access/tableam.h"
30 #include "access/transam.h"
31 #include "access/xact.h"
32 #include "catalog/index.h"
33 #include "catalog/pg_am.h"
34 #include "commands/tablecmds.h"
35 #include "lib/bloomfilter.h"
36 #include "miscadmin.h"
37 #include "storage/lmgr.h"
38 #include "storage/smgr.h"
39 #include "utils/memutils.h"
40 #include "utils/snapmgr.h"
41 
42 
43 PG_MODULE_MAGIC;
44 
45 /*
46  * A B-Tree cannot possibly have this many levels, since there must be one
47  * block per level, which is bound by the range of BlockNumber:
48  */
49 #define InvalidBtreeLevel	((uint32) InvalidBlockNumber)
50 #define BTreeTupleGetNKeyAtts(itup, rel)   \
51 	Min(IndexRelationGetNumberOfKeyAttributes(rel), BTreeTupleGetNAtts(itup, rel))
52 
53 /*
54  * State associated with verifying a B-Tree index
55  *
56  * target is the point of reference for a verification operation.
57  *
58  * Other B-Tree pages may be allocated, but those are always auxiliary (e.g.,
59  * they are current target's child pages).  Conceptually, problems are only
60  * ever found in the current target page (or for a particular heap tuple during
61  * heapallindexed verification).  Each page found by verification's left/right,
62  * top/bottom scan becomes the target exactly once.
63  */
64 typedef struct BtreeCheckState
65 {
66 	/*
67 	 * Unchanging state, established at start of verification:
68 	 */
69 
70 	/* B-Tree Index Relation and associated heap relation */
71 	Relation	rel;
72 	Relation	heaprel;
73 	/* rel is heapkeyspace index? */
74 	bool		heapkeyspace;
75 	/* ShareLock held on heap/index, rather than AccessShareLock? */
76 	bool		readonly;
77 	/* Also verifying heap has no unindexed tuples? */
78 	bool		heapallindexed;
79 	/* Also making sure non-pivot tuples can be found by new search? */
80 	bool		rootdescend;
81 	/* Per-page context */
82 	MemoryContext targetcontext;
83 	/* Buffer access strategy */
84 	BufferAccessStrategy checkstrategy;
85 
86 	/*
87 	 * Mutable state, for verification of particular page:
88 	 */
89 
90 	/* Current target page */
91 	Page		target;
92 	/* Target block number */
93 	BlockNumber targetblock;
94 	/* Target page's LSN */
95 	XLogRecPtr	targetlsn;
96 
97 	/*
98 	 * Low key: high key of left sibling of target page.  Used only for child
99 	 * verification.  So, 'lowkey' is kept only when 'readonly' is set.
100 	 */
101 	IndexTuple	lowkey;
102 
103 	/*
104 	 * The rightlink and incomplete split flag of block one level down to the
105 	 * target page, which was visited last time via downlink from taget page.
106 	 * We use it to check for missing downlinks.
107 	 */
108 	BlockNumber prevrightlink;
109 	bool		previncompletesplit;
110 
111 	/*
112 	 * Mutable state, for optional heapallindexed verification:
113 	 */
114 
115 	/* Bloom filter fingerprints B-Tree index */
116 	bloom_filter *filter;
117 	/* Debug counter */
118 	int64		heaptuplespresent;
119 } BtreeCheckState;
120 
121 /*
122  * Starting point for verifying an entire B-Tree index level
123  */
124 typedef struct BtreeLevel
125 {
126 	/* Level number (0 is leaf page level). */
127 	uint32		level;
128 
129 	/* Left most block on level.  Scan of level begins here. */
130 	BlockNumber leftmost;
131 
132 	/* Is this level reported as "true" root level by meta page? */
133 	bool		istruerootlevel;
134 } BtreeLevel;
135 
136 PG_FUNCTION_INFO_V1(bt_index_check);
137 PG_FUNCTION_INFO_V1(bt_index_parent_check);
138 
139 static void bt_index_check_internal(Oid indrelid, bool parentcheck,
140 									bool heapallindexed, bool rootdescend);
141 static inline void btree_index_checkable(Relation rel);
142 static inline bool btree_index_mainfork_expected(Relation rel);
143 static void bt_check_every_level(Relation rel, Relation heaprel,
144 								 bool heapkeyspace, bool readonly, bool heapallindexed,
145 								 bool rootdescend);
146 static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
147 											   BtreeLevel level);
148 static void bt_recheck_sibling_links(BtreeCheckState *state,
149 									 BlockNumber btpo_prev_from_target,
150 									 BlockNumber leftcurrent);
151 static void bt_target_page_check(BtreeCheckState *state);
152 static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state);
153 static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
154 						   OffsetNumber downlinkoffnum);
155 static void bt_child_highkey_check(BtreeCheckState *state,
156 								   OffsetNumber target_downlinkoffnum,
157 								   Page loaded_child,
158 								   uint32 target_level);
159 static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
160 									  BlockNumber targetblock, Page target);
161 static void bt_tuple_present_callback(Relation index, ItemPointer tid,
162 									  Datum *values, bool *isnull,
163 									  bool tupleIsAlive, void *checkstate);
164 static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
165 									 IndexTuple itup);
166 static inline IndexTuple bt_posting_plain_tuple(IndexTuple itup, int n);
167 static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup);
168 static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
169 											   OffsetNumber offset);
170 static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
171 									  OffsetNumber upperbound);
172 static inline bool invariant_leq_offset(BtreeCheckState *state,
173 										BTScanInsert key,
174 										OffsetNumber upperbound);
175 static inline bool invariant_g_offset(BtreeCheckState *state, BTScanInsert key,
176 									  OffsetNumber lowerbound);
177 static inline bool invariant_l_nontarget_offset(BtreeCheckState *state,
178 												BTScanInsert key,
179 												BlockNumber nontargetblock,
180 												Page nontarget,
181 												OffsetNumber upperbound);
182 static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum);
183 static inline BTScanInsert bt_mkscankey_pivotsearch(Relation rel,
184 													IndexTuple itup);
185 static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block,
186 								   Page page, OffsetNumber offset);
187 static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state,
188 													  IndexTuple itup, bool nonpivot);
189 static inline ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup);
190 
191 /*
192  * bt_index_check(index regclass, heapallindexed boolean)
193  *
194  * Verify integrity of B-Tree index.
195  *
196  * Acquires AccessShareLock on heap & index relations.  Does not consider
197  * invariants that exist between parent/child pages.  Optionally verifies
198  * that heap does not contain any unindexed or incorrectly indexed tuples.
199  */
200 Datum
bt_index_check(PG_FUNCTION_ARGS)201 bt_index_check(PG_FUNCTION_ARGS)
202 {
203 	Oid			indrelid = PG_GETARG_OID(0);
204 	bool		heapallindexed = false;
205 
206 	if (PG_NARGS() == 2)
207 		heapallindexed = PG_GETARG_BOOL(1);
208 
209 	bt_index_check_internal(indrelid, false, heapallindexed, false);
210 
211 	PG_RETURN_VOID();
212 }
213 
214 /*
215  * bt_index_parent_check(index regclass, heapallindexed boolean)
216  *
217  * Verify integrity of B-Tree index.
218  *
219  * Acquires ShareLock on heap & index relations.  Verifies that downlinks in
220  * parent pages are valid lower bounds on child pages.  Optionally verifies
221  * that heap does not contain any unindexed or incorrectly indexed tuples.
222  */
223 Datum
bt_index_parent_check(PG_FUNCTION_ARGS)224 bt_index_parent_check(PG_FUNCTION_ARGS)
225 {
226 	Oid			indrelid = PG_GETARG_OID(0);
227 	bool		heapallindexed = false;
228 	bool		rootdescend = false;
229 
230 	if (PG_NARGS() >= 2)
231 		heapallindexed = PG_GETARG_BOOL(1);
232 	if (PG_NARGS() == 3)
233 		rootdescend = PG_GETARG_BOOL(2);
234 
235 	bt_index_check_internal(indrelid, true, heapallindexed, rootdescend);
236 
237 	PG_RETURN_VOID();
238 }
239 
240 /*
241  * Helper for bt_index_[parent_]check, coordinating the bulk of the work.
242  */
243 static void
bt_index_check_internal(Oid indrelid,bool parentcheck,bool heapallindexed,bool rootdescend)244 bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
245 						bool rootdescend)
246 {
247 	Oid			heapid;
248 	Relation	indrel;
249 	Relation	heaprel;
250 	LOCKMODE	lockmode;
251 
252 	if (parentcheck)
253 		lockmode = ShareLock;
254 	else
255 		lockmode = AccessShareLock;
256 
257 	/*
258 	 * We must lock table before index to avoid deadlocks.  However, if the
259 	 * passed indrelid isn't an index then IndexGetRelation() will fail.
260 	 * Rather than emitting a not-very-helpful error message, postpone
261 	 * complaining, expecting that the is-it-an-index test below will fail.
262 	 *
263 	 * In hot standby mode this will raise an error when parentcheck is true.
264 	 */
265 	heapid = IndexGetRelation(indrelid, true);
266 	if (OidIsValid(heapid))
267 		heaprel = table_open(heapid, lockmode);
268 	else
269 		heaprel = NULL;
270 
271 	/*
272 	 * Open the target index relations separately (like relation_openrv(), but
273 	 * with heap relation locked first to prevent deadlocking).  In hot
274 	 * standby mode this will raise an error when parentcheck is true.
275 	 *
276 	 * There is no need for the usual indcheckxmin usability horizon test
277 	 * here, even in the heapallindexed case, because index undergoing
278 	 * verification only needs to have entries for a new transaction snapshot.
279 	 * (If this is a parentcheck verification, there is no question about
280 	 * committed or recently dead heap tuples lacking index entries due to
281 	 * concurrent activity.)
282 	 */
283 	indrel = index_open(indrelid, lockmode);
284 
285 	/*
286 	 * Since we did the IndexGetRelation call above without any lock, it's
287 	 * barely possible that a race against an index drop/recreation could have
288 	 * netted us the wrong table.
289 	 */
290 	if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false))
291 		ereport(ERROR,
292 				(errcode(ERRCODE_UNDEFINED_TABLE),
293 				 errmsg("could not open parent table of index \"%s\"",
294 						RelationGetRelationName(indrel))));
295 
296 	/* Relation suitable for checking as B-Tree? */
297 	btree_index_checkable(indrel);
298 
299 	if (btree_index_mainfork_expected(indrel))
300 	{
301 		bool		heapkeyspace,
302 					allequalimage;
303 
304 		RelationOpenSmgr(indrel);
305 		if (!smgrexists(indrel->rd_smgr, MAIN_FORKNUM))
306 			ereport(ERROR,
307 					(errcode(ERRCODE_INDEX_CORRUPTED),
308 					 errmsg("index \"%s\" lacks a main relation fork",
309 							RelationGetRelationName(indrel))));
310 
311 		/* Extract metadata from metapage, and sanitize it in passing */
312 		_bt_metaversion(indrel, &heapkeyspace, &allequalimage);
313 		if (allequalimage && !heapkeyspace)
314 			ereport(ERROR,
315 					(errcode(ERRCODE_INDEX_CORRUPTED),
316 					 errmsg("index \"%s\" metapage has equalimage field set on unsupported nbtree version",
317 							RelationGetRelationName(indrel))));
318 		if (allequalimage && !_bt_allequalimage(indrel, false))
319 			ereport(ERROR,
320 					(errcode(ERRCODE_INDEX_CORRUPTED),
321 					 errmsg("index \"%s\" metapage incorrectly indicates that deduplication is safe",
322 							RelationGetRelationName(indrel))));
323 
324 		/* Check index, possibly against table it is an index on */
325 		bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
326 							 heapallindexed, rootdescend);
327 	}
328 
329 	/*
330 	 * Release locks early. That's ok here because nothing in the called
331 	 * routines will trigger shared cache invalidations to be sent, so we can
332 	 * relax the usual pattern of only releasing locks after commit.
333 	 */
334 	index_close(indrel, lockmode);
335 	if (heaprel)
336 		table_close(heaprel, lockmode);
337 }
338 
339 /*
340  * Basic checks about the suitability of a relation for checking as a B-Tree
341  * index.
342  *
343  * NB: Intentionally not checking permissions, the function is normally not
344  * callable by non-superusers. If granted, it's useful to be able to check a
345  * whole cluster.
346  */
347 static inline void
btree_index_checkable(Relation rel)348 btree_index_checkable(Relation rel)
349 {
350 	if (rel->rd_rel->relkind != RELKIND_INDEX ||
351 		rel->rd_rel->relam != BTREE_AM_OID)
352 		ereport(ERROR,
353 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
354 				 errmsg("only B-Tree indexes are supported as targets for verification"),
355 				 errdetail("Relation \"%s\" is not a B-Tree index.",
356 						   RelationGetRelationName(rel))));
357 
358 	if (RELATION_IS_OTHER_TEMP(rel))
359 		ereport(ERROR,
360 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
361 				 errmsg("cannot access temporary tables of other sessions"),
362 				 errdetail("Index \"%s\" is associated with temporary relation.",
363 						   RelationGetRelationName(rel))));
364 
365 	if (!rel->rd_index->indisvalid)
366 		ereport(ERROR,
367 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
368 				 errmsg("cannot check index \"%s\"",
369 						RelationGetRelationName(rel)),
370 				 errdetail("Index is not valid.")));
371 }
372 
373 /*
374  * Check if B-Tree index relation should have a file for its main relation
375  * fork.  Verification uses this to skip unlogged indexes when in hot standby
376  * mode, where there is simply nothing to verify.  We behave as if the
377  * relation is empty.
378  *
379  * NB: Caller should call btree_index_checkable() before calling here.
380  */
381 static inline bool
btree_index_mainfork_expected(Relation rel)382 btree_index_mainfork_expected(Relation rel)
383 {
384 	if (rel->rd_rel->relpersistence != RELPERSISTENCE_UNLOGGED ||
385 		!RecoveryInProgress())
386 		return true;
387 
388 	ereport(DEBUG1,
389 			(errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),
390 			 errmsg("cannot verify unlogged index \"%s\" during recovery, skipping",
391 					RelationGetRelationName(rel))));
392 
393 	return false;
394 }
395 
396 /*
397  * Main entry point for B-Tree SQL-callable functions. Walks the B-Tree in
398  * logical order, verifying invariants as it goes.  Optionally, verification
399  * checks if the heap relation contains any tuples that are not represented in
400  * the index but should be.
401  *
402  * It is the caller's responsibility to acquire appropriate heavyweight lock on
403  * the index relation, and advise us if extra checks are safe when a ShareLock
404  * is held.  (A lock of the same type must also have been acquired on the heap
405  * relation.)
406  *
407  * A ShareLock is generally assumed to prevent any kind of physical
408  * modification to the index structure, including modifications that VACUUM may
409  * make.  This does not include setting of the LP_DEAD bit by concurrent index
410  * scans, although that is just metadata that is not able to directly affect
411  * any check performed here.  Any concurrent process that might act on the
412  * LP_DEAD bit being set (recycle space) requires a heavyweight lock that
413  * cannot be held while we hold a ShareLock.  (Besides, even if that could
414  * happen, the ad-hoc recycling when a page might otherwise split is performed
415  * per-page, and requires an exclusive buffer lock, which wouldn't cause us
416  * trouble.  _bt_delitems_vacuum() may only delete leaf items, and so the extra
417  * parent/child check cannot be affected.)
418  */
419 static void
bt_check_every_level(Relation rel,Relation heaprel,bool heapkeyspace,bool readonly,bool heapallindexed,bool rootdescend)420 bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
421 					 bool readonly, bool heapallindexed, bool rootdescend)
422 {
423 	BtreeCheckState *state;
424 	Page		metapage;
425 	BTMetaPageData *metad;
426 	uint32		previouslevel;
427 	BtreeLevel	current;
428 	Snapshot	snapshot = SnapshotAny;
429 
430 	if (!readonly)
431 		elog(DEBUG1, "verifying consistency of tree structure for index \"%s\"",
432 			 RelationGetRelationName(rel));
433 	else
434 		elog(DEBUG1, "verifying consistency of tree structure for index \"%s\" with cross-level checks",
435 			 RelationGetRelationName(rel));
436 
437 	/*
438 	 * This assertion matches the one in index_getnext_tid().  See page
439 	 * recycling/"visible to everyone" notes in nbtree README.
440 	 */
441 	Assert(TransactionIdIsValid(RecentXmin));
442 
443 	/*
444 	 * Initialize state for entire verification operation
445 	 */
446 	state = palloc0(sizeof(BtreeCheckState));
447 	state->rel = rel;
448 	state->heaprel = heaprel;
449 	state->heapkeyspace = heapkeyspace;
450 	state->readonly = readonly;
451 	state->heapallindexed = heapallindexed;
452 	state->rootdescend = rootdescend;
453 
454 	if (state->heapallindexed)
455 	{
456 		int64		total_pages;
457 		int64		total_elems;
458 		uint64		seed;
459 
460 		/*
461 		 * Size Bloom filter based on estimated number of tuples in index,
462 		 * while conservatively assuming that each block must contain at least
463 		 * MaxTIDsPerBTreePage / 3 "plain" tuples -- see
464 		 * bt_posting_plain_tuple() for definition, and details of how posting
465 		 * list tuples are handled.
466 		 */
467 		total_pages = RelationGetNumberOfBlocks(rel);
468 		total_elems = Max(total_pages * (MaxTIDsPerBTreePage / 3),
469 						  (int64) state->rel->rd_rel->reltuples);
470 		/* Random seed relies on backend srandom() call to avoid repetition */
471 		seed = random();
472 		/* Create Bloom filter to fingerprint index */
473 		state->filter = bloom_create(total_elems, maintenance_work_mem, seed);
474 		state->heaptuplespresent = 0;
475 
476 		/*
477 		 * Register our own snapshot in !readonly case, rather than asking
478 		 * table_index_build_scan() to do this for us later.  This needs to
479 		 * happen before index fingerprinting begins, so we can later be
480 		 * certain that index fingerprinting should have reached all tuples
481 		 * returned by table_index_build_scan().
482 		 */
483 		if (!state->readonly)
484 		{
485 			snapshot = RegisterSnapshot(GetTransactionSnapshot());
486 
487 			/*
488 			 * GetTransactionSnapshot() always acquires a new MVCC snapshot in
489 			 * READ COMMITTED mode.  A new snapshot is guaranteed to have all
490 			 * the entries it requires in the index.
491 			 *
492 			 * We must defend against the possibility that an old xact
493 			 * snapshot was returned at higher isolation levels when that
494 			 * snapshot is not safe for index scans of the target index.  This
495 			 * is possible when the snapshot sees tuples that are before the
496 			 * index's indcheckxmin horizon.  Throwing an error here should be
497 			 * very rare.  It doesn't seem worth using a secondary snapshot to
498 			 * avoid this.
499 			 */
500 			if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin &&
501 				!TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data),
502 									   snapshot->xmin))
503 				ereport(ERROR,
504 						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
505 						 errmsg("index \"%s\" cannot be verified using transaction snapshot",
506 								RelationGetRelationName(rel))));
507 		}
508 	}
509 
510 	Assert(!state->rootdescend || state->readonly);
511 	if (state->rootdescend && !state->heapkeyspace)
512 		ereport(ERROR,
513 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
514 				 errmsg("cannot verify that tuples from index \"%s\" can each be found by an independent index search",
515 						RelationGetRelationName(rel)),
516 				 errhint("Only B-Tree version 4 indexes support rootdescend verification.")));
517 
518 	/* Create context for page */
519 	state->targetcontext = AllocSetContextCreate(CurrentMemoryContext,
520 												 "amcheck context",
521 												 ALLOCSET_DEFAULT_SIZES);
522 	state->checkstrategy = GetAccessStrategy(BAS_BULKREAD);
523 
524 	/* Get true root block from meta-page */
525 	metapage = palloc_btree_page(state, BTREE_METAPAGE);
526 	metad = BTPageGetMeta(metapage);
527 
528 	/*
529 	 * Certain deletion patterns can result in "skinny" B-Tree indexes, where
530 	 * the fast root and true root differ.
531 	 *
532 	 * Start from the true root, not the fast root, unlike conventional index
533 	 * scans.  This approach is more thorough, and removes the risk of
534 	 * following a stale fast root from the meta page.
535 	 */
536 	if (metad->btm_fastroot != metad->btm_root)
537 		ereport(DEBUG1,
538 				(errcode(ERRCODE_NO_DATA),
539 				 errmsg_internal("harmless fast root mismatch in index \"%s\"",
540 								 RelationGetRelationName(rel)),
541 				 errdetail_internal("Fast root block %u (level %u) differs from true root block %u (level %u).",
542 									metad->btm_fastroot, metad->btm_fastlevel,
543 									metad->btm_root, metad->btm_level)));
544 
545 	/*
546 	 * Starting at the root, verify every level.  Move left to right, top to
547 	 * bottom.  Note that there may be no pages other than the meta page (meta
548 	 * page can indicate that root is P_NONE when the index is totally empty).
549 	 */
550 	previouslevel = InvalidBtreeLevel;
551 	current.level = metad->btm_level;
552 	current.leftmost = metad->btm_root;
553 	current.istruerootlevel = true;
554 	while (current.leftmost != P_NONE)
555 	{
556 		/*
557 		 * Verify this level, and get left most page for next level down, if
558 		 * not at leaf level
559 		 */
560 		current = bt_check_level_from_leftmost(state, current);
561 
562 		if (current.leftmost == InvalidBlockNumber)
563 			ereport(ERROR,
564 					(errcode(ERRCODE_INDEX_CORRUPTED),
565 					 errmsg("index \"%s\" has no valid pages on level below %u or first level",
566 							RelationGetRelationName(rel), previouslevel)));
567 
568 		previouslevel = current.level;
569 	}
570 
571 	/*
572 	 * * Check whether heap contains unindexed/malformed tuples *
573 	 */
574 	if (state->heapallindexed)
575 	{
576 		IndexInfo  *indexinfo = BuildIndexInfo(state->rel);
577 		TableScanDesc scan;
578 
579 		/*
580 		 * Create our own scan for table_index_build_scan(), rather than
581 		 * getting it to do so for us.  This is required so that we can
582 		 * actually use the MVCC snapshot registered earlier in !readonly
583 		 * case.
584 		 *
585 		 * Note that table_index_build_scan() calls heap_endscan() for us.
586 		 */
587 		scan = table_beginscan_strat(state->heaprel,	/* relation */
588 									 snapshot,	/* snapshot */
589 									 0, /* number of keys */
590 									 NULL,	/* scan key */
591 									 true,	/* buffer access strategy OK */
592 									 true); /* syncscan OK? */
593 
594 		/*
595 		 * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY
596 		 * behaves in !readonly case.
597 		 *
598 		 * It's okay that we don't actually use the same lock strength for the
599 		 * heap relation as any other ii_Concurrent caller would in !readonly
600 		 * case.  We have no reason to care about a concurrent VACUUM
601 		 * operation, since there isn't going to be a second scan of the heap
602 		 * that needs to be sure that there was no concurrent recycling of
603 		 * TIDs.
604 		 */
605 		indexinfo->ii_Concurrent = !state->readonly;
606 
607 		/*
608 		 * Don't wait for uncommitted tuple xact commit/abort when index is a
609 		 * unique index on a catalog (or an index used by an exclusion
610 		 * constraint).  This could otherwise happen in the readonly case.
611 		 */
612 		indexinfo->ii_Unique = false;
613 		indexinfo->ii_ExclusionOps = NULL;
614 		indexinfo->ii_ExclusionProcs = NULL;
615 		indexinfo->ii_ExclusionStrats = NULL;
616 
617 		elog(DEBUG1, "verifying that tuples from index \"%s\" are present in \"%s\"",
618 			 RelationGetRelationName(state->rel),
619 			 RelationGetRelationName(state->heaprel));
620 
621 		table_index_build_scan(state->heaprel, state->rel, indexinfo, true, false,
622 							   bt_tuple_present_callback, (void *) state, scan);
623 
624 		ereport(DEBUG1,
625 				(errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples from table \"%s\" with bitset %.2f%% set",
626 								 state->heaptuplespresent, RelationGetRelationName(heaprel),
627 								 100.0 * bloom_prop_bits_set(state->filter))));
628 
629 		if (snapshot != SnapshotAny)
630 			UnregisterSnapshot(snapshot);
631 
632 		bloom_free(state->filter);
633 	}
634 
635 	/* Be tidy: */
636 	MemoryContextDelete(state->targetcontext);
637 }
638 
639 /*
640  * Given a left-most block at some level, move right, verifying each page
641  * individually (with more verification across pages for "readonly"
642  * callers).  Caller should pass the true root page as the leftmost initially,
643  * working their way down by passing what is returned for the last call here
644  * until level 0 (leaf page level) was reached.
645  *
646  * Returns state for next call, if any.  This includes left-most block number
647  * one level lower that should be passed on next level/call, which is set to
648  * P_NONE on last call here (when leaf level is verified).  Level numbers
649  * follow the nbtree convention: higher levels have higher numbers, because new
650  * levels are added only due to a root page split.  Note that prior to the
651  * first root page split, the root is also a leaf page, so there is always a
652  * level 0 (leaf level), and it's always the last level processed.
653  *
654  * Note on memory management:  State's per-page context is reset here, between
655  * each call to bt_target_page_check().
656  */
657 static BtreeLevel
bt_check_level_from_leftmost(BtreeCheckState * state,BtreeLevel level)658 bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
659 {
660 	/* State to establish early, concerning entire level */
661 	BTPageOpaque opaque;
662 	MemoryContext oldcontext;
663 	BtreeLevel	nextleveldown;
664 
665 	/* Variables for iterating across level using right links */
666 	BlockNumber leftcurrent = P_NONE;
667 	BlockNumber current = level.leftmost;
668 
669 	/* Initialize return state */
670 	nextleveldown.leftmost = InvalidBlockNumber;
671 	nextleveldown.level = InvalidBtreeLevel;
672 	nextleveldown.istruerootlevel = false;
673 
674 	/* Use page-level context for duration of this call */
675 	oldcontext = MemoryContextSwitchTo(state->targetcontext);
676 
677 	elog(DEBUG1, "verifying level %u%s", level.level,
678 		 level.istruerootlevel ?
679 		 " (true root level)" : level.level == 0 ? " (leaf level)" : "");
680 
681 	state->prevrightlink = InvalidBlockNumber;
682 	state->previncompletesplit = false;
683 
684 	do
685 	{
686 		/* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
687 		CHECK_FOR_INTERRUPTS();
688 
689 		/* Initialize state for this iteration */
690 		state->targetblock = current;
691 		state->target = palloc_btree_page(state, state->targetblock);
692 		state->targetlsn = PageGetLSN(state->target);
693 
694 		opaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
695 
696 		if (P_IGNORE(opaque))
697 		{
698 			/*
699 			 * Since there cannot be a concurrent VACUUM operation in readonly
700 			 * mode, and since a page has no links within other pages
701 			 * (siblings and parent) once it is marked fully deleted, it
702 			 * should be impossible to land on a fully deleted page in
703 			 * readonly mode. See bt_child_check() for further details.
704 			 *
705 			 * The bt_child_check() P_ISDELETED() check is repeated here so
706 			 * that pages that are only reachable through sibling links get
707 			 * checked.
708 			 */
709 			if (state->readonly && P_ISDELETED(opaque))
710 				ereport(ERROR,
711 						(errcode(ERRCODE_INDEX_CORRUPTED),
712 						 errmsg("downlink or sibling link points to deleted block in index \"%s\"",
713 								RelationGetRelationName(state->rel)),
714 						 errdetail_internal("Block=%u left block=%u left link from block=%u.",
715 											current, leftcurrent, opaque->btpo_prev)));
716 
717 			if (P_RIGHTMOST(opaque))
718 				ereport(ERROR,
719 						(errcode(ERRCODE_INDEX_CORRUPTED),
720 						 errmsg("block %u fell off the end of index \"%s\"",
721 								current, RelationGetRelationName(state->rel))));
722 			else
723 				ereport(DEBUG1,
724 						(errcode(ERRCODE_NO_DATA),
725 						 errmsg_internal("block %u of index \"%s\" concurrently deleted",
726 										 current, RelationGetRelationName(state->rel))));
727 			goto nextpage;
728 		}
729 		else if (nextleveldown.leftmost == InvalidBlockNumber)
730 		{
731 			/*
732 			 * A concurrent page split could make the caller supplied leftmost
733 			 * block no longer contain the leftmost page, or no longer be the
734 			 * true root, but where that isn't possible due to heavyweight
735 			 * locking, check that the first valid page meets caller's
736 			 * expectations.
737 			 */
738 			if (state->readonly)
739 			{
740 				if (!P_LEFTMOST(opaque))
741 					ereport(ERROR,
742 							(errcode(ERRCODE_INDEX_CORRUPTED),
743 							 errmsg("block %u is not leftmost in index \"%s\"",
744 									current, RelationGetRelationName(state->rel))));
745 
746 				if (level.istruerootlevel && !P_ISROOT(opaque))
747 					ereport(ERROR,
748 							(errcode(ERRCODE_INDEX_CORRUPTED),
749 							 errmsg("block %u is not true root in index \"%s\"",
750 									current, RelationGetRelationName(state->rel))));
751 			}
752 
753 			/*
754 			 * Before beginning any non-trivial examination of level, prepare
755 			 * state for next bt_check_level_from_leftmost() invocation for
756 			 * the next level for the next level down (if any).
757 			 *
758 			 * There should be at least one non-ignorable page per level,
759 			 * unless this is the leaf level, which is assumed by caller to be
760 			 * final level.
761 			 */
762 			if (!P_ISLEAF(opaque))
763 			{
764 				IndexTuple	itup;
765 				ItemId		itemid;
766 
767 				/* Internal page -- downlink gets leftmost on next level */
768 				itemid = PageGetItemIdCareful(state, state->targetblock,
769 											  state->target,
770 											  P_FIRSTDATAKEY(opaque));
771 				itup = (IndexTuple) PageGetItem(state->target, itemid);
772 				nextleveldown.leftmost = BTreeTupleGetDownLink(itup);
773 				nextleveldown.level = opaque->btpo_level - 1;
774 			}
775 			else
776 			{
777 				/*
778 				 * Leaf page -- final level caller must process.
779 				 *
780 				 * Note that this could also be the root page, if there has
781 				 * been no root page split yet.
782 				 */
783 				nextleveldown.leftmost = P_NONE;
784 				nextleveldown.level = InvalidBtreeLevel;
785 			}
786 
787 			/*
788 			 * Finished setting up state for this call/level.  Control will
789 			 * never end up back here in any future loop iteration for this
790 			 * level.
791 			 */
792 		}
793 
794 		/* Sibling links should be in mutual agreement */
795 		if (opaque->btpo_prev != leftcurrent)
796 			bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
797 
798 		/* Check level */
799 		if (level.level != opaque->btpo_level)
800 			ereport(ERROR,
801 					(errcode(ERRCODE_INDEX_CORRUPTED),
802 					 errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",
803 							RelationGetRelationName(state->rel)),
804 					 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
805 										current, level.level, opaque->btpo_level)));
806 
807 		/* Verify invariants for page */
808 		bt_target_page_check(state);
809 
810 nextpage:
811 
812 		/* Try to detect circular links */
813 		if (current == leftcurrent || current == opaque->btpo_prev)
814 			ereport(ERROR,
815 					(errcode(ERRCODE_INDEX_CORRUPTED),
816 					 errmsg("circular link chain found in block %u of index \"%s\"",
817 							current, RelationGetRelationName(state->rel))));
818 
819 		leftcurrent = current;
820 		current = opaque->btpo_next;
821 
822 		if (state->lowkey)
823 		{
824 			Assert(state->readonly);
825 			pfree(state->lowkey);
826 			state->lowkey = NULL;
827 		}
828 
829 		/*
830 		 * Copy current target high key as the low key of right sibling.
831 		 * Allocate memory in upper level context, so it would be cleared
832 		 * after reset of target context.
833 		 *
834 		 * We only need the low key in corner cases of checking child high
835 		 * keys. We use high key only when incomplete split on the child level
836 		 * falls to the boundary of pages on the target level.  See
837 		 * bt_child_highkey_check() for details.  So, typically we won't end
838 		 * up doing anything with low key, but it's simpler for general case
839 		 * high key verification to always have it available.
840 		 *
841 		 * The correctness of managing low key in the case of concurrent
842 		 * splits wasn't investigated yet.  Thankfully we only need low key
843 		 * for readonly verification and concurrent splits won't happen.
844 		 */
845 		if (state->readonly && !P_RIGHTMOST(opaque))
846 		{
847 			IndexTuple	itup;
848 			ItemId		itemid;
849 
850 			itemid = PageGetItemIdCareful(state, state->targetblock,
851 										  state->target, P_HIKEY);
852 			itup = (IndexTuple) PageGetItem(state->target, itemid);
853 
854 			state->lowkey = MemoryContextAlloc(oldcontext, IndexTupleSize(itup));
855 			memcpy(state->lowkey, itup, IndexTupleSize(itup));
856 		}
857 
858 		/* Free page and associated memory for this iteration */
859 		MemoryContextReset(state->targetcontext);
860 	}
861 	while (current != P_NONE);
862 
863 	if (state->lowkey)
864 	{
865 		Assert(state->readonly);
866 		pfree(state->lowkey);
867 		state->lowkey = NULL;
868 	}
869 
870 	/* Don't change context for caller */
871 	MemoryContextSwitchTo(oldcontext);
872 
873 	return nextleveldown;
874 }
875 
876 /*
877  * Raise an error when target page's left link does not point back to the
878  * previous target page, called leftcurrent here.  The leftcurrent page's
879  * right link was followed to get to the current target page, and we expect
880  * mutual agreement among leftcurrent and the current target page.  Make sure
881  * that this condition has definitely been violated in the !readonly case,
882  * where concurrent page splits are something that we need to deal with.
883  *
884  * Cross-page inconsistencies involving pages that don't agree about being
885  * siblings are known to be a particularly good indicator of corruption
886  * involving partial writes/lost updates.  The bt_right_page_check_scankey
887  * check also provides a way of detecting cross-page inconsistencies for
888  * !readonly callers, but it can only detect sibling pages that have an
889  * out-of-order keyspace, which can't catch many of the problems that we
890  * expect to catch here.
891  *
892  * The classic example of the kind of inconsistency that we can only catch
893  * with this check (when in !readonly mode) involves three sibling pages that
894  * were affected by a faulty page split at some point in the past.  The
895  * effects of the split are reflected in the original page and its new right
896  * sibling page, with a lack of any accompanying changes for the _original_
897  * right sibling page.  The original right sibling page's left link fails to
898  * point to the new right sibling page (its left link still points to the
899  * original page), even though the first phase of a page split is supposed to
900  * work as a single atomic action.  This subtle inconsistency will probably
901  * only break backwards scans in practice.
902  *
903  * Note that this is the only place where amcheck will "couple" buffer locks
904  * (and only for !readonly callers).  In general we prefer to avoid more
905  * thorough cross-page checks in !readonly mode, but it seems worth the
906  * complexity here.  Also, the performance overhead of performing lock
907  * coupling here is negligible in practice.  Control only reaches here with a
908  * non-corrupt index when there is a concurrent page split at the instant
909  * caller crossed over to target page from leftcurrent page.
910  */
911 static void
bt_recheck_sibling_links(BtreeCheckState * state,BlockNumber btpo_prev_from_target,BlockNumber leftcurrent)912 bt_recheck_sibling_links(BtreeCheckState *state,
913 						 BlockNumber btpo_prev_from_target,
914 						 BlockNumber leftcurrent)
915 {
916 	if (!state->readonly)
917 	{
918 		Buffer		lbuf;
919 		Buffer		newtargetbuf;
920 		Page		page;
921 		BTPageOpaque opaque;
922 		BlockNumber newtargetblock;
923 
924 		/* Couple locks in the usual order for nbtree:  Left to right */
925 		lbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, leftcurrent,
926 								  RBM_NORMAL, state->checkstrategy);
927 		LockBuffer(lbuf, BT_READ);
928 		_bt_checkpage(state->rel, lbuf);
929 		page = BufferGetPage(lbuf);
930 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
931 		if (P_ISDELETED(opaque))
932 		{
933 			/*
934 			 * Cannot reason about concurrently deleted page -- the left link
935 			 * in the page to the right is expected to point to some other
936 			 * page to the left (not leftcurrent page).
937 			 *
938 			 * Note that we deliberately don't give up with a half-dead page.
939 			 */
940 			UnlockReleaseBuffer(lbuf);
941 			return;
942 		}
943 
944 		newtargetblock = opaque->btpo_next;
945 		/* Avoid self-deadlock when newtargetblock == leftcurrent */
946 		if (newtargetblock != leftcurrent)
947 		{
948 			newtargetbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM,
949 											  newtargetblock, RBM_NORMAL,
950 											  state->checkstrategy);
951 			LockBuffer(newtargetbuf, BT_READ);
952 			_bt_checkpage(state->rel, newtargetbuf);
953 			page = BufferGetPage(newtargetbuf);
954 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
955 			/* btpo_prev_from_target may have changed; update it */
956 			btpo_prev_from_target = opaque->btpo_prev;
957 		}
958 		else
959 		{
960 			/*
961 			 * leftcurrent right sibling points back to leftcurrent block.
962 			 * Index is corrupt.  Easiest way to handle this is to pretend
963 			 * that we actually read from a distinct page that has an invalid
964 			 * block number in its btpo_prev.
965 			 */
966 			newtargetbuf = InvalidBuffer;
967 			btpo_prev_from_target = InvalidBlockNumber;
968 		}
969 
970 		/*
971 		 * No need to check P_ISDELETED here, since new target block cannot be
972 		 * marked deleted as long as we hold a lock on lbuf
973 		 */
974 		if (BufferIsValid(newtargetbuf))
975 			UnlockReleaseBuffer(newtargetbuf);
976 		UnlockReleaseBuffer(lbuf);
977 
978 		if (btpo_prev_from_target == leftcurrent)
979 		{
980 			/* Report split in left sibling, not target (or new target) */
981 			ereport(DEBUG1,
982 					(errcode(ERRCODE_INTERNAL_ERROR),
983 					 errmsg_internal("harmless concurrent page split detected in index \"%s\"",
984 									 RelationGetRelationName(state->rel)),
985 					 errdetail_internal("Block=%u new right sibling=%u original right sibling=%u.",
986 										leftcurrent, newtargetblock,
987 										state->targetblock)));
988 			return;
989 		}
990 
991 		/*
992 		 * Index is corrupt.  Make sure that we report correct target page.
993 		 *
994 		 * This could have changed in cases where there was a concurrent page
995 		 * split, as well as index corruption (at least in theory).  Note that
996 		 * btpo_prev_from_target was already updated above.
997 		 */
998 		state->targetblock = newtargetblock;
999 	}
1000 
1001 	ereport(ERROR,
1002 			(errcode(ERRCODE_INDEX_CORRUPTED),
1003 			 errmsg("left link/right link pair in index \"%s\" not in agreement",
1004 					RelationGetRelationName(state->rel)),
1005 			 errdetail_internal("Block=%u left block=%u left link from block=%u.",
1006 								state->targetblock, leftcurrent,
1007 								btpo_prev_from_target)));
1008 }
1009 
1010 /*
1011  * Function performs the following checks on target page, or pages ancillary to
1012  * target page:
1013  *
1014  * - That every "real" data item is less than or equal to the high key, which
1015  *	 is an upper bound on the items on the page.  Data items should be
1016  *	 strictly less than the high key when the page is an internal page.
1017  *
1018  * - That within the page, every data item is strictly less than the item
1019  *	 immediately to its right, if any (i.e., that the items are in order
1020  *	 within the page, so that the binary searches performed by index scans are
1021  *	 sane).
1022  *
1023  * - That the last data item stored on the page is strictly less than the
1024  *	 first data item on the page to the right (when such a first item is
1025  *	 available).
1026  *
1027  * - Various checks on the structure of tuples themselves.  For example, check
1028  *	 that non-pivot tuples have no truncated attributes.
1029  *
1030  * Furthermore, when state passed shows ShareLock held, function also checks:
1031  *
1032  * - That all child pages respect strict lower bound from parent's pivot
1033  *	 tuple.
1034  *
1035  * - That downlink to block was encountered in parent where that's expected.
1036  *
1037  * - That high keys of child pages matches corresponding pivot keys in parent.
1038  *
1039  * This is also where heapallindexed callers use their Bloom filter to
1040  * fingerprint IndexTuples for later table_index_build_scan() verification.
1041  *
1042  * Note:  Memory allocated in this routine is expected to be released by caller
1043  * resetting state->targetcontext.
1044  */
1045 static void
bt_target_page_check(BtreeCheckState * state)1046 bt_target_page_check(BtreeCheckState *state)
1047 {
1048 	OffsetNumber offset;
1049 	OffsetNumber max;
1050 	BTPageOpaque topaque;
1051 
1052 	topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
1053 	max = PageGetMaxOffsetNumber(state->target);
1054 
1055 	elog(DEBUG2, "verifying %u items on %s block %u", max,
1056 		 P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock);
1057 
1058 	/*
1059 	 * Check the number of attributes in high key. Note, rightmost page
1060 	 * doesn't contain a high key, so nothing to check
1061 	 */
1062 	if (!P_RIGHTMOST(topaque))
1063 	{
1064 		ItemId		itemid;
1065 		IndexTuple	itup;
1066 
1067 		/* Verify line pointer before checking tuple */
1068 		itemid = PageGetItemIdCareful(state, state->targetblock,
1069 									  state->target, P_HIKEY);
1070 		if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
1071 							 P_HIKEY))
1072 		{
1073 			itup = (IndexTuple) PageGetItem(state->target, itemid);
1074 			ereport(ERROR,
1075 					(errcode(ERRCODE_INDEX_CORRUPTED),
1076 					 errmsg("wrong number of high key index tuple attributes in index \"%s\"",
1077 							RelationGetRelationName(state->rel)),
1078 					 errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%X.",
1079 										state->targetblock,
1080 										BTreeTupleGetNAtts(itup, state->rel),
1081 										P_ISLEAF(topaque) ? "heap" : "index",
1082 										LSN_FORMAT_ARGS(state->targetlsn))));
1083 		}
1084 	}
1085 
1086 	/*
1087 	 * Loop over page items, starting from first non-highkey item, not high
1088 	 * key (if any).  Most tests are not performed for the "negative infinity"
1089 	 * real item (if any).
1090 	 */
1091 	for (offset = P_FIRSTDATAKEY(topaque);
1092 		 offset <= max;
1093 		 offset = OffsetNumberNext(offset))
1094 	{
1095 		ItemId		itemid;
1096 		IndexTuple	itup;
1097 		size_t		tupsize;
1098 		BTScanInsert skey;
1099 		bool		lowersizelimit;
1100 		ItemPointer scantid;
1101 
1102 		CHECK_FOR_INTERRUPTS();
1103 
1104 		itemid = PageGetItemIdCareful(state, state->targetblock,
1105 									  state->target, offset);
1106 		itup = (IndexTuple) PageGetItem(state->target, itemid);
1107 		tupsize = IndexTupleSize(itup);
1108 
1109 		/*
1110 		 * lp_len should match the IndexTuple reported length exactly, since
1111 		 * lp_len is completely redundant in indexes, and both sources of
1112 		 * tuple length are MAXALIGN()'d.  nbtree does not use lp_len all that
1113 		 * frequently, and is surprisingly tolerant of corrupt lp_len fields.
1114 		 */
1115 		if (tupsize != ItemIdGetLength(itemid))
1116 			ereport(ERROR,
1117 					(errcode(ERRCODE_INDEX_CORRUPTED),
1118 					 errmsg("index tuple size does not equal lp_len in index \"%s\"",
1119 							RelationGetRelationName(state->rel)),
1120 					 errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%X.",
1121 										state->targetblock, offset,
1122 										tupsize, ItemIdGetLength(itemid),
1123 										LSN_FORMAT_ARGS(state->targetlsn)),
1124 					 errhint("This could be a torn page problem.")));
1125 
1126 		/* Check the number of index tuple attributes */
1127 		if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
1128 							 offset))
1129 		{
1130 			ItemPointer tid;
1131 			char	   *itid,
1132 					   *htid;
1133 
1134 			itid = psprintf("(%u,%u)", state->targetblock, offset);
1135 			tid = BTreeTupleGetPointsToTID(itup);
1136 			htid = psprintf("(%u,%u)",
1137 							ItemPointerGetBlockNumberNoCheck(tid),
1138 							ItemPointerGetOffsetNumberNoCheck(tid));
1139 
1140 			ereport(ERROR,
1141 					(errcode(ERRCODE_INDEX_CORRUPTED),
1142 					 errmsg("wrong number of index tuple attributes in index \"%s\"",
1143 							RelationGetRelationName(state->rel)),
1144 					 errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.",
1145 										itid,
1146 										BTreeTupleGetNAtts(itup, state->rel),
1147 										P_ISLEAF(topaque) ? "heap" : "index",
1148 										htid,
1149 										LSN_FORMAT_ARGS(state->targetlsn))));
1150 		}
1151 
1152 		/*
1153 		 * Don't try to generate scankey using "negative infinity" item on
1154 		 * internal pages. They are always truncated to zero attributes.
1155 		 */
1156 		if (offset_is_negative_infinity(topaque, offset))
1157 		{
1158 			/*
1159 			 * We don't call bt_child_check() for "negative infinity" items.
1160 			 * But if we're performing downlink connectivity check, we do it
1161 			 * for every item including "negative infinity" one.
1162 			 */
1163 			if (!P_ISLEAF(topaque) && state->readonly)
1164 			{
1165 				bt_child_highkey_check(state,
1166 									   offset,
1167 									   NULL,
1168 									   topaque->btpo_level);
1169 			}
1170 			continue;
1171 		}
1172 
1173 		/*
1174 		 * Readonly callers may optionally verify that non-pivot tuples can
1175 		 * each be found by an independent search that starts from the root.
1176 		 * Note that we deliberately don't do individual searches for each
1177 		 * TID, since the posting list itself is validated by other checks.
1178 		 */
1179 		if (state->rootdescend && P_ISLEAF(topaque) &&
1180 			!bt_rootdescend(state, itup))
1181 		{
1182 			ItemPointer tid = BTreeTupleGetPointsToTID(itup);
1183 			char	   *itid,
1184 					   *htid;
1185 
1186 			itid = psprintf("(%u,%u)", state->targetblock, offset);
1187 			htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid),
1188 							ItemPointerGetOffsetNumber(tid));
1189 
1190 			ereport(ERROR,
1191 					(errcode(ERRCODE_INDEX_CORRUPTED),
1192 					 errmsg("could not find tuple using search from root page in index \"%s\"",
1193 							RelationGetRelationName(state->rel)),
1194 					 errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.",
1195 										itid, htid,
1196 										LSN_FORMAT_ARGS(state->targetlsn))));
1197 		}
1198 
1199 		/*
1200 		 * If tuple is a posting list tuple, make sure posting list TIDs are
1201 		 * in order
1202 		 */
1203 		if (BTreeTupleIsPosting(itup))
1204 		{
1205 			ItemPointerData last;
1206 			ItemPointer current;
1207 
1208 			ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last);
1209 
1210 			for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1211 			{
1212 
1213 				current = BTreeTupleGetPostingN(itup, i);
1214 
1215 				if (ItemPointerCompare(current, &last) <= 0)
1216 				{
1217 					char	   *itid = psprintf("(%u,%u)", state->targetblock, offset);
1218 
1219 					ereport(ERROR,
1220 							(errcode(ERRCODE_INDEX_CORRUPTED),
1221 							 errmsg_internal("posting list contains misplaced TID in index \"%s\"",
1222 											 RelationGetRelationName(state->rel)),
1223 							 errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.",
1224 												itid, i,
1225 												LSN_FORMAT_ARGS(state->targetlsn))));
1226 				}
1227 
1228 				ItemPointerCopy(current, &last);
1229 			}
1230 		}
1231 
1232 		/* Build insertion scankey for current page offset */
1233 		skey = bt_mkscankey_pivotsearch(state->rel, itup);
1234 
1235 		/*
1236 		 * Make sure tuple size does not exceed the relevant BTREE_VERSION
1237 		 * specific limit.
1238 		 *
1239 		 * BTREE_VERSION 4 (which introduced heapkeyspace rules) requisitioned
1240 		 * a small amount of space from BTMaxItemSize() in order to ensure
1241 		 * that suffix truncation always has enough space to add an explicit
1242 		 * heap TID back to a tuple -- we pessimistically assume that every
1243 		 * newly inserted tuple will eventually need to have a heap TID
1244 		 * appended during a future leaf page split, when the tuple becomes
1245 		 * the basis of the new high key (pivot tuple) for the leaf page.
1246 		 *
1247 		 * Since the reclaimed space is reserved for that purpose, we must not
1248 		 * enforce the slightly lower limit when the extra space has been used
1249 		 * as intended.  In other words, there is only a cross-version
1250 		 * difference in the limit on tuple size within leaf pages.
1251 		 *
1252 		 * Still, we're particular about the details within BTREE_VERSION 4
1253 		 * internal pages.  Pivot tuples may only use the extra space for its
1254 		 * designated purpose.  Enforce the lower limit for pivot tuples when
1255 		 * an explicit heap TID isn't actually present. (In all other cases
1256 		 * suffix truncation is guaranteed to generate a pivot tuple that's no
1257 		 * larger than the firstright tuple provided to it by its caller.)
1258 		 */
1259 		lowersizelimit = skey->heapkeyspace &&
1260 			(P_ISLEAF(topaque) || BTreeTupleGetHeapTID(itup) == NULL);
1261 		if (tupsize > (lowersizelimit ? BTMaxItemSize(state->target) :
1262 					   BTMaxItemSizeNoHeapTid(state->target)))
1263 		{
1264 			ItemPointer tid = BTreeTupleGetPointsToTID(itup);
1265 			char	   *itid,
1266 					   *htid;
1267 
1268 			itid = psprintf("(%u,%u)", state->targetblock, offset);
1269 			htid = psprintf("(%u,%u)",
1270 							ItemPointerGetBlockNumberNoCheck(tid),
1271 							ItemPointerGetOffsetNumberNoCheck(tid));
1272 
1273 			ereport(ERROR,
1274 					(errcode(ERRCODE_INDEX_CORRUPTED),
1275 					 errmsg("index row size %zu exceeds maximum for index \"%s\"",
1276 							tupsize, RelationGetRelationName(state->rel)),
1277 					 errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.",
1278 										itid,
1279 										P_ISLEAF(topaque) ? "heap" : "index",
1280 										htid,
1281 										LSN_FORMAT_ARGS(state->targetlsn))));
1282 		}
1283 
1284 		/* Fingerprint leaf page tuples (those that point to the heap) */
1285 		if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid))
1286 		{
1287 			IndexTuple	norm;
1288 
1289 			if (BTreeTupleIsPosting(itup))
1290 			{
1291 				/* Fingerprint all elements as distinct "plain" tuples */
1292 				for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
1293 				{
1294 					IndexTuple	logtuple;
1295 
1296 					logtuple = bt_posting_plain_tuple(itup, i);
1297 					norm = bt_normalize_tuple(state, logtuple);
1298 					bloom_add_element(state->filter, (unsigned char *) norm,
1299 									  IndexTupleSize(norm));
1300 					/* Be tidy */
1301 					if (norm != logtuple)
1302 						pfree(norm);
1303 					pfree(logtuple);
1304 				}
1305 			}
1306 			else
1307 			{
1308 				norm = bt_normalize_tuple(state, itup);
1309 				bloom_add_element(state->filter, (unsigned char *) norm,
1310 								  IndexTupleSize(norm));
1311 				/* Be tidy */
1312 				if (norm != itup)
1313 					pfree(norm);
1314 			}
1315 		}
1316 
1317 		/*
1318 		 * * High key check *
1319 		 *
1320 		 * If there is a high key (if this is not the rightmost page on its
1321 		 * entire level), check that high key actually is upper bound on all
1322 		 * page items.  If this is a posting list tuple, we'll need to set
1323 		 * scantid to be highest TID in posting list.
1324 		 *
1325 		 * We prefer to check all items against high key rather than checking
1326 		 * just the last and trusting that the operator class obeys the
1327 		 * transitive law (which implies that all previous items also
1328 		 * respected the high key invariant if they pass the item order
1329 		 * check).
1330 		 *
1331 		 * Ideally, we'd compare every item in the index against every other
1332 		 * item in the index, and not trust opclass obedience of the
1333 		 * transitive law to bridge the gap between children and their
1334 		 * grandparents (as well as great-grandparents, and so on).  We don't
1335 		 * go to those lengths because that would be prohibitively expensive,
1336 		 * and probably not markedly more effective in practice.
1337 		 *
1338 		 * On the leaf level, we check that the key is <= the highkey.
1339 		 * However, on non-leaf levels we check that the key is < the highkey,
1340 		 * because the high key is "just another separator" rather than a copy
1341 		 * of some existing key item; we expect it to be unique among all keys
1342 		 * on the same level.  (Suffix truncation will sometimes produce a
1343 		 * leaf highkey that is an untruncated copy of the lastleft item, but
1344 		 * never any other item, which necessitates weakening the leaf level
1345 		 * check to <=.)
1346 		 *
1347 		 * Full explanation for why a highkey is never truly a copy of another
1348 		 * item from the same level on internal levels:
1349 		 *
1350 		 * While the new left page's high key is copied from the first offset
1351 		 * on the right page during an internal page split, that's not the
1352 		 * full story.  In effect, internal pages are split in the middle of
1353 		 * the firstright tuple, not between the would-be lastleft and
1354 		 * firstright tuples: the firstright key ends up on the left side as
1355 		 * left's new highkey, and the firstright downlink ends up on the
1356 		 * right side as right's new "negative infinity" item.  The negative
1357 		 * infinity tuple is truncated to zero attributes, so we're only left
1358 		 * with the downlink.  In other words, the copying is just an
1359 		 * implementation detail of splitting in the middle of a (pivot)
1360 		 * tuple. (See also: "Notes About Data Representation" in the nbtree
1361 		 * README.)
1362 		 */
1363 		scantid = skey->scantid;
1364 		if (state->heapkeyspace && BTreeTupleIsPosting(itup))
1365 			skey->scantid = BTreeTupleGetMaxHeapTID(itup);
1366 
1367 		if (!P_RIGHTMOST(topaque) &&
1368 			!(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
1369 			  invariant_l_offset(state, skey, P_HIKEY)))
1370 		{
1371 			ItemPointer tid = BTreeTupleGetPointsToTID(itup);
1372 			char	   *itid,
1373 					   *htid;
1374 
1375 			itid = psprintf("(%u,%u)", state->targetblock, offset);
1376 			htid = psprintf("(%u,%u)",
1377 							ItemPointerGetBlockNumberNoCheck(tid),
1378 							ItemPointerGetOffsetNumberNoCheck(tid));
1379 
1380 			ereport(ERROR,
1381 					(errcode(ERRCODE_INDEX_CORRUPTED),
1382 					 errmsg("high key invariant violated for index \"%s\"",
1383 							RelationGetRelationName(state->rel)),
1384 					 errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.",
1385 										itid,
1386 										P_ISLEAF(topaque) ? "heap" : "index",
1387 										htid,
1388 										LSN_FORMAT_ARGS(state->targetlsn))));
1389 		}
1390 		/* Reset, in case scantid was set to (itup) posting tuple's max TID */
1391 		skey->scantid = scantid;
1392 
1393 		/*
1394 		 * * Item order check *
1395 		 *
1396 		 * Check that items are stored on page in logical order, by checking
1397 		 * current item is strictly less than next item (if any).
1398 		 */
1399 		if (OffsetNumberNext(offset) <= max &&
1400 			!invariant_l_offset(state, skey, OffsetNumberNext(offset)))
1401 		{
1402 			ItemPointer tid;
1403 			char	   *itid,
1404 					   *htid,
1405 					   *nitid,
1406 					   *nhtid;
1407 
1408 			itid = psprintf("(%u,%u)", state->targetblock, offset);
1409 			tid = BTreeTupleGetPointsToTID(itup);
1410 			htid = psprintf("(%u,%u)",
1411 							ItemPointerGetBlockNumberNoCheck(tid),
1412 							ItemPointerGetOffsetNumberNoCheck(tid));
1413 			nitid = psprintf("(%u,%u)", state->targetblock,
1414 							 OffsetNumberNext(offset));
1415 
1416 			/* Reuse itup to get pointed-to heap location of second item */
1417 			itemid = PageGetItemIdCareful(state, state->targetblock,
1418 										  state->target,
1419 										  OffsetNumberNext(offset));
1420 			itup = (IndexTuple) PageGetItem(state->target, itemid);
1421 			tid = BTreeTupleGetPointsToTID(itup);
1422 			nhtid = psprintf("(%u,%u)",
1423 							 ItemPointerGetBlockNumberNoCheck(tid),
1424 							 ItemPointerGetOffsetNumberNoCheck(tid));
1425 
1426 			ereport(ERROR,
1427 					(errcode(ERRCODE_INDEX_CORRUPTED),
1428 					 errmsg("item order invariant violated for index \"%s\"",
1429 							RelationGetRelationName(state->rel)),
1430 					 errdetail_internal("Lower index tid=%s (points to %s tid=%s) "
1431 										"higher index tid=%s (points to %s tid=%s) "
1432 										"page lsn=%X/%X.",
1433 										itid,
1434 										P_ISLEAF(topaque) ? "heap" : "index",
1435 										htid,
1436 										nitid,
1437 										P_ISLEAF(topaque) ? "heap" : "index",
1438 										nhtid,
1439 										LSN_FORMAT_ARGS(state->targetlsn))));
1440 		}
1441 
1442 		/*
1443 		 * * Last item check *
1444 		 *
1445 		 * Check last item against next/right page's first data item's when
1446 		 * last item on page is reached.  This additional check will detect
1447 		 * transposed pages iff the supposed right sibling page happens to
1448 		 * belong before target in the key space.  (Otherwise, a subsequent
1449 		 * heap verification will probably detect the problem.)
1450 		 *
1451 		 * This check is similar to the item order check that will have
1452 		 * already been performed for every other "real" item on target page
1453 		 * when last item is checked.  The difference is that the next item
1454 		 * (the item that is compared to target's last item) needs to come
1455 		 * from the next/sibling page.  There may not be such an item
1456 		 * available from sibling for various reasons, though (e.g., target is
1457 		 * the rightmost page on level).
1458 		 */
1459 		else if (offset == max)
1460 		{
1461 			BTScanInsert rightkey;
1462 
1463 			/* Get item in next/right page */
1464 			rightkey = bt_right_page_check_scankey(state);
1465 
1466 			if (rightkey &&
1467 				!invariant_g_offset(state, rightkey, max))
1468 			{
1469 				/*
1470 				 * As explained at length in bt_right_page_check_scankey(),
1471 				 * there is a known !readonly race that could account for
1472 				 * apparent violation of invariant, which we must check for
1473 				 * before actually proceeding with raising error.  Our canary
1474 				 * condition is that target page was deleted.
1475 				 */
1476 				if (!state->readonly)
1477 				{
1478 					/* Get fresh copy of target page */
1479 					state->target = palloc_btree_page(state, state->targetblock);
1480 					/* Note that we deliberately do not update target LSN */
1481 					topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
1482 
1483 					/*
1484 					 * All !readonly checks now performed; just return
1485 					 */
1486 					if (P_IGNORE(topaque))
1487 						return;
1488 				}
1489 
1490 				ereport(ERROR,
1491 						(errcode(ERRCODE_INDEX_CORRUPTED),
1492 						 errmsg("cross page item order invariant violated for index \"%s\"",
1493 								RelationGetRelationName(state->rel)),
1494 						 errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%X.",
1495 											state->targetblock, offset,
1496 											LSN_FORMAT_ARGS(state->targetlsn))));
1497 			}
1498 		}
1499 
1500 		/*
1501 		 * * Downlink check *
1502 		 *
1503 		 * Additional check of child items iff this is an internal page and
1504 		 * caller holds a ShareLock.  This happens for every downlink (item)
1505 		 * in target excluding the negative-infinity downlink (again, this is
1506 		 * because it has no useful value to compare).
1507 		 */
1508 		if (!P_ISLEAF(topaque) && state->readonly)
1509 			bt_child_check(state, skey, offset);
1510 	}
1511 
1512 	/*
1513 	 * Special case bt_child_highkey_check() call
1514 	 *
1515 	 * We don't pass an real downlink, but we've to finish the level
1516 	 * processing. If condition is satisfied, we've already processed all the
1517 	 * downlinks from the target level.  But there still might be pages to the
1518 	 * right of the child page pointer to by our rightmost downlink.  And they
1519 	 * might have missing downlinks.  This final call checks for them.
1520 	 */
1521 	if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
1522 	{
1523 		bt_child_highkey_check(state, InvalidOffsetNumber,
1524 							   NULL, topaque->btpo_level);
1525 	}
1526 }
1527 
1528 /*
1529  * Return a scankey for an item on page to right of current target (or the
1530  * first non-ignorable page), sufficient to check ordering invariant on last
1531  * item in current target page.  Returned scankey relies on local memory
1532  * allocated for the child page, which caller cannot pfree().  Caller's memory
1533  * context should be reset between calls here.
1534  *
1535  * This is the first data item, and so all adjacent items are checked against
1536  * their immediate sibling item (which may be on a sibling page, or even a
1537  * "cousin" page at parent boundaries where target's rightlink points to page
1538  * with different parent page).  If no such valid item is available, return
1539  * NULL instead.
1540  *
1541  * Note that !readonly callers must reverify that target page has not
1542  * been concurrently deleted.
1543  */
1544 static BTScanInsert
bt_right_page_check_scankey(BtreeCheckState * state)1545 bt_right_page_check_scankey(BtreeCheckState *state)
1546 {
1547 	BTPageOpaque opaque;
1548 	ItemId		rightitem;
1549 	IndexTuple	firstitup;
1550 	BlockNumber targetnext;
1551 	Page		rightpage;
1552 	OffsetNumber nline;
1553 
1554 	/* Determine target's next block number */
1555 	opaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
1556 
1557 	/* If target is already rightmost, no right sibling; nothing to do here */
1558 	if (P_RIGHTMOST(opaque))
1559 		return NULL;
1560 
1561 	/*
1562 	 * General notes on concurrent page splits and page deletion:
1563 	 *
1564 	 * Routines like _bt_search() don't require *any* page split interlock
1565 	 * when descending the tree, including something very light like a buffer
1566 	 * pin. That's why it's okay that we don't either.  This avoidance of any
1567 	 * need to "couple" buffer locks is the raison d' etre of the Lehman & Yao
1568 	 * algorithm, in fact.
1569 	 *
1570 	 * That leaves deletion.  A deleted page won't actually be recycled by
1571 	 * VACUUM early enough for us to fail to at least follow its right link
1572 	 * (or left link, or downlink) and find its sibling, because recycling
1573 	 * does not occur until no possible index scan could land on the page.
1574 	 * Index scans can follow links with nothing more than their snapshot as
1575 	 * an interlock and be sure of at least that much.  (See page
1576 	 * recycling/"visible to everyone" notes in nbtree README.)
1577 	 *
1578 	 * Furthermore, it's okay if we follow a rightlink and find a half-dead or
1579 	 * dead (ignorable) page one or more times.  There will either be a
1580 	 * further right link to follow that leads to a live page before too long
1581 	 * (before passing by parent's rightmost child), or we will find the end
1582 	 * of the entire level instead (possible when parent page is itself the
1583 	 * rightmost on its level).
1584 	 */
1585 	targetnext = opaque->btpo_next;
1586 	for (;;)
1587 	{
1588 		CHECK_FOR_INTERRUPTS();
1589 
1590 		rightpage = palloc_btree_page(state, targetnext);
1591 		opaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1592 
1593 		if (!P_IGNORE(opaque) || P_RIGHTMOST(opaque))
1594 			break;
1595 
1596 		/*
1597 		 * We landed on a deleted or half-dead sibling page.  Step right until
1598 		 * we locate a live sibling page.
1599 		 */
1600 		ereport(DEBUG2,
1601 				(errcode(ERRCODE_NO_DATA),
1602 				 errmsg_internal("level %u sibling page in block %u of index \"%s\" was found deleted or half dead",
1603 								 opaque->btpo_level, targetnext, RelationGetRelationName(state->rel)),
1604 				 errdetail_internal("Deleted page found when building scankey from right sibling.")));
1605 
1606 		targetnext = opaque->btpo_next;
1607 
1608 		/* Be slightly more pro-active in freeing this memory, just in case */
1609 		pfree(rightpage);
1610 	}
1611 
1612 	/*
1613 	 * No ShareLock held case -- why it's safe to proceed.
1614 	 *
1615 	 * Problem:
1616 	 *
1617 	 * We must avoid false positive reports of corruption when caller treats
1618 	 * item returned here as an upper bound on target's last item.  In
1619 	 * general, false positives are disallowed.  Avoiding them here when
1620 	 * caller is !readonly is subtle.
1621 	 *
1622 	 * A concurrent page deletion by VACUUM of the target page can result in
1623 	 * the insertion of items on to this right sibling page that would
1624 	 * previously have been inserted on our target page.  There might have
1625 	 * been insertions that followed the target's downlink after it was made
1626 	 * to point to right sibling instead of target by page deletion's first
1627 	 * phase. The inserters insert items that would belong on target page.
1628 	 * This race is very tight, but it's possible.  This is our only problem.
1629 	 *
1630 	 * Non-problems:
1631 	 *
1632 	 * We are not hindered by a concurrent page split of the target; we'll
1633 	 * never land on the second half of the page anyway.  A concurrent split
1634 	 * of the right page will also not matter, because the first data item
1635 	 * remains the same within the left half, which we'll reliably land on. If
1636 	 * we had to skip over ignorable/deleted pages, it cannot matter because
1637 	 * their key space has already been atomically merged with the first
1638 	 * non-ignorable page we eventually find (doesn't matter whether the page
1639 	 * we eventually find is a true sibling or a cousin of target, which we go
1640 	 * into below).
1641 	 *
1642 	 * Solution:
1643 	 *
1644 	 * Caller knows that it should reverify that target is not ignorable
1645 	 * (half-dead or deleted) when cross-page sibling item comparison appears
1646 	 * to indicate corruption (invariant fails).  This detects the single race
1647 	 * condition that exists for caller.  This is correct because the
1648 	 * continued existence of target block as non-ignorable (not half-dead or
1649 	 * deleted) implies that target page was not merged into from the right by
1650 	 * deletion; the key space at or after target never moved left.  Target's
1651 	 * parent either has the same downlink to target as before, or a <
1652 	 * downlink due to deletion at the left of target.  Target either has the
1653 	 * same highkey as before, or a highkey < before when there is a page
1654 	 * split. (The rightmost concurrently-split-from-target-page page will
1655 	 * still have the same highkey as target was originally found to have,
1656 	 * which for our purposes is equivalent to target's highkey itself never
1657 	 * changing, since we reliably skip over
1658 	 * concurrently-split-from-target-page pages.)
1659 	 *
1660 	 * In simpler terms, we allow that the key space of the target may expand
1661 	 * left (the key space can move left on the left side of target only), but
1662 	 * the target key space cannot expand right and get ahead of us without
1663 	 * our detecting it.  The key space of the target cannot shrink, unless it
1664 	 * shrinks to zero due to the deletion of the original page, our canary
1665 	 * condition.  (To be very precise, we're a bit stricter than that because
1666 	 * it might just have been that the target page split and only the
1667 	 * original target page was deleted.  We can be more strict, just not more
1668 	 * lax.)
1669 	 *
1670 	 * Top level tree walk caller moves on to next page (makes it the new
1671 	 * target) following recovery from this race.  (cf.  The rationale for
1672 	 * child/downlink verification needing a ShareLock within
1673 	 * bt_child_check(), where page deletion is also the main source of
1674 	 * trouble.)
1675 	 *
1676 	 * Note that it doesn't matter if right sibling page here is actually a
1677 	 * cousin page, because in order for the key space to be readjusted in a
1678 	 * way that causes us issues in next level up (guiding problematic
1679 	 * concurrent insertions to the cousin from the grandparent rather than to
1680 	 * the sibling from the parent), there'd have to be page deletion of
1681 	 * target's parent page (affecting target's parent's downlink in target's
1682 	 * grandparent page).  Internal page deletion only occurs when there are
1683 	 * no child pages (they were all fully deleted), and caller is checking
1684 	 * that the target's parent has at least one non-deleted (so
1685 	 * non-ignorable) child: the target page.  (Note that the first phase of
1686 	 * deletion atomically marks the page to be deleted half-dead/ignorable at
1687 	 * the same time downlink in its parent is removed, so caller will
1688 	 * definitely not fail to detect that this happened.)
1689 	 *
1690 	 * This trick is inspired by the method backward scans use for dealing
1691 	 * with concurrent page splits; concurrent page deletion is a problem that
1692 	 * similarly receives special consideration sometimes (it's possible that
1693 	 * the backwards scan will re-read its "original" block after failing to
1694 	 * find a right-link to it, having already moved in the opposite direction
1695 	 * (right/"forwards") a few times to try to locate one).  Just like us,
1696 	 * that happens only to determine if there was a concurrent page deletion
1697 	 * of a reference page, and just like us if there was a page deletion of
1698 	 * that reference page it means we can move on from caring about the
1699 	 * reference page.  See the nbtree README for a full description of how
1700 	 * that works.
1701 	 */
1702 	nline = PageGetMaxOffsetNumber(rightpage);
1703 
1704 	/*
1705 	 * Get first data item, if any
1706 	 */
1707 	if (P_ISLEAF(opaque) && nline >= P_FIRSTDATAKEY(opaque))
1708 	{
1709 		/* Return first data item (if any) */
1710 		rightitem = PageGetItemIdCareful(state, targetnext, rightpage,
1711 										 P_FIRSTDATAKEY(opaque));
1712 	}
1713 	else if (!P_ISLEAF(opaque) &&
1714 			 nline >= OffsetNumberNext(P_FIRSTDATAKEY(opaque)))
1715 	{
1716 		/*
1717 		 * Return first item after the internal page's "negative infinity"
1718 		 * item
1719 		 */
1720 		rightitem = PageGetItemIdCareful(state, targetnext, rightpage,
1721 										 OffsetNumberNext(P_FIRSTDATAKEY(opaque)));
1722 	}
1723 	else
1724 	{
1725 		/*
1726 		 * No first item.  Page is probably empty leaf page, but it's also
1727 		 * possible that it's an internal page with only a negative infinity
1728 		 * item.
1729 		 */
1730 		ereport(DEBUG2,
1731 				(errcode(ERRCODE_NO_DATA),
1732 				 errmsg_internal("%s block %u of index \"%s\" has no first data item",
1733 								 P_ISLEAF(opaque) ? "leaf" : "internal", targetnext,
1734 								 RelationGetRelationName(state->rel))));
1735 		return NULL;
1736 	}
1737 
1738 	/*
1739 	 * Return first real item scankey.  Note that this relies on right page
1740 	 * memory remaining allocated.
1741 	 */
1742 	firstitup = (IndexTuple) PageGetItem(rightpage, rightitem);
1743 	return bt_mkscankey_pivotsearch(state->rel, firstitup);
1744 }
1745 
1746 /*
1747  * Check if two tuples are binary identical except the block number.  So,
1748  * this function is capable to compare pivot keys on different levels.
1749  */
1750 static bool
bt_pivot_tuple_identical(bool heapkeyspace,IndexTuple itup1,IndexTuple itup2)1751 bt_pivot_tuple_identical(bool heapkeyspace, IndexTuple itup1, IndexTuple itup2)
1752 {
1753 	if (IndexTupleSize(itup1) != IndexTupleSize(itup2))
1754 		return false;
1755 
1756 	if (heapkeyspace)
1757 	{
1758 		/*
1759 		 * Offset number will contain important information in heapkeyspace
1760 		 * indexes: the number of attributes left in the pivot tuple following
1761 		 * suffix truncation.  Don't skip over it (compare it too).
1762 		 */
1763 		if (memcmp(&itup1->t_tid.ip_posid, &itup2->t_tid.ip_posid,
1764 				   IndexTupleSize(itup1) -
1765 				   offsetof(ItemPointerData, ip_posid)) != 0)
1766 			return false;
1767 	}
1768 	else
1769 	{
1770 		/*
1771 		 * Cannot rely on offset number field having consistent value across
1772 		 * levels on pg_upgrade'd !heapkeyspace indexes.  Compare contents of
1773 		 * tuple starting from just after item pointer (i.e. after block
1774 		 * number and offset number).
1775 		 */
1776 		if (memcmp(&itup1->t_info, &itup2->t_info,
1777 				   IndexTupleSize(itup1) -
1778 				   offsetof(IndexTupleData, t_info)) != 0)
1779 			return false;
1780 	}
1781 
1782 	return true;
1783 }
1784 
1785 /*---
1786  * Check high keys on the child level.  Traverse rightlinks from previous
1787  * downlink to the current one.  Check that there are no intermediate pages
1788  * with missing downlinks.
1789  *
1790  * If 'loaded_child' is given, it's assumed to be the page pointed to by the
1791  * downlink referenced by 'downlinkoffnum' of the target page.
1792  *
1793  * Basically this function is called for each target downlink and checks two
1794  * invariants:
1795  *
1796  * 1) You can reach the next child from previous one via rightlinks;
1797  * 2) Each child high key have matching pivot key on target level.
1798  *
1799  * Consider the sample tree picture.
1800  *
1801  *               1
1802  *           /       \
1803  *        2     <->     3
1804  *      /   \        /     \
1805  *    4  <>  5  <> 6 <> 7 <> 8
1806  *
1807  * This function will be called for blocks 4, 5, 6 and 8.  Consider what is
1808  * happening for each function call.
1809  *
1810  * - The function call for block 4 initializes data structure and matches high
1811  *   key of block 4 to downlink's pivot key of block 2.
1812  * - The high key of block 5 is matched to the high key of block 2.
1813  * - The block 6 has an incomplete split flag set, so its high key isn't
1814  *   matched to anything.
1815  * - The function call for block 8 checks that block 8 can be found while
1816  *   following rightlinks from block 6.  The high key of block 7 will be
1817  *   matched to downlink's pivot key in block 3.
1818  *
1819  * There is also final call of this function, which checks that there is no
1820  * missing downlinks for children to the right of the child referenced by
1821  * rightmost downlink in target level.
1822  */
1823 static void
bt_child_highkey_check(BtreeCheckState * state,OffsetNumber target_downlinkoffnum,Page loaded_child,uint32 target_level)1824 bt_child_highkey_check(BtreeCheckState *state,
1825 					   OffsetNumber target_downlinkoffnum,
1826 					   Page loaded_child,
1827 					   uint32 target_level)
1828 {
1829 	BlockNumber blkno = state->prevrightlink;
1830 	Page		page;
1831 	BTPageOpaque opaque;
1832 	bool		rightsplit = state->previncompletesplit;
1833 	bool		first = true;
1834 	ItemId		itemid;
1835 	IndexTuple	itup;
1836 	BlockNumber downlink;
1837 
1838 	if (OffsetNumberIsValid(target_downlinkoffnum))
1839 	{
1840 		itemid = PageGetItemIdCareful(state, state->targetblock,
1841 									  state->target, target_downlinkoffnum);
1842 		itup = (IndexTuple) PageGetItem(state->target, itemid);
1843 		downlink = BTreeTupleGetDownLink(itup);
1844 	}
1845 	else
1846 	{
1847 		downlink = P_NONE;
1848 	}
1849 
1850 	/*
1851 	 * If no previous rightlink is memorized for current level just below
1852 	 * target page's level, we are about to start from the leftmost page. We
1853 	 * can't follow rightlinks from previous page, because there is no
1854 	 * previous page.  But we still can match high key.
1855 	 *
1856 	 * So we initialize variables for the loop above like there is previous
1857 	 * page referencing current child.  Also we imply previous page to not
1858 	 * have incomplete split flag, that would make us require downlink for
1859 	 * current child.  That's correct, because leftmost page on the level
1860 	 * should always have parent downlink.
1861 	 */
1862 	if (!BlockNumberIsValid(blkno))
1863 	{
1864 		blkno = downlink;
1865 		rightsplit = false;
1866 	}
1867 
1868 	/* Move to the right on the child level */
1869 	while (true)
1870 	{
1871 		/*
1872 		 * Did we traverse the whole tree level and this is check for pages to
1873 		 * the right of rightmost downlink?
1874 		 */
1875 		if (blkno == P_NONE && downlink == P_NONE)
1876 		{
1877 			state->prevrightlink = InvalidBlockNumber;
1878 			state->previncompletesplit = false;
1879 			return;
1880 		}
1881 
1882 		/* Did we traverse the whole tree level and don't find next downlink? */
1883 		if (blkno == P_NONE)
1884 			ereport(ERROR,
1885 					(errcode(ERRCODE_INDEX_CORRUPTED),
1886 					 errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"",
1887 							state->prevrightlink, downlink,
1888 							RelationGetRelationName(state->rel))));
1889 
1890 		/* Load page contents */
1891 		if (blkno == downlink && loaded_child)
1892 			page = loaded_child;
1893 		else
1894 			page = palloc_btree_page(state, blkno);
1895 
1896 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1897 
1898 		/* The first page we visit at the level should be leftmost */
1899 		if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque))
1900 			ereport(ERROR,
1901 					(errcode(ERRCODE_INDEX_CORRUPTED),
1902 					 errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"",
1903 							RelationGetRelationName(state->rel)),
1904 					 errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
1905 										state->targetblock, blkno,
1906 										LSN_FORMAT_ARGS(state->targetlsn))));
1907 
1908 		/* Do level sanity check */
1909 		if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) &&
1910 			opaque->btpo_level != target_level - 1)
1911 			ereport(ERROR,
1912 					(errcode(ERRCODE_INDEX_CORRUPTED),
1913 					 errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
1914 							RelationGetRelationName(state->rel)),
1915 					 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
1916 										blkno, target_level - 1, opaque->btpo_level)));
1917 
1918 		/* Try to detect circular links */
1919 		if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
1920 			ereport(ERROR,
1921 					(errcode(ERRCODE_INDEX_CORRUPTED),
1922 					 errmsg("circular link chain found in block %u of index \"%s\"",
1923 							blkno, RelationGetRelationName(state->rel))));
1924 
1925 		if (blkno != downlink && !P_IGNORE(opaque))
1926 		{
1927 			/* blkno probably has missing parent downlink */
1928 			bt_downlink_missing_check(state, rightsplit, blkno, page);
1929 		}
1930 
1931 		rightsplit = P_INCOMPLETE_SPLIT(opaque);
1932 
1933 		/*
1934 		 * If we visit page with high key, check that it is equal to the
1935 		 * target key next to corresponding downlink.
1936 		 */
1937 		if (!rightsplit && !P_RIGHTMOST(opaque))
1938 		{
1939 			BTPageOpaque topaque;
1940 			IndexTuple	highkey;
1941 			OffsetNumber pivotkey_offset;
1942 
1943 			/* Get high key */
1944 			itemid = PageGetItemIdCareful(state, blkno, page, P_HIKEY);
1945 			highkey = (IndexTuple) PageGetItem(page, itemid);
1946 
1947 			/*
1948 			 * There might be two situations when we examine high key.  If
1949 			 * current child page is referenced by given target downlink, we
1950 			 * should look to the next offset number for matching key from
1951 			 * target page.
1952 			 *
1953 			 * Alternatively, we're following rightlinks somewhere in the
1954 			 * middle between page referenced by previous target's downlink
1955 			 * and the page referenced by current target's downlink.  If
1956 			 * current child page hasn't incomplete split flag set, then its
1957 			 * high key should match to the target's key of current offset
1958 			 * number. This happens when a previous call here (to
1959 			 * bt_child_highkey_check()) found an incomplete split, and we
1960 			 * reach a right sibling page without a downlink -- the right
1961 			 * sibling page's high key still needs to be matched to a
1962 			 * separator key on the parent/target level.
1963 			 *
1964 			 * Don't apply OffsetNumberNext() to target_downlinkoffnum when we
1965 			 * already had to step right on the child level. Our traversal of
1966 			 * the child level must try to move in perfect lockstep behind (to
1967 			 * the left of) the target/parent level traversal.
1968 			 */
1969 			if (blkno == downlink)
1970 				pivotkey_offset = OffsetNumberNext(target_downlinkoffnum);
1971 			else
1972 				pivotkey_offset = target_downlinkoffnum;
1973 
1974 			topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
1975 
1976 			if (!offset_is_negative_infinity(topaque, pivotkey_offset))
1977 			{
1978 				/*
1979 				 * If we're looking for the next pivot tuple in target page,
1980 				 * but there is no more pivot tuples, then we should match to
1981 				 * high key instead.
1982 				 */
1983 				if (pivotkey_offset > PageGetMaxOffsetNumber(state->target))
1984 				{
1985 					if (P_RIGHTMOST(topaque))
1986 						ereport(ERROR,
1987 								(errcode(ERRCODE_INDEX_CORRUPTED),
1988 								 errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"",
1989 										RelationGetRelationName(state->rel)),
1990 								 errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
1991 													state->targetblock, blkno,
1992 													LSN_FORMAT_ARGS(state->targetlsn))));
1993 					pivotkey_offset = P_HIKEY;
1994 				}
1995 				itemid = PageGetItemIdCareful(state, state->targetblock,
1996 											  state->target, pivotkey_offset);
1997 				itup = (IndexTuple) PageGetItem(state->target, itemid);
1998 			}
1999 			else
2000 			{
2001 				/*
2002 				 * We cannot try to match child's high key to a negative
2003 				 * infinity key in target, since there is nothing to compare.
2004 				 * However, it's still possible to match child's high key
2005 				 * outside of target page.  The reason why we're are is that
2006 				 * bt_child_highkey_check() was previously called for the
2007 				 * cousin page of 'loaded_child', which is incomplete split.
2008 				 * So, now we traverse to the right of that cousin page and
2009 				 * current child level page under consideration still belongs
2010 				 * to the subtree of target's left sibling.  Thus, we need to
2011 				 * match child's high key to it's left uncle page high key.
2012 				 * Thankfully we saved it, it's called a "low key" of target
2013 				 * page.
2014 				 */
2015 				if (!state->lowkey)
2016 					ereport(ERROR,
2017 							(errcode(ERRCODE_INDEX_CORRUPTED),
2018 							 errmsg("can't find left sibling high key in index \"%s\"",
2019 									RelationGetRelationName(state->rel)),
2020 							 errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
2021 												state->targetblock, blkno,
2022 												LSN_FORMAT_ARGS(state->targetlsn))));
2023 				itup = state->lowkey;
2024 			}
2025 
2026 			if (!bt_pivot_tuple_identical(state->heapkeyspace, highkey, itup))
2027 			{
2028 				ereport(ERROR,
2029 						(errcode(ERRCODE_INDEX_CORRUPTED),
2030 						 errmsg("mismatch between parent key and child high key in index \"%s\"",
2031 								RelationGetRelationName(state->rel)),
2032 						 errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
2033 											state->targetblock, blkno,
2034 											LSN_FORMAT_ARGS(state->targetlsn))));
2035 			}
2036 		}
2037 
2038 		/* Exit if we already found next downlink */
2039 		if (blkno == downlink)
2040 		{
2041 			state->prevrightlink = opaque->btpo_next;
2042 			state->previncompletesplit = rightsplit;
2043 			return;
2044 		}
2045 
2046 		/* Traverse to the next page using rightlink */
2047 		blkno = opaque->btpo_next;
2048 
2049 		/* Free page contents if it's allocated by us */
2050 		if (page != loaded_child)
2051 			pfree(page);
2052 		first = false;
2053 	}
2054 }
2055 
2056 /*
2057  * Checks one of target's downlink against its child page.
2058  *
2059  * Conceptually, the target page continues to be what is checked here.  The
2060  * target block is still blamed in the event of finding an invariant violation.
2061  * The downlink insertion into the target is probably where any problem raised
2062  * here arises, and there is no such thing as a parent link, so doing the
2063  * verification this way around is much more practical.
2064  *
2065  * This function visits child page and it's sequentially called for each
2066  * downlink of target page.  Assuming this we also check downlink connectivity
2067  * here in order to save child page visits.
2068  */
2069 static void
bt_child_check(BtreeCheckState * state,BTScanInsert targetkey,OffsetNumber downlinkoffnum)2070 bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
2071 			   OffsetNumber downlinkoffnum)
2072 {
2073 	ItemId		itemid;
2074 	IndexTuple	itup;
2075 	BlockNumber childblock;
2076 	OffsetNumber offset;
2077 	OffsetNumber maxoffset;
2078 	Page		child;
2079 	BTPageOpaque copaque;
2080 	BTPageOpaque topaque;
2081 
2082 	itemid = PageGetItemIdCareful(state, state->targetblock,
2083 								  state->target, downlinkoffnum);
2084 	itup = (IndexTuple) PageGetItem(state->target, itemid);
2085 	childblock = BTreeTupleGetDownLink(itup);
2086 
2087 	/*
2088 	 * Caller must have ShareLock on target relation, because of
2089 	 * considerations around page deletion by VACUUM.
2090 	 *
2091 	 * NB: In general, page deletion deletes the right sibling's downlink, not
2092 	 * the downlink of the page being deleted; the deleted page's downlink is
2093 	 * reused for its sibling.  The key space is thereby consolidated between
2094 	 * the deleted page and its right sibling.  (We cannot delete a parent
2095 	 * page's rightmost child unless it is the last child page, and we intend
2096 	 * to also delete the parent itself.)
2097 	 *
2098 	 * If this verification happened without a ShareLock, the following race
2099 	 * condition could cause false positives:
2100 	 *
2101 	 * In general, concurrent page deletion might occur, including deletion of
2102 	 * the left sibling of the child page that is examined here.  If such a
2103 	 * page deletion were to occur, closely followed by an insertion into the
2104 	 * newly expanded key space of the child, a window for the false positive
2105 	 * opens up: the stale parent/target downlink originally followed to get
2106 	 * to the child legitimately ceases to be a lower bound on all items in
2107 	 * the page, since the key space was concurrently expanded "left".
2108 	 * (Insertion followed the "new" downlink for the child, not our now-stale
2109 	 * downlink, which was concurrently physically removed in target/parent as
2110 	 * part of deletion's first phase.)
2111 	 *
2112 	 * While we use various techniques elsewhere to perform cross-page
2113 	 * verification for !readonly callers, a similar trick seems difficult
2114 	 * here.  The tricks used by bt_recheck_sibling_links and by
2115 	 * bt_right_page_check_scankey both involve verification of a same-level,
2116 	 * cross-sibling invariant.  Cross-level invariants are far more squishy,
2117 	 * though.  The nbtree REDO routines do not actually couple buffer locks
2118 	 * across levels during page splits, so making any cross-level check work
2119 	 * reliably in !readonly mode may be impossible.
2120 	 */
2121 	Assert(state->readonly);
2122 
2123 	/*
2124 	 * Verify child page has the downlink key from target page (its parent) as
2125 	 * a lower bound; downlink must be strictly less than all keys on the
2126 	 * page.
2127 	 *
2128 	 * Check all items, rather than checking just the first and trusting that
2129 	 * the operator class obeys the transitive law.
2130 	 */
2131 	topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
2132 	child = palloc_btree_page(state, childblock);
2133 	copaque = (BTPageOpaque) PageGetSpecialPointer(child);
2134 	maxoffset = PageGetMaxOffsetNumber(child);
2135 
2136 	/*
2137 	 * Since we've already loaded the child block, combine this check with
2138 	 * check for downlink connectivity.
2139 	 */
2140 	bt_child_highkey_check(state, downlinkoffnum,
2141 						   child, topaque->btpo_level);
2142 
2143 	/*
2144 	 * Since there cannot be a concurrent VACUUM operation in readonly mode,
2145 	 * and since a page has no links within other pages (siblings and parent)
2146 	 * once it is marked fully deleted, it should be impossible to land on a
2147 	 * fully deleted page.
2148 	 *
2149 	 * It does not quite make sense to enforce that the page cannot even be
2150 	 * half-dead, despite the fact the downlink is modified at the same stage
2151 	 * that the child leaf page is marked half-dead.  That's incorrect because
2152 	 * there may occasionally be multiple downlinks from a chain of pages
2153 	 * undergoing deletion, where multiple successive calls are made to
2154 	 * _bt_unlink_halfdead_page() by VACUUM before it can finally safely mark
2155 	 * the leaf page as fully dead.  While _bt_mark_page_halfdead() usually
2156 	 * removes the downlink to the leaf page that is marked half-dead, that's
2157 	 * not guaranteed, so it's possible we'll land on a half-dead page with a
2158 	 * downlink due to an interrupted multi-level page deletion.
2159 	 *
2160 	 * We go ahead with our checks if the child page is half-dead.  It's safe
2161 	 * to do so because we do not test the child's high key, so it does not
2162 	 * matter that the original high key will have been replaced by a dummy
2163 	 * truncated high key within _bt_mark_page_halfdead().  All other page
2164 	 * items are left intact on a half-dead page, so there is still something
2165 	 * to test.
2166 	 */
2167 	if (P_ISDELETED(copaque))
2168 		ereport(ERROR,
2169 				(errcode(ERRCODE_INDEX_CORRUPTED),
2170 				 errmsg("downlink to deleted page found in index \"%s\"",
2171 						RelationGetRelationName(state->rel)),
2172 				 errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%X.",
2173 									state->targetblock, childblock,
2174 									LSN_FORMAT_ARGS(state->targetlsn))));
2175 
2176 	for (offset = P_FIRSTDATAKEY(copaque);
2177 		 offset <= maxoffset;
2178 		 offset = OffsetNumberNext(offset))
2179 	{
2180 		/*
2181 		 * Skip comparison of target page key against "negative infinity"
2182 		 * item, if any.  Checking it would indicate that it's not a strict
2183 		 * lower bound, but that's only because of the hard-coding for
2184 		 * negative infinity items within _bt_compare().
2185 		 *
2186 		 * If nbtree didn't truncate negative infinity tuples during internal
2187 		 * page splits then we'd expect child's negative infinity key to be
2188 		 * equal to the scankey/downlink from target/parent (it would be a
2189 		 * "low key" in this hypothetical scenario, and so it would still need
2190 		 * to be treated as a special case here).
2191 		 *
2192 		 * Negative infinity items can be thought of as a strict lower bound
2193 		 * that works transitively, with the last non-negative-infinity pivot
2194 		 * followed during a descent from the root as its "true" strict lower
2195 		 * bound.  Only a small number of negative infinity items are truly
2196 		 * negative infinity; those that are the first items of leftmost
2197 		 * internal pages.  In more general terms, a negative infinity item is
2198 		 * only negative infinity with respect to the subtree that the page is
2199 		 * at the root of.
2200 		 *
2201 		 * See also: bt_rootdescend(), which can even detect transitive
2202 		 * inconsistencies on cousin leaf pages.
2203 		 */
2204 		if (offset_is_negative_infinity(copaque, offset))
2205 			continue;
2206 
2207 		if (!invariant_l_nontarget_offset(state, targetkey, childblock, child,
2208 										  offset))
2209 			ereport(ERROR,
2210 					(errcode(ERRCODE_INDEX_CORRUPTED),
2211 					 errmsg("down-link lower bound invariant violated for index \"%s\"",
2212 							RelationGetRelationName(state->rel)),
2213 					 errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X.",
2214 										state->targetblock, childblock, offset,
2215 										LSN_FORMAT_ARGS(state->targetlsn))));
2216 	}
2217 
2218 	pfree(child);
2219 }
2220 
2221 /*
2222  * Checks if page is missing a downlink that it should have.
2223  *
2224  * A page that lacks a downlink/parent may indicate corruption.  However, we
2225  * must account for the fact that a missing downlink can occasionally be
2226  * encountered in a non-corrupt index.  This can be due to an interrupted page
2227  * split, or an interrupted multi-level page deletion (i.e. there was a hard
2228  * crash or an error during a page split, or while VACUUM was deleting a
2229  * multi-level chain of pages).
2230  *
2231  * Note that this can only be called in readonly mode, so there is no need to
2232  * be concerned about concurrent page splits or page deletions.
2233  */
2234 static void
bt_downlink_missing_check(BtreeCheckState * state,bool rightsplit,BlockNumber blkno,Page page)2235 bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
2236 						  BlockNumber blkno, Page page)
2237 {
2238 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2239 	ItemId		itemid;
2240 	IndexTuple	itup;
2241 	Page		child;
2242 	BTPageOpaque copaque;
2243 	uint32		level;
2244 	BlockNumber childblk;
2245 	XLogRecPtr	pagelsn;
2246 
2247 	Assert(state->readonly);
2248 	Assert(!P_IGNORE(opaque));
2249 
2250 	/* No next level up with downlinks to fingerprint from the true root */
2251 	if (P_ISROOT(opaque))
2252 		return;
2253 
2254 	pagelsn = PageGetLSN(page);
2255 
2256 	/*
2257 	 * Incomplete (interrupted) page splits can account for the lack of a
2258 	 * downlink.  Some inserting transaction should eventually complete the
2259 	 * page split in passing, when it notices that the left sibling page is
2260 	 * P_INCOMPLETE_SPLIT().
2261 	 *
2262 	 * In general, VACUUM is not prepared for there to be no downlink to a
2263 	 * page that it deletes.  This is the main reason why the lack of a
2264 	 * downlink can be reported as corruption here.  It's not obvious that an
2265 	 * invalid missing downlink can result in wrong answers to queries,
2266 	 * though, since index scans that land on the child may end up
2267 	 * consistently moving right. The handling of concurrent page splits (and
2268 	 * page deletions) within _bt_moveright() cannot distinguish
2269 	 * inconsistencies that last for a moment from inconsistencies that are
2270 	 * permanent and irrecoverable.
2271 	 *
2272 	 * VACUUM isn't even prepared to delete pages that have no downlink due to
2273 	 * an incomplete page split, but it can detect and reason about that case
2274 	 * by design, so it shouldn't be taken to indicate corruption.  See
2275 	 * _bt_pagedel() for full details.
2276 	 */
2277 	if (rightsplit)
2278 	{
2279 		ereport(DEBUG1,
2280 				(errcode(ERRCODE_NO_DATA),
2281 				 errmsg_internal("harmless interrupted page split detected in index \"%s\"",
2282 								 RelationGetRelationName(state->rel)),
2283 				 errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
2284 									blkno, opaque->btpo_level,
2285 									opaque->btpo_prev,
2286 									LSN_FORMAT_ARGS(pagelsn))));
2287 		return;
2288 	}
2289 
2290 	/*
2291 	 * Page under check is probably the "top parent" of a multi-level page
2292 	 * deletion.  We'll need to descend the subtree to make sure that
2293 	 * descendant pages are consistent with that, though.
2294 	 *
2295 	 * If the page (which must be non-ignorable) is a leaf page, then clearly
2296 	 * it can't be the top parent.  The lack of a downlink is probably a
2297 	 * symptom of a broad problem that could just as easily cause
2298 	 * inconsistencies anywhere else.
2299 	 */
2300 	if (P_ISLEAF(opaque))
2301 		ereport(ERROR,
2302 				(errcode(ERRCODE_INDEX_CORRUPTED),
2303 				 errmsg("leaf index block lacks downlink in index \"%s\"",
2304 						RelationGetRelationName(state->rel)),
2305 				 errdetail_internal("Block=%u page lsn=%X/%X.",
2306 									blkno,
2307 									LSN_FORMAT_ARGS(pagelsn))));
2308 
2309 	/* Descend from the given page, which is an internal page */
2310 	elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
2311 		 RelationGetRelationName(state->rel));
2312 
2313 	level = opaque->btpo_level;
2314 	itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
2315 	itup = (IndexTuple) PageGetItem(page, itemid);
2316 	childblk = BTreeTupleGetDownLink(itup);
2317 	for (;;)
2318 	{
2319 		CHECK_FOR_INTERRUPTS();
2320 
2321 		child = palloc_btree_page(state, childblk);
2322 		copaque = (BTPageOpaque) PageGetSpecialPointer(child);
2323 
2324 		if (P_ISLEAF(copaque))
2325 			break;
2326 
2327 		/* Do an extra sanity check in passing on internal pages */
2328 		if (copaque->btpo_level != level - 1)
2329 			ereport(ERROR,
2330 					(errcode(ERRCODE_INDEX_CORRUPTED),
2331 					 errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
2332 									 RelationGetRelationName(state->rel)),
2333 					 errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
2334 										blkno, childblk,
2335 										level - 1, copaque->btpo_level)));
2336 
2337 		level = copaque->btpo_level;
2338 		itemid = PageGetItemIdCareful(state, childblk, child,
2339 									  P_FIRSTDATAKEY(copaque));
2340 		itup = (IndexTuple) PageGetItem(child, itemid);
2341 		childblk = BTreeTupleGetDownLink(itup);
2342 		/* Be slightly more pro-active in freeing this memory, just in case */
2343 		pfree(child);
2344 	}
2345 
2346 	/*
2347 	 * Since there cannot be a concurrent VACUUM operation in readonly mode,
2348 	 * and since a page has no links within other pages (siblings and parent)
2349 	 * once it is marked fully deleted, it should be impossible to land on a
2350 	 * fully deleted page.  See bt_child_check() for further details.
2351 	 *
2352 	 * The bt_child_check() P_ISDELETED() check is repeated here because
2353 	 * bt_child_check() does not visit pages reachable through negative
2354 	 * infinity items.  Besides, bt_child_check() is unwilling to descend
2355 	 * multiple levels.  (The similar bt_child_check() P_ISDELETED() check
2356 	 * within bt_check_level_from_leftmost() won't reach the page either,
2357 	 * since the leaf's live siblings should have their sibling links updated
2358 	 * to bypass the deletion target page when it is marked fully dead.)
2359 	 *
2360 	 * If this error is raised, it might be due to a previous multi-level page
2361 	 * deletion that failed to realize that it wasn't yet safe to mark the
2362 	 * leaf page as fully dead.  A "dangling downlink" will still remain when
2363 	 * this happens.  The fact that the dangling downlink's page (the leaf's
2364 	 * parent/ancestor page) lacked a downlink is incidental.
2365 	 */
2366 	if (P_ISDELETED(copaque))
2367 		ereport(ERROR,
2368 				(errcode(ERRCODE_INDEX_CORRUPTED),
2369 				 errmsg_internal("downlink to deleted leaf page found in index \"%s\"",
2370 								 RelationGetRelationName(state->rel)),
2371 				 errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X.",
2372 									blkno, childblk,
2373 									LSN_FORMAT_ARGS(pagelsn))));
2374 
2375 	/*
2376 	 * Iff leaf page is half-dead, its high key top parent link should point
2377 	 * to what VACUUM considered to be the top parent page at the instant it
2378 	 * was interrupted.  Provided the high key link actually points to the
2379 	 * page under check, the missing downlink we detected is consistent with
2380 	 * there having been an interrupted multi-level page deletion.  This means
2381 	 * that the subtree with the page under check at its root (a page deletion
2382 	 * chain) is in a consistent state, enabling VACUUM to resume deleting the
2383 	 * entire chain the next time it encounters the half-dead leaf page.
2384 	 */
2385 	if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque))
2386 	{
2387 		itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY);
2388 		itup = (IndexTuple) PageGetItem(child, itemid);
2389 		if (BTreeTupleGetTopParent(itup) == blkno)
2390 			return;
2391 	}
2392 
2393 	ereport(ERROR,
2394 			(errcode(ERRCODE_INDEX_CORRUPTED),
2395 			 errmsg("internal index block lacks downlink in index \"%s\"",
2396 					RelationGetRelationName(state->rel)),
2397 			 errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
2398 								blkno, opaque->btpo_level,
2399 								LSN_FORMAT_ARGS(pagelsn))));
2400 }
2401 
2402 /*
2403  * Per-tuple callback from table_index_build_scan, used to determine if index has
2404  * all the entries that definitely should have been observed in leaf pages of
2405  * the target index (that is, all IndexTuples that were fingerprinted by our
2406  * Bloom filter).  All heapallindexed checks occur here.
2407  *
2408  * The redundancy between an index and the table it indexes provides a good
2409  * opportunity to detect corruption, especially corruption within the table.
2410  * The high level principle behind the verification performed here is that any
2411  * IndexTuple that should be in an index following a fresh CREATE INDEX (based
2412  * on the same index definition) should also have been in the original,
2413  * existing index, which should have used exactly the same representation
2414  *
2415  * Since the overall structure of the index has already been verified, the most
2416  * likely explanation for error here is a corrupt heap page (could be logical
2417  * or physical corruption).  Index corruption may still be detected here,
2418  * though.  Only readonly callers will have verified that left links and right
2419  * links are in agreement, and so it's possible that a leaf page transposition
2420  * within index is actually the source of corruption detected here (for
2421  * !readonly callers).  The checks performed only for readonly callers might
2422  * more accurately frame the problem as a cross-page invariant issue (this
2423  * could even be due to recovery not replaying all WAL records).  The !readonly
2424  * ERROR message raised here includes a HINT about retrying with readonly
2425  * verification, just in case it's a cross-page invariant issue, though that
2426  * isn't particularly likely.
2427  *
2428  * table_index_build_scan() expects to be able to find the root tuple when a
2429  * heap-only tuple (the live tuple at the end of some HOT chain) needs to be
2430  * indexed, in order to replace the actual tuple's TID with the root tuple's
2431  * TID (which is what we're actually passed back here).  The index build heap
2432  * scan code will raise an error when a tuple that claims to be the root of the
2433  * heap-only tuple's HOT chain cannot be located.  This catches cases where the
2434  * original root item offset/root tuple for a HOT chain indicates (for whatever
2435  * reason) that the entire HOT chain is dead, despite the fact that the latest
2436  * heap-only tuple should be indexed.  When this happens, sequential scans may
2437  * always give correct answers, and all indexes may be considered structurally
2438  * consistent (i.e. the nbtree structural checks would not detect corruption).
2439  * It may be the case that only index scans give wrong answers, and yet heap or
2440  * SLRU corruption is the real culprit.  (While it's true that LP_DEAD bit
2441  * setting will probably also leave the index in a corrupt state before too
2442  * long, the problem is nonetheless that there is heap corruption.)
2443  *
2444  * Heap-only tuple handling within table_index_build_scan() works in a way that
2445  * helps us to detect index tuples that contain the wrong values (values that
2446  * don't match the latest tuple in the HOT chain).  This can happen when there
2447  * is no superseding index tuple due to a faulty assessment of HOT safety,
2448  * perhaps during the original CREATE INDEX.  Because the latest tuple's
2449  * contents are used with the root TID, an error will be raised when a tuple
2450  * with the same TID but non-matching attribute values is passed back to us.
2451  * Faulty assessment of HOT-safety was behind at least two distinct CREATE
2452  * INDEX CONCURRENTLY bugs that made it into stable releases, one of which was
2453  * undetected for many years.  In short, the same principle that allows a
2454  * REINDEX to repair corruption when there was an (undetected) broken HOT chain
2455  * also allows us to detect the corruption in many cases.
2456  */
2457 static void
bt_tuple_present_callback(Relation index,ItemPointer tid,Datum * values,bool * isnull,bool tupleIsAlive,void * checkstate)2458 bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values,
2459 						  bool *isnull, bool tupleIsAlive, void *checkstate)
2460 {
2461 	BtreeCheckState *state = (BtreeCheckState *) checkstate;
2462 	IndexTuple	itup,
2463 				norm;
2464 
2465 	Assert(state->heapallindexed);
2466 
2467 	/* Generate a normalized index tuple for fingerprinting */
2468 	itup = index_form_tuple(RelationGetDescr(index), values, isnull);
2469 	itup->t_tid = *tid;
2470 	norm = bt_normalize_tuple(state, itup);
2471 
2472 	/* Probe Bloom filter -- tuple should be present */
2473 	if (bloom_lacks_element(state->filter, (unsigned char *) norm,
2474 							IndexTupleSize(norm)))
2475 		ereport(ERROR,
2476 				(errcode(ERRCODE_DATA_CORRUPTED),
2477 				 errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",
2478 						ItemPointerGetBlockNumber(&(itup->t_tid)),
2479 						ItemPointerGetOffsetNumber(&(itup->t_tid)),
2480 						RelationGetRelationName(state->heaprel),
2481 						RelationGetRelationName(state->rel)),
2482 				 !state->readonly
2483 				 ? errhint("Retrying verification using the function bt_index_parent_check() might provide a more specific error.")
2484 				 : 0));
2485 
2486 	state->heaptuplespresent++;
2487 	pfree(itup);
2488 	/* Cannot leak memory here */
2489 	if (norm != itup)
2490 		pfree(norm);
2491 }
2492 
2493 /*
2494  * Normalize an index tuple for fingerprinting.
2495  *
2496  * In general, index tuple formation is assumed to be deterministic by
2497  * heapallindexed verification, and IndexTuples are assumed immutable.  While
2498  * the LP_DEAD bit is mutable in leaf pages, that's ItemId metadata, which is
2499  * not fingerprinted.  Normalization is required to compensate for corner
2500  * cases where the determinism assumption doesn't quite work.
2501  *
2502  * There is currently one such case: index_form_tuple() does not try to hide
2503  * the source TOAST state of input datums.  The executor applies TOAST
2504  * compression for heap tuples based on different criteria to the compression
2505  * applied within btinsert()'s call to index_form_tuple(): it sometimes
2506  * compresses more aggressively, resulting in compressed heap tuple datums but
2507  * uncompressed corresponding index tuple datums.  A subsequent heapallindexed
2508  * verification will get a logically equivalent though bitwise unequal tuple
2509  * from index_form_tuple().  False positive heapallindexed corruption reports
2510  * could occur without normalizing away the inconsistency.
2511  *
2512  * Returned tuple is often caller's own original tuple.  Otherwise, it is a
2513  * new representation of caller's original index tuple, palloc()'d in caller's
2514  * memory context.
2515  *
2516  * Note: This routine is not concerned with distinctions about the
2517  * representation of tuples beyond those that might break heapallindexed
2518  * verification.  In particular, it won't try to normalize opclass-equal
2519  * datums with potentially distinct representations (e.g., btree/numeric_ops
2520  * index datums will not get their display scale normalized-away here).
2521  * Caller does normalization for non-pivot tuples that have a posting list,
2522  * since dummy CREATE INDEX callback code generates new tuples with the same
2523  * normalized representation.
2524  */
2525 static IndexTuple
bt_normalize_tuple(BtreeCheckState * state,IndexTuple itup)2526 bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
2527 {
2528 	TupleDesc	tupleDescriptor = RelationGetDescr(state->rel);
2529 	Datum		normalized[INDEX_MAX_KEYS];
2530 	bool		isnull[INDEX_MAX_KEYS];
2531 	bool		toast_free[INDEX_MAX_KEYS];
2532 	bool		formnewtup = false;
2533 	IndexTuple	reformed;
2534 	int			i;
2535 
2536 	/* Caller should only pass "logical" non-pivot tuples here */
2537 	Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup));
2538 
2539 	/* Easy case: It's immediately clear that tuple has no varlena datums */
2540 	if (!IndexTupleHasVarwidths(itup))
2541 		return itup;
2542 
2543 	for (i = 0; i < tupleDescriptor->natts; i++)
2544 	{
2545 		Form_pg_attribute att;
2546 
2547 		att = TupleDescAttr(tupleDescriptor, i);
2548 
2549 		/* Assume untoasted/already normalized datum initially */
2550 		toast_free[i] = false;
2551 		normalized[i] = index_getattr(itup, att->attnum,
2552 									  tupleDescriptor,
2553 									  &isnull[i]);
2554 		if (att->attbyval || att->attlen != -1 || isnull[i])
2555 			continue;
2556 
2557 		/*
2558 		 * Callers always pass a tuple that could safely be inserted into the
2559 		 * index without further processing, so an external varlena header
2560 		 * should never be encountered here
2561 		 */
2562 		if (VARATT_IS_EXTERNAL(DatumGetPointer(normalized[i])))
2563 			ereport(ERROR,
2564 					(errcode(ERRCODE_INDEX_CORRUPTED),
2565 					 errmsg("external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"",
2566 							ItemPointerGetBlockNumber(&(itup->t_tid)),
2567 							ItemPointerGetOffsetNumber(&(itup->t_tid)),
2568 							RelationGetRelationName(state->rel))));
2569 		else if (VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i])))
2570 		{
2571 			formnewtup = true;
2572 			normalized[i] = PointerGetDatum(PG_DETOAST_DATUM(normalized[i]));
2573 			toast_free[i] = true;
2574 		}
2575 	}
2576 
2577 	/* Easier case: Tuple has varlena datums, none of which are compressed */
2578 	if (!formnewtup)
2579 		return itup;
2580 
2581 	/*
2582 	 * Hard case: Tuple had compressed varlena datums that necessitate
2583 	 * creating normalized version of the tuple from uncompressed input datums
2584 	 * (normalized input datums).  This is rather naive, but shouldn't be
2585 	 * necessary too often.
2586 	 *
2587 	 * Note that we rely on deterministic index_form_tuple() TOAST compression
2588 	 * of normalized input.
2589 	 */
2590 	reformed = index_form_tuple(tupleDescriptor, normalized, isnull);
2591 	reformed->t_tid = itup->t_tid;
2592 
2593 	/* Cannot leak memory here */
2594 	for (i = 0; i < tupleDescriptor->natts; i++)
2595 		if (toast_free[i])
2596 			pfree(DatumGetPointer(normalized[i]));
2597 
2598 	return reformed;
2599 }
2600 
2601 /*
2602  * Produce palloc()'d "plain" tuple for nth posting list entry/TID.
2603  *
2604  * In general, deduplication is not supposed to change the logical contents of
2605  * an index.  Multiple index tuples are merged together into one equivalent
2606  * posting list index tuple when convenient.
2607  *
2608  * heapallindexed verification must normalize-away this variation in
2609  * representation by converting posting list tuples into two or more "plain"
2610  * tuples.  Each tuple must be fingerprinted separately -- there must be one
2611  * tuple for each corresponding Bloom filter probe during the heap scan.
2612  *
2613  * Note: Caller still needs to call bt_normalize_tuple() with returned tuple.
2614  */
2615 static inline IndexTuple
bt_posting_plain_tuple(IndexTuple itup,int n)2616 bt_posting_plain_tuple(IndexTuple itup, int n)
2617 {
2618 	Assert(BTreeTupleIsPosting(itup));
2619 
2620 	/* Returns non-posting-list tuple */
2621 	return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1);
2622 }
2623 
2624 /*
2625  * Search for itup in index, starting from fast root page.  itup must be a
2626  * non-pivot tuple.  This is only supported with heapkeyspace indexes, since
2627  * we rely on having fully unique keys to find a match with only a single
2628  * visit to a leaf page, barring an interrupted page split, where we may have
2629  * to move right.  (A concurrent page split is impossible because caller must
2630  * be readonly caller.)
2631  *
2632  * This routine can detect very subtle transitive consistency issues across
2633  * more than one level of the tree.  Leaf pages all have a high key (even the
2634  * rightmost page has a conceptual positive infinity high key), but not a low
2635  * key.  Their downlink in parent is a lower bound, which along with the high
2636  * key is almost enough to detect every possible inconsistency.  A downlink
2637  * separator key value won't always be available from parent, though, because
2638  * the first items of internal pages are negative infinity items, truncated
2639  * down to zero attributes during internal page splits.  While it's true that
2640  * bt_child_check() and the high key check can detect most imaginable key
2641  * space problems, there are remaining problems it won't detect with non-pivot
2642  * tuples in cousin leaf pages.  Starting a search from the root for every
2643  * existing leaf tuple detects small inconsistencies in upper levels of the
2644  * tree that cannot be detected any other way.  (Besides all this, this is
2645  * probably also useful as a direct test of the code used by index scans
2646  * themselves.)
2647  */
2648 static bool
bt_rootdescend(BtreeCheckState * state,IndexTuple itup)2649 bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
2650 {
2651 	BTScanInsert key;
2652 	BTStack		stack;
2653 	Buffer		lbuf;
2654 	bool		exists;
2655 
2656 	key = _bt_mkscankey(state->rel, itup);
2657 	Assert(key->heapkeyspace && key->scantid != NULL);
2658 
2659 	/*
2660 	 * Search from root.
2661 	 *
2662 	 * Ideally, we would arrange to only move right within _bt_search() when
2663 	 * an interrupted page split is detected (i.e. when the incomplete split
2664 	 * bit is found to be set), but for now we accept the possibility that
2665 	 * that could conceal an inconsistency.
2666 	 */
2667 	Assert(state->readonly && state->rootdescend);
2668 	exists = false;
2669 	stack = _bt_search(state->rel, key, &lbuf, BT_READ, NULL);
2670 
2671 	if (BufferIsValid(lbuf))
2672 	{
2673 		BTInsertStateData insertstate;
2674 		OffsetNumber offnum;
2675 		Page		page;
2676 
2677 		insertstate.itup = itup;
2678 		insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
2679 		insertstate.itup_key = key;
2680 		insertstate.postingoff = 0;
2681 		insertstate.bounds_valid = false;
2682 		insertstate.buf = lbuf;
2683 
2684 		/* Get matching tuple on leaf page */
2685 		offnum = _bt_binsrch_insert(state->rel, &insertstate);
2686 		/* Compare first >= matching item on leaf page, if any */
2687 		page = BufferGetPage(lbuf);
2688 		/* Should match on first heap TID when tuple has a posting list */
2689 		if (offnum <= PageGetMaxOffsetNumber(page) &&
2690 			insertstate.postingoff <= 0 &&
2691 			_bt_compare(state->rel, key, page, offnum) == 0)
2692 			exists = true;
2693 		_bt_relbuf(state->rel, lbuf);
2694 	}
2695 
2696 	_bt_freestack(stack);
2697 	pfree(key);
2698 
2699 	return exists;
2700 }
2701 
2702 /*
2703  * Is particular offset within page (whose special state is passed by caller)
2704  * the page negative-infinity item?
2705  *
2706  * As noted in comments above _bt_compare(), there is special handling of the
2707  * first data item as a "negative infinity" item.  The hard-coding within
2708  * _bt_compare() makes comparing this item for the purposes of verification
2709  * pointless at best, since the IndexTuple only contains a valid TID (a
2710  * reference TID to child page).
2711  */
2712 static inline bool
offset_is_negative_infinity(BTPageOpaque opaque,OffsetNumber offset)2713 offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)
2714 {
2715 	/*
2716 	 * For internal pages only, the first item after high key, if any, is
2717 	 * negative infinity item.  Internal pages always have a negative infinity
2718 	 * item, whereas leaf pages never have one.  This implies that negative
2719 	 * infinity item is either first or second line item, or there is none
2720 	 * within page.
2721 	 *
2722 	 * Negative infinity items are a special case among pivot tuples.  They
2723 	 * always have zero attributes, while all other pivot tuples always have
2724 	 * nkeyatts attributes.
2725 	 *
2726 	 * Right-most pages don't have a high key, but could be said to
2727 	 * conceptually have a "positive infinity" high key.  Thus, there is a
2728 	 * symmetry between down link items in parent pages, and high keys in
2729 	 * children.  Together, they represent the part of the key space that
2730 	 * belongs to each page in the index.  For example, all children of the
2731 	 * root page will have negative infinity as a lower bound from root
2732 	 * negative infinity downlink, and positive infinity as an upper bound
2733 	 * (implicitly, from "imaginary" positive infinity high key in root).
2734 	 */
2735 	return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque);
2736 }
2737 
2738 /*
2739  * Does the invariant hold that the key is strictly less than a given upper
2740  * bound offset item?
2741  *
2742  * Verifies line pointer on behalf of caller.
2743  *
2744  * If this function returns false, convention is that caller throws error due
2745  * to corruption.
2746  */
2747 static inline bool
invariant_l_offset(BtreeCheckState * state,BTScanInsert key,OffsetNumber upperbound)2748 invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
2749 				   OffsetNumber upperbound)
2750 {
2751 	ItemId		itemid;
2752 	int32		cmp;
2753 
2754 	Assert(key->pivotsearch);
2755 
2756 	/* Verify line pointer before checking tuple */
2757 	itemid = PageGetItemIdCareful(state, state->targetblock, state->target,
2758 								  upperbound);
2759 	/* pg_upgrade'd indexes may legally have equal sibling tuples */
2760 	if (!key->heapkeyspace)
2761 		return invariant_leq_offset(state, key, upperbound);
2762 
2763 	cmp = _bt_compare(state->rel, key, state->target, upperbound);
2764 
2765 	/*
2766 	 * _bt_compare() is capable of determining that a scankey with a
2767 	 * filled-out attribute is greater than pivot tuples where the comparison
2768 	 * is resolved at a truncated attribute (value of attribute in pivot is
2769 	 * minus infinity).  However, it is not capable of determining that a
2770 	 * scankey is _less than_ a tuple on the basis of a comparison resolved at
2771 	 * _scankey_ minus infinity attribute.  Complete an extra step to simulate
2772 	 * having minus infinity values for omitted scankey attribute(s).
2773 	 */
2774 	if (cmp == 0)
2775 	{
2776 		BTPageOpaque topaque;
2777 		IndexTuple	ritup;
2778 		int			uppnkeyatts;
2779 		ItemPointer rheaptid;
2780 		bool		nonpivot;
2781 
2782 		ritup = (IndexTuple) PageGetItem(state->target, itemid);
2783 		topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
2784 		nonpivot = P_ISLEAF(topaque) && upperbound >= P_FIRSTDATAKEY(topaque);
2785 
2786 		/* Get number of keys + heap TID for item to the right */
2787 		uppnkeyatts = BTreeTupleGetNKeyAtts(ritup, state->rel);
2788 		rheaptid = BTreeTupleGetHeapTIDCareful(state, ritup, nonpivot);
2789 
2790 		/* Heap TID is tiebreaker key attribute */
2791 		if (key->keysz == uppnkeyatts)
2792 			return key->scantid == NULL && rheaptid != NULL;
2793 
2794 		return key->keysz < uppnkeyatts;
2795 	}
2796 
2797 	return cmp < 0;
2798 }
2799 
2800 /*
2801  * Does the invariant hold that the key is less than or equal to a given upper
2802  * bound offset item?
2803  *
2804  * Caller should have verified that upperbound's line pointer is consistent
2805  * using PageGetItemIdCareful() call.
2806  *
2807  * If this function returns false, convention is that caller throws error due
2808  * to corruption.
2809  */
2810 static inline bool
invariant_leq_offset(BtreeCheckState * state,BTScanInsert key,OffsetNumber upperbound)2811 invariant_leq_offset(BtreeCheckState *state, BTScanInsert key,
2812 					 OffsetNumber upperbound)
2813 {
2814 	int32		cmp;
2815 
2816 	Assert(key->pivotsearch);
2817 
2818 	cmp = _bt_compare(state->rel, key, state->target, upperbound);
2819 
2820 	return cmp <= 0;
2821 }
2822 
2823 /*
2824  * Does the invariant hold that the key is strictly greater than a given lower
2825  * bound offset item?
2826  *
2827  * Caller should have verified that lowerbound's line pointer is consistent
2828  * using PageGetItemIdCareful() call.
2829  *
2830  * If this function returns false, convention is that caller throws error due
2831  * to corruption.
2832  */
2833 static inline bool
invariant_g_offset(BtreeCheckState * state,BTScanInsert key,OffsetNumber lowerbound)2834 invariant_g_offset(BtreeCheckState *state, BTScanInsert key,
2835 				   OffsetNumber lowerbound)
2836 {
2837 	int32		cmp;
2838 
2839 	Assert(key->pivotsearch);
2840 
2841 	cmp = _bt_compare(state->rel, key, state->target, lowerbound);
2842 
2843 	/* pg_upgrade'd indexes may legally have equal sibling tuples */
2844 	if (!key->heapkeyspace)
2845 		return cmp >= 0;
2846 
2847 	/*
2848 	 * No need to consider the possibility that scankey has attributes that we
2849 	 * need to force to be interpreted as negative infinity.  _bt_compare() is
2850 	 * able to determine that scankey is greater than negative infinity.  The
2851 	 * distinction between "==" and "<" isn't interesting here, since
2852 	 * corruption is indicated either way.
2853 	 */
2854 	return cmp > 0;
2855 }
2856 
2857 /*
2858  * Does the invariant hold that the key is strictly less than a given upper
2859  * bound offset item, with the offset relating to a caller-supplied page that
2860  * is not the current target page?
2861  *
2862  * Caller's non-target page is a child page of the target, checked as part of
2863  * checking a property of the target page (i.e. the key comes from the
2864  * target).  Verifies line pointer on behalf of caller.
2865  *
2866  * If this function returns false, convention is that caller throws error due
2867  * to corruption.
2868  */
2869 static inline bool
invariant_l_nontarget_offset(BtreeCheckState * state,BTScanInsert key,BlockNumber nontargetblock,Page nontarget,OffsetNumber upperbound)2870 invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
2871 							 BlockNumber nontargetblock, Page nontarget,
2872 							 OffsetNumber upperbound)
2873 {
2874 	ItemId		itemid;
2875 	int32		cmp;
2876 
2877 	Assert(key->pivotsearch);
2878 
2879 	/* Verify line pointer before checking tuple */
2880 	itemid = PageGetItemIdCareful(state, nontargetblock, nontarget,
2881 								  upperbound);
2882 	cmp = _bt_compare(state->rel, key, nontarget, upperbound);
2883 
2884 	/* pg_upgrade'd indexes may legally have equal sibling tuples */
2885 	if (!key->heapkeyspace)
2886 		return cmp <= 0;
2887 
2888 	/* See invariant_l_offset() for an explanation of this extra step */
2889 	if (cmp == 0)
2890 	{
2891 		IndexTuple	child;
2892 		int			uppnkeyatts;
2893 		ItemPointer childheaptid;
2894 		BTPageOpaque copaque;
2895 		bool		nonpivot;
2896 
2897 		child = (IndexTuple) PageGetItem(nontarget, itemid);
2898 		copaque = (BTPageOpaque) PageGetSpecialPointer(nontarget);
2899 		nonpivot = P_ISLEAF(copaque) && upperbound >= P_FIRSTDATAKEY(copaque);
2900 
2901 		/* Get number of keys + heap TID for child/non-target item */
2902 		uppnkeyatts = BTreeTupleGetNKeyAtts(child, state->rel);
2903 		childheaptid = BTreeTupleGetHeapTIDCareful(state, child, nonpivot);
2904 
2905 		/* Heap TID is tiebreaker key attribute */
2906 		if (key->keysz == uppnkeyatts)
2907 			return key->scantid == NULL && childheaptid != NULL;
2908 
2909 		return key->keysz < uppnkeyatts;
2910 	}
2911 
2912 	return cmp < 0;
2913 }
2914 
2915 /*
2916  * Given a block number of a B-Tree page, return page in palloc()'d memory.
2917  * While at it, perform some basic checks of the page.
2918  *
2919  * There is never an attempt to get a consistent view of multiple pages using
2920  * multiple concurrent buffer locks; in general, we only acquire a single pin
2921  * and buffer lock at a time, which is often all that the nbtree code requires.
2922  * (Actually, bt_recheck_sibling_links couples buffer locks, which is the only
2923  * exception to this general rule.)
2924  *
2925  * Operating on a copy of the page is useful because it prevents control
2926  * getting stuck in an uninterruptible state when an underlying operator class
2927  * misbehaves.
2928  */
2929 static Page
palloc_btree_page(BtreeCheckState * state,BlockNumber blocknum)2930 palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
2931 {
2932 	Buffer		buffer;
2933 	Page		page;
2934 	BTPageOpaque opaque;
2935 	OffsetNumber maxoffset;
2936 
2937 	page = palloc(BLCKSZ);
2938 
2939 	/*
2940 	 * We copy the page into local storage to avoid holding pin on the buffer
2941 	 * longer than we must.
2942 	 */
2943 	buffer = ReadBufferExtended(state->rel, MAIN_FORKNUM, blocknum, RBM_NORMAL,
2944 								state->checkstrategy);
2945 	LockBuffer(buffer, BT_READ);
2946 
2947 	/*
2948 	 * Perform the same basic sanity checking that nbtree itself performs for
2949 	 * every page:
2950 	 */
2951 	_bt_checkpage(state->rel, buffer);
2952 
2953 	/* Only use copy of page in palloc()'d memory */
2954 	memcpy(page, BufferGetPage(buffer), BLCKSZ);
2955 	UnlockReleaseBuffer(buffer);
2956 
2957 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2958 
2959 	if (P_ISMETA(opaque) && blocknum != BTREE_METAPAGE)
2960 		ereport(ERROR,
2961 				(errcode(ERRCODE_INDEX_CORRUPTED),
2962 				 errmsg("invalid meta page found at block %u in index \"%s\"",
2963 						blocknum, RelationGetRelationName(state->rel))));
2964 
2965 	/* Check page from block that ought to be meta page */
2966 	if (blocknum == BTREE_METAPAGE)
2967 	{
2968 		BTMetaPageData *metad = BTPageGetMeta(page);
2969 
2970 		if (!P_ISMETA(opaque) ||
2971 			metad->btm_magic != BTREE_MAGIC)
2972 			ereport(ERROR,
2973 					(errcode(ERRCODE_INDEX_CORRUPTED),
2974 					 errmsg("index \"%s\" meta page is corrupt",
2975 							RelationGetRelationName(state->rel))));
2976 
2977 		if (metad->btm_version < BTREE_MIN_VERSION ||
2978 			metad->btm_version > BTREE_VERSION)
2979 			ereport(ERROR,
2980 					(errcode(ERRCODE_INDEX_CORRUPTED),
2981 					 errmsg("version mismatch in index \"%s\": file version %d, "
2982 							"current version %d, minimum supported version %d",
2983 							RelationGetRelationName(state->rel),
2984 							metad->btm_version, BTREE_VERSION,
2985 							BTREE_MIN_VERSION)));
2986 
2987 		/* Finished with metapage checks */
2988 		return page;
2989 	}
2990 
2991 	/*
2992 	 * Deleted pages that still use the old 32-bit XID representation have no
2993 	 * sane "level" field because they type pun the field, but all other pages
2994 	 * (including pages deleted on Postgres 14+) have a valid value.
2995 	 */
2996 	if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque))
2997 	{
2998 		/* Okay, no reason not to trust btpo_level field from page */
2999 
3000 		if (P_ISLEAF(opaque) && opaque->btpo_level != 0)
3001 			ereport(ERROR,
3002 					(errcode(ERRCODE_INDEX_CORRUPTED),
3003 					 errmsg_internal("invalid leaf page level %u for block %u in index \"%s\"",
3004 									 opaque->btpo_level, blocknum,
3005 									 RelationGetRelationName(state->rel))));
3006 
3007 		if (!P_ISLEAF(opaque) && opaque->btpo_level == 0)
3008 			ereport(ERROR,
3009 					(errcode(ERRCODE_INDEX_CORRUPTED),
3010 					 errmsg_internal("invalid internal page level 0 for block %u in index \"%s\"",
3011 									 blocknum,
3012 									 RelationGetRelationName(state->rel))));
3013 	}
3014 
3015 	/*
3016 	 * Sanity checks for number of items on page.
3017 	 *
3018 	 * As noted at the beginning of _bt_binsrch(), an internal page must have
3019 	 * children, since there must always be a negative infinity downlink
3020 	 * (there may also be a highkey).  In the case of non-rightmost leaf
3021 	 * pages, there must be at least a highkey.  The exceptions are deleted
3022 	 * pages, which contain no items.
3023 	 *
3024 	 * This is correct when pages are half-dead, since internal pages are
3025 	 * never half-dead, and leaf pages must have a high key when half-dead
3026 	 * (the rightmost page can never be deleted).  It's also correct with
3027 	 * fully deleted pages: _bt_unlink_halfdead_page() doesn't change anything
3028 	 * about the target page other than setting the page as fully dead, and
3029 	 * setting its xact field.  In particular, it doesn't change the sibling
3030 	 * links in the deletion target itself, since they're required when index
3031 	 * scans land on the deletion target, and then need to move right (or need
3032 	 * to move left, in the case of backward index scans).
3033 	 */
3034 	maxoffset = PageGetMaxOffsetNumber(page);
3035 	if (maxoffset > MaxIndexTuplesPerPage)
3036 		ereport(ERROR,
3037 				(errcode(ERRCODE_INDEX_CORRUPTED),
3038 				 errmsg("Number of items on block %u of index \"%s\" exceeds MaxIndexTuplesPerPage (%u)",
3039 						blocknum, RelationGetRelationName(state->rel),
3040 						MaxIndexTuplesPerPage)));
3041 
3042 	if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) && maxoffset < P_FIRSTDATAKEY(opaque))
3043 		ereport(ERROR,
3044 				(errcode(ERRCODE_INDEX_CORRUPTED),
3045 				 errmsg("internal block %u in index \"%s\" lacks high key and/or at least one downlink",
3046 						blocknum, RelationGetRelationName(state->rel))));
3047 
3048 	if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && !P_RIGHTMOST(opaque) && maxoffset < P_HIKEY)
3049 		ereport(ERROR,
3050 				(errcode(ERRCODE_INDEX_CORRUPTED),
3051 				 errmsg("non-rightmost leaf block %u in index \"%s\" lacks high key item",
3052 						blocknum, RelationGetRelationName(state->rel))));
3053 
3054 	/*
3055 	 * In general, internal pages are never marked half-dead, except on
3056 	 * versions of Postgres prior to 9.4, where it can be valid transient
3057 	 * state.  This state is nonetheless treated as corruption by VACUUM on
3058 	 * from version 9.4 on, so do the same here.  See _bt_pagedel() for full
3059 	 * details.
3060 	 */
3061 	if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
3062 		ereport(ERROR,
3063 				(errcode(ERRCODE_INDEX_CORRUPTED),
3064 				 errmsg("internal page block %u in index \"%s\" is half-dead",
3065 						blocknum, RelationGetRelationName(state->rel)),
3066 				 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
3067 
3068 	/*
3069 	 * Check that internal pages have no garbage items, and that no page has
3070 	 * an invalid combination of deletion-related page level flags
3071 	 */
3072 	if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
3073 		ereport(ERROR,
3074 				(errcode(ERRCODE_INDEX_CORRUPTED),
3075 				 errmsg_internal("internal page block %u in index \"%s\" has garbage items",
3076 								 blocknum, RelationGetRelationName(state->rel))));
3077 
3078 	if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque))
3079 		ereport(ERROR,
3080 				(errcode(ERRCODE_INDEX_CORRUPTED),
3081 				 errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"",
3082 								 blocknum, RelationGetRelationName(state->rel))));
3083 
3084 	if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque))
3085 		ereport(ERROR,
3086 				(errcode(ERRCODE_INDEX_CORRUPTED),
3087 				 errmsg_internal("deleted page block %u in index \"%s\" is half-dead",
3088 								 blocknum, RelationGetRelationName(state->rel))));
3089 
3090 	return page;
3091 }
3092 
3093 /*
3094  * _bt_mkscankey() wrapper that automatically prevents insertion scankey from
3095  * being considered greater than the pivot tuple that its values originated
3096  * from (or some other identical pivot tuple) in the common case where there
3097  * are truncated/minus infinity attributes.  Without this extra step, there
3098  * are forms of corruption that amcheck could theoretically fail to report.
3099  *
3100  * For example, invariant_g_offset() might miss a cross-page invariant failure
3101  * on an internal level if the scankey built from the first item on the
3102  * target's right sibling page happened to be equal to (not greater than) the
3103  * last item on target page.  The !pivotsearch tiebreaker in _bt_compare()
3104  * might otherwise cause amcheck to assume (rather than actually verify) that
3105  * the scankey is greater.
3106  */
3107 static inline BTScanInsert
bt_mkscankey_pivotsearch(Relation rel,IndexTuple itup)3108 bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup)
3109 {
3110 	BTScanInsert skey;
3111 
3112 	skey = _bt_mkscankey(rel, itup);
3113 	skey->pivotsearch = true;
3114 
3115 	return skey;
3116 }
3117 
3118 /*
3119  * PageGetItemId() wrapper that validates returned line pointer.
3120  *
3121  * Buffer page/page item access macros generally trust that line pointers are
3122  * not corrupt, which might cause problems for verification itself.  For
3123  * example, there is no bounds checking in PageGetItem().  Passing it a
3124  * corrupt line pointer can cause it to return a tuple/pointer that is unsafe
3125  * to dereference.
3126  *
3127  * Validating line pointers before tuples avoids undefined behavior and
3128  * assertion failures with corrupt indexes, making the verification process
3129  * more robust and predictable.
3130  */
3131 static ItemId
PageGetItemIdCareful(BtreeCheckState * state,BlockNumber block,Page page,OffsetNumber offset)3132 PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page,
3133 					 OffsetNumber offset)
3134 {
3135 	ItemId		itemid = PageGetItemId(page, offset);
3136 
3137 	if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) >
3138 		BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData)))
3139 		ereport(ERROR,
3140 				(errcode(ERRCODE_INDEX_CORRUPTED),
3141 				 errmsg("line pointer points past end of tuple space in index \"%s\"",
3142 						RelationGetRelationName(state->rel)),
3143 				 errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.",
3144 									block, offset, ItemIdGetOffset(itemid),
3145 									ItemIdGetLength(itemid),
3146 									ItemIdGetFlags(itemid))));
3147 
3148 	/*
3149 	 * Verify that line pointer isn't LP_REDIRECT or LP_UNUSED, since nbtree
3150 	 * never uses either.  Verify that line pointer has storage, too, since
3151 	 * even LP_DEAD items should within nbtree.
3152 	 */
3153 	if (ItemIdIsRedirected(itemid) || !ItemIdIsUsed(itemid) ||
3154 		ItemIdGetLength(itemid) == 0)
3155 		ereport(ERROR,
3156 				(errcode(ERRCODE_INDEX_CORRUPTED),
3157 				 errmsg("invalid line pointer storage in index \"%s\"",
3158 						RelationGetRelationName(state->rel)),
3159 				 errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.",
3160 									block, offset, ItemIdGetOffset(itemid),
3161 									ItemIdGetLength(itemid),
3162 									ItemIdGetFlags(itemid))));
3163 
3164 	return itemid;
3165 }
3166 
3167 /*
3168  * BTreeTupleGetHeapTID() wrapper that enforces that a heap TID is present in
3169  * cases where that is mandatory (i.e. for non-pivot tuples)
3170  */
3171 static inline ItemPointer
BTreeTupleGetHeapTIDCareful(BtreeCheckState * state,IndexTuple itup,bool nonpivot)3172 BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup,
3173 							bool nonpivot)
3174 {
3175 	ItemPointer htid;
3176 
3177 	/*
3178 	 * Caller determines whether this is supposed to be a pivot or non-pivot
3179 	 * tuple using page type and item offset number.  Verify that tuple
3180 	 * metadata agrees with this.
3181 	 */
3182 	Assert(state->heapkeyspace);
3183 	if (BTreeTupleIsPivot(itup) && nonpivot)
3184 		ereport(ERROR,
3185 				(errcode(ERRCODE_INDEX_CORRUPTED),
3186 				 errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple",
3187 								 state->targetblock,
3188 								 RelationGetRelationName(state->rel))));
3189 
3190 	if (!BTreeTupleIsPivot(itup) && !nonpivot)
3191 		ereport(ERROR,
3192 				(errcode(ERRCODE_INDEX_CORRUPTED),
3193 				 errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple",
3194 								 state->targetblock,
3195 								 RelationGetRelationName(state->rel))));
3196 
3197 	htid = BTreeTupleGetHeapTID(itup);
3198 	if (!ItemPointerIsValid(htid) && nonpivot)
3199 		ereport(ERROR,
3200 				(errcode(ERRCODE_INDEX_CORRUPTED),
3201 				 errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
3202 						state->targetblock,
3203 						RelationGetRelationName(state->rel))));
3204 
3205 	return htid;
3206 }
3207 
3208 /*
3209  * Return the "pointed to" TID for itup, which is used to generate a
3210  * descriptive error message.  itup must be a "data item" tuple (it wouldn't
3211  * make much sense to call here with a high key tuple, since there won't be a
3212  * valid downlink/block number to display).
3213  *
3214  * Returns either a heap TID (which will be the first heap TID in posting list
3215  * if itup is posting list tuple), or a TID that contains downlink block
3216  * number, plus some encoded metadata (e.g., the number of attributes present
3217  * in itup).
3218  */
3219 static inline ItemPointer
BTreeTupleGetPointsToTID(IndexTuple itup)3220 BTreeTupleGetPointsToTID(IndexTuple itup)
3221 {
3222 	/*
3223 	 * Rely on the assumption that !heapkeyspace internal page data items will
3224 	 * correctly return TID with downlink here -- BTreeTupleGetHeapTID() won't
3225 	 * recognize it as a pivot tuple, but everything still works out because
3226 	 * the t_tid field is still returned
3227 	 */
3228 	if (!BTreeTupleIsPivot(itup))
3229 		return BTreeTupleGetHeapTID(itup);
3230 
3231 	/* Pivot tuple returns TID with downlink block (heapkeyspace variant) */
3232 	return &itup->t_tid;
3233 }
3234