1 /*-------------------------------------------------------------------------
2  *
3  * verify_nbtree.c
4  *		Verifies the integrity of nbtree indexes based on invariants.
5  *
6  * For B-Tree indexes, verification includes checking that each page in the
7  * target index has items in logical order as reported by an insertion scankey
8  * (the insertion scankey sort-wise NULL semantics are needed for
9  * verification).
10  *
11  *
12  * Copyright (c) 2017, PostgreSQL Global Development Group
13  *
14  * IDENTIFICATION
15  *	  contrib/amcheck/verify_nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/transam.h"
23 #include "catalog/index.h"
24 #include "catalog/pg_am.h"
25 #include "commands/tablecmds.h"
26 #include "miscadmin.h"
27 #include "storage/lmgr.h"
28 #include "storage/smgr.h"
29 #include "utils/memutils.h"
30 #include "utils/snapmgr.h"
31 
32 
33 PG_MODULE_MAGIC;
34 
35 /*
36  * A B-Tree cannot possibly have this many levels, since there must be one
37  * block per level, which is bound by the range of BlockNumber:
38  */
39 #define InvalidBtreeLevel	((uint32) InvalidBlockNumber)
40 
41 /*
42  * State associated with verifying a B-Tree index
43  *
44  * target is the point of reference for a verification operation.
45  *
46  * Other B-Tree pages may be allocated, but those are always auxiliary (e.g.,
47  * they are current target's child pages). Conceptually, problems are only
48  * ever found in the current target page. Each page found by verification's
49  * left/right, top/bottom scan becomes the target exactly once.
50  */
51 typedef struct BtreeCheckState
52 {
53 	/*
54 	 * Unchanging state, established at start of verification:
55 	 */
56 
57 	/* B-Tree Index Relation */
58 	Relation	rel;
59 	/* ShareLock held on heap/index, rather than AccessShareLock? */
60 	bool		readonly;
61 	/* Per-page context */
62 	MemoryContext targetcontext;
63 	/* Buffer access strategy */
64 	BufferAccessStrategy checkstrategy;
65 
66 	/*
67 	 * Mutable state, for verification of particular page:
68 	 */
69 
70 	/* Current target page */
71 	Page		target;
72 	/* Target block number */
73 	BlockNumber targetblock;
74 	/* Target page's LSN */
75 	XLogRecPtr	targetlsn;
76 } BtreeCheckState;
77 
78 /*
79  * Starting point for verifying an entire B-Tree index level
80  */
81 typedef struct BtreeLevel
82 {
83 	/* Level number (0 is leaf page level). */
84 	uint32		level;
85 
86 	/* Left most block on level.  Scan of level begins here. */
87 	BlockNumber leftmost;
88 
89 	/* Is this level reported as "true" root level by meta page? */
90 	bool		istruerootlevel;
91 } BtreeLevel;
92 
93 PG_FUNCTION_INFO_V1(bt_index_check);
94 PG_FUNCTION_INFO_V1(bt_index_parent_check);
95 
96 static void bt_index_check_internal(Oid indrelid, bool parentcheck);
97 static inline void btree_index_checkable(Relation rel);
98 static inline bool btree_index_mainfork_expected(Relation rel);
99 static void bt_check_every_level(Relation rel, bool readonly);
100 static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
101 							 BtreeLevel level);
102 static void bt_target_page_check(BtreeCheckState *state);
103 static ScanKey bt_right_page_check_scankey(BtreeCheckState *state);
104 static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
105 				  ScanKey targetkey);
106 static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
107 							OffsetNumber offset);
108 static inline bool invariant_leq_offset(BtreeCheckState *state,
109 					 ScanKey key,
110 					 OffsetNumber upperbound);
111 static inline bool invariant_geq_offset(BtreeCheckState *state,
112 					 ScanKey key,
113 					 OffsetNumber lowerbound);
114 static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state,
115 							   Page other,
116 							   ScanKey key,
117 							   OffsetNumber upperbound);
118 static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum);
119 
120 /*
121  * bt_index_check(index regclass)
122  *
123  * Verify integrity of B-Tree index.
124  *
125  * Acquires AccessShareLock on heap & index relations.  Does not consider
126  * invariants that exist between parent/child pages.
127  */
128 Datum
bt_index_check(PG_FUNCTION_ARGS)129 bt_index_check(PG_FUNCTION_ARGS)
130 {
131 	Oid			indrelid = PG_GETARG_OID(0);
132 
133 	bt_index_check_internal(indrelid, false);
134 
135 	PG_RETURN_VOID();
136 }
137 
138 /*
139  * bt_index_parent_check(index regclass)
140  *
141  * Verify integrity of B-Tree index.
142  *
143  * Acquires ShareLock on heap & index relations.  Verifies that downlinks in
144  * parent pages are valid lower bounds on child pages.
145  */
146 Datum
bt_index_parent_check(PG_FUNCTION_ARGS)147 bt_index_parent_check(PG_FUNCTION_ARGS)
148 {
149 	Oid			indrelid = PG_GETARG_OID(0);
150 
151 	bt_index_check_internal(indrelid, true);
152 
153 	PG_RETURN_VOID();
154 }
155 
156 /*
157  * Helper for bt_index_[parent_]check, coordinating the bulk of the work.
158  */
159 static void
bt_index_check_internal(Oid indrelid,bool parentcheck)160 bt_index_check_internal(Oid indrelid, bool parentcheck)
161 {
162 	Oid			heapid;
163 	Relation	indrel;
164 	Relation	heaprel;
165 	LOCKMODE	lockmode;
166 
167 	if (parentcheck)
168 		lockmode = ShareLock;
169 	else
170 		lockmode = AccessShareLock;
171 
172 	/*
173 	 * We must lock table before index to avoid deadlocks.  However, if the
174 	 * passed indrelid isn't an index then IndexGetRelation() will fail.
175 	 * Rather than emitting a not-very-helpful error message, postpone
176 	 * complaining, expecting that the is-it-an-index test below will fail.
177 	 *
178 	 * In hot standby mode this will raise an error when parentcheck is true.
179 	 */
180 	heapid = IndexGetRelation(indrelid, true);
181 	if (OidIsValid(heapid))
182 		heaprel = heap_open(heapid, lockmode);
183 	else
184 		heaprel = NULL;
185 
186 	/*
187 	 * Open the target index relations separately (like relation_openrv(), but
188 	 * with heap relation locked first to prevent deadlocking).  In hot
189 	 * standby mode this will raise an error when parentcheck is true.
190 	 */
191 	indrel = index_open(indrelid, lockmode);
192 
193 	/*
194 	 * Since we did the IndexGetRelation call above without any lock, it's
195 	 * barely possible that a race against an index drop/recreation could have
196 	 * netted us the wrong table.  Although the table itself won't actually be
197 	 * examined during verification currently, a recheck still seems like a
198 	 * good idea.
199 	 */
200 	if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false))
201 		ereport(ERROR,
202 				(errcode(ERRCODE_UNDEFINED_TABLE),
203 				 errmsg("could not open parent table of index %s",
204 						RelationGetRelationName(indrel))));
205 
206 	/* Relation suitable for checking as B-Tree? */
207 	btree_index_checkable(indrel);
208 
209 	if (btree_index_mainfork_expected(indrel))
210 	{
211 		RelationOpenSmgr(indrel);
212 		if (!smgrexists(indrel->rd_smgr, MAIN_FORKNUM))
213 			ereport(ERROR,
214 					(errcode(ERRCODE_INDEX_CORRUPTED),
215 					 errmsg("index \"%s\" lacks a main relation fork",
216 							RelationGetRelationName(indrel))));
217 
218 		/* Check index */
219 		bt_check_every_level(indrel, parentcheck);
220 	}
221 
222 	/*
223 	 * Release locks early. That's ok here because nothing in the called
224 	 * routines will trigger shared cache invalidations to be sent, so we can
225 	 * relax the usual pattern of only releasing locks after commit.
226 	 */
227 	index_close(indrel, lockmode);
228 	if (heaprel)
229 		heap_close(heaprel, lockmode);
230 }
231 
232 /*
233  * Basic checks about the suitability of a relation for checking as a B-Tree
234  * index.
235  *
236  * NB: Intentionally not checking permissions, the function is normally not
237  * callable by non-superusers. If granted, it's useful to be able to check a
238  * whole cluster.
239  */
240 static inline void
btree_index_checkable(Relation rel)241 btree_index_checkable(Relation rel)
242 {
243 	if (rel->rd_rel->relkind != RELKIND_INDEX ||
244 		rel->rd_rel->relam != BTREE_AM_OID)
245 		ereport(ERROR,
246 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
247 				 errmsg("only B-Tree indexes are supported as targets for verification"),
248 				 errdetail("Relation \"%s\" is not a B-Tree index.",
249 						   RelationGetRelationName(rel))));
250 
251 	if (RELATION_IS_OTHER_TEMP(rel))
252 		ereport(ERROR,
253 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
254 				 errmsg("cannot access temporary tables of other sessions"),
255 				 errdetail("Index \"%s\" is associated with temporary relation.",
256 						   RelationGetRelationName(rel))));
257 
258 	if (!IndexIsValid(rel->rd_index))
259 		ereport(ERROR,
260 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
261 				 errmsg("cannot check index \"%s\"",
262 						RelationGetRelationName(rel)),
263 				 errdetail("Index is not valid")));
264 }
265 
266 /*
267  * Check if B-Tree index relation should have a file for its main relation
268  * fork.  Verification uses this to skip unlogged indexes when in hot standby
269  * mode, where there is simply nothing to verify.
270  *
271  * NB: Caller should call btree_index_checkable() before calling here.
272  */
273 static inline bool
btree_index_mainfork_expected(Relation rel)274 btree_index_mainfork_expected(Relation rel)
275 {
276 	if (rel->rd_rel->relpersistence != RELPERSISTENCE_UNLOGGED ||
277 		!RecoveryInProgress())
278 		return true;
279 
280 	ereport(NOTICE,
281 			(errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),
282 			 errmsg("cannot verify unlogged index \"%s\" during recovery, skipping",
283 					RelationGetRelationName(rel))));
284 
285 	return false;
286 }
287 
288 /*
289  * Main entry point for B-Tree SQL-callable functions. Walks the B-Tree in
290  * logical order, verifying invariants as it goes.
291  *
292  * It is the caller's responsibility to acquire appropriate heavyweight lock on
293  * the index relation, and advise us if extra checks are safe when a ShareLock
294  * is held.
295  *
296  * A ShareLock is generally assumed to prevent any kind of physical
297  * modification to the index structure, including modifications that VACUUM may
298  * make.  This does not include setting of the LP_DEAD bit by concurrent index
299  * scans, although that is just metadata that is not able to directly affect
300  * any check performed here.  Any concurrent process that might act on the
301  * LP_DEAD bit being set (recycle space) requires a heavyweight lock that
302  * cannot be held while we hold a ShareLock.  (Besides, even if that could
303  * happen, the ad-hoc recycling when a page might otherwise split is performed
304  * per-page, and requires an exclusive buffer lock, which wouldn't cause us
305  * trouble.  _bt_delitems_vacuum() may only delete leaf items, and so the extra
306  * parent/child check cannot be affected.)
307  */
308 static void
bt_check_every_level(Relation rel,bool readonly)309 bt_check_every_level(Relation rel, bool readonly)
310 {
311 	BtreeCheckState *state;
312 	Page		metapage;
313 	BTMetaPageData *metad;
314 	uint32		previouslevel;
315 	BtreeLevel	current;
316 
317 	/*
318 	 * RecentGlobalXmin assertion matches index_getnext_tid().  See note on
319 	 * RecentGlobalXmin/B-Tree page deletion.
320 	 */
321 	Assert(TransactionIdIsValid(RecentGlobalXmin));
322 
323 	/*
324 	 * Initialize state for entire verification operation
325 	 */
326 	state = palloc(sizeof(BtreeCheckState));
327 	state->rel = rel;
328 	state->readonly = readonly;
329 	/* Create context for page */
330 	state->targetcontext = AllocSetContextCreate(CurrentMemoryContext,
331 												 "amcheck context",
332 												 ALLOCSET_DEFAULT_MINSIZE,
333 												 ALLOCSET_DEFAULT_INITSIZE,
334 												 ALLOCSET_DEFAULT_MAXSIZE);
335 	state->checkstrategy = GetAccessStrategy(BAS_BULKREAD);
336 
337 	/* Get true root block from meta-page */
338 	metapage = palloc_btree_page(state, BTREE_METAPAGE);
339 	metad = BTPageGetMeta(metapage);
340 
341 	/*
342 	 * Certain deletion patterns can result in "skinny" B-Tree indexes, where
343 	 * the fast root and true root differ.
344 	 *
345 	 * Start from the true root, not the fast root, unlike conventional index
346 	 * scans.  This approach is more thorough, and removes the risk of
347 	 * following a stale fast root from the meta page.
348 	 */
349 	if (metad->btm_fastroot != metad->btm_root)
350 		ereport(DEBUG1,
351 				(errcode(ERRCODE_NO_DATA),
352 				 errmsg("harmless fast root mismatch in index %s",
353 						RelationGetRelationName(rel)),
354 				 errdetail_internal("Fast root block %u (level %u) differs from true root block %u (level %u).",
355 									metad->btm_fastroot, metad->btm_fastlevel,
356 									metad->btm_root, metad->btm_level)));
357 
358 	/*
359 	 * Starting at the root, verify every level.  Move left to right, top to
360 	 * bottom.  Note that there may be no pages other than the meta page (meta
361 	 * page can indicate that root is P_NONE when the index is totally empty).
362 	 */
363 	previouslevel = InvalidBtreeLevel;
364 	current.level = metad->btm_level;
365 	current.leftmost = metad->btm_root;
366 	current.istruerootlevel = true;
367 	while (current.leftmost != P_NONE)
368 	{
369 		/*
370 		 * Verify this level, and get left most page for next level down, if
371 		 * not at leaf level
372 		 */
373 		current = bt_check_level_from_leftmost(state, current);
374 
375 		if (current.leftmost == InvalidBlockNumber)
376 			ereport(ERROR,
377 					(errcode(ERRCODE_INDEX_CORRUPTED),
378 					 errmsg("index \"%s\" has no valid pages on level below %u or first level",
379 							RelationGetRelationName(rel), previouslevel)));
380 
381 		previouslevel = current.level;
382 	}
383 
384 	/* Be tidy: */
385 	MemoryContextDelete(state->targetcontext);
386 }
387 
388 /*
389  * Given a left-most block at some level, move right, verifying each page
390  * individually (with more verification across pages for "readonly"
391  * callers).  Caller should pass the true root page as the leftmost initially,
392  * working their way down by passing what is returned for the last call here
393  * until level 0 (leaf page level) was reached.
394  *
395  * Returns state for next call, if any.  This includes left-most block number
396  * one level lower that should be passed on next level/call, which is set to
397  * P_NONE on last call here (when leaf level is verified).  Level numbers
398  * follow the nbtree convention: higher levels have higher numbers, because new
399  * levels are added only due to a root page split.  Note that prior to the
400  * first root page split, the root is also a leaf page, so there is always a
401  * level 0 (leaf level), and it's always the last level processed.
402  *
403  * Note on memory management:  State's per-page context is reset here, between
404  * each call to bt_target_page_check().
405  */
406 static BtreeLevel
bt_check_level_from_leftmost(BtreeCheckState * state,BtreeLevel level)407 bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
408 {
409 	/* State to establish early, concerning entire level */
410 	BTPageOpaque opaque;
411 	MemoryContext oldcontext;
412 	BtreeLevel	nextleveldown;
413 
414 	/* Variables for iterating across level using right links */
415 	BlockNumber leftcurrent = P_NONE;
416 	BlockNumber current = level.leftmost;
417 
418 	/* Initialize return state */
419 	nextleveldown.leftmost = InvalidBlockNumber;
420 	nextleveldown.level = InvalidBtreeLevel;
421 	nextleveldown.istruerootlevel = false;
422 
423 	/* Use page-level context for duration of this call */
424 	oldcontext = MemoryContextSwitchTo(state->targetcontext);
425 
426 	elog(DEBUG2, "verifying level %u%s", level.level,
427 		 level.istruerootlevel ?
428 		 " (true root level)" : level.level == 0 ? " (leaf level)" : "");
429 
430 	do
431 	{
432 		/* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
433 		CHECK_FOR_INTERRUPTS();
434 
435 		/* Initialize state for this iteration */
436 		state->targetblock = current;
437 		state->target = palloc_btree_page(state, state->targetblock);
438 		state->targetlsn = PageGetLSN(state->target);
439 
440 		opaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
441 
442 		if (P_IGNORE(opaque))
443 		{
444 			if (P_RIGHTMOST(opaque))
445 				ereport(ERROR,
446 						(errcode(ERRCODE_INDEX_CORRUPTED),
447 						 errmsg("block %u fell off the end of index \"%s\"",
448 								current, RelationGetRelationName(state->rel))));
449 			else
450 				ereport(DEBUG1,
451 						(errcode(ERRCODE_NO_DATA),
452 						 errmsg("block %u of index \"%s\" ignored",
453 								current, RelationGetRelationName(state->rel))));
454 			goto nextpage;
455 		}
456 		else if (nextleveldown.leftmost == InvalidBlockNumber)
457 		{
458 			/*
459 			 * A concurrent page split could make the caller supplied leftmost
460 			 * block no longer contain the leftmost page, or no longer be the
461 			 * true root, but where that isn't possible due to heavyweight
462 			 * locking, check that the first valid page meets caller's
463 			 * expectations.
464 			 */
465 			if (state->readonly)
466 			{
467 				if (!P_LEFTMOST(opaque))
468 					ereport(ERROR,
469 							(errcode(ERRCODE_INDEX_CORRUPTED),
470 							 errmsg("block %u is not leftmost in index \"%s\"",
471 									current, RelationGetRelationName(state->rel))));
472 
473 				if (level.istruerootlevel && !P_ISROOT(opaque))
474 					ereport(ERROR,
475 							(errcode(ERRCODE_INDEX_CORRUPTED),
476 							 errmsg("block %u is not true root in index \"%s\"",
477 									current, RelationGetRelationName(state->rel))));
478 			}
479 
480 			/*
481 			 * Before beginning any non-trivial examination of level, prepare
482 			 * state for next bt_check_level_from_leftmost() invocation for
483 			 * the next level for the next level down (if any).
484 			 *
485 			 * There should be at least one non-ignorable page per level,
486 			 * unless this is the leaf level, which is assumed by caller to be
487 			 * final level.
488 			 */
489 			if (!P_ISLEAF(opaque))
490 			{
491 				IndexTuple	itup;
492 				ItemId		itemid;
493 
494 				/* Internal page -- downlink gets leftmost on next level */
495 				itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(opaque));
496 				itup = (IndexTuple) PageGetItem(state->target, itemid);
497 				nextleveldown.leftmost = ItemPointerGetBlockNumber(&(itup->t_tid));
498 				nextleveldown.level = opaque->btpo.level - 1;
499 			}
500 			else
501 			{
502 				/*
503 				 * Leaf page -- final level caller must process.
504 				 *
505 				 * Note that this could also be the root page, if there has
506 				 * been no root page split yet.
507 				 */
508 				nextleveldown.leftmost = P_NONE;
509 				nextleveldown.level = InvalidBtreeLevel;
510 			}
511 
512 			/*
513 			 * Finished setting up state for this call/level.  Control will
514 			 * never end up back here in any future loop iteration for this
515 			 * level.
516 			 */
517 		}
518 
519 		if (state->readonly && opaque->btpo_prev != leftcurrent)
520 			ereport(ERROR,
521 					(errcode(ERRCODE_INDEX_CORRUPTED),
522 					 errmsg("left link/right link pair in index \"%s\" not in agreement",
523 							RelationGetRelationName(state->rel)),
524 					 errdetail_internal("Block=%u left block=%u left link from block=%u.",
525 										current, leftcurrent, opaque->btpo_prev)));
526 
527 		/* Check level, which must be valid for non-ignorable page */
528 		if (level.level != opaque->btpo.level)
529 			ereport(ERROR,
530 					(errcode(ERRCODE_INDEX_CORRUPTED),
531 					 errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",
532 							RelationGetRelationName(state->rel)),
533 					 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
534 										current, level.level, opaque->btpo.level)));
535 
536 		/* Verify invariants for page -- all important checks occur here */
537 		bt_target_page_check(state);
538 
539 nextpage:
540 
541 		/* Try to detect circular links */
542 		if (current == leftcurrent || current == opaque->btpo_prev)
543 			ereport(ERROR,
544 					(errcode(ERRCODE_INDEX_CORRUPTED),
545 					 errmsg("circular link chain found in block %u of index \"%s\"",
546 							current, RelationGetRelationName(state->rel))));
547 
548 		leftcurrent = current;
549 		current = opaque->btpo_next;
550 
551 		/* Free page and associated memory for this iteration */
552 		MemoryContextReset(state->targetcontext);
553 	}
554 	while (current != P_NONE);
555 
556 	/* Don't change context for caller */
557 	MemoryContextSwitchTo(oldcontext);
558 
559 	return nextleveldown;
560 }
561 
562 /*
563  * Function performs the following checks on target page, or pages ancillary to
564  * target page:
565  *
566  * - That every "real" data item is less than or equal to the high key, which
567  *	 is an upper bound on the items on the pages (where there is a high key at
568  *	 all -- pages that are rightmost lack one).
569  *
570  * - That within the page, every "real" item is less than or equal to the item
571  *	 immediately to its right, if any (i.e., that the items are in order within
572  *	 the page, so that the binary searches performed by index scans are sane).
573  *
574  * - That the last item stored on the page is less than or equal to the first
575  *	 "real" data item on the page to the right (if such a first item is
576  *	 available).
577  *
578  * Furthermore, when state passed shows ShareLock held, and target page is
579  * internal page, function also checks:
580  *
581  * - That all child pages respect downlinks lower bound.
582  *
583  * Note:  Memory allocated in this routine is expected to be released by caller
584  * resetting state->targetcontext.
585  */
586 static void
bt_target_page_check(BtreeCheckState * state)587 bt_target_page_check(BtreeCheckState *state)
588 {
589 	OffsetNumber offset;
590 	OffsetNumber max;
591 	BTPageOpaque topaque;
592 
593 	topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
594 	max = PageGetMaxOffsetNumber(state->target);
595 
596 	elog(DEBUG2, "verifying %u items on %s block %u", max,
597 		 P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock);
598 
599 	/*
600 	 * Loop over page items, starting from first non-highkey item, not high
601 	 * key (if any).  Also, immediately skip "negative infinity" real item (if
602 	 * any).
603 	 */
604 	for (offset = P_FIRSTDATAKEY(topaque);
605 		 offset <= max;
606 		 offset = OffsetNumberNext(offset))
607 	{
608 		ItemId		itemid;
609 		IndexTuple	itup;
610 		ScanKey		skey;
611 
612 		CHECK_FOR_INTERRUPTS();
613 
614 		/*
615 		 * Don't try to generate scankey using "negative infinity" garbage
616 		 * data
617 		 */
618 		if (offset_is_negative_infinity(topaque, offset))
619 			continue;
620 
621 		/* Build insertion scankey for current page offset */
622 		itemid = PageGetItemId(state->target, offset);
623 		itup = (IndexTuple) PageGetItem(state->target, itemid);
624 		skey = _bt_mkscankey(state->rel, itup);
625 
626 		/*
627 		 * * High key check *
628 		 *
629 		 * If there is a high key (if this is not the rightmost page on its
630 		 * entire level), check that high key actually is upper bound on all
631 		 * page items.
632 		 *
633 		 * We prefer to check all items against high key rather than checking
634 		 * just the last and trusting that the operator class obeys the
635 		 * transitive law (which implies that all previous items also
636 		 * respected the high key invariant if they pass the item order
637 		 * check).
638 		 *
639 		 * Ideally, we'd compare every item in the index against every other
640 		 * item in the index, and not trust opclass obedience of the
641 		 * transitive law to bridge the gap between children and their
642 		 * grandparents (as well as great-grandparents, and so on).  We don't
643 		 * go to those lengths because that would be prohibitively expensive,
644 		 * and probably not markedly more effective in practice.
645 		 */
646 		if (!P_RIGHTMOST(topaque) &&
647 			!invariant_leq_offset(state, skey, P_HIKEY))
648 		{
649 			char	   *itid,
650 					   *htid;
651 
652 			itid = psprintf("(%u,%u)", state->targetblock, offset);
653 			htid = psprintf("(%u,%u)",
654 							ItemPointerGetBlockNumber(&(itup->t_tid)),
655 							ItemPointerGetOffsetNumber(&(itup->t_tid)));
656 
657 			ereport(ERROR,
658 					(errcode(ERRCODE_INDEX_CORRUPTED),
659 					 errmsg("high key invariant violated for index \"%s\"",
660 							RelationGetRelationName(state->rel)),
661 					 errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.",
662 										itid,
663 										P_ISLEAF(topaque) ? "heap" : "index",
664 										htid,
665 										(uint32) (state->targetlsn >> 32),
666 										(uint32) state->targetlsn)));
667 		}
668 
669 		/*
670 		 * * Item order check *
671 		 *
672 		 * Check that items are stored on page in logical order, by checking
673 		 * current item is less than or equal to next item (if any).
674 		 */
675 		if (OffsetNumberNext(offset) <= max &&
676 			!invariant_leq_offset(state, skey,
677 								  OffsetNumberNext(offset)))
678 		{
679 			char	   *itid,
680 					   *htid,
681 					   *nitid,
682 					   *nhtid;
683 
684 			itid = psprintf("(%u,%u)", state->targetblock, offset);
685 			htid = psprintf("(%u,%u)",
686 							ItemPointerGetBlockNumber(&(itup->t_tid)),
687 							ItemPointerGetOffsetNumber(&(itup->t_tid)));
688 			nitid = psprintf("(%u,%u)", state->targetblock,
689 							 OffsetNumberNext(offset));
690 
691 			/* Reuse itup to get pointed-to heap location of second item */
692 			itemid = PageGetItemId(state->target, OffsetNumberNext(offset));
693 			itup = (IndexTuple) PageGetItem(state->target, itemid);
694 			nhtid = psprintf("(%u,%u)",
695 							 ItemPointerGetBlockNumber(&(itup->t_tid)),
696 							 ItemPointerGetOffsetNumber(&(itup->t_tid)));
697 
698 			ereport(ERROR,
699 					(errcode(ERRCODE_INDEX_CORRUPTED),
700 					 errmsg("item order invariant violated for index \"%s\"",
701 							RelationGetRelationName(state->rel)),
702 					 errdetail_internal("Lower index tid=%s (points to %s tid=%s) "
703 										"higher index tid=%s (points to %s tid=%s) "
704 										"page lsn=%X/%X.",
705 										itid,
706 										P_ISLEAF(topaque) ? "heap" : "index",
707 										htid,
708 										nitid,
709 										P_ISLEAF(topaque) ? "heap" : "index",
710 										nhtid,
711 										(uint32) (state->targetlsn >> 32),
712 										(uint32) state->targetlsn)));
713 		}
714 
715 		/*
716 		 * * Last item check *
717 		 *
718 		 * Check last item against next/right page's first data item's when
719 		 * last item on page is reached.  This additional check can detect
720 		 * transposed pages.
721 		 *
722 		 * This check is similar to the item order check that will have
723 		 * already been performed for every other "real" item on target page
724 		 * when last item is checked.  The difference is that the next item
725 		 * (the item that is compared to target's last item) needs to come
726 		 * from the next/sibling page.  There may not be such an item
727 		 * available from sibling for various reasons, though (e.g., target is
728 		 * the rightmost page on level).
729 		 */
730 		else if (offset == max)
731 		{
732 			ScanKey		rightkey;
733 
734 			/* Get item in next/right page */
735 			rightkey = bt_right_page_check_scankey(state);
736 
737 			if (rightkey &&
738 				!invariant_geq_offset(state, rightkey, max))
739 			{
740 				/*
741 				 * As explained at length in bt_right_page_check_scankey(),
742 				 * there is a known !readonly race that could account for
743 				 * apparent violation of invariant, which we must check for
744 				 * before actually proceeding with raising error.  Our canary
745 				 * condition is that target page was deleted.
746 				 */
747 				if (!state->readonly)
748 				{
749 					/* Get fresh copy of target page */
750 					state->target = palloc_btree_page(state, state->targetblock);
751 					/* Note that we deliberately do not update target LSN */
752 					topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
753 
754 					/*
755 					 * All !readonly checks now performed; just return
756 					 */
757 					if (P_IGNORE(topaque))
758 						return;
759 				}
760 
761 				ereport(ERROR,
762 						(errcode(ERRCODE_INDEX_CORRUPTED),
763 						 errmsg("cross page item order invariant violated for index \"%s\"",
764 								RelationGetRelationName(state->rel)),
765 						 errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%X.",
766 											state->targetblock, offset,
767 											(uint32) (state->targetlsn >> 32),
768 											(uint32) state->targetlsn)));
769 			}
770 		}
771 
772 		/*
773 		 * * Downlink check *
774 		 *
775 		 * Additional check of child items iff this is an internal page and
776 		 * caller holds a ShareLock.  This happens for every downlink (item)
777 		 * in target excluding the negative-infinity downlink (again, this is
778 		 * because it has no useful value to compare).
779 		 */
780 		if (!P_ISLEAF(topaque) && state->readonly)
781 		{
782 			BlockNumber childblock = ItemPointerGetBlockNumber(&(itup->t_tid));
783 
784 			bt_downlink_check(state, childblock, skey);
785 		}
786 	}
787 }
788 
789 /*
790  * Return a scankey for an item on page to right of current target (or the
791  * first non-ignorable page), sufficient to check ordering invariant on last
792  * item in current target page.  Returned scankey relies on local memory
793  * allocated for the child page, which caller cannot pfree().  Caller's memory
794  * context should be reset between calls here.
795  *
796  * This is the first data item, and so all adjacent items are checked against
797  * their immediate sibling item (which may be on a sibling page, or even a
798  * "cousin" page at parent boundaries where target's rightlink points to page
799  * with different parent page).  If no such valid item is available, return
800  * NULL instead.
801  *
802  * Note that !readonly callers must reverify that target page has not
803  * been concurrently deleted.
804  */
805 static ScanKey
bt_right_page_check_scankey(BtreeCheckState * state)806 bt_right_page_check_scankey(BtreeCheckState *state)
807 {
808 	BTPageOpaque opaque;
809 	ItemId		rightitem;
810 	BlockNumber targetnext;
811 	Page		rightpage;
812 	OffsetNumber nline;
813 
814 	/* Determine target's next block number */
815 	opaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
816 
817 	/* If target is already rightmost, no right sibling; nothing to do here */
818 	if (P_RIGHTMOST(opaque))
819 		return NULL;
820 
821 	/*
822 	 * General notes on concurrent page splits and page deletion:
823 	 *
824 	 * Routines like _bt_search() don't require *any* page split interlock
825 	 * when descending the tree, including something very light like a buffer
826 	 * pin. That's why it's okay that we don't either.  This avoidance of any
827 	 * need to "couple" buffer locks is the raison d' etre of the Lehman & Yao
828 	 * algorithm, in fact.
829 	 *
830 	 * That leaves deletion.  A deleted page won't actually be recycled by
831 	 * VACUUM early enough for us to fail to at least follow its right link
832 	 * (or left link, or downlink) and find its sibling, because recycling
833 	 * does not occur until no possible index scan could land on the page.
834 	 * Index scans can follow links with nothing more than their snapshot as
835 	 * an interlock and be sure of at least that much.  (See page
836 	 * recycling/RecentGlobalXmin notes in nbtree README.)
837 	 *
838 	 * Furthermore, it's okay if we follow a rightlink and find a half-dead or
839 	 * dead (ignorable) page one or more times.  There will either be a
840 	 * further right link to follow that leads to a live page before too long
841 	 * (before passing by parent's rightmost child), or we will find the end
842 	 * of the entire level instead (possible when parent page is itself the
843 	 * rightmost on its level).
844 	 */
845 	targetnext = opaque->btpo_next;
846 	for (;;)
847 	{
848 		CHECK_FOR_INTERRUPTS();
849 
850 		rightpage = palloc_btree_page(state, targetnext);
851 		opaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
852 
853 		if (!P_IGNORE(opaque) || P_RIGHTMOST(opaque))
854 			break;
855 
856 		/* We landed on a deleted page, so step right to find a live page */
857 		targetnext = opaque->btpo_next;
858 		ereport(DEBUG1,
859 				(errcode(ERRCODE_NO_DATA),
860 				 errmsg("level %u leftmost page of index \"%s\" was found deleted or half dead",
861 						opaque->btpo.level, RelationGetRelationName(state->rel)),
862 				 errdetail_internal("Deleted page found when building scankey from right sibling.")));
863 
864 		/* Be slightly more pro-active in freeing this memory, just in case */
865 		pfree(rightpage);
866 	}
867 
868 	/*
869 	 * No ShareLock held case -- why it's safe to proceed.
870 	 *
871 	 * Problem:
872 	 *
873 	 * We must avoid false positive reports of corruption when caller treats
874 	 * item returned here as an upper bound on target's last item.  In
875 	 * general, false positives are disallowed.  Avoiding them here when
876 	 * caller is !readonly is subtle.
877 	 *
878 	 * A concurrent page deletion by VACUUM of the target page can result in
879 	 * the insertion of items on to this right sibling page that would
880 	 * previously have been inserted on our target page.  There might have
881 	 * been insertions that followed the target's downlink after it was made
882 	 * to point to right sibling instead of target by page deletion's first
883 	 * phase. The inserters insert items that would belong on target page.
884 	 * This race is very tight, but it's possible.  This is our only problem.
885 	 *
886 	 * Non-problems:
887 	 *
888 	 * We are not hindered by a concurrent page split of the target; we'll
889 	 * never land on the second half of the page anyway.  A concurrent split
890 	 * of the right page will also not matter, because the first data item
891 	 * remains the same within the left half, which we'll reliably land on. If
892 	 * we had to skip over ignorable/deleted pages, it cannot matter because
893 	 * their key space has already been atomically merged with the first
894 	 * non-ignorable page we eventually find (doesn't matter whether the page
895 	 * we eventually find is a true sibling or a cousin of target, which we go
896 	 * into below).
897 	 *
898 	 * Solution:
899 	 *
900 	 * Caller knows that it should reverify that target is not ignorable
901 	 * (half-dead or deleted) when cross-page sibling item comparison appears
902 	 * to indicate corruption (invariant fails).  This detects the single race
903 	 * condition that exists for caller.  This is correct because the
904 	 * continued existence of target block as non-ignorable (not half-dead or
905 	 * deleted) implies that target page was not merged into from the right by
906 	 * deletion; the key space at or after target never moved left.  Target's
907 	 * parent either has the same downlink to target as before, or a <=
908 	 * downlink due to deletion at the left of target.  Target either has the
909 	 * same highkey as before, or a highkey <= before when there is a page
910 	 * split. (The rightmost concurrently-split-from-target-page page will
911 	 * still have the same highkey as target was originally found to have,
912 	 * which for our purposes is equivalent to target's highkey itself never
913 	 * changing, since we reliably skip over
914 	 * concurrently-split-from-target-page pages.)
915 	 *
916 	 * In simpler terms, we allow that the key space of the target may expand
917 	 * left (the key space can move left on the left side of target only), but
918 	 * the target key space cannot expand right and get ahead of us without
919 	 * our detecting it.  The key space of the target cannot shrink, unless it
920 	 * shrinks to zero due to the deletion of the original page, our canary
921 	 * condition.  (To be very precise, we're a bit stricter than that because
922 	 * it might just have been that the target page split and only the
923 	 * original target page was deleted.  We can be more strict, just not more
924 	 * lax.)
925 	 *
926 	 * Top level tree walk caller moves on to next page (makes it the new
927 	 * target) following recovery from this race.  (cf.  The rationale for
928 	 * child/downlink verification needing a ShareLock within
929 	 * bt_downlink_check(), where page deletion is also the main source of
930 	 * trouble.)
931 	 *
932 	 * Note that it doesn't matter if right sibling page here is actually a
933 	 * cousin page, because in order for the key space to be readjusted in a
934 	 * way that causes us issues in next level up (guiding problematic
935 	 * concurrent insertions to the cousin from the grandparent rather than to
936 	 * the sibling from the parent), there'd have to be page deletion of
937 	 * target's parent page (affecting target's parent's downlink in target's
938 	 * grandparent page).  Internal page deletion only occurs when there are
939 	 * no child pages (they were all fully deleted), and caller is checking
940 	 * that the target's parent has at least one non-deleted (so
941 	 * non-ignorable) child: the target page.  (Note that the first phase of
942 	 * deletion atomically marks the page to be deleted half-dead/ignorable at
943 	 * the same time downlink in its parent is removed, so caller will
944 	 * definitely not fail to detect that this happened.)
945 	 *
946 	 * This trick is inspired by the method backward scans use for dealing
947 	 * with concurrent page splits; concurrent page deletion is a problem that
948 	 * similarly receives special consideration sometimes (it's possible that
949 	 * the backwards scan will re-read its "original" block after failing to
950 	 * find a right-link to it, having already moved in the opposite direction
951 	 * (right/"forwards") a few times to try to locate one).  Just like us,
952 	 * that happens only to determine if there was a concurrent page deletion
953 	 * of a reference page, and just like us if there was a page deletion of
954 	 * that reference page it means we can move on from caring about the
955 	 * reference page.  See the nbtree README for a full description of how
956 	 * that works.
957 	 */
958 	nline = PageGetMaxOffsetNumber(rightpage);
959 
960 	/*
961 	 * Get first data item, if any
962 	 */
963 	if (P_ISLEAF(opaque) && nline >= P_FIRSTDATAKEY(opaque))
964 	{
965 		/* Return first data item (if any) */
966 		rightitem = PageGetItemId(rightpage, P_FIRSTDATAKEY(opaque));
967 	}
968 	else if (!P_ISLEAF(opaque) &&
969 			 nline >= OffsetNumberNext(P_FIRSTDATAKEY(opaque)))
970 	{
971 		/*
972 		 * Return first item after the internal page's "negative infinity"
973 		 * item
974 		 */
975 		rightitem = PageGetItemId(rightpage,
976 								  OffsetNumberNext(P_FIRSTDATAKEY(opaque)));
977 	}
978 	else
979 	{
980 		/*
981 		 * No first item.  Page is probably empty leaf page, but it's also
982 		 * possible that it's an internal page with only a negative infinity
983 		 * item.
984 		 */
985 		ereport(DEBUG1,
986 				(errcode(ERRCODE_NO_DATA),
987 				 errmsg("%s block %u of index \"%s\" has no first data item",
988 						P_ISLEAF(opaque) ? "leaf" : "internal", targetnext,
989 						RelationGetRelationName(state->rel))));
990 		return NULL;
991 	}
992 
993 	/*
994 	 * Return first real item scankey.  Note that this relies on right page
995 	 * memory remaining allocated.
996 	 */
997 	return _bt_mkscankey(state->rel,
998 						 (IndexTuple) PageGetItem(rightpage, rightitem));
999 }
1000 
1001 /*
1002  * Checks one of target's downlink against its child page.
1003  *
1004  * Conceptually, the target page continues to be what is checked here.  The
1005  * target block is still blamed in the event of finding an invariant violation.
1006  * The downlink insertion into the target is probably where any problem raised
1007  * here arises, and there is no such thing as a parent link, so doing the
1008  * verification this way around is much more practical.
1009  */
1010 static void
bt_downlink_check(BtreeCheckState * state,BlockNumber childblock,ScanKey targetkey)1011 bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
1012 				  ScanKey targetkey)
1013 {
1014 	OffsetNumber offset;
1015 	OffsetNumber maxoffset;
1016 	Page		child;
1017 	BTPageOpaque copaque;
1018 
1019 	/*
1020 	 * Caller must have ShareLock on target relation, because of
1021 	 * considerations around page deletion by VACUUM.
1022 	 *
1023 	 * NB: In general, page deletion deletes the right sibling's downlink, not
1024 	 * the downlink of the page being deleted; the deleted page's downlink is
1025 	 * reused for its sibling.  The key space is thereby consolidated between
1026 	 * the deleted page and its right sibling.  (We cannot delete a parent
1027 	 * page's rightmost child unless it is the last child page, and we intend
1028 	 * to also delete the parent itself.)
1029 	 *
1030 	 * If this verification happened without a ShareLock, the following race
1031 	 * condition could cause false positives:
1032 	 *
1033 	 * In general, concurrent page deletion might occur, including deletion of
1034 	 * the left sibling of the child page that is examined here.  If such a
1035 	 * page deletion were to occur, closely followed by an insertion into the
1036 	 * newly expanded key space of the child, a window for the false positive
1037 	 * opens up: the stale parent/target downlink originally followed to get
1038 	 * to the child legitimately ceases to be a lower bound on all items in
1039 	 * the page, since the key space was concurrently expanded "left".
1040 	 * (Insertion followed the "new" downlink for the child, not our now-stale
1041 	 * downlink, which was concurrently physically removed in target/parent as
1042 	 * part of deletion's first phase.)
1043 	 *
1044 	 * Note that while the cross-page-same-level last item check uses a trick
1045 	 * that allows it to perform verification for !readonly callers, a similar
1046 	 * trick seems difficult here.  The trick that that other check uses is,
1047 	 * in essence, to lock down race conditions to those that occur due to
1048 	 * concurrent page deletion of the target; that's a race that can be
1049 	 * reliably detected before actually reporting corruption.
1050 	 *
1051 	 * On the other hand, we'd need to lock down race conditions involving
1052 	 * deletion of child's left page, for long enough to read the child page
1053 	 * into memory (in other words, a scheme with concurrently held buffer
1054 	 * locks on both child and left-of-child pages).  That's unacceptable for
1055 	 * amcheck functions on general principle, though.
1056 	 */
1057 	Assert(state->readonly);
1058 
1059 	/*
1060 	 * Verify child page has the downlink key from target page (its parent) as
1061 	 * a lower bound.
1062 	 *
1063 	 * Check all items, rather than checking just the first and trusting that
1064 	 * the operator class obeys the transitive law.
1065 	 */
1066 	child = palloc_btree_page(state, childblock);
1067 	copaque = (BTPageOpaque) PageGetSpecialPointer(child);
1068 	maxoffset = PageGetMaxOffsetNumber(child);
1069 
1070 	for (offset = P_FIRSTDATAKEY(copaque);
1071 		 offset <= maxoffset;
1072 		 offset = OffsetNumberNext(offset))
1073 	{
1074 		/*
1075 		 * Skip comparison of target page key against "negative infinity"
1076 		 * item, if any.  Checking it would indicate that it's not an upper
1077 		 * bound, but that's only because of the hard-coding within
1078 		 * _bt_compare().
1079 		 */
1080 		if (offset_is_negative_infinity(copaque, offset))
1081 			continue;
1082 
1083 		if (!invariant_leq_nontarget_offset(state, child,
1084 											targetkey, offset))
1085 			ereport(ERROR,
1086 					(errcode(ERRCODE_INDEX_CORRUPTED),
1087 					 errmsg("down-link lower bound invariant violated for index \"%s\"",
1088 							RelationGetRelationName(state->rel)),
1089 					 errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X.",
1090 										state->targetblock, childblock, offset,
1091 										(uint32) (state->targetlsn >> 32),
1092 										(uint32) state->targetlsn)));
1093 	}
1094 
1095 	pfree(child);
1096 }
1097 
1098 /*
1099  * Is particular offset within page (whose special state is passed by caller)
1100  * the page negative-infinity item?
1101  *
1102  * As noted in comments above _bt_compare(), there is special handling of the
1103  * first data item as a "negative infinity" item.  The hard-coding within
1104  * _bt_compare() makes comparing this item for the purposes of verification
1105  * pointless at best, since the IndexTuple only contains a valid TID (a
1106  * reference TID to child page).
1107  */
1108 static inline bool
offset_is_negative_infinity(BTPageOpaque opaque,OffsetNumber offset)1109 offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)
1110 {
1111 	/*
1112 	 * For internal pages only, the first item after high key, if any, is
1113 	 * negative infinity item.  Internal pages always have a negative infinity
1114 	 * item, whereas leaf pages never have one.  This implies that negative
1115 	 * infinity item is either first or second line item, or there is none
1116 	 * within page.
1117 	 *
1118 	 * Right-most pages don't have a high key, but could be said to
1119 	 * conceptually have a "positive infinity" high key.  Thus, there is a
1120 	 * symmetry between down link items in parent pages, and high keys in
1121 	 * children.  Together, they represent the part of the key space that
1122 	 * belongs to each page in the index.  For example, all children of the
1123 	 * root page will have negative infinity as a lower bound from root
1124 	 * negative infinity downlink, and positive infinity as an upper bound
1125 	 * (implicitly, from "imaginary" positive infinity high key in root).
1126 	 */
1127 	return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque);
1128 }
1129 
1130 /*
1131  * Does the invariant hold that the key is less than or equal to a given upper
1132  * bound offset item?
1133  *
1134  * If this function returns false, convention is that caller throws error due
1135  * to corruption.
1136  */
1137 static inline bool
invariant_leq_offset(BtreeCheckState * state,ScanKey key,OffsetNumber upperbound)1138 invariant_leq_offset(BtreeCheckState *state, ScanKey key,
1139 					 OffsetNumber upperbound)
1140 {
1141 	int16		natts = state->rel->rd_rel->relnatts;
1142 	int32		cmp;
1143 
1144 	cmp = _bt_compare(state->rel, natts, key, state->target, upperbound);
1145 
1146 	return cmp <= 0;
1147 }
1148 
1149 /*
1150  * Does the invariant hold that the key is greater than or equal to a given
1151  * lower bound offset item?
1152  *
1153  * If this function returns false, convention is that caller throws error due
1154  * to corruption.
1155  */
1156 static inline bool
invariant_geq_offset(BtreeCheckState * state,ScanKey key,OffsetNumber lowerbound)1157 invariant_geq_offset(BtreeCheckState *state, ScanKey key,
1158 					 OffsetNumber lowerbound)
1159 {
1160 	int16		natts = state->rel->rd_rel->relnatts;
1161 	int32		cmp;
1162 
1163 	cmp = _bt_compare(state->rel, natts, key, state->target, lowerbound);
1164 
1165 	return cmp >= 0;
1166 }
1167 
1168 /*
1169  * Does the invariant hold that the key is less than or equal to a given upper
1170  * bound offset item, with the offset relating to a caller-supplied page that
1171  * is not the current target page? Caller's non-target page is typically a
1172  * child page of the target, checked as part of checking a property of the
1173  * target page (i.e. the key comes from the target).
1174  *
1175  * If this function returns false, convention is that caller throws error due
1176  * to corruption.
1177  */
1178 static inline bool
invariant_leq_nontarget_offset(BtreeCheckState * state,Page nontarget,ScanKey key,OffsetNumber upperbound)1179 invariant_leq_nontarget_offset(BtreeCheckState *state,
1180 							   Page nontarget, ScanKey key,
1181 							   OffsetNumber upperbound)
1182 {
1183 	int16		natts = state->rel->rd_rel->relnatts;
1184 	int32		cmp;
1185 
1186 	cmp = _bt_compare(state->rel, natts, key, nontarget, upperbound);
1187 
1188 	return cmp <= 0;
1189 }
1190 
1191 /*
1192  * Given a block number of a B-Tree page, return page in palloc()'d memory.
1193  * While at it, perform some basic checks of the page.
1194  *
1195  * There is never an attempt to get a consistent view of multiple pages using
1196  * multiple concurrent buffer locks; in general, we only acquire a single pin
1197  * and buffer lock at a time, which is often all that the nbtree code requires.
1198  *
1199  * Operating on a copy of the page is useful because it prevents control
1200  * getting stuck in an uninterruptible state when an underlying operator class
1201  * misbehaves.
1202  */
1203 static Page
palloc_btree_page(BtreeCheckState * state,BlockNumber blocknum)1204 palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
1205 {
1206 	Buffer		buffer;
1207 	Page		page;
1208 	BTPageOpaque opaque;
1209 
1210 	page = palloc(BLCKSZ);
1211 
1212 	/*
1213 	 * We copy the page into local storage to avoid holding pin on the buffer
1214 	 * longer than we must.
1215 	 */
1216 	buffer = ReadBufferExtended(state->rel, MAIN_FORKNUM, blocknum, RBM_NORMAL,
1217 								state->checkstrategy);
1218 	LockBuffer(buffer, BT_READ);
1219 
1220 	/*
1221 	 * Perform the same basic sanity checking that nbtree itself performs for
1222 	 * every page:
1223 	 */
1224 	_bt_checkpage(state->rel, buffer);
1225 
1226 	/* Only use copy of page in palloc()'d memory */
1227 	memcpy(page, BufferGetPage(buffer), BLCKSZ);
1228 	UnlockReleaseBuffer(buffer);
1229 
1230 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1231 
1232 	if (opaque->btpo_flags & BTP_META && blocknum != BTREE_METAPAGE)
1233 		ereport(ERROR,
1234 				(errcode(ERRCODE_INDEX_CORRUPTED),
1235 				 errmsg("invalid meta page found at block %u in index \"%s\"",
1236 						blocknum, RelationGetRelationName(state->rel))));
1237 
1238 	/* Check page from block that ought to be meta page */
1239 	if (blocknum == BTREE_METAPAGE)
1240 	{
1241 		BTMetaPageData *metad = BTPageGetMeta(page);
1242 
1243 		if (!(opaque->btpo_flags & BTP_META) ||
1244 			metad->btm_magic != BTREE_MAGIC)
1245 			ereport(ERROR,
1246 					(errcode(ERRCODE_INDEX_CORRUPTED),
1247 					 errmsg("index \"%s\" meta page is corrupt",
1248 							RelationGetRelationName(state->rel))));
1249 
1250 		if (metad->btm_version != BTREE_VERSION)
1251 			ereport(ERROR,
1252 					(errcode(ERRCODE_INDEX_CORRUPTED),
1253 					 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
1254 							RelationGetRelationName(state->rel),
1255 							metad->btm_version, BTREE_VERSION)));
1256 	}
1257 
1258 	/*
1259 	 * Deleted pages have no sane "level" field, so can only check non-deleted
1260 	 * page level
1261 	 */
1262 	if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0)
1263 		ereport(ERROR,
1264 				(errcode(ERRCODE_INDEX_CORRUPTED),
1265 				 errmsg("invalid leaf page level %u for block %u in index \"%s\"",
1266 						opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));
1267 
1268 	if (blocknum != BTREE_METAPAGE && !P_ISLEAF(opaque) &&
1269 		!P_ISDELETED(opaque) && opaque->btpo.level == 0)
1270 		ereport(ERROR,
1271 				(errcode(ERRCODE_INDEX_CORRUPTED),
1272 				 errmsg("invalid internal page level 0 for block %u in index \"%s\"",
1273 						opaque->btpo.level, RelationGetRelationName(state->rel))));
1274 
1275 	if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
1276 		ereport(ERROR,
1277 				(errcode(ERRCODE_INDEX_CORRUPTED),
1278 				 errmsg("internal page block %u in index \"%s\" has garbage items",
1279 						blocknum, RelationGetRelationName(state->rel))));
1280 
1281 	return page;
1282 }
1283