1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  *	  Implementation of Lehman and Yao's btree management algorithm for
5  *	  Postgres.
6  *
7  * NOTES
8  *	  This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  *	  src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/vacuum.h"
26 #include "miscadmin.h"
27 #include "nodes/execnodes.h"
28 #include "pgstat.h"
29 #include "postmaster/autovacuum.h"
30 #include "storage/condition_variable.h"
31 #include "storage/indexfsm.h"
32 #include "storage/ipc.h"
33 #include "storage/lmgr.h"
34 #include "storage/smgr.h"
35 #include "utils/builtins.h"
36 #include "utils/index_selfuncs.h"
37 #include "utils/memutils.h"
38 
39 
40 /* Working state needed by btvacuumpage */
41 typedef struct
42 {
43 	IndexVacuumInfo *info;
44 	IndexBulkDeleteResult *stats;
45 	IndexBulkDeleteCallback callback;
46 	void	   *callback_state;
47 	BTCycleId	cycleid;
48 	BlockNumber lastBlockVacuumed;	/* highest blkno actually vacuumed */
49 	BlockNumber lastBlockLocked;	/* highest blkno we've cleanup-locked */
50 	BlockNumber totFreePages;	/* true total # of free pages */
51 	TransactionId oldestBtpoXact;
52 	MemoryContext pagedelcontext;
53 } BTVacState;
54 
55 /*
56  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
57  *
58  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
59  * a new page; others must wait.
60  *
61  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
62  * to a new page; some process can start doing that.
63  *
64  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
65  * We reach this state once for every distinct combination of array keys.
66  */
67 typedef enum
68 {
69 	BTPARALLEL_NOT_INITIALIZED,
70 	BTPARALLEL_ADVANCING,
71 	BTPARALLEL_IDLE,
72 	BTPARALLEL_DONE
73 } BTPS_State;
74 
75 /*
76  * BTParallelScanDescData contains btree specific shared information required
77  * for parallel scan.
78  */
79 typedef struct BTParallelScanDescData
80 {
81 	BlockNumber btps_scanPage;	/* latest or next page to be scanned */
82 	BTPS_State	btps_pageStatus;	/* indicates whether next page is
83 									 * available for scan. see above for
84 									 * possible states of parallel scan. */
85 	int			btps_arrayKeyCount; /* count indicating number of array scan
86 									 * keys processed by parallel scan */
87 	slock_t		btps_mutex;		/* protects above variables */
88 	ConditionVariable btps_cv;	/* used to synchronize parallel scan */
89 }			BTParallelScanDescData;
90 
91 typedef struct BTParallelScanDescData *BTParallelScanDesc;
92 
93 
94 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
95 			 IndexBulkDeleteCallback callback, void *callback_state,
96 			 BTCycleId cycleid);
97 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
98 			 BlockNumber orig_blkno);
99 
100 
101 /*
102  * Btree handler function: return IndexAmRoutine with access method parameters
103  * and callbacks.
104  */
105 Datum
bthandler(PG_FUNCTION_ARGS)106 bthandler(PG_FUNCTION_ARGS)
107 {
108 	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
109 
110 	amroutine->amstrategies = BTMaxStrategyNumber;
111 	amroutine->amsupport = BTNProcs;
112 	amroutine->amcanorder = true;
113 	amroutine->amcanorderbyop = false;
114 	amroutine->amcanbackward = true;
115 	amroutine->amcanunique = true;
116 	amroutine->amcanmulticol = true;
117 	amroutine->amoptionalkey = true;
118 	amroutine->amsearcharray = true;
119 	amroutine->amsearchnulls = true;
120 	amroutine->amstorage = false;
121 	amroutine->amclusterable = true;
122 	amroutine->ampredlocks = true;
123 	amroutine->amcanparallel = true;
124 	amroutine->amcaninclude = true;
125 	amroutine->amkeytype = InvalidOid;
126 
127 	amroutine->ambuild = btbuild;
128 	amroutine->ambuildempty = btbuildempty;
129 	amroutine->aminsert = btinsert;
130 	amroutine->ambulkdelete = btbulkdelete;
131 	amroutine->amvacuumcleanup = btvacuumcleanup;
132 	amroutine->amcanreturn = btcanreturn;
133 	amroutine->amcostestimate = btcostestimate;
134 	amroutine->amoptions = btoptions;
135 	amroutine->amproperty = btproperty;
136 	amroutine->amvalidate = btvalidate;
137 	amroutine->ambeginscan = btbeginscan;
138 	amroutine->amrescan = btrescan;
139 	amroutine->amgettuple = btgettuple;
140 	amroutine->amgetbitmap = btgetbitmap;
141 	amroutine->amendscan = btendscan;
142 	amroutine->ammarkpos = btmarkpos;
143 	amroutine->amrestrpos = btrestrpos;
144 	amroutine->amestimateparallelscan = btestimateparallelscan;
145 	amroutine->aminitparallelscan = btinitparallelscan;
146 	amroutine->amparallelrescan = btparallelrescan;
147 
148 	PG_RETURN_POINTER(amroutine);
149 }
150 
151 /*
152  *	btbuildempty() -- build an empty btree index in the initialization fork
153  */
154 void
btbuildempty(Relation index)155 btbuildempty(Relation index)
156 {
157 	Page		metapage;
158 
159 	/* Construct metapage. */
160 	metapage = (Page) palloc(BLCKSZ);
161 	_bt_initmetapage(metapage, P_NONE, 0);
162 
163 	/*
164 	 * Write the page and log it.  It might seem that an immediate sync would
165 	 * be sufficient to guarantee that the file exists on disk, but recovery
166 	 * itself might remove it while replaying, for example, an
167 	 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
168 	 * this even when wal_level=minimal.
169 	 */
170 	PageSetChecksumInplace(metapage, BTREE_METAPAGE);
171 	smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
172 			  (char *) metapage, true);
173 	log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
174 				BTREE_METAPAGE, metapage, true);
175 
176 	/*
177 	 * An immediate sync is required even if we xlog'd the page, because the
178 	 * write did not go through shared_buffers and therefore a concurrent
179 	 * checkpoint may have moved the redo pointer past our xlog record.
180 	 */
181 	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
182 }
183 
184 /*
185  *	btinsert() -- insert an index tuple into a btree.
186  *
187  *		Descend the tree recursively, find the appropriate location for our
188  *		new tuple, and put it there.
189  */
190 bool
btinsert(Relation rel,Datum * values,bool * isnull,ItemPointer ht_ctid,Relation heapRel,IndexUniqueCheck checkUnique,IndexInfo * indexInfo)191 btinsert(Relation rel, Datum *values, bool *isnull,
192 		 ItemPointer ht_ctid, Relation heapRel,
193 		 IndexUniqueCheck checkUnique,
194 		 IndexInfo *indexInfo)
195 {
196 	bool		result;
197 	IndexTuple	itup;
198 
199 	/* generate an index tuple */
200 	itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
201 	itup->t_tid = *ht_ctid;
202 
203 	result = _bt_doinsert(rel, itup, checkUnique, heapRel);
204 
205 	pfree(itup);
206 
207 	return result;
208 }
209 
210 /*
211  *	btgettuple() -- Get the next tuple in the scan.
212  */
213 bool
btgettuple(IndexScanDesc scan,ScanDirection dir)214 btgettuple(IndexScanDesc scan, ScanDirection dir)
215 {
216 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
217 	bool		res;
218 
219 	/* btree indexes are never lossy */
220 	scan->xs_recheck = false;
221 
222 	/*
223 	 * If we have any array keys, initialize them during first call for a
224 	 * scan.  We can't do this in btrescan because we don't know the scan
225 	 * direction at that time.
226 	 */
227 	if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
228 	{
229 		/* punt if we have any unsatisfiable array keys */
230 		if (so->numArrayKeys < 0)
231 			return false;
232 
233 		_bt_start_array_keys(scan, dir);
234 	}
235 
236 	/* This loop handles advancing to the next array elements, if any */
237 	do
238 	{
239 		/*
240 		 * If we've already initialized this scan, we can just advance it in
241 		 * the appropriate direction.  If we haven't done so yet, we call
242 		 * _bt_first() to get the first item in the scan.
243 		 */
244 		if (!BTScanPosIsValid(so->currPos))
245 			res = _bt_first(scan, dir);
246 		else
247 		{
248 			/*
249 			 * Check to see if we should kill the previously-fetched tuple.
250 			 */
251 			if (scan->kill_prior_tuple)
252 			{
253 				/*
254 				 * Yes, remember it for later. (We'll deal with all such
255 				 * tuples at once right before leaving the index page.)  The
256 				 * test for numKilled overrun is not just paranoia: if the
257 				 * caller reverses direction in the indexscan then the same
258 				 * item might get entered multiple times. It's not worth
259 				 * trying to optimize that, so we don't detect it, but instead
260 				 * just forget any excess entries.
261 				 */
262 				if (so->killedItems == NULL)
263 					so->killedItems = (int *)
264 						palloc(MaxIndexTuplesPerPage * sizeof(int));
265 				if (so->numKilled < MaxIndexTuplesPerPage)
266 					so->killedItems[so->numKilled++] = so->currPos.itemIndex;
267 			}
268 
269 			/*
270 			 * Now continue the scan.
271 			 */
272 			res = _bt_next(scan, dir);
273 		}
274 
275 		/* If we have a tuple, return it ... */
276 		if (res)
277 			break;
278 		/* ... otherwise see if we have more array keys to deal with */
279 	} while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
280 
281 	return res;
282 }
283 
284 /*
285  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
286  */
287 int64
btgetbitmap(IndexScanDesc scan,TIDBitmap * tbm)288 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
289 {
290 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
291 	int64		ntids = 0;
292 	ItemPointer heapTid;
293 
294 	/*
295 	 * If we have any array keys, initialize them.
296 	 */
297 	if (so->numArrayKeys)
298 	{
299 		/* punt if we have any unsatisfiable array keys */
300 		if (so->numArrayKeys < 0)
301 			return ntids;
302 
303 		_bt_start_array_keys(scan, ForwardScanDirection);
304 	}
305 
306 	/* This loop handles advancing to the next array elements, if any */
307 	do
308 	{
309 		/* Fetch the first page & tuple */
310 		if (_bt_first(scan, ForwardScanDirection))
311 		{
312 			/* Save tuple ID, and continue scanning */
313 			heapTid = &scan->xs_ctup.t_self;
314 			tbm_add_tuples(tbm, heapTid, 1, false);
315 			ntids++;
316 
317 			for (;;)
318 			{
319 				/*
320 				 * Advance to next tuple within page.  This is the same as the
321 				 * easy case in _bt_next().
322 				 */
323 				if (++so->currPos.itemIndex > so->currPos.lastItem)
324 				{
325 					/* let _bt_next do the heavy lifting */
326 					if (!_bt_next(scan, ForwardScanDirection))
327 						break;
328 				}
329 
330 				/* Save tuple ID, and continue scanning */
331 				heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
332 				tbm_add_tuples(tbm, heapTid, 1, false);
333 				ntids++;
334 			}
335 		}
336 		/* Now see if we have more array keys to deal with */
337 	} while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
338 
339 	return ntids;
340 }
341 
342 /*
343  *	btbeginscan() -- start a scan on a btree index
344  */
345 IndexScanDesc
btbeginscan(Relation rel,int nkeys,int norderbys)346 btbeginscan(Relation rel, int nkeys, int norderbys)
347 {
348 	IndexScanDesc scan;
349 	BTScanOpaque so;
350 
351 	/* no order by operators allowed */
352 	Assert(norderbys == 0);
353 
354 	/* get the scan */
355 	scan = RelationGetIndexScan(rel, nkeys, norderbys);
356 
357 	/* allocate private workspace */
358 	so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
359 	BTScanPosInvalidate(so->currPos);
360 	BTScanPosInvalidate(so->markPos);
361 	if (scan->numberOfKeys > 0)
362 		so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
363 	else
364 		so->keyData = NULL;
365 
366 	so->arrayKeyData = NULL;	/* assume no array keys for now */
367 	so->numArrayKeys = 0;
368 	so->arrayKeys = NULL;
369 	so->arrayContext = NULL;
370 
371 	so->killedItems = NULL;		/* until needed */
372 	so->numKilled = 0;
373 
374 	/*
375 	 * We don't know yet whether the scan will be index-only, so we do not
376 	 * allocate the tuple workspace arrays until btrescan.  However, we set up
377 	 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
378 	 */
379 	so->currTuples = so->markTuples = NULL;
380 
381 	scan->xs_itupdesc = RelationGetDescr(rel);
382 
383 	scan->opaque = so;
384 
385 	return scan;
386 }
387 
388 /*
389  *	btrescan() -- rescan an index relation
390  */
391 void
btrescan(IndexScanDesc scan,ScanKey scankey,int nscankeys,ScanKey orderbys,int norderbys)392 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
393 		 ScanKey orderbys, int norderbys)
394 {
395 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
396 
397 	/* we aren't holding any read locks, but gotta drop the pins */
398 	if (BTScanPosIsValid(so->currPos))
399 	{
400 		/* Before leaving current page, deal with any killed items */
401 		if (so->numKilled > 0)
402 			_bt_killitems(scan);
403 		BTScanPosUnpinIfPinned(so->currPos);
404 		BTScanPosInvalidate(so->currPos);
405 	}
406 
407 	so->markItemIndex = -1;
408 	so->arrayKeyCount = 0;
409 	BTScanPosUnpinIfPinned(so->markPos);
410 	BTScanPosInvalidate(so->markPos);
411 
412 	/*
413 	 * Allocate tuple workspace arrays, if needed for an index-only scan and
414 	 * not already done in a previous rescan call.  To save on palloc
415 	 * overhead, both workspaces are allocated as one palloc block; only this
416 	 * function and btendscan know that.
417 	 *
418 	 * NOTE: this data structure also makes it safe to return data from a
419 	 * "name" column, even though btree name_ops uses an underlying storage
420 	 * datatype of cstring.  The risk there is that "name" is supposed to be
421 	 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
422 	 * However, since we only return data out of tuples sitting in the
423 	 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
424 	 * data out of the markTuples array --- running off the end of memory for
425 	 * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
426 	 * adding special-case treatment for name_ops elsewhere.
427 	 */
428 	if (scan->xs_want_itup && so->currTuples == NULL)
429 	{
430 		so->currTuples = (char *) palloc(BLCKSZ * 2);
431 		so->markTuples = so->currTuples + BLCKSZ;
432 	}
433 
434 	/*
435 	 * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
436 	 * - vadim 05/05/97
437 	 */
438 	if (scankey && scan->numberOfKeys > 0)
439 		memmove(scan->keyData,
440 				scankey,
441 				scan->numberOfKeys * sizeof(ScanKeyData));
442 	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */
443 
444 	/* If any keys are SK_SEARCHARRAY type, set up array-key info */
445 	_bt_preprocess_array_keys(scan);
446 }
447 
448 /*
449  *	btendscan() -- close down a scan
450  */
451 void
btendscan(IndexScanDesc scan)452 btendscan(IndexScanDesc scan)
453 {
454 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
455 
456 	/* we aren't holding any read locks, but gotta drop the pins */
457 	if (BTScanPosIsValid(so->currPos))
458 	{
459 		/* Before leaving current page, deal with any killed items */
460 		if (so->numKilled > 0)
461 			_bt_killitems(scan);
462 		BTScanPosUnpinIfPinned(so->currPos);
463 	}
464 
465 	so->markItemIndex = -1;
466 	BTScanPosUnpinIfPinned(so->markPos);
467 
468 	/* No need to invalidate positions, the RAM is about to be freed. */
469 
470 	/* Release storage */
471 	if (so->keyData != NULL)
472 		pfree(so->keyData);
473 	/* so->arrayKeyData and so->arrayKeys are in arrayContext */
474 	if (so->arrayContext != NULL)
475 		MemoryContextDelete(so->arrayContext);
476 	if (so->killedItems != NULL)
477 		pfree(so->killedItems);
478 	if (so->currTuples != NULL)
479 		pfree(so->currTuples);
480 	/* so->markTuples should not be pfree'd, see btrescan */
481 	pfree(so);
482 }
483 
484 /*
485  *	btmarkpos() -- save current scan position
486  */
487 void
btmarkpos(IndexScanDesc scan)488 btmarkpos(IndexScanDesc scan)
489 {
490 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
491 
492 	/* There may be an old mark with a pin (but no lock). */
493 	BTScanPosUnpinIfPinned(so->markPos);
494 
495 	/*
496 	 * Just record the current itemIndex.  If we later step to next page
497 	 * before releasing the marked position, _bt_steppage makes a full copy of
498 	 * the currPos struct in markPos.  If (as often happens) the mark is moved
499 	 * before we leave the page, we don't have to do that work.
500 	 */
501 	if (BTScanPosIsValid(so->currPos))
502 		so->markItemIndex = so->currPos.itemIndex;
503 	else
504 	{
505 		BTScanPosInvalidate(so->markPos);
506 		so->markItemIndex = -1;
507 	}
508 
509 	/* Also record the current positions of any array keys */
510 	if (so->numArrayKeys)
511 		_bt_mark_array_keys(scan);
512 }
513 
514 /*
515  *	btrestrpos() -- restore scan to last saved position
516  */
517 void
btrestrpos(IndexScanDesc scan)518 btrestrpos(IndexScanDesc scan)
519 {
520 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
521 
522 	/* Restore the marked positions of any array keys */
523 	if (so->numArrayKeys)
524 		_bt_restore_array_keys(scan);
525 
526 	if (so->markItemIndex >= 0)
527 	{
528 		/*
529 		 * The scan has never moved to a new page since the last mark.  Just
530 		 * restore the itemIndex.
531 		 *
532 		 * NB: In this case we can't count on anything in so->markPos to be
533 		 * accurate.
534 		 */
535 		so->currPos.itemIndex = so->markItemIndex;
536 	}
537 	else
538 	{
539 		/*
540 		 * The scan moved to a new page after last mark or restore, and we are
541 		 * now restoring to the marked page.  We aren't holding any read
542 		 * locks, but if we're still holding the pin for the current position,
543 		 * we must drop it.
544 		 */
545 		if (BTScanPosIsValid(so->currPos))
546 		{
547 			/* Before leaving current page, deal with any killed items */
548 			if (so->numKilled > 0)
549 				_bt_killitems(scan);
550 			BTScanPosUnpinIfPinned(so->currPos);
551 		}
552 
553 		if (BTScanPosIsValid(so->markPos))
554 		{
555 			/* bump pin on mark buffer for assignment to current buffer */
556 			if (BTScanPosIsPinned(so->markPos))
557 				IncrBufferRefCount(so->markPos.buf);
558 			memcpy(&so->currPos, &so->markPos,
559 				   offsetof(BTScanPosData, items[1]) +
560 				   so->markPos.lastItem * sizeof(BTScanPosItem));
561 			if (so->currTuples)
562 				memcpy(so->currTuples, so->markTuples,
563 					   so->markPos.nextTupleOffset);
564 		}
565 		else
566 			BTScanPosInvalidate(so->currPos);
567 	}
568 }
569 
570 /*
571  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
572  */
573 Size
btestimateparallelscan(void)574 btestimateparallelscan(void)
575 {
576 	return sizeof(BTParallelScanDescData);
577 }
578 
579 /*
580  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
581  */
582 void
btinitparallelscan(void * target)583 btinitparallelscan(void *target)
584 {
585 	BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
586 
587 	SpinLockInit(&bt_target->btps_mutex);
588 	bt_target->btps_scanPage = InvalidBlockNumber;
589 	bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
590 	bt_target->btps_arrayKeyCount = 0;
591 	ConditionVariableInit(&bt_target->btps_cv);
592 }
593 
594 /*
595  *	btparallelrescan() -- reset parallel scan
596  */
597 void
btparallelrescan(IndexScanDesc scan)598 btparallelrescan(IndexScanDesc scan)
599 {
600 	BTParallelScanDesc btscan;
601 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
602 
603 	Assert(parallel_scan);
604 
605 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
606 												  parallel_scan->ps_offset);
607 
608 	/*
609 	 * In theory, we don't need to acquire the spinlock here, because there
610 	 * shouldn't be any other workers running at this point, but we do so for
611 	 * consistency.
612 	 */
613 	SpinLockAcquire(&btscan->btps_mutex);
614 	btscan->btps_scanPage = InvalidBlockNumber;
615 	btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
616 	btscan->btps_arrayKeyCount = 0;
617 	SpinLockRelease(&btscan->btps_mutex);
618 }
619 
620 /*
621  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
622  *		page.  Other scans must wait until we call bt_parallel_release() or
623  *		bt_parallel_done().
624  *
625  * The return value is true if we successfully seized the scan and false
626  * if we did not.  The latter case occurs if no pages remain for the current
627  * set of scankeys.
628  *
629  * If the return value is true, *pageno returns the next or current page
630  * of the scan (depending on the scan direction).  An invalid block number
631  * means the scan hasn't yet started, and P_NONE means we've reached the end.
632  * The first time a participating process reaches the last page, it will return
633  * true and set *pageno to P_NONE; after that, further attempts to seize the
634  * scan will return false.
635  *
636  * Callers should ignore the value of pageno if the return value is false.
637  */
638 bool
_bt_parallel_seize(IndexScanDesc scan,BlockNumber * pageno)639 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
640 {
641 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
642 	BTPS_State	pageStatus;
643 	bool		exit_loop = false;
644 	bool		status = true;
645 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
646 	BTParallelScanDesc btscan;
647 
648 	*pageno = P_NONE;
649 
650 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
651 												  parallel_scan->ps_offset);
652 
653 	while (1)
654 	{
655 		SpinLockAcquire(&btscan->btps_mutex);
656 		pageStatus = btscan->btps_pageStatus;
657 
658 		if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
659 		{
660 			/* Parallel scan has already advanced to a new set of scankeys. */
661 			status = false;
662 		}
663 		else if (pageStatus == BTPARALLEL_DONE)
664 		{
665 			/*
666 			 * We're done with this set of scankeys.  This may be the end, or
667 			 * there could be more sets to try.
668 			 */
669 			status = false;
670 		}
671 		else if (pageStatus != BTPARALLEL_ADVANCING)
672 		{
673 			/*
674 			 * We have successfully seized control of the scan for the purpose
675 			 * of advancing it to a new page!
676 			 */
677 			btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
678 			*pageno = btscan->btps_scanPage;
679 			exit_loop = true;
680 		}
681 		SpinLockRelease(&btscan->btps_mutex);
682 		if (exit_loop || !status)
683 			break;
684 		ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
685 	}
686 	ConditionVariableCancelSleep();
687 
688 	return status;
689 }
690 
691 /*
692  * _bt_parallel_release() -- Complete the process of advancing the scan to a
693  *		new page.  We now have the new value btps_scanPage; some other backend
694  *		can now begin advancing the scan.
695  */
696 void
_bt_parallel_release(IndexScanDesc scan,BlockNumber scan_page)697 _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
698 {
699 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
700 	BTParallelScanDesc btscan;
701 
702 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
703 												  parallel_scan->ps_offset);
704 
705 	SpinLockAcquire(&btscan->btps_mutex);
706 	btscan->btps_scanPage = scan_page;
707 	btscan->btps_pageStatus = BTPARALLEL_IDLE;
708 	SpinLockRelease(&btscan->btps_mutex);
709 	ConditionVariableSignal(&btscan->btps_cv);
710 }
711 
712 /*
713  * _bt_parallel_done() -- Mark the parallel scan as complete.
714  *
715  * When there are no pages left to scan, this function should be called to
716  * notify other workers.  Otherwise, they might wait forever for the scan to
717  * advance to the next page.
718  */
719 void
_bt_parallel_done(IndexScanDesc scan)720 _bt_parallel_done(IndexScanDesc scan)
721 {
722 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
723 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
724 	BTParallelScanDesc btscan;
725 	bool		status_changed = false;
726 
727 	/* Do nothing, for non-parallel scans */
728 	if (parallel_scan == NULL)
729 		return;
730 
731 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
732 												  parallel_scan->ps_offset);
733 
734 	/*
735 	 * Mark the parallel scan as done for this combination of scan keys,
736 	 * unless some other process already did so.  See also
737 	 * _bt_advance_array_keys.
738 	 */
739 	SpinLockAcquire(&btscan->btps_mutex);
740 	if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
741 		btscan->btps_pageStatus != BTPARALLEL_DONE)
742 	{
743 		btscan->btps_pageStatus = BTPARALLEL_DONE;
744 		status_changed = true;
745 	}
746 	SpinLockRelease(&btscan->btps_mutex);
747 
748 	/* wake up all the workers associated with this parallel scan */
749 	if (status_changed)
750 		ConditionVariableBroadcast(&btscan->btps_cv);
751 }
752 
753 /*
754  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
755  *			keys.
756  *
757  * Updates the count of array keys processed for both local and parallel
758  * scans.
759  */
760 void
_bt_parallel_advance_array_keys(IndexScanDesc scan)761 _bt_parallel_advance_array_keys(IndexScanDesc scan)
762 {
763 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
764 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
765 	BTParallelScanDesc btscan;
766 
767 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
768 												  parallel_scan->ps_offset);
769 
770 	so->arrayKeyCount++;
771 	SpinLockAcquire(&btscan->btps_mutex);
772 	if (btscan->btps_pageStatus == BTPARALLEL_DONE)
773 	{
774 		btscan->btps_scanPage = InvalidBlockNumber;
775 		btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
776 		btscan->btps_arrayKeyCount++;
777 	}
778 	SpinLockRelease(&btscan->btps_mutex);
779 }
780 
781 /*
782  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
783  *
784  * Called by btvacuumcleanup when btbulkdelete was never called because no
785  * tuples need to be deleted.
786  *
787  * When we return false, VACUUM can even skip the cleanup-only call to
788  * btvacuumscan (i.e. there will be no btvacuumscan call for this index at
789  * all).  Otherwise, a cleanup-only btvacuumscan call is required.
790  */
791 static bool
_bt_vacuum_needs_cleanup(IndexVacuumInfo * info)792 _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
793 {
794 	Buffer		metabuf;
795 	Page		metapg;
796 	BTMetaPageData *metad;
797 	bool		result = false;
798 
799 	metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
800 	metapg = BufferGetPage(metabuf);
801 	metad = BTPageGetMeta(metapg);
802 
803 	if (metad->btm_version < BTREE_VERSION)
804 	{
805 		/*
806 		 * Do cleanup if metapage needs upgrade, because we don't have
807 		 * cleanup-related meta-information yet.
808 		 */
809 		result = true;
810 	}
811 	else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
812 			 TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
813 								   RecentGlobalXmin))
814 	{
815 		/*
816 		 * If any oldest btpo.xact from a previously deleted page in the index
817 		 * is older than RecentGlobalXmin, then at least one deleted page can
818 		 * be recycled -- don't skip cleanup.
819 		 */
820 		result = true;
821 	}
822 	else
823 	{
824 		StdRdOptions *relopts;
825 		float8		cleanup_scale_factor;
826 		float8		prev_num_heap_tuples;
827 
828 		/*
829 		 * If table receives enough insertions and no cleanup was performed,
830 		 * then index would appear have stale statistics.  If scale factor is
831 		 * set, we avoid that by performing cleanup if the number of inserted
832 		 * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
833 		 * original tuples count.
834 		 */
835 		relopts = (StdRdOptions *) info->index->rd_options;
836 		cleanup_scale_factor = (relopts &&
837 								relopts->vacuum_cleanup_index_scale_factor >= 0)
838 			? relopts->vacuum_cleanup_index_scale_factor
839 			: vacuum_cleanup_index_scale_factor;
840 		prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
841 
842 		if (cleanup_scale_factor <= 0 ||
843 			prev_num_heap_tuples <= 0 ||
844 			(info->num_heap_tuples - prev_num_heap_tuples) /
845 			prev_num_heap_tuples >= cleanup_scale_factor)
846 			result = true;
847 	}
848 
849 	_bt_relbuf(info->index, metabuf);
850 	return result;
851 }
852 
853 /*
854  * Bulk deletion of all index entries pointing to a set of heap tuples.
855  * The set of target tuples is specified via a callback routine that tells
856  * whether any given heap tuple (identified by ItemPointer) is being deleted.
857  *
858  * Result: a palloc'd struct containing statistical info for VACUUM displays.
859  */
860 IndexBulkDeleteResult *
btbulkdelete(IndexVacuumInfo * info,IndexBulkDeleteResult * stats,IndexBulkDeleteCallback callback,void * callback_state)861 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
862 			 IndexBulkDeleteCallback callback, void *callback_state)
863 {
864 	Relation	rel = info->index;
865 	BTCycleId	cycleid;
866 
867 	/* allocate stats if first time through, else re-use existing struct */
868 	if (stats == NULL)
869 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
870 
871 	/* Establish the vacuum cycle ID to use for this scan */
872 	/* The ENSURE stuff ensures we clean up shared memory on failure */
873 	PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
874 	{
875 		cycleid = _bt_start_vacuum(rel);
876 
877 		btvacuumscan(info, stats, callback, callback_state, cycleid);
878 	}
879 	PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
880 	_bt_end_vacuum(rel);
881 
882 	return stats;
883 }
884 
885 /*
886  * Post-VACUUM cleanup.
887  *
888  * Result: a palloc'd struct containing statistical info for VACUUM displays.
889  */
890 IndexBulkDeleteResult *
btvacuumcleanup(IndexVacuumInfo * info,IndexBulkDeleteResult * stats)891 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
892 {
893 	/* No-op in ANALYZE ONLY mode */
894 	if (info->analyze_only)
895 		return stats;
896 
897 	/*
898 	 * If btbulkdelete was called, we need not do anything, just return the
899 	 * stats from the latest btbulkdelete call.  If it wasn't called, we might
900 	 * still need to do a pass over the index, to recycle any newly-recyclable
901 	 * pages or to obtain index statistics.  _bt_vacuum_needs_cleanup
902 	 * determines if either are needed.
903 	 *
904 	 * Since we aren't going to actually delete any leaf items, there's no
905 	 * need to go through all the vacuum-cycle-ID pushups.
906 	 */
907 	if (stats == NULL)
908 	{
909 		/* Check if we need a cleanup */
910 		if (!_bt_vacuum_needs_cleanup(info))
911 			return NULL;
912 
913 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
914 		btvacuumscan(info, stats, NULL, NULL, 0);
915 	}
916 
917 	/*
918 	 * It's quite possible for us to be fooled by concurrent page splits into
919 	 * double-counting some index tuples, so disbelieve any total that exceeds
920 	 * the underlying heap's count ... if we know that accurately.  Otherwise
921 	 * this might just make matters worse.
922 	 */
923 	if (!info->estimated_count)
924 	{
925 		if (stats->num_index_tuples > info->num_heap_tuples)
926 			stats->num_index_tuples = info->num_heap_tuples;
927 	}
928 
929 	return stats;
930 }
931 
932 /*
933  * btvacuumscan --- scan the index for VACUUMing purposes
934  *
935  * This combines the functions of looking for leaf tuples that are deletable
936  * according to the vacuum callback, looking for empty pages that can be
937  * deleted, and looking for old deleted pages that can be recycled.  Both
938  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
939  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
940  * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup
941  * is maintained.
942  *
943  * The caller is responsible for initially allocating/zeroing a stats struct
944  * and for obtaining a vacuum cycle ID if necessary.
945  */
946 static void
btvacuumscan(IndexVacuumInfo * info,IndexBulkDeleteResult * stats,IndexBulkDeleteCallback callback,void * callback_state,BTCycleId cycleid)947 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
948 			 IndexBulkDeleteCallback callback, void *callback_state,
949 			 BTCycleId cycleid)
950 {
951 	Relation	rel = info->index;
952 	BTVacState	vstate;
953 	BlockNumber num_pages;
954 	BlockNumber blkno;
955 	bool		needLock;
956 
957 	/*
958 	 * Reset counts that will be incremented during the scan; needed in case
959 	 * of multiple scans during a single VACUUM command
960 	 */
961 	stats->estimated_count = false;
962 	stats->num_index_tuples = 0;
963 	stats->pages_deleted = 0;
964 
965 	/* Set up info to pass down to btvacuumpage */
966 	vstate.info = info;
967 	vstate.stats = stats;
968 	vstate.callback = callback;
969 	vstate.callback_state = callback_state;
970 	vstate.cycleid = cycleid;
971 	vstate.lastBlockVacuumed = BTREE_METAPAGE;	/* Initialise at first block */
972 	vstate.lastBlockLocked = BTREE_METAPAGE;
973 	vstate.totFreePages = 0;
974 	vstate.oldestBtpoXact = InvalidTransactionId;
975 
976 	/* Create a temporary memory context to run _bt_pagedel in */
977 	vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
978 												  "_bt_pagedel",
979 												  ALLOCSET_DEFAULT_SIZES);
980 
981 	/*
982 	 * The outer loop iterates over all index pages except the metapage, in
983 	 * physical order (we hope the kernel will cooperate in providing
984 	 * read-ahead for speed).  It is critical that we visit all leaf pages,
985 	 * including ones added after we start the scan, else we might fail to
986 	 * delete some deletable tuples.  Hence, we must repeatedly check the
987 	 * relation length.  We must acquire the relation-extension lock while
988 	 * doing so to avoid a race condition: if someone else is extending the
989 	 * relation, there is a window where bufmgr/smgr have created a new
990 	 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
991 	 * we manage to scan such a page here, we'll improperly assume it can be
992 	 * recycled.  Taking the lock synchronizes things enough to prevent a
993 	 * problem: either num_pages won't include the new page, or _bt_getbuf
994 	 * already has write lock on the buffer and it will be fully initialized
995 	 * before we can examine it.  (See also vacuumlazy.c, which has the same
996 	 * issue.)	Also, we need not worry if a page is added immediately after
997 	 * we look; the page splitting code already has write-lock on the left
998 	 * page before it adds a right page, so we must already have processed any
999 	 * tuples due to be moved into such a page.
1000 	 *
1001 	 * We can skip locking for new or temp relations, however, since no one
1002 	 * else could be accessing them.
1003 	 */
1004 	needLock = !RELATION_IS_LOCAL(rel);
1005 
1006 	blkno = BTREE_METAPAGE + 1;
1007 	for (;;)
1008 	{
1009 		/* Get the current relation length */
1010 		if (needLock)
1011 			LockRelationForExtension(rel, ExclusiveLock);
1012 		num_pages = RelationGetNumberOfBlocks(rel);
1013 		if (needLock)
1014 			UnlockRelationForExtension(rel, ExclusiveLock);
1015 
1016 		/* Quit if we've scanned the whole relation */
1017 		if (blkno >= num_pages)
1018 			break;
1019 		/* Iterate over pages, then loop back to recheck length */
1020 		for (; blkno < num_pages; blkno++)
1021 		{
1022 			btvacuumpage(&vstate, blkno, blkno);
1023 		}
1024 	}
1025 
1026 	/*
1027 	 * Check to see if we need to issue one final WAL record for this index,
1028 	 * which may be needed for correctness on a hot standby node when non-MVCC
1029 	 * index scans could take place.
1030 	 *
1031 	 * If the WAL is replayed in hot standby, the replay process needs to get
1032 	 * cleanup locks on all index leaf pages, just as we've been doing here.
1033 	 * However, we won't issue any WAL records about pages that have no items
1034 	 * to be deleted.  For pages between pages we've vacuumed, the replay code
1035 	 * will take locks under the direction of the lastBlockVacuumed fields in
1036 	 * the XLOG_BTREE_VACUUM WAL records.  To cover pages after the last one
1037 	 * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1038 	 * against the last leaf page in the index, if that one wasn't vacuumed.
1039 	 */
1040 	if (XLogStandbyInfoActive() &&
1041 		vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1042 	{
1043 		Buffer		buf;
1044 
1045 		/*
1046 		 * The page should be valid, but we can't use _bt_getbuf() because we
1047 		 * want to use a nondefault buffer access strategy.  Since we aren't
1048 		 * going to delete any items, getting cleanup lock again is probably
1049 		 * overkill, but for consistency do that anyway.
1050 		 */
1051 		buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1052 								 RBM_NORMAL, info->strategy);
1053 		LockBufferForCleanup(buf);
1054 		_bt_checkpage(rel, buf);
1055 		_bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1056 		_bt_relbuf(rel, buf);
1057 	}
1058 
1059 	MemoryContextDelete(vstate.pagedelcontext);
1060 
1061 	/*
1062 	 * If we found any recyclable pages (and recorded them in the FSM), then
1063 	 * forcibly update the upper-level FSM pages to ensure that searchers can
1064 	 * find them.  It's possible that the pages were also found during
1065 	 * previous scans and so this is a waste of time, but it's cheap enough
1066 	 * relative to scanning the index that it shouldn't matter much, and
1067 	 * making sure that free pages are available sooner not later seems
1068 	 * worthwhile.
1069 	 *
1070 	 * Note that if no recyclable pages exist, we don't bother vacuuming the
1071 	 * FSM at all.
1072 	 */
1073 	if (vstate.totFreePages > 0)
1074 		IndexFreeSpaceMapVacuum(rel);
1075 
1076 	/*
1077 	 * Maintain the oldest btpo.xact and a count of the current number of heap
1078 	 * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup).
1079 	 *
1080 	 * The page with the oldest btpo.xact is typically a page deleted by this
1081 	 * VACUUM operation, since pages deleted by a previous VACUUM operation
1082 	 * tend to be placed in the FSM (by the current VACUUM operation) -- such
1083 	 * pages are not candidates to be the oldest btpo.xact.  (Note that pages
1084 	 * placed in the FSM are reported as deleted pages in the bulk delete
1085 	 * statistics, despite not counting as deleted pages for the purposes of
1086 	 * determining the oldest btpo.xact.)
1087 	 */
1088 	_bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact,
1089 								 info->num_heap_tuples);
1090 
1091 	/* update statistics */
1092 	stats->num_pages = num_pages;
1093 	stats->pages_free = vstate.totFreePages;
1094 }
1095 
1096 /*
1097  * btvacuumpage --- VACUUM one page
1098  *
1099  * This processes a single page for btvacuumscan().  In some cases we
1100  * must go back and re-examine previously-scanned pages; this routine
1101  * recurses when necessary to handle that case.
1102  *
1103  * blkno is the page to process.  orig_blkno is the highest block number
1104  * reached by the outer btvacuumscan loop (the same as blkno, unless we
1105  * are recursing to re-examine a previous page).
1106  */
1107 static void
btvacuumpage(BTVacState * vstate,BlockNumber blkno,BlockNumber orig_blkno)1108 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1109 {
1110 	IndexVacuumInfo *info = vstate->info;
1111 	IndexBulkDeleteResult *stats = vstate->stats;
1112 	IndexBulkDeleteCallback callback = vstate->callback;
1113 	void	   *callback_state = vstate->callback_state;
1114 	Relation	rel = info->index;
1115 	bool		delete_now;
1116 	BlockNumber recurse_to;
1117 	Buffer		buf;
1118 	Page		page;
1119 	BTPageOpaque opaque = NULL;
1120 
1121 restart:
1122 	delete_now = false;
1123 	recurse_to = P_NONE;
1124 
1125 	/* call vacuum_delay_point while not holding any buffer lock */
1126 	vacuum_delay_point();
1127 
1128 	/*
1129 	 * We can't use _bt_getbuf() here because it always applies
1130 	 * _bt_checkpage(), which will barf on an all-zero page. We want to
1131 	 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
1132 	 * buffer access strategy.
1133 	 */
1134 	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1135 							 info->strategy);
1136 	LockBuffer(buf, BT_READ);
1137 	page = BufferGetPage(buf);
1138 	if (!PageIsNew(page))
1139 	{
1140 		_bt_checkpage(rel, buf);
1141 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1142 	}
1143 
1144 	/*
1145 	 * If we are recursing, the only case we want to do anything with is a
1146 	 * live leaf page having the current vacuum cycle ID.  Any other state
1147 	 * implies we already saw the page (eg, deleted it as being empty).
1148 	 */
1149 	if (blkno != orig_blkno)
1150 	{
1151 		if (_bt_page_recyclable(page) ||
1152 			P_IGNORE(opaque) ||
1153 			!P_ISLEAF(opaque) ||
1154 			opaque->btpo_cycleid != vstate->cycleid)
1155 		{
1156 			_bt_relbuf(rel, buf);
1157 			return;
1158 		}
1159 	}
1160 
1161 	/* Page is valid, see what to do with it */
1162 	if (_bt_page_recyclable(page))
1163 	{
1164 		/* Okay to recycle this page (which could be leaf or internal) */
1165 		RecordFreeIndexPage(rel, blkno);
1166 		vstate->totFreePages++;
1167 		stats->pages_deleted++;
1168 	}
1169 	else if (P_ISDELETED(opaque))
1170 	{
1171 		/*
1172 		 * Already deleted page (which could be leaf or internal).  Can't
1173 		 * recycle yet.
1174 		 */
1175 		stats->pages_deleted++;
1176 
1177 		/* Maintain the oldest btpo.xact */
1178 		if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1179 			TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1180 			vstate->oldestBtpoXact = opaque->btpo.xact;
1181 	}
1182 	else if (P_ISHALFDEAD(opaque))
1183 	{
1184 		/*
1185 		 * Half-dead leaf page.  Try to delete now.  Might update
1186 		 * oldestBtpoXact and pages_deleted below.
1187 		 */
1188 		delete_now = true;
1189 	}
1190 	else if (P_ISLEAF(opaque))
1191 	{
1192 		OffsetNumber deletable[MaxOffsetNumber];
1193 		int			ndeletable;
1194 		OffsetNumber offnum,
1195 					minoff,
1196 					maxoff;
1197 
1198 		/*
1199 		 * Trade in the initial read lock for a super-exclusive write lock on
1200 		 * this page.  We must get such a lock on every leaf page over the
1201 		 * course of the vacuum scan, whether or not it actually contains any
1202 		 * deletable tuples --- see nbtree/README.
1203 		 */
1204 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1205 		LockBufferForCleanup(buf);
1206 
1207 		/*
1208 		 * Remember highest leaf page number we've taken cleanup lock on; see
1209 		 * notes in btvacuumscan
1210 		 */
1211 		if (blkno > vstate->lastBlockLocked)
1212 			vstate->lastBlockLocked = blkno;
1213 
1214 		/*
1215 		 * Check whether we need to recurse back to earlier pages.  What we
1216 		 * are concerned about is a page split that happened since we started
1217 		 * the vacuum scan.  If the split moved some tuples to a lower page
1218 		 * then we might have missed 'em.  If so, set up for tail recursion.
1219 		 * (Must do this before possibly clearing btpo_cycleid below!)
1220 		 */
1221 		if (vstate->cycleid != 0 &&
1222 			opaque->btpo_cycleid == vstate->cycleid &&
1223 			!(opaque->btpo_flags & BTP_SPLIT_END) &&
1224 			!P_RIGHTMOST(opaque) &&
1225 			opaque->btpo_next < orig_blkno)
1226 			recurse_to = opaque->btpo_next;
1227 
1228 		/*
1229 		 * Scan over all items to see which ones need deleted according to the
1230 		 * callback function.
1231 		 */
1232 		ndeletable = 0;
1233 		minoff = P_FIRSTDATAKEY(opaque);
1234 		maxoff = PageGetMaxOffsetNumber(page);
1235 		if (callback)
1236 		{
1237 			for (offnum = minoff;
1238 				 offnum <= maxoff;
1239 				 offnum = OffsetNumberNext(offnum))
1240 			{
1241 				IndexTuple	itup;
1242 				ItemPointer htup;
1243 
1244 				itup = (IndexTuple) PageGetItem(page,
1245 												PageGetItemId(page, offnum));
1246 				htup = &(itup->t_tid);
1247 
1248 				/*
1249 				 * During Hot Standby we currently assume that
1250 				 * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1251 				 * only true as long as the callback function depends only
1252 				 * upon whether the index tuple refers to heap tuples removed
1253 				 * in the initial heap scan. When vacuum starts it derives a
1254 				 * value of OldestXmin. Backends taking later snapshots could
1255 				 * have a RecentGlobalXmin with a later xid than the vacuum's
1256 				 * OldestXmin, so it is possible that row versions deleted
1257 				 * after OldestXmin could be marked as killed by other
1258 				 * backends. The callback function *could* look at the index
1259 				 * tuple state in isolation and decide to delete the index
1260 				 * tuple, though currently it does not. If it ever did, we
1261 				 * would need to reconsider whether XLOG_BTREE_VACUUM records
1262 				 * should cause conflicts. If they did cause conflicts they
1263 				 * would be fairly harsh conflicts, since we haven't yet
1264 				 * worked out a way to pass a useful value for
1265 				 * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1266 				 * applies to *any* type of index that marks index tuples as
1267 				 * killed.
1268 				 */
1269 				if (callback(htup, callback_state))
1270 					deletable[ndeletable++] = offnum;
1271 			}
1272 		}
1273 
1274 		/*
1275 		 * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
1276 		 * call per page, so as to minimize WAL traffic.
1277 		 */
1278 		if (ndeletable > 0)
1279 		{
1280 			/*
1281 			 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1282 			 * all information to the replay code to allow it to get a cleanup
1283 			 * lock on all pages between the previous lastBlockVacuumed and
1284 			 * this page. This ensures that WAL replay locks all leaf pages at
1285 			 * some point, which is important should non-MVCC scans be
1286 			 * requested. This is currently unused on standby, but we record
1287 			 * it anyway, so that the WAL contains the required information.
1288 			 *
1289 			 * Since we can visit leaf pages out-of-order when recursing,
1290 			 * replay might end up locking such pages an extra time, but it
1291 			 * doesn't seem worth the amount of bookkeeping it'd take to avoid
1292 			 * that.
1293 			 */
1294 			_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1295 								vstate->lastBlockVacuumed);
1296 
1297 			/*
1298 			 * Remember highest leaf page number we've issued a
1299 			 * XLOG_BTREE_VACUUM WAL record for.
1300 			 */
1301 			if (blkno > vstate->lastBlockVacuumed)
1302 				vstate->lastBlockVacuumed = blkno;
1303 
1304 			stats->tuples_removed += ndeletable;
1305 			/* must recompute maxoff */
1306 			maxoff = PageGetMaxOffsetNumber(page);
1307 		}
1308 		else
1309 		{
1310 			/*
1311 			 * If the leaf page has been split during this vacuum cycle, it
1312 			 * seems worth expending a write to clear btpo_cycleid even if we
1313 			 * don't have any deletions to do.  (If we do, _bt_delitems_vacuum
1314 			 * takes care of this.)  This ensures we won't process the page
1315 			 * again.
1316 			 *
1317 			 * We treat this like a hint-bit update because there's no need to
1318 			 * WAL-log it.
1319 			 */
1320 			if (vstate->cycleid != 0 &&
1321 				opaque->btpo_cycleid == vstate->cycleid)
1322 			{
1323 				opaque->btpo_cycleid = 0;
1324 				MarkBufferDirtyHint(buf, true);
1325 			}
1326 		}
1327 
1328 		/*
1329 		 * If the leaf page is now empty, try to delete it; else count the
1330 		 * live tuples.  We don't delete when recursing, though, to avoid
1331 		 * putting entries into freePages out-of-order (doesn't seem worth any
1332 		 * extra code to handle the case).
1333 		 */
1334 		if (minoff > maxoff)
1335 			delete_now = (blkno == orig_blkno);
1336 		else
1337 			stats->num_index_tuples += maxoff - minoff + 1;
1338 	}
1339 
1340 	if (delete_now)
1341 	{
1342 		MemoryContext oldcontext;
1343 
1344 		/* Run pagedel in a temp context to avoid memory leakage */
1345 		MemoryContextReset(vstate->pagedelcontext);
1346 		oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1347 
1348 		/*
1349 		 * We trust the _bt_pagedel return value because it does not include
1350 		 * any page that a future call here from btvacuumscan is expected to
1351 		 * count.  There will be no double-counting.
1352 		 */
1353 		stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
1354 
1355 		MemoryContextSwitchTo(oldcontext);
1356 		/* pagedel released buffer, so we shouldn't */
1357 	}
1358 	else
1359 		_bt_relbuf(rel, buf);
1360 
1361 	/*
1362 	 * This is really tail recursion, but if the compiler is too stupid to
1363 	 * optimize it as such, we'd eat an uncomfortably large amount of stack
1364 	 * space per recursion level (due to the deletable[] array). A failure is
1365 	 * improbable since the number of levels isn't likely to be large ... but
1366 	 * just in case, let's hand-optimize into a loop.
1367 	 */
1368 	if (recurse_to != P_NONE)
1369 	{
1370 		blkno = recurse_to;
1371 		goto restart;
1372 	}
1373 }
1374 
1375 /*
1376  *	btcanreturn() -- Check whether btree indexes support index-only scans.
1377  *
1378  * btrees always do, so this is trivial.
1379  */
1380 bool
btcanreturn(Relation index,int attno)1381 btcanreturn(Relation index, int attno)
1382 {
1383 	return true;
1384 }
1385