1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  *	  Implementation of Lehman and Yao's btree management algorithm for
5  *	  Postgres.
6  *
7  * NOTES
8  *	  This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  *	  src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/progress.h"
26 #include "commands/vacuum.h"
27 #include "miscadmin.h"
28 #include "nodes/execnodes.h"
29 #include "pgstat.h"
30 #include "postmaster/autovacuum.h"
31 #include "storage/condition_variable.h"
32 #include "storage/indexfsm.h"
33 #include "storage/ipc.h"
34 #include "storage/lmgr.h"
35 #include "storage/smgr.h"
36 #include "utils/builtins.h"
37 #include "utils/index_selfuncs.h"
38 #include "utils/memutils.h"
39 
40 
41 /*
42  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
43  *
44  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
45  * a new page; others must wait.
46  *
47  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
48  * to a new page; some process can start doing that.
49  *
50  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
51  * We reach this state once for every distinct combination of array keys.
52  */
53 typedef enum
54 {
55 	BTPARALLEL_NOT_INITIALIZED,
56 	BTPARALLEL_ADVANCING,
57 	BTPARALLEL_IDLE,
58 	BTPARALLEL_DONE
59 } BTPS_State;
60 
61 /*
62  * BTParallelScanDescData contains btree specific shared information required
63  * for parallel scan.
64  */
65 typedef struct BTParallelScanDescData
66 {
67 	BlockNumber btps_scanPage;	/* latest or next page to be scanned */
68 	BTPS_State	btps_pageStatus;	/* indicates whether next page is
69 									 * available for scan. see above for
70 									 * possible states of parallel scan. */
71 	int			btps_arrayKeyCount; /* count indicating number of array scan
72 									 * keys processed by parallel scan */
73 	slock_t		btps_mutex;		/* protects above variables */
74 	ConditionVariable btps_cv;	/* used to synchronize parallel scan */
75 }			BTParallelScanDescData;
76 
77 typedef struct BTParallelScanDescData *BTParallelScanDesc;
78 
79 
80 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
81 						 IndexBulkDeleteCallback callback, void *callback_state,
82 						 BTCycleId cycleid);
83 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
84 static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
85 										  IndexTuple posting,
86 										  OffsetNumber updatedoffset,
87 										  int *nremaining);
88 
89 
90 /*
91  * Btree handler function: return IndexAmRoutine with access method parameters
92  * and callbacks.
93  */
94 Datum
bthandler(PG_FUNCTION_ARGS)95 bthandler(PG_FUNCTION_ARGS)
96 {
97 	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
98 
99 	amroutine->amstrategies = BTMaxStrategyNumber;
100 	amroutine->amsupport = BTNProcs;
101 	amroutine->amoptsprocnum = BTOPTIONS_PROC;
102 	amroutine->amcanorder = true;
103 	amroutine->amcanorderbyop = false;
104 	amroutine->amcanbackward = true;
105 	amroutine->amcanunique = true;
106 	amroutine->amcanmulticol = true;
107 	amroutine->amoptionalkey = true;
108 	amroutine->amsearcharray = true;
109 	amroutine->amsearchnulls = true;
110 	amroutine->amstorage = false;
111 	amroutine->amclusterable = true;
112 	amroutine->ampredlocks = true;
113 	amroutine->amcanparallel = true;
114 	amroutine->amcaninclude = true;
115 	amroutine->amusemaintenanceworkmem = false;
116 	amroutine->amparallelvacuumoptions =
117 		VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
118 	amroutine->amkeytype = InvalidOid;
119 
120 	amroutine->ambuild = btbuild;
121 	amroutine->ambuildempty = btbuildempty;
122 	amroutine->aminsert = btinsert;
123 	amroutine->ambulkdelete = btbulkdelete;
124 	amroutine->amvacuumcleanup = btvacuumcleanup;
125 	amroutine->amcanreturn = btcanreturn;
126 	amroutine->amcostestimate = btcostestimate;
127 	amroutine->amoptions = btoptions;
128 	amroutine->amproperty = btproperty;
129 	amroutine->ambuildphasename = btbuildphasename;
130 	amroutine->amvalidate = btvalidate;
131 	amroutine->amadjustmembers = btadjustmembers;
132 	amroutine->ambeginscan = btbeginscan;
133 	amroutine->amrescan = btrescan;
134 	amroutine->amgettuple = btgettuple;
135 	amroutine->amgetbitmap = btgetbitmap;
136 	amroutine->amendscan = btendscan;
137 	amroutine->ammarkpos = btmarkpos;
138 	amroutine->amrestrpos = btrestrpos;
139 	amroutine->amestimateparallelscan = btestimateparallelscan;
140 	amroutine->aminitparallelscan = btinitparallelscan;
141 	amroutine->amparallelrescan = btparallelrescan;
142 
143 	PG_RETURN_POINTER(amroutine);
144 }
145 
146 /*
147  *	btbuildempty() -- build an empty btree index in the initialization fork
148  */
149 void
btbuildempty(Relation index)150 btbuildempty(Relation index)
151 {
152 	Page		metapage;
153 
154 	/* Construct metapage. */
155 	metapage = (Page) palloc(BLCKSZ);
156 	_bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
157 
158 	/*
159 	 * Write the page and log it.  It might seem that an immediate sync would
160 	 * be sufficient to guarantee that the file exists on disk, but recovery
161 	 * itself might remove it while replaying, for example, an
162 	 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
163 	 * this even when wal_level=minimal.
164 	 */
165 	PageSetChecksumInplace(metapage, BTREE_METAPAGE);
166 	smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
167 			  (char *) metapage, true);
168 	log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
169 				BTREE_METAPAGE, metapage, true);
170 
171 	/*
172 	 * An immediate sync is required even if we xlog'd the page, because the
173 	 * write did not go through shared_buffers and therefore a concurrent
174 	 * checkpoint may have moved the redo pointer past our xlog record.
175 	 */
176 	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
177 }
178 
179 /*
180  *	btinsert() -- insert an index tuple into a btree.
181  *
182  *		Descend the tree recursively, find the appropriate location for our
183  *		new tuple, and put it there.
184  */
185 bool
btinsert(Relation rel,Datum * values,bool * isnull,ItemPointer ht_ctid,Relation heapRel,IndexUniqueCheck checkUnique,bool indexUnchanged,IndexInfo * indexInfo)186 btinsert(Relation rel, Datum *values, bool *isnull,
187 		 ItemPointer ht_ctid, Relation heapRel,
188 		 IndexUniqueCheck checkUnique,
189 		 bool indexUnchanged,
190 		 IndexInfo *indexInfo)
191 {
192 	bool		result;
193 	IndexTuple	itup;
194 
195 	/* generate an index tuple */
196 	itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
197 	itup->t_tid = *ht_ctid;
198 
199 	result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
200 
201 	pfree(itup);
202 
203 	return result;
204 }
205 
206 /*
207  *	btgettuple() -- Get the next tuple in the scan.
208  */
209 bool
btgettuple(IndexScanDesc scan,ScanDirection dir)210 btgettuple(IndexScanDesc scan, ScanDirection dir)
211 {
212 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
213 	bool		res;
214 
215 	/* btree indexes are never lossy */
216 	scan->xs_recheck = false;
217 
218 	/*
219 	 * If we have any array keys, initialize them during first call for a
220 	 * scan.  We can't do this in btrescan because we don't know the scan
221 	 * direction at that time.
222 	 */
223 	if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
224 	{
225 		/* punt if we have any unsatisfiable array keys */
226 		if (so->numArrayKeys < 0)
227 			return false;
228 
229 		_bt_start_array_keys(scan, dir);
230 	}
231 
232 	/* This loop handles advancing to the next array elements, if any */
233 	do
234 	{
235 		/*
236 		 * If we've already initialized this scan, we can just advance it in
237 		 * the appropriate direction.  If we haven't done so yet, we call
238 		 * _bt_first() to get the first item in the scan.
239 		 */
240 		if (!BTScanPosIsValid(so->currPos))
241 			res = _bt_first(scan, dir);
242 		else
243 		{
244 			/*
245 			 * Check to see if we should kill the previously-fetched tuple.
246 			 */
247 			if (scan->kill_prior_tuple)
248 			{
249 				/*
250 				 * Yes, remember it for later. (We'll deal with all such
251 				 * tuples at once right before leaving the index page.)  The
252 				 * test for numKilled overrun is not just paranoia: if the
253 				 * caller reverses direction in the indexscan then the same
254 				 * item might get entered multiple times. It's not worth
255 				 * trying to optimize that, so we don't detect it, but instead
256 				 * just forget any excess entries.
257 				 */
258 				if (so->killedItems == NULL)
259 					so->killedItems = (int *)
260 						palloc(MaxTIDsPerBTreePage * sizeof(int));
261 				if (so->numKilled < MaxTIDsPerBTreePage)
262 					so->killedItems[so->numKilled++] = so->currPos.itemIndex;
263 			}
264 
265 			/*
266 			 * Now continue the scan.
267 			 */
268 			res = _bt_next(scan, dir);
269 		}
270 
271 		/* If we have a tuple, return it ... */
272 		if (res)
273 			break;
274 		/* ... otherwise see if we have more array keys to deal with */
275 	} while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
276 
277 	return res;
278 }
279 
280 /*
281  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
282  */
283 int64
btgetbitmap(IndexScanDesc scan,TIDBitmap * tbm)284 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
285 {
286 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
287 	int64		ntids = 0;
288 	ItemPointer heapTid;
289 
290 	/*
291 	 * If we have any array keys, initialize them.
292 	 */
293 	if (so->numArrayKeys)
294 	{
295 		/* punt if we have any unsatisfiable array keys */
296 		if (so->numArrayKeys < 0)
297 			return ntids;
298 
299 		_bt_start_array_keys(scan, ForwardScanDirection);
300 	}
301 
302 	/* This loop handles advancing to the next array elements, if any */
303 	do
304 	{
305 		/* Fetch the first page & tuple */
306 		if (_bt_first(scan, ForwardScanDirection))
307 		{
308 			/* Save tuple ID, and continue scanning */
309 			heapTid = &scan->xs_heaptid;
310 			tbm_add_tuples(tbm, heapTid, 1, false);
311 			ntids++;
312 
313 			for (;;)
314 			{
315 				/*
316 				 * Advance to next tuple within page.  This is the same as the
317 				 * easy case in _bt_next().
318 				 */
319 				if (++so->currPos.itemIndex > so->currPos.lastItem)
320 				{
321 					/* let _bt_next do the heavy lifting */
322 					if (!_bt_next(scan, ForwardScanDirection))
323 						break;
324 				}
325 
326 				/* Save tuple ID, and continue scanning */
327 				heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
328 				tbm_add_tuples(tbm, heapTid, 1, false);
329 				ntids++;
330 			}
331 		}
332 		/* Now see if we have more array keys to deal with */
333 	} while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
334 
335 	return ntids;
336 }
337 
338 /*
339  *	btbeginscan() -- start a scan on a btree index
340  */
341 IndexScanDesc
btbeginscan(Relation rel,int nkeys,int norderbys)342 btbeginscan(Relation rel, int nkeys, int norderbys)
343 {
344 	IndexScanDesc scan;
345 	BTScanOpaque so;
346 
347 	/* no order by operators allowed */
348 	Assert(norderbys == 0);
349 
350 	/* get the scan */
351 	scan = RelationGetIndexScan(rel, nkeys, norderbys);
352 
353 	/* allocate private workspace */
354 	so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
355 	BTScanPosInvalidate(so->currPos);
356 	BTScanPosInvalidate(so->markPos);
357 	if (scan->numberOfKeys > 0)
358 		so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
359 	else
360 		so->keyData = NULL;
361 
362 	so->arrayKeyData = NULL;	/* assume no array keys for now */
363 	so->numArrayKeys = 0;
364 	so->arrayKeys = NULL;
365 	so->arrayContext = NULL;
366 
367 	so->killedItems = NULL;		/* until needed */
368 	so->numKilled = 0;
369 
370 	/*
371 	 * We don't know yet whether the scan will be index-only, so we do not
372 	 * allocate the tuple workspace arrays until btrescan.  However, we set up
373 	 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
374 	 */
375 	so->currTuples = so->markTuples = NULL;
376 
377 	scan->xs_itupdesc = RelationGetDescr(rel);
378 
379 	scan->opaque = so;
380 
381 	return scan;
382 }
383 
384 /*
385  *	btrescan() -- rescan an index relation
386  */
387 void
btrescan(IndexScanDesc scan,ScanKey scankey,int nscankeys,ScanKey orderbys,int norderbys)388 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
389 		 ScanKey orderbys, int norderbys)
390 {
391 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
392 
393 	/* we aren't holding any read locks, but gotta drop the pins */
394 	if (BTScanPosIsValid(so->currPos))
395 	{
396 		/* Before leaving current page, deal with any killed items */
397 		if (so->numKilled > 0)
398 			_bt_killitems(scan);
399 		BTScanPosUnpinIfPinned(so->currPos);
400 		BTScanPosInvalidate(so->currPos);
401 	}
402 
403 	so->markItemIndex = -1;
404 	so->arrayKeyCount = 0;
405 	BTScanPosUnpinIfPinned(so->markPos);
406 	BTScanPosInvalidate(so->markPos);
407 
408 	/*
409 	 * Allocate tuple workspace arrays, if needed for an index-only scan and
410 	 * not already done in a previous rescan call.  To save on palloc
411 	 * overhead, both workspaces are allocated as one palloc block; only this
412 	 * function and btendscan know that.
413 	 *
414 	 * NOTE: this data structure also makes it safe to return data from a
415 	 * "name" column, even though btree name_ops uses an underlying storage
416 	 * datatype of cstring.  The risk there is that "name" is supposed to be
417 	 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
418 	 * However, since we only return data out of tuples sitting in the
419 	 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
420 	 * data out of the markTuples array --- running off the end of memory for
421 	 * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
422 	 * adding special-case treatment for name_ops elsewhere.
423 	 */
424 	if (scan->xs_want_itup && so->currTuples == NULL)
425 	{
426 		so->currTuples = (char *) palloc(BLCKSZ * 2);
427 		so->markTuples = so->currTuples + BLCKSZ;
428 	}
429 
430 	/*
431 	 * Reset the scan keys
432 	 */
433 	if (scankey && scan->numberOfKeys > 0)
434 		memmove(scan->keyData,
435 				scankey,
436 				scan->numberOfKeys * sizeof(ScanKeyData));
437 	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */
438 
439 	/* If any keys are SK_SEARCHARRAY type, set up array-key info */
440 	_bt_preprocess_array_keys(scan);
441 }
442 
443 /*
444  *	btendscan() -- close down a scan
445  */
446 void
btendscan(IndexScanDesc scan)447 btendscan(IndexScanDesc scan)
448 {
449 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
450 
451 	/* we aren't holding any read locks, but gotta drop the pins */
452 	if (BTScanPosIsValid(so->currPos))
453 	{
454 		/* Before leaving current page, deal with any killed items */
455 		if (so->numKilled > 0)
456 			_bt_killitems(scan);
457 		BTScanPosUnpinIfPinned(so->currPos);
458 	}
459 
460 	so->markItemIndex = -1;
461 	BTScanPosUnpinIfPinned(so->markPos);
462 
463 	/* No need to invalidate positions, the RAM is about to be freed. */
464 
465 	/* Release storage */
466 	if (so->keyData != NULL)
467 		pfree(so->keyData);
468 	/* so->arrayKeyData and so->arrayKeys are in arrayContext */
469 	if (so->arrayContext != NULL)
470 		MemoryContextDelete(so->arrayContext);
471 	if (so->killedItems != NULL)
472 		pfree(so->killedItems);
473 	if (so->currTuples != NULL)
474 		pfree(so->currTuples);
475 	/* so->markTuples should not be pfree'd, see btrescan */
476 	pfree(so);
477 }
478 
479 /*
480  *	btmarkpos() -- save current scan position
481  */
482 void
btmarkpos(IndexScanDesc scan)483 btmarkpos(IndexScanDesc scan)
484 {
485 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
486 
487 	/* There may be an old mark with a pin (but no lock). */
488 	BTScanPosUnpinIfPinned(so->markPos);
489 
490 	/*
491 	 * Just record the current itemIndex.  If we later step to next page
492 	 * before releasing the marked position, _bt_steppage makes a full copy of
493 	 * the currPos struct in markPos.  If (as often happens) the mark is moved
494 	 * before we leave the page, we don't have to do that work.
495 	 */
496 	if (BTScanPosIsValid(so->currPos))
497 		so->markItemIndex = so->currPos.itemIndex;
498 	else
499 	{
500 		BTScanPosInvalidate(so->markPos);
501 		so->markItemIndex = -1;
502 	}
503 
504 	/* Also record the current positions of any array keys */
505 	if (so->numArrayKeys)
506 		_bt_mark_array_keys(scan);
507 }
508 
509 /*
510  *	btrestrpos() -- restore scan to last saved position
511  */
512 void
btrestrpos(IndexScanDesc scan)513 btrestrpos(IndexScanDesc scan)
514 {
515 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
516 
517 	/* Restore the marked positions of any array keys */
518 	if (so->numArrayKeys)
519 		_bt_restore_array_keys(scan);
520 
521 	if (so->markItemIndex >= 0)
522 	{
523 		/*
524 		 * The scan has never moved to a new page since the last mark.  Just
525 		 * restore the itemIndex.
526 		 *
527 		 * NB: In this case we can't count on anything in so->markPos to be
528 		 * accurate.
529 		 */
530 		so->currPos.itemIndex = so->markItemIndex;
531 	}
532 	else
533 	{
534 		/*
535 		 * The scan moved to a new page after last mark or restore, and we are
536 		 * now restoring to the marked page.  We aren't holding any read
537 		 * locks, but if we're still holding the pin for the current position,
538 		 * we must drop it.
539 		 */
540 		if (BTScanPosIsValid(so->currPos))
541 		{
542 			/* Before leaving current page, deal with any killed items */
543 			if (so->numKilled > 0)
544 				_bt_killitems(scan);
545 			BTScanPosUnpinIfPinned(so->currPos);
546 		}
547 
548 		if (BTScanPosIsValid(so->markPos))
549 		{
550 			/* bump pin on mark buffer for assignment to current buffer */
551 			if (BTScanPosIsPinned(so->markPos))
552 				IncrBufferRefCount(so->markPos.buf);
553 			memcpy(&so->currPos, &so->markPos,
554 				   offsetof(BTScanPosData, items[1]) +
555 				   so->markPos.lastItem * sizeof(BTScanPosItem));
556 			if (so->currTuples)
557 				memcpy(so->currTuples, so->markTuples,
558 					   so->markPos.nextTupleOffset);
559 		}
560 		else
561 			BTScanPosInvalidate(so->currPos);
562 	}
563 }
564 
565 /*
566  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
567  */
568 Size
btestimateparallelscan(void)569 btestimateparallelscan(void)
570 {
571 	return sizeof(BTParallelScanDescData);
572 }
573 
574 /*
575  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
576  */
577 void
btinitparallelscan(void * target)578 btinitparallelscan(void *target)
579 {
580 	BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
581 
582 	SpinLockInit(&bt_target->btps_mutex);
583 	bt_target->btps_scanPage = InvalidBlockNumber;
584 	bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
585 	bt_target->btps_arrayKeyCount = 0;
586 	ConditionVariableInit(&bt_target->btps_cv);
587 }
588 
589 /*
590  *	btparallelrescan() -- reset parallel scan
591  */
592 void
btparallelrescan(IndexScanDesc scan)593 btparallelrescan(IndexScanDesc scan)
594 {
595 	BTParallelScanDesc btscan;
596 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
597 
598 	Assert(parallel_scan);
599 
600 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
601 												  parallel_scan->ps_offset);
602 
603 	/*
604 	 * In theory, we don't need to acquire the spinlock here, because there
605 	 * shouldn't be any other workers running at this point, but we do so for
606 	 * consistency.
607 	 */
608 	SpinLockAcquire(&btscan->btps_mutex);
609 	btscan->btps_scanPage = InvalidBlockNumber;
610 	btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
611 	btscan->btps_arrayKeyCount = 0;
612 	SpinLockRelease(&btscan->btps_mutex);
613 }
614 
615 /*
616  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
617  *		page.  Other scans must wait until we call _bt_parallel_release()
618  *		or _bt_parallel_done().
619  *
620  * The return value is true if we successfully seized the scan and false
621  * if we did not.  The latter case occurs if no pages remain for the current
622  * set of scankeys.
623  *
624  * If the return value is true, *pageno returns the next or current page
625  * of the scan (depending on the scan direction).  An invalid block number
626  * means the scan hasn't yet started, and P_NONE means we've reached the end.
627  * The first time a participating process reaches the last page, it will return
628  * true and set *pageno to P_NONE; after that, further attempts to seize the
629  * scan will return false.
630  *
631  * Callers should ignore the value of pageno if the return value is false.
632  */
633 bool
_bt_parallel_seize(IndexScanDesc scan,BlockNumber * pageno)634 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
635 {
636 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
637 	BTPS_State	pageStatus;
638 	bool		exit_loop = false;
639 	bool		status = true;
640 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
641 	BTParallelScanDesc btscan;
642 
643 	*pageno = P_NONE;
644 
645 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
646 												  parallel_scan->ps_offset);
647 
648 	while (1)
649 	{
650 		SpinLockAcquire(&btscan->btps_mutex);
651 		pageStatus = btscan->btps_pageStatus;
652 
653 		if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
654 		{
655 			/* Parallel scan has already advanced to a new set of scankeys. */
656 			status = false;
657 		}
658 		else if (pageStatus == BTPARALLEL_DONE)
659 		{
660 			/*
661 			 * We're done with this set of scankeys.  This may be the end, or
662 			 * there could be more sets to try.
663 			 */
664 			status = false;
665 		}
666 		else if (pageStatus != BTPARALLEL_ADVANCING)
667 		{
668 			/*
669 			 * We have successfully seized control of the scan for the purpose
670 			 * of advancing it to a new page!
671 			 */
672 			btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
673 			*pageno = btscan->btps_scanPage;
674 			exit_loop = true;
675 		}
676 		SpinLockRelease(&btscan->btps_mutex);
677 		if (exit_loop || !status)
678 			break;
679 		ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
680 	}
681 	ConditionVariableCancelSleep();
682 
683 	return status;
684 }
685 
686 /*
687  * _bt_parallel_release() -- Complete the process of advancing the scan to a
688  *		new page.  We now have the new value btps_scanPage; some other backend
689  *		can now begin advancing the scan.
690  */
691 void
_bt_parallel_release(IndexScanDesc scan,BlockNumber scan_page)692 _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
693 {
694 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
695 	BTParallelScanDesc btscan;
696 
697 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
698 												  parallel_scan->ps_offset);
699 
700 	SpinLockAcquire(&btscan->btps_mutex);
701 	btscan->btps_scanPage = scan_page;
702 	btscan->btps_pageStatus = BTPARALLEL_IDLE;
703 	SpinLockRelease(&btscan->btps_mutex);
704 	ConditionVariableSignal(&btscan->btps_cv);
705 }
706 
707 /*
708  * _bt_parallel_done() -- Mark the parallel scan as complete.
709  *
710  * When there are no pages left to scan, this function should be called to
711  * notify other workers.  Otherwise, they might wait forever for the scan to
712  * advance to the next page.
713  */
714 void
_bt_parallel_done(IndexScanDesc scan)715 _bt_parallel_done(IndexScanDesc scan)
716 {
717 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
718 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
719 	BTParallelScanDesc btscan;
720 	bool		status_changed = false;
721 
722 	/* Do nothing, for non-parallel scans */
723 	if (parallel_scan == NULL)
724 		return;
725 
726 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
727 												  parallel_scan->ps_offset);
728 
729 	/*
730 	 * Mark the parallel scan as done for this combination of scan keys,
731 	 * unless some other process already did so.  See also
732 	 * _bt_advance_array_keys.
733 	 */
734 	SpinLockAcquire(&btscan->btps_mutex);
735 	if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
736 		btscan->btps_pageStatus != BTPARALLEL_DONE)
737 	{
738 		btscan->btps_pageStatus = BTPARALLEL_DONE;
739 		status_changed = true;
740 	}
741 	SpinLockRelease(&btscan->btps_mutex);
742 
743 	/* wake up all the workers associated with this parallel scan */
744 	if (status_changed)
745 		ConditionVariableBroadcast(&btscan->btps_cv);
746 }
747 
748 /*
749  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
750  *			keys.
751  *
752  * Updates the count of array keys processed for both local and parallel
753  * scans.
754  */
755 void
_bt_parallel_advance_array_keys(IndexScanDesc scan)756 _bt_parallel_advance_array_keys(IndexScanDesc scan)
757 {
758 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
759 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
760 	BTParallelScanDesc btscan;
761 
762 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
763 												  parallel_scan->ps_offset);
764 
765 	so->arrayKeyCount++;
766 	SpinLockAcquire(&btscan->btps_mutex);
767 	if (btscan->btps_pageStatus == BTPARALLEL_DONE)
768 	{
769 		btscan->btps_scanPage = InvalidBlockNumber;
770 		btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
771 		btscan->btps_arrayKeyCount++;
772 	}
773 	SpinLockRelease(&btscan->btps_mutex);
774 }
775 
776 /*
777  * Bulk deletion of all index entries pointing to a set of heap tuples.
778  * The set of target tuples is specified via a callback routine that tells
779  * whether any given heap tuple (identified by ItemPointer) is being deleted.
780  *
781  * Result: a palloc'd struct containing statistical info for VACUUM displays.
782  */
783 IndexBulkDeleteResult *
btbulkdelete(IndexVacuumInfo * info,IndexBulkDeleteResult * stats,IndexBulkDeleteCallback callback,void * callback_state)784 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
785 			 IndexBulkDeleteCallback callback, void *callback_state)
786 {
787 	Relation	rel = info->index;
788 	BTCycleId	cycleid;
789 
790 	/* allocate stats if first time through, else re-use existing struct */
791 	if (stats == NULL)
792 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
793 
794 	/* Establish the vacuum cycle ID to use for this scan */
795 	/* The ENSURE stuff ensures we clean up shared memory on failure */
796 	PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
797 	{
798 		cycleid = _bt_start_vacuum(rel);
799 
800 		btvacuumscan(info, stats, callback, callback_state, cycleid);
801 	}
802 	PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
803 	_bt_end_vacuum(rel);
804 
805 	return stats;
806 }
807 
808 /*
809  * Post-VACUUM cleanup.
810  *
811  * Result: a palloc'd struct containing statistical info for VACUUM displays.
812  */
813 IndexBulkDeleteResult *
btvacuumcleanup(IndexVacuumInfo * info,IndexBulkDeleteResult * stats)814 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
815 {
816 	BlockNumber num_delpages;
817 
818 	/* No-op in ANALYZE ONLY mode */
819 	if (info->analyze_only)
820 		return stats;
821 
822 	/*
823 	 * If btbulkdelete was called, we need not do anything (we just maintain
824 	 * the information used within _bt_vacuum_needs_cleanup() by calling
825 	 * _bt_set_cleanup_info() below).
826 	 *
827 	 * If btbulkdelete was _not_ called, then we have a choice to make: we
828 	 * must decide whether or not a btvacuumscan() call is needed now (i.e.
829 	 * whether the ongoing VACUUM operation can entirely avoid a physical scan
830 	 * of the index).  A call to _bt_vacuum_needs_cleanup() decides it for us
831 	 * now.
832 	 */
833 	if (stats == NULL)
834 	{
835 		/* Check if VACUUM operation can entirely avoid btvacuumscan() call */
836 		if (!_bt_vacuum_needs_cleanup(info->index))
837 			return NULL;
838 
839 		/*
840 		 * Since we aren't going to actually delete any leaf items, there's no
841 		 * need to go through all the vacuum-cycle-ID pushups here.
842 		 *
843 		 * Posting list tuples are a source of inaccuracy for cleanup-only
844 		 * scans.  btvacuumscan() will assume that the number of index tuples
845 		 * from each page can be used as num_index_tuples, even though
846 		 * num_index_tuples is supposed to represent the number of TIDs in the
847 		 * index.  This naive approach can underestimate the number of tuples
848 		 * in the index significantly.
849 		 *
850 		 * We handle the problem by making num_index_tuples an estimate in
851 		 * cleanup-only case.
852 		 */
853 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
854 		btvacuumscan(info, stats, NULL, NULL, 0);
855 		stats->estimated_count = true;
856 	}
857 
858 	/*
859 	 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
860 	 *
861 	 * num_delpages is the number of deleted pages now in the index that were
862 	 * not safe to place in the FSM to be recycled just yet.  num_delpages is
863 	 * greater than 0 only when _bt_pagedel() actually deleted pages during
864 	 * our call to btvacuumscan().  Even then, _bt_pendingfsm_finalize() must
865 	 * have failed to place any newly deleted pages in the FSM just moments
866 	 * ago.  (Actually, there are edge cases where recycling of the current
867 	 * VACUUM's newly deleted pages does not even become safe by the time the
868 	 * next VACUUM comes around.  See nbtree/README.)
869 	 */
870 	Assert(stats->pages_deleted >= stats->pages_free);
871 	num_delpages = stats->pages_deleted - stats->pages_free;
872 	_bt_set_cleanup_info(info->index, num_delpages);
873 
874 	/*
875 	 * It's quite possible for us to be fooled by concurrent page splits into
876 	 * double-counting some index tuples, so disbelieve any total that exceeds
877 	 * the underlying heap's count ... if we know that accurately.  Otherwise
878 	 * this might just make matters worse.
879 	 */
880 	if (!info->estimated_count)
881 	{
882 		if (stats->num_index_tuples > info->num_heap_tuples)
883 			stats->num_index_tuples = info->num_heap_tuples;
884 	}
885 
886 	return stats;
887 }
888 
889 /*
890  * btvacuumscan --- scan the index for VACUUMing purposes
891  *
892  * This combines the functions of looking for leaf tuples that are deletable
893  * according to the vacuum callback, looking for empty pages that can be
894  * deleted, and looking for old deleted pages that can be recycled.  Both
895  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
896  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
897  *
898  * The caller is responsible for initially allocating/zeroing a stats struct
899  * and for obtaining a vacuum cycle ID if necessary.
900  */
901 static void
btvacuumscan(IndexVacuumInfo * info,IndexBulkDeleteResult * stats,IndexBulkDeleteCallback callback,void * callback_state,BTCycleId cycleid)902 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
903 			 IndexBulkDeleteCallback callback, void *callback_state,
904 			 BTCycleId cycleid)
905 {
906 	Relation	rel = info->index;
907 	BTVacState	vstate;
908 	BlockNumber num_pages;
909 	BlockNumber scanblkno;
910 	bool		needLock;
911 
912 	/*
913 	 * Reset fields that track information about the entire index now.  This
914 	 * avoids double-counting in the case where a single VACUUM command
915 	 * requires multiple scans of the index.
916 	 *
917 	 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
918 	 * since they track information about the VACUUM command, and so must last
919 	 * across each call to btvacuumscan().
920 	 *
921 	 * (Note that pages_free is treated as state about the whole index, not
922 	 * the current VACUUM.  This is appropriate because RecordFreeIndexPage()
923 	 * calls are idempotent, and get repeated for the same deleted pages in
924 	 * some scenarios.  The point for us is to track the number of recyclable
925 	 * pages in the index at the end of the VACUUM command.)
926 	 */
927 	stats->num_pages = 0;
928 	stats->num_index_tuples = 0;
929 	stats->pages_deleted = 0;
930 	stats->pages_free = 0;
931 
932 	/* Set up info to pass down to btvacuumpage */
933 	vstate.info = info;
934 	vstate.stats = stats;
935 	vstate.callback = callback;
936 	vstate.callback_state = callback_state;
937 	vstate.cycleid = cycleid;
938 
939 	/* Create a temporary memory context to run _bt_pagedel in */
940 	vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
941 												  "_bt_pagedel",
942 												  ALLOCSET_DEFAULT_SIZES);
943 
944 	/* Initialize vstate fields used by _bt_pendingfsm_finalize */
945 	vstate.bufsize = 0;
946 	vstate.maxbufsize = 0;
947 	vstate.pendingpages = NULL;
948 	vstate.npendingpages = 0;
949 	/* Consider applying _bt_pendingfsm_finalize optimization */
950 	_bt_pendingfsm_init(rel, &vstate, (callback == NULL));
951 
952 	/*
953 	 * The outer loop iterates over all index pages except the metapage, in
954 	 * physical order (we hope the kernel will cooperate in providing
955 	 * read-ahead for speed).  It is critical that we visit all leaf pages,
956 	 * including ones added after we start the scan, else we might fail to
957 	 * delete some deletable tuples.  Hence, we must repeatedly check the
958 	 * relation length.  We must acquire the relation-extension lock while
959 	 * doing so to avoid a race condition: if someone else is extending the
960 	 * relation, there is a window where bufmgr/smgr have created a new
961 	 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
962 	 * we manage to scan such a page here, we'll improperly assume it can be
963 	 * recycled.  Taking the lock synchronizes things enough to prevent a
964 	 * problem: either num_pages won't include the new page, or _bt_getbuf
965 	 * already has write lock on the buffer and it will be fully initialized
966 	 * before we can examine it.  (See also vacuumlazy.c, which has the same
967 	 * issue.)	Also, we need not worry if a page is added immediately after
968 	 * we look; the page splitting code already has write-lock on the left
969 	 * page before it adds a right page, so we must already have processed any
970 	 * tuples due to be moved into such a page.
971 	 *
972 	 * We can skip locking for new or temp relations, however, since no one
973 	 * else could be accessing them.
974 	 */
975 	needLock = !RELATION_IS_LOCAL(rel);
976 
977 	scanblkno = BTREE_METAPAGE + 1;
978 	for (;;)
979 	{
980 		/* Get the current relation length */
981 		if (needLock)
982 			LockRelationForExtension(rel, ExclusiveLock);
983 		num_pages = RelationGetNumberOfBlocks(rel);
984 		if (needLock)
985 			UnlockRelationForExtension(rel, ExclusiveLock);
986 
987 		if (info->report_progress)
988 			pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
989 										 num_pages);
990 
991 		/* Quit if we've scanned the whole relation */
992 		if (scanblkno >= num_pages)
993 			break;
994 		/* Iterate over pages, then loop back to recheck length */
995 		for (; scanblkno < num_pages; scanblkno++)
996 		{
997 			btvacuumpage(&vstate, scanblkno);
998 			if (info->report_progress)
999 				pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1000 											 scanblkno);
1001 		}
1002 	}
1003 
1004 	/* Set statistics num_pages field to final size of index */
1005 	stats->num_pages = num_pages;
1006 
1007 	MemoryContextDelete(vstate.pagedelcontext);
1008 
1009 	/*
1010 	 * If there were any calls to _bt_pagedel() during scan of the index then
1011 	 * see if any of the resulting pages can be placed in the FSM now.  When
1012 	 * it's not safe we'll have to leave it up to a future VACUUM operation.
1013 	 *
1014 	 * Finally, if we placed any pages in the FSM (either just now or during
1015 	 * the scan), forcibly update the upper-level FSM pages to ensure that
1016 	 * searchers can find them.
1017 	 */
1018 	_bt_pendingfsm_finalize(rel, &vstate);
1019 	if (stats->pages_free > 0)
1020 		IndexFreeSpaceMapVacuum(rel);
1021 }
1022 
1023 /*
1024  * btvacuumpage --- VACUUM one page
1025  *
1026  * This processes a single page for btvacuumscan().  In some cases we must
1027  * backtrack to re-examine and VACUUM pages that were the scanblkno during
1028  * a previous call here.  This is how we handle page splits (that happened
1029  * after our cycleid was acquired) whose right half page happened to reuse
1030  * a block that we might have processed at some point before it was
1031  * recycled (i.e. before the page split).
1032  */
1033 static void
btvacuumpage(BTVacState * vstate,BlockNumber scanblkno)1034 btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
1035 {
1036 	IndexVacuumInfo *info = vstate->info;
1037 	IndexBulkDeleteResult *stats = vstate->stats;
1038 	IndexBulkDeleteCallback callback = vstate->callback;
1039 	void	   *callback_state = vstate->callback_state;
1040 	Relation	rel = info->index;
1041 	bool		attempt_pagedel;
1042 	BlockNumber blkno,
1043 				backtrack_to;
1044 	Buffer		buf;
1045 	Page		page;
1046 	BTPageOpaque opaque;
1047 
1048 	blkno = scanblkno;
1049 
1050 backtrack:
1051 
1052 	attempt_pagedel = false;
1053 	backtrack_to = P_NONE;
1054 
1055 	/* call vacuum_delay_point while not holding any buffer lock */
1056 	vacuum_delay_point();
1057 
1058 	/*
1059 	 * We can't use _bt_getbuf() here because it always applies
1060 	 * _bt_checkpage(), which will barf on an all-zero page. We want to
1061 	 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
1062 	 * buffer access strategy.
1063 	 */
1064 	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1065 							 info->strategy);
1066 	_bt_lockbuf(rel, buf, BT_READ);
1067 	page = BufferGetPage(buf);
1068 	opaque = NULL;
1069 	if (!PageIsNew(page))
1070 	{
1071 		_bt_checkpage(rel, buf);
1072 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1073 	}
1074 
1075 	Assert(blkno <= scanblkno);
1076 	if (blkno != scanblkno)
1077 	{
1078 		/*
1079 		 * We're backtracking.
1080 		 *
1081 		 * We followed a right link to a sibling leaf page (a page that
1082 		 * happens to be from a block located before scanblkno).  The only
1083 		 * case we want to do anything with is a live leaf page having the
1084 		 * current vacuum cycle ID.
1085 		 *
1086 		 * The page had better be in a state that's consistent with what we
1087 		 * expect.  Check for conditions that imply corruption in passing.  It
1088 		 * can't be half-dead because only an interrupted VACUUM process can
1089 		 * leave pages in that state, so we'd definitely have dealt with it
1090 		 * back when the page was the scanblkno page (half-dead pages are
1091 		 * always marked fully deleted by _bt_pagedel()).  This assumes that
1092 		 * there can be only one vacuum process running at a time.
1093 		 */
1094 		if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1095 		{
1096 			Assert(false);
1097 			ereport(LOG,
1098 					(errcode(ERRCODE_INDEX_CORRUPTED),
1099 					 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1100 									 blkno, scanblkno, RelationGetRelationName(rel))));
1101 			_bt_relbuf(rel, buf);
1102 			return;
1103 		}
1104 
1105 		/*
1106 		 * We may have already processed the page in an earlier call, when the
1107 		 * page was scanblkno.  This happens when the leaf page split occurred
1108 		 * after the scan began, but before the right sibling page became the
1109 		 * scanblkno.
1110 		 *
1111 		 * Page may also have been deleted by current btvacuumpage() call,
1112 		 * since _bt_pagedel() sometimes deletes the right sibling page of
1113 		 * scanblkno in passing (it does so after we decided where to
1114 		 * backtrack to).  We don't need to process this page as a deleted
1115 		 * page a second time now (in fact, it would be wrong to count it as a
1116 		 * deleted page in the bulk delete statistics a second time).
1117 		 */
1118 		if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1119 		{
1120 			/* Done with current scanblkno (and all lower split pages) */
1121 			_bt_relbuf(rel, buf);
1122 			return;
1123 		}
1124 	}
1125 
1126 	if (!opaque || BTPageIsRecyclable(page))
1127 	{
1128 		/* Okay to recycle this page (which could be leaf or internal) */
1129 		RecordFreeIndexPage(rel, blkno);
1130 		stats->pages_deleted++;
1131 		stats->pages_free++;
1132 	}
1133 	else if (P_ISDELETED(opaque))
1134 	{
1135 		/*
1136 		 * Already deleted page (which could be leaf or internal).  Can't
1137 		 * recycle yet.
1138 		 */
1139 		stats->pages_deleted++;
1140 	}
1141 	else if (P_ISHALFDEAD(opaque))
1142 	{
1143 		/* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1144 		attempt_pagedel = true;
1145 
1146 		/*
1147 		 * _bt_pagedel() will increment both pages_newly_deleted and
1148 		 * pages_deleted stats in all cases (barring corruption)
1149 		 */
1150 	}
1151 	else if (P_ISLEAF(opaque))
1152 	{
1153 		OffsetNumber deletable[MaxIndexTuplesPerPage];
1154 		int			ndeletable;
1155 		BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1156 		int			nupdatable;
1157 		OffsetNumber offnum,
1158 					minoff,
1159 					maxoff;
1160 		int			nhtidsdead,
1161 					nhtidslive;
1162 
1163 		/*
1164 		 * Trade in the initial read lock for a super-exclusive write lock on
1165 		 * this page.  We must get such a lock on every leaf page over the
1166 		 * course of the vacuum scan, whether or not it actually contains any
1167 		 * deletable tuples --- see nbtree/README.
1168 		 */
1169 		_bt_upgradelockbufcleanup(rel, buf);
1170 
1171 		/*
1172 		 * Check whether we need to backtrack to earlier pages.  What we are
1173 		 * concerned about is a page split that happened since we started the
1174 		 * vacuum scan.  If the split moved tuples on the right half of the
1175 		 * split (i.e. the tuples that sort high) to a block that we already
1176 		 * passed over, then we might have missed the tuples.  We need to
1177 		 * backtrack now.  (Must do this before possibly clearing btpo_cycleid
1178 		 * or deleting scanblkno page below!)
1179 		 */
1180 		if (vstate->cycleid != 0 &&
1181 			opaque->btpo_cycleid == vstate->cycleid &&
1182 			!(opaque->btpo_flags & BTP_SPLIT_END) &&
1183 			!P_RIGHTMOST(opaque) &&
1184 			opaque->btpo_next < scanblkno)
1185 			backtrack_to = opaque->btpo_next;
1186 
1187 		/*
1188 		 * When each VACUUM begins, it determines an OldestXmin cutoff value.
1189 		 * Tuples before the cutoff are removed by VACUUM.  Scan over all
1190 		 * items to see which ones need to be deleted according to cutoff
1191 		 * point using callback.
1192 		 */
1193 		ndeletable = 0;
1194 		nupdatable = 0;
1195 		minoff = P_FIRSTDATAKEY(opaque);
1196 		maxoff = PageGetMaxOffsetNumber(page);
1197 		nhtidsdead = 0;
1198 		nhtidslive = 0;
1199 		if (callback)
1200 		{
1201 			for (offnum = minoff;
1202 				 offnum <= maxoff;
1203 				 offnum = OffsetNumberNext(offnum))
1204 			{
1205 				IndexTuple	itup;
1206 
1207 				itup = (IndexTuple) PageGetItem(page,
1208 												PageGetItemId(page, offnum));
1209 
1210 				/*
1211 				 * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM
1212 				 * records do not produce their own conflicts.  This is safe
1213 				 * as long as the callback function only considers whether the
1214 				 * index tuple refers to pre-cutoff heap tuples that were
1215 				 * certainly already pruned away during VACUUM's initial heap
1216 				 * scan by the time we get here. (heapam's XLOG_HEAP2_PRUNE
1217 				 * records produce conflicts using a latestRemovedXid value
1218 				 * for the pointed-to heap tuples, so there is no need to
1219 				 * produce our own conflict now.)
1220 				 *
1221 				 * Backends with snapshots acquired after a VACUUM starts but
1222 				 * before it finishes could have visibility cutoff with a
1223 				 * later xid than VACUUM's OldestXmin cutoff.  These backends
1224 				 * might happen to opportunistically mark some index tuples
1225 				 * LP_DEAD before we reach them, even though they may be after
1226 				 * our cutoff.  We don't try to kill these "extra" index
1227 				 * tuples in _bt_delitems_vacuum().  This keep things simple,
1228 				 * and allows us to always avoid generating our own conflicts.
1229 				 */
1230 				Assert(!BTreeTupleIsPivot(itup));
1231 				if (!BTreeTupleIsPosting(itup))
1232 				{
1233 					/* Regular tuple, standard table TID representation */
1234 					if (callback(&itup->t_tid, callback_state))
1235 					{
1236 						deletable[ndeletable++] = offnum;
1237 						nhtidsdead++;
1238 					}
1239 					else
1240 						nhtidslive++;
1241 				}
1242 				else
1243 				{
1244 					BTVacuumPosting vacposting;
1245 					int			nremaining;
1246 
1247 					/* Posting list tuple */
1248 					vacposting = btreevacuumposting(vstate, itup, offnum,
1249 													&nremaining);
1250 					if (vacposting == NULL)
1251 					{
1252 						/*
1253 						 * All table TIDs from the posting tuple remain, so no
1254 						 * delete or update required
1255 						 */
1256 						Assert(nremaining == BTreeTupleGetNPosting(itup));
1257 					}
1258 					else if (nremaining > 0)
1259 					{
1260 
1261 						/*
1262 						 * Store metadata about posting list tuple in
1263 						 * updatable array for entire page.  Existing tuple
1264 						 * will be updated during the later call to
1265 						 * _bt_delitems_vacuum().
1266 						 */
1267 						Assert(nremaining < BTreeTupleGetNPosting(itup));
1268 						updatable[nupdatable++] = vacposting;
1269 						nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1270 					}
1271 					else
1272 					{
1273 						/*
1274 						 * All table TIDs from the posting list must be
1275 						 * deleted.  We'll delete the index tuple completely
1276 						 * (no update required).
1277 						 */
1278 						Assert(nremaining == 0);
1279 						deletable[ndeletable++] = offnum;
1280 						nhtidsdead += BTreeTupleGetNPosting(itup);
1281 						pfree(vacposting);
1282 					}
1283 
1284 					nhtidslive += nremaining;
1285 				}
1286 			}
1287 		}
1288 
1289 		/*
1290 		 * Apply any needed deletes or updates.  We issue just one
1291 		 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1292 		 */
1293 		if (ndeletable > 0 || nupdatable > 0)
1294 		{
1295 			Assert(nhtidsdead >= ndeletable + nupdatable);
1296 			_bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1297 								nupdatable);
1298 
1299 			stats->tuples_removed += nhtidsdead;
1300 			/* must recompute maxoff */
1301 			maxoff = PageGetMaxOffsetNumber(page);
1302 
1303 			/* can't leak memory here */
1304 			for (int i = 0; i < nupdatable; i++)
1305 				pfree(updatable[i]);
1306 		}
1307 		else
1308 		{
1309 			/*
1310 			 * If the leaf page has been split during this vacuum cycle, it
1311 			 * seems worth expending a write to clear btpo_cycleid even if we
1312 			 * don't have any deletions to do.  (If we do, _bt_delitems_vacuum
1313 			 * takes care of this.)  This ensures we won't process the page
1314 			 * again.
1315 			 *
1316 			 * We treat this like a hint-bit update because there's no need to
1317 			 * WAL-log it.
1318 			 */
1319 			Assert(nhtidsdead == 0);
1320 			if (vstate->cycleid != 0 &&
1321 				opaque->btpo_cycleid == vstate->cycleid)
1322 			{
1323 				opaque->btpo_cycleid = 0;
1324 				MarkBufferDirtyHint(buf, true);
1325 			}
1326 		}
1327 
1328 		/*
1329 		 * If the leaf page is now empty, try to delete it; else count the
1330 		 * live tuples (live table TIDs in posting lists are counted as
1331 		 * separate live tuples).  We don't delete when backtracking, though,
1332 		 * since that would require teaching _bt_pagedel() about backtracking
1333 		 * (doesn't seem worth adding more complexity to deal with that).
1334 		 *
1335 		 * We don't count the number of live TIDs during cleanup-only calls to
1336 		 * btvacuumscan (i.e. when callback is not set).  We count the number
1337 		 * of index tuples directly instead.  This avoids the expense of
1338 		 * directly examining all of the tuples on each page.  VACUUM will
1339 		 * treat num_index_tuples as an estimate in cleanup-only case, so it
1340 		 * doesn't matter that this underestimates num_index_tuples
1341 		 * significantly in some cases.
1342 		 */
1343 		if (minoff > maxoff)
1344 			attempt_pagedel = (blkno == scanblkno);
1345 		else if (callback)
1346 			stats->num_index_tuples += nhtidslive;
1347 		else
1348 			stats->num_index_tuples += maxoff - minoff + 1;
1349 
1350 		Assert(!attempt_pagedel || nhtidslive == 0);
1351 	}
1352 
1353 	if (attempt_pagedel)
1354 	{
1355 		MemoryContext oldcontext;
1356 
1357 		/* Run pagedel in a temp context to avoid memory leakage */
1358 		MemoryContextReset(vstate->pagedelcontext);
1359 		oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1360 
1361 		/*
1362 		 * _bt_pagedel maintains the bulk delete stats on our behalf;
1363 		 * pages_newly_deleted and pages_deleted are likely to be incremented
1364 		 * during call
1365 		 */
1366 		Assert(blkno == scanblkno);
1367 		_bt_pagedel(rel, buf, vstate);
1368 
1369 		MemoryContextSwitchTo(oldcontext);
1370 		/* pagedel released buffer, so we shouldn't */
1371 	}
1372 	else
1373 		_bt_relbuf(rel, buf);
1374 
1375 	if (backtrack_to != P_NONE)
1376 	{
1377 		blkno = backtrack_to;
1378 		goto backtrack;
1379 	}
1380 }
1381 
1382 /*
1383  * btreevacuumposting --- determine TIDs still needed in posting list
1384  *
1385  * Returns metadata describing how to build replacement tuple without the TIDs
1386  * that VACUUM needs to delete.  Returned value is NULL in the common case
1387  * where no changes are needed to caller's posting list tuple (we avoid
1388  * allocating memory here as an optimization).
1389  *
1390  * The number of TIDs that should remain in the posting list tuple is set for
1391  * caller in *nremaining.
1392  */
1393 static BTVacuumPosting
btreevacuumposting(BTVacState * vstate,IndexTuple posting,OffsetNumber updatedoffset,int * nremaining)1394 btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1395 				   OffsetNumber updatedoffset, int *nremaining)
1396 {
1397 	int			live = 0;
1398 	int			nitem = BTreeTupleGetNPosting(posting);
1399 	ItemPointer items = BTreeTupleGetPosting(posting);
1400 	BTVacuumPosting vacposting = NULL;
1401 
1402 	for (int i = 0; i < nitem; i++)
1403 	{
1404 		if (!vstate->callback(items + i, vstate->callback_state))
1405 		{
1406 			/* Live table TID */
1407 			live++;
1408 		}
1409 		else if (vacposting == NULL)
1410 		{
1411 			/*
1412 			 * First dead table TID encountered.
1413 			 *
1414 			 * It's now clear that we need to delete one or more dead table
1415 			 * TIDs, so start maintaining metadata describing how to update
1416 			 * existing posting list tuple.
1417 			 */
1418 			vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1419 								nitem * sizeof(uint16));
1420 
1421 			vacposting->itup = posting;
1422 			vacposting->updatedoffset = updatedoffset;
1423 			vacposting->ndeletedtids = 0;
1424 			vacposting->deletetids[vacposting->ndeletedtids++] = i;
1425 		}
1426 		else
1427 		{
1428 			/* Second or subsequent dead table TID */
1429 			vacposting->deletetids[vacposting->ndeletedtids++] = i;
1430 		}
1431 	}
1432 
1433 	*nremaining = live;
1434 	return vacposting;
1435 }
1436 
1437 /*
1438  *	btcanreturn() -- Check whether btree indexes support index-only scans.
1439  *
1440  * btrees always do, so this is trivial.
1441  */
1442 bool
btcanreturn(Relation index,int attno)1443 btcanreturn(Relation index, int attno)
1444 {
1445 	return true;
1446 }
1447