1 /*-------------------------------------------------------------------------
2  *
3  * vacuumlazy.c
4  *	  Concurrent ("lazy") vacuuming.
5  *
6  *
7  * The major space usage for LAZY VACUUM is storage for the array of dead tuple
8  * TIDs.  We want to ensure we can vacuum even the very largest relations with
9  * finite memory space usage.  To do that, we set upper bounds on the number of
10  * tuples we will keep track of at once.
11  *
12  * We are willing to use at most maintenance_work_mem (or perhaps
13  * autovacuum_work_mem) memory space to keep track of dead tuples.  We
14  * initially allocate an array of TIDs of that size, with an upper limit that
15  * depends on table size (this limit ensures we don't allocate a huge area
16  * uselessly for vacuuming small tables).  If the array threatens to overflow,
17  * we suspend the heap scan phase and perform a pass of index cleanup and page
18  * compaction, then resume the heap scan with an empty TID array.
19  *
20  * If we're processing a table with no indexes, we can just vacuum each page
21  * as we go; there's no need to save up multiple tuples to minimize the number
22  * of index scans performed.  So we don't use maintenance_work_mem memory for
23  * the TID array, just enough to hold as many heap tuples as fit on one page.
24  *
25  * Lazy vacuum supports parallel execution with parallel worker processes.  In
26  * a parallel vacuum, we perform both index vacuum and index cleanup with
27  * parallel worker processes.  Individual indexes are processed by one vacuum
28  * process.  At the beginning of a lazy vacuum (at lazy_scan_heap) we prepare
29  * the parallel context and initialize the DSM segment that contains shared
30  * information as well as the memory space for storing dead tuples.  When
31  * starting either index vacuum or index cleanup, we launch parallel worker
32  * processes.  Once all indexes are processed the parallel worker processes
33  * exit.  After that, the leader process re-initializes the parallel context
34  * so that it can use the same DSM for multiple passes of index vacuum and
35  * for performing index cleanup.  For updating the index statistics, we need
36  * to update the system table and since updates are not allowed during
37  * parallel mode we update the index statistics after exiting from the
38  * parallel mode.
39  *
40  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
41  * Portions Copyright (c) 1994, Regents of the University of California
42  *
43  *
44  * IDENTIFICATION
45  *	  src/backend/access/heap/vacuumlazy.c
46  *
47  *-------------------------------------------------------------------------
48  */
49 #include "postgres.h"
50 
51 #include <math.h>
52 
53 #include "access/amapi.h"
54 #include "access/genam.h"
55 #include "access/heapam.h"
56 #include "access/heapam_xlog.h"
57 #include "access/htup_details.h"
58 #include "access/multixact.h"
59 #include "access/parallel.h"
60 #include "access/transam.h"
61 #include "access/visibilitymap.h"
62 #include "access/xact.h"
63 #include "access/xlog.h"
64 #include "catalog/index.h"
65 #include "catalog/storage.h"
66 #include "commands/dbcommands.h"
67 #include "commands/progress.h"
68 #include "commands/vacuum.h"
69 #include "executor/instrument.h"
70 #include "miscadmin.h"
71 #include "optimizer/paths.h"
72 #include "pgstat.h"
73 #include "portability/instr_time.h"
74 #include "postmaster/autovacuum.h"
75 #include "storage/bufmgr.h"
76 #include "storage/freespace.h"
77 #include "storage/lmgr.h"
78 #include "tcop/tcopprot.h"
79 #include "utils/lsyscache.h"
80 #include "utils/memutils.h"
81 #include "utils/pg_rusage.h"
82 #include "utils/timestamp.h"
83 
84 
85 /*
86  * Space/time tradeoff parameters: do these need to be user-tunable?
87  *
88  * To consider truncating the relation, we want there to be at least
89  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
90  * is less) potentially-freeable pages.
91  */
92 #define REL_TRUNCATE_MINIMUM	1000
93 #define REL_TRUNCATE_FRACTION	16
94 
95 /*
96  * Timing parameters for truncate locking heuristics.
97  *
98  * These were not exposed as user tunable GUC values because it didn't seem
99  * that the potential for improvement was great enough to merit the cost of
100  * supporting them.
101  */
102 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL		20	/* ms */
103 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL		50	/* ms */
104 #define VACUUM_TRUNCATE_LOCK_TIMEOUT			5000	/* ms */
105 
106 /*
107  * Threshold that controls whether we bypass index vacuuming and heap
108  * vacuuming as an optimization
109  */
110 #define BYPASS_THRESHOLD_PAGES	0.02	/* i.e. 2% of rel_pages */
111 
112 /*
113  * Perform a failsafe check every 4GB during the heap scan, approximately
114  */
115 #define FAILSAFE_EVERY_PAGES \
116 	((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ))
117 
118 /*
119  * When a table has no indexes, vacuum the FSM after every 8GB, approximately
120  * (it won't be exact because we only vacuum FSM after processing a heap page
121  * that has some removable tuples).  When there are indexes, this is ignored,
122  * and we vacuum FSM after each index/heap cleaning pass.
123  */
124 #define VACUUM_FSM_EVERY_PAGES \
125 	((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
126 
127 /*
128  * Guesstimation of number of dead tuples per page.  This is used to
129  * provide an upper limit to memory allocated when vacuuming small
130  * tables.
131  */
132 #define LAZY_ALLOC_TUPLES		MaxHeapTuplesPerPage
133 
134 /*
135  * Before we consider skipping a page that's marked as clean in
136  * visibility map, we must've seen at least this many clean pages.
137  */
138 #define SKIP_PAGES_THRESHOLD	((BlockNumber) 32)
139 
140 /*
141  * Size of the prefetch window for lazy vacuum backwards truncation scan.
142  * Needs to be a power of 2.
143  */
144 #define PREFETCH_SIZE			((BlockNumber) 32)
145 
146 /*
147  * DSM keys for parallel vacuum.  Unlike other parallel execution code, since
148  * we don't need to worry about DSM keys conflicting with plan_node_id we can
149  * use small integers.
150  */
151 #define PARALLEL_VACUUM_KEY_SHARED			1
152 #define PARALLEL_VACUUM_KEY_DEAD_TUPLES		2
153 #define PARALLEL_VACUUM_KEY_QUERY_TEXT		3
154 #define PARALLEL_VACUUM_KEY_BUFFER_USAGE	4
155 #define PARALLEL_VACUUM_KEY_WAL_USAGE		5
156 
157 /*
158  * Macro to check if we are in a parallel vacuum.  If true, we are in the
159  * parallel mode and the DSM segment is initialized.
160  */
161 #define ParallelVacuumIsActive(vacrel) ((vacrel)->lps != NULL)
162 
163 /* Phases of vacuum during which we report error context. */
164 typedef enum
165 {
166 	VACUUM_ERRCB_PHASE_UNKNOWN,
167 	VACUUM_ERRCB_PHASE_SCAN_HEAP,
168 	VACUUM_ERRCB_PHASE_VACUUM_INDEX,
169 	VACUUM_ERRCB_PHASE_VACUUM_HEAP,
170 	VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
171 	VACUUM_ERRCB_PHASE_TRUNCATE
172 } VacErrPhase;
173 
174 /*
175  * LVDeadTuples stores the dead tuple TIDs collected during the heap scan.
176  * This is allocated in the DSM segment in parallel mode and in local memory
177  * in non-parallel mode.
178  */
179 typedef struct LVDeadTuples
180 {
181 	int			max_tuples;		/* # slots allocated in array */
182 	int			num_tuples;		/* current # of entries */
183 	/* List of TIDs of tuples we intend to delete */
184 	/* NB: this list is ordered by TID address */
185 	ItemPointerData itemptrs[FLEXIBLE_ARRAY_MEMBER];	/* array of
186 														 * ItemPointerData */
187 } LVDeadTuples;
188 
189 /* The dead tuple space consists of LVDeadTuples and dead tuple TIDs */
190 #define SizeOfDeadTuples(cnt) \
191 	add_size(offsetof(LVDeadTuples, itemptrs), \
192 			 mul_size(sizeof(ItemPointerData), cnt))
193 #define MAXDEADTUPLES(max_size) \
194 		(((max_size) - offsetof(LVDeadTuples, itemptrs)) / sizeof(ItemPointerData))
195 
196 /*
197  * Shared information among parallel workers.  So this is allocated in the DSM
198  * segment.
199  */
200 typedef struct LVShared
201 {
202 	/*
203 	 * Target table relid and log level.  These fields are not modified during
204 	 * the lazy vacuum.
205 	 */
206 	Oid			relid;
207 	int			elevel;
208 
209 	/*
210 	 * An indication for vacuum workers to perform either index vacuum or
211 	 * index cleanup.  first_time is true only if for_cleanup is true and
212 	 * bulk-deletion is not performed yet.
213 	 */
214 	bool		for_cleanup;
215 	bool		first_time;
216 
217 	/*
218 	 * Fields for both index vacuum and cleanup.
219 	 *
220 	 * reltuples is the total number of input heap tuples.  We set either old
221 	 * live tuples in the index vacuum case or the new live tuples in the
222 	 * index cleanup case.
223 	 *
224 	 * estimated_count is true if reltuples is an estimated value.  (Note that
225 	 * reltuples could be -1 in this case, indicating we have no idea.)
226 	 */
227 	double		reltuples;
228 	bool		estimated_count;
229 
230 	/*
231 	 * In single process lazy vacuum we could consume more memory during index
232 	 * vacuuming or cleanup apart from the memory for heap scanning.  In
233 	 * parallel vacuum, since individual vacuum workers can consume memory
234 	 * equal to maintenance_work_mem, the new maintenance_work_mem for each
235 	 * worker is set such that the parallel operation doesn't consume more
236 	 * memory than single process lazy vacuum.
237 	 */
238 	int			maintenance_work_mem_worker;
239 
240 	/*
241 	 * Shared vacuum cost balance.  During parallel vacuum,
242 	 * VacuumSharedCostBalance points to this value and it accumulates the
243 	 * balance of each parallel vacuum worker.
244 	 */
245 	pg_atomic_uint32 cost_balance;
246 
247 	/*
248 	 * Number of active parallel workers.  This is used for computing the
249 	 * minimum threshold of the vacuum cost balance before a worker sleeps for
250 	 * cost-based delay.
251 	 */
252 	pg_atomic_uint32 active_nworkers;
253 
254 	/*
255 	 * Variables to control parallel vacuum.  We have a bitmap to indicate
256 	 * which index has stats in shared memory.  The set bit in the map
257 	 * indicates that the particular index supports a parallel vacuum.
258 	 */
259 	pg_atomic_uint32 idx;		/* counter for vacuuming and clean up */
260 	uint32		offset;			/* sizeof header incl. bitmap */
261 	bits8		bitmap[FLEXIBLE_ARRAY_MEMBER];	/* bit map of NULLs */
262 
263 	/* Shared index statistics data follows at end of struct */
264 } LVShared;
265 
266 #define SizeOfLVShared (offsetof(LVShared, bitmap) + sizeof(bits8))
267 #define GetSharedIndStats(s) \
268 	((LVSharedIndStats *)((char *)(s) + ((LVShared *)(s))->offset))
269 #define IndStatsIsNull(s, i) \
270 	(!(((LVShared *)(s))->bitmap[(i) >> 3] & (1 << ((i) & 0x07))))
271 
272 /*
273  * Struct for an index bulk-deletion statistic used for parallel vacuum.  This
274  * is allocated in the DSM segment.
275  */
276 typedef struct LVSharedIndStats
277 {
278 	bool		updated;		/* are the stats updated? */
279 	IndexBulkDeleteResult istat;
280 } LVSharedIndStats;
281 
282 /* Struct for maintaining a parallel vacuum state. */
283 typedef struct LVParallelState
284 {
285 	ParallelContext *pcxt;
286 
287 	/* Shared information among parallel vacuum workers */
288 	LVShared   *lvshared;
289 
290 	/* Points to buffer usage area in DSM */
291 	BufferUsage *buffer_usage;
292 
293 	/* Points to WAL usage area in DSM */
294 	WalUsage   *wal_usage;
295 
296 	/*
297 	 * The number of indexes that support parallel index bulk-deletion and
298 	 * parallel index cleanup respectively.
299 	 */
300 	int			nindexes_parallel_bulkdel;
301 	int			nindexes_parallel_cleanup;
302 	int			nindexes_parallel_condcleanup;
303 } LVParallelState;
304 
305 typedef struct LVRelState
306 {
307 	/* Target heap relation and its indexes */
308 	Relation	rel;
309 	Relation   *indrels;
310 	int			nindexes;
311 
312 	/* Wraparound failsafe has been triggered? */
313 	bool		failsafe_active;
314 	/* Consider index vacuuming bypass optimization? */
315 	bool		consider_bypass_optimization;
316 
317 	/* Doing index vacuuming, index cleanup, rel truncation? */
318 	bool		do_index_vacuuming;
319 	bool		do_index_cleanup;
320 	bool		do_rel_truncate;
321 
322 	/* Buffer access strategy and parallel state */
323 	BufferAccessStrategy bstrategy;
324 	LVParallelState *lps;
325 
326 	/* Statistics from pg_class when we start out */
327 	BlockNumber old_rel_pages;	/* previous value of pg_class.relpages */
328 	double		old_live_tuples;	/* previous value of pg_class.reltuples */
329 	/* rel's initial relfrozenxid and relminmxid */
330 	TransactionId relfrozenxid;
331 	MultiXactId relminmxid;
332 
333 	/* VACUUM operation's cutoff for pruning */
334 	TransactionId OldestXmin;
335 	/* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */
336 	TransactionId FreezeLimit;
337 	MultiXactId MultiXactCutoff;
338 
339 	/* Error reporting state */
340 	char	   *relnamespace;
341 	char	   *relname;
342 	char	   *indname;
343 	BlockNumber blkno;			/* used only for heap operations */
344 	OffsetNumber offnum;		/* used only for heap operations */
345 	VacErrPhase phase;
346 
347 	/*
348 	 * State managed by lazy_scan_heap() follows
349 	 */
350 	LVDeadTuples *dead_tuples;	/* items to vacuum from indexes */
351 	BlockNumber rel_pages;		/* total number of pages */
352 	BlockNumber scanned_pages;	/* number of pages we examined */
353 	BlockNumber pinskipped_pages;	/* # of pages skipped due to a pin */
354 	BlockNumber frozenskipped_pages;	/* # of frozen pages we skipped */
355 	BlockNumber tupcount_pages; /* pages whose tuples we counted */
356 	BlockNumber pages_removed;	/* pages remove by truncation */
357 	BlockNumber lpdead_item_pages;	/* # pages with LP_DEAD items */
358 	BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
359 
360 	/* Statistics output by us, for table */
361 	double		new_rel_tuples; /* new estimated total # of tuples */
362 	double		new_live_tuples;	/* new estimated total # of live tuples */
363 	/* Statistics output by index AMs */
364 	IndexBulkDeleteResult **indstats;
365 
366 	/* Instrumentation counters */
367 	int			num_index_scans;
368 	int64		tuples_deleted; /* # deleted from table */
369 	int64		lpdead_items;	/* # deleted from indexes */
370 	int64		new_dead_tuples;	/* new estimated total # of dead items in
371 									 * table */
372 	int64		num_tuples;		/* total number of nonremovable tuples */
373 	int64		live_tuples;	/* live tuples (reltuples estimate) */
374 } LVRelState;
375 
376 /*
377  * State returned by lazy_scan_prune()
378  */
379 typedef struct LVPagePruneState
380 {
381 	bool		hastup;			/* Page is truncatable? */
382 	bool		has_lpdead_items;	/* includes existing LP_DEAD items */
383 
384 	/*
385 	 * State describes the proper VM bit states to set for the page following
386 	 * pruning and freezing.  all_visible implies !has_lpdead_items, but don't
387 	 * trust all_frozen result unless all_visible is also set to true.
388 	 */
389 	bool		all_visible;	/* Every item visible to all? */
390 	bool		all_frozen;		/* provided all_visible is also true */
391 	TransactionId visibility_cutoff_xid;	/* For recovery conflicts */
392 } LVPagePruneState;
393 
394 /* Struct for saving and restoring vacuum error information. */
395 typedef struct LVSavedErrInfo
396 {
397 	BlockNumber blkno;
398 	OffsetNumber offnum;
399 	VacErrPhase phase;
400 } LVSavedErrInfo;
401 
402 /* elevel controls whole VACUUM's verbosity */
403 static int	elevel = -1;
404 
405 
406 /* non-export function prototypes */
407 static void lazy_scan_heap(LVRelState *vacrel, VacuumParams *params,
408 						   bool aggressive);
409 static void lazy_scan_prune(LVRelState *vacrel, Buffer buf,
410 							BlockNumber blkno, Page page,
411 							GlobalVisState *vistest,
412 							LVPagePruneState *prunestate);
413 static void lazy_vacuum(LVRelState *vacrel);
414 static bool lazy_vacuum_all_indexes(LVRelState *vacrel);
415 static void lazy_vacuum_heap_rel(LVRelState *vacrel);
416 static int	lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno,
417 								  Buffer buffer, int tupindex, Buffer *vmbuffer);
418 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup,
419 									LVRelState *vacrel);
420 static bool lazy_check_wraparound_failsafe(LVRelState *vacrel);
421 static void do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel);
422 static void do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel);
423 static void do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers);
424 static void do_parallel_processing(LVRelState *vacrel,
425 								   LVShared *lvshared);
426 static void do_serial_processing_for_unsafe_indexes(LVRelState *vacrel,
427 													LVShared *lvshared);
428 static IndexBulkDeleteResult *parallel_process_one_index(Relation indrel,
429 														 IndexBulkDeleteResult *istat,
430 														 LVShared *lvshared,
431 														 LVSharedIndStats *shared_indstats,
432 														 LVRelState *vacrel);
433 static void lazy_cleanup_all_indexes(LVRelState *vacrel);
434 static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel,
435 													IndexBulkDeleteResult *istat,
436 													double reltuples,
437 													LVRelState *vacrel);
438 static IndexBulkDeleteResult *lazy_cleanup_one_index(Relation indrel,
439 													 IndexBulkDeleteResult *istat,
440 													 double reltuples,
441 													 bool estimated_count,
442 													 LVRelState *vacrel);
443 static bool should_attempt_truncation(LVRelState *vacrel);
444 static void lazy_truncate_heap(LVRelState *vacrel);
445 static BlockNumber count_nondeletable_pages(LVRelState *vacrel,
446 											bool *lock_waiter_detected);
447 static long compute_max_dead_tuples(BlockNumber relblocks, bool hasindex);
448 static void lazy_space_alloc(LVRelState *vacrel, int nworkers,
449 							 BlockNumber relblocks);
450 static void lazy_space_free(LVRelState *vacrel);
451 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
452 static int	vac_cmp_itemptr(const void *left, const void *right);
453 static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
454 									 TransactionId *visibility_cutoff_xid, bool *all_frozen);
455 static int	compute_parallel_vacuum_workers(LVRelState *vacrel,
456 											int nrequested,
457 											bool *will_parallel_vacuum);
458 static void update_index_statistics(LVRelState *vacrel);
459 static LVParallelState *begin_parallel_vacuum(LVRelState *vacrel,
460 											  BlockNumber nblocks,
461 											  int nrequested);
462 static void end_parallel_vacuum(LVRelState *vacrel);
463 static LVSharedIndStats *parallel_stats_for_idx(LVShared *lvshared, int getidx);
464 static bool parallel_processing_is_safe(Relation indrel, LVShared *lvshared);
465 static void vacuum_error_callback(void *arg);
466 static void update_vacuum_error_info(LVRelState *vacrel,
467 									 LVSavedErrInfo *saved_vacrel,
468 									 int phase, BlockNumber blkno,
469 									 OffsetNumber offnum);
470 static void restore_vacuum_error_info(LVRelState *vacrel,
471 									  const LVSavedErrInfo *saved_vacrel);
472 
473 
474 /*
475  *	heap_vacuum_rel() -- perform VACUUM for one heap relation
476  *
477  *		This routine vacuums a single heap, cleans out its indexes, and
478  *		updates its relpages and reltuples statistics.
479  *
480  *		At entry, we have already established a transaction and opened
481  *		and locked the relation.
482  */
483 void
heap_vacuum_rel(Relation rel,VacuumParams * params,BufferAccessStrategy bstrategy)484 heap_vacuum_rel(Relation rel, VacuumParams *params,
485 				BufferAccessStrategy bstrategy)
486 {
487 	LVRelState *vacrel;
488 	PGRUsage	ru0;
489 	TimestampTz starttime = 0;
490 	WalUsage	walusage_start = pgWalUsage;
491 	WalUsage	walusage = {0, 0, 0};
492 	long		secs;
493 	int			usecs;
494 	double		read_rate,
495 				write_rate;
496 	bool		aggressive;		/* should we scan all unfrozen pages? */
497 	bool		scanned_all_unfrozen;	/* actually scanned all such pages? */
498 	char	  **indnames = NULL;
499 	TransactionId xidFullScanLimit;
500 	MultiXactId mxactFullScanLimit;
501 	BlockNumber new_rel_pages;
502 	BlockNumber new_rel_allvisible;
503 	double		new_live_tuples;
504 	TransactionId new_frozen_xid;
505 	MultiXactId new_min_multi;
506 	ErrorContextCallback errcallback;
507 	PgStat_Counter startreadtime = 0;
508 	PgStat_Counter startwritetime = 0;
509 	TransactionId OldestXmin;
510 	TransactionId FreezeLimit;
511 	MultiXactId MultiXactCutoff;
512 
513 	/* measure elapsed time iff autovacuum logging requires it */
514 	if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
515 	{
516 		pg_rusage_init(&ru0);
517 		starttime = GetCurrentTimestamp();
518 		if (track_io_timing)
519 		{
520 			startreadtime = pgStatBlockReadTime;
521 			startwritetime = pgStatBlockWriteTime;
522 		}
523 	}
524 
525 	if (params->options & VACOPT_VERBOSE)
526 		elevel = INFO;
527 	else
528 		elevel = DEBUG2;
529 
530 	pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
531 								  RelationGetRelid(rel));
532 
533 	vacuum_set_xid_limits(rel,
534 						  params->freeze_min_age,
535 						  params->freeze_table_age,
536 						  params->multixact_freeze_min_age,
537 						  params->multixact_freeze_table_age,
538 						  &OldestXmin, &FreezeLimit, &xidFullScanLimit,
539 						  &MultiXactCutoff, &mxactFullScanLimit);
540 
541 	/*
542 	 * We request an aggressive scan if the table's frozen Xid is now older
543 	 * than or equal to the requested Xid full-table scan limit; or if the
544 	 * table's minimum MultiXactId is older than or equal to the requested
545 	 * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
546 	 */
547 	aggressive = TransactionIdPrecedesOrEquals(rel->rd_rel->relfrozenxid,
548 											   xidFullScanLimit);
549 	aggressive |= MultiXactIdPrecedesOrEquals(rel->rd_rel->relminmxid,
550 											  mxactFullScanLimit);
551 	if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
552 		aggressive = true;
553 
554 	vacrel = (LVRelState *) palloc0(sizeof(LVRelState));
555 
556 	/* Set up high level stuff about rel */
557 	vacrel->rel = rel;
558 	vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes,
559 					 &vacrel->indrels);
560 	vacrel->failsafe_active = false;
561 	vacrel->consider_bypass_optimization = true;
562 
563 	/*
564 	 * The index_cleanup param either disables index vacuuming and cleanup or
565 	 * forces it to go ahead when we would otherwise apply the index bypass
566 	 * optimization.  The default is 'auto', which leaves the final decision
567 	 * up to lazy_vacuum().
568 	 *
569 	 * The truncate param allows user to avoid attempting relation truncation,
570 	 * though it can't force truncation to happen.
571 	 */
572 	Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED);
573 	Assert(params->truncate != VACOPTVALUE_UNSPECIFIED &&
574 		   params->truncate != VACOPTVALUE_AUTO);
575 	vacrel->do_index_vacuuming = true;
576 	vacrel->do_index_cleanup = true;
577 	vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED);
578 	if (params->index_cleanup == VACOPTVALUE_DISABLED)
579 	{
580 		/* Force disable index vacuuming up-front */
581 		vacrel->do_index_vacuuming = false;
582 		vacrel->do_index_cleanup = false;
583 	}
584 	else if (params->index_cleanup == VACOPTVALUE_ENABLED)
585 	{
586 		/* Force index vacuuming.  Note that failsafe can still bypass. */
587 		vacrel->consider_bypass_optimization = false;
588 	}
589 	else
590 	{
591 		/* Default/auto, make all decisions dynamically */
592 		Assert(params->index_cleanup == VACOPTVALUE_AUTO);
593 	}
594 
595 	vacrel->bstrategy = bstrategy;
596 	vacrel->old_rel_pages = rel->rd_rel->relpages;
597 	vacrel->old_live_tuples = rel->rd_rel->reltuples;
598 	vacrel->relfrozenxid = rel->rd_rel->relfrozenxid;
599 	vacrel->relminmxid = rel->rd_rel->relminmxid;
600 
601 	/* Set cutoffs for entire VACUUM */
602 	vacrel->OldestXmin = OldestXmin;
603 	vacrel->FreezeLimit = FreezeLimit;
604 	vacrel->MultiXactCutoff = MultiXactCutoff;
605 
606 	vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel));
607 	vacrel->relname = pstrdup(RelationGetRelationName(rel));
608 	vacrel->indname = NULL;
609 	vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN;
610 
611 	/* Save index names iff autovacuum logging requires it */
612 	if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0 &&
613 		vacrel->nindexes > 0)
614 	{
615 		indnames = palloc(sizeof(char *) * vacrel->nindexes);
616 		for (int i = 0; i < vacrel->nindexes; i++)
617 			indnames[i] =
618 				pstrdup(RelationGetRelationName(vacrel->indrels[i]));
619 	}
620 
621 	/*
622 	 * Setup error traceback support for ereport().  The idea is to set up an
623 	 * error context callback to display additional information on any error
624 	 * during a vacuum.  During different phases of vacuum (heap scan, heap
625 	 * vacuum, index vacuum, index clean up, heap truncate), we update the
626 	 * error context callback to display appropriate information.
627 	 *
628 	 * Note that the index vacuum and heap vacuum phases may be called
629 	 * multiple times in the middle of the heap scan phase.  So the old phase
630 	 * information is restored at the end of those phases.
631 	 */
632 	errcallback.callback = vacuum_error_callback;
633 	errcallback.arg = vacrel;
634 	errcallback.previous = error_context_stack;
635 	error_context_stack = &errcallback;
636 
637 	/* Do the vacuuming */
638 	lazy_scan_heap(vacrel, params, aggressive);
639 
640 	/* Done with indexes */
641 	vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock);
642 
643 	/*
644 	 * Compute whether we actually scanned the all unfrozen pages. If we did,
645 	 * we can adjust relfrozenxid and relminmxid.
646 	 *
647 	 * NB: We need to check this before truncating the relation, because that
648 	 * will change ->rel_pages.
649 	 */
650 	if ((vacrel->scanned_pages + vacrel->frozenskipped_pages)
651 		< vacrel->rel_pages)
652 	{
653 		Assert(!aggressive);
654 		scanned_all_unfrozen = false;
655 	}
656 	else
657 		scanned_all_unfrozen = true;
658 
659 	/*
660 	 * Optionally truncate the relation.
661 	 */
662 	if (should_attempt_truncation(vacrel))
663 	{
664 		/*
665 		 * Update error traceback information.  This is the last phase during
666 		 * which we add context information to errors, so we don't need to
667 		 * revert to the previous phase.
668 		 */
669 		update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE,
670 								 vacrel->nonempty_pages,
671 								 InvalidOffsetNumber);
672 		lazy_truncate_heap(vacrel);
673 	}
674 
675 	/* Pop the error context stack */
676 	error_context_stack = errcallback.previous;
677 
678 	/* Report that we are now doing final cleanup */
679 	pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
680 								 PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
681 
682 	/*
683 	 * Update statistics in pg_class.
684 	 *
685 	 * In principle new_live_tuples could be -1 indicating that we (still)
686 	 * don't know the tuple count.  In practice that probably can't happen,
687 	 * since we'd surely have scanned some pages if the table is new and
688 	 * nonempty.
689 	 *
690 	 * For safety, clamp relallvisible to be not more than what we're setting
691 	 * relpages to.
692 	 *
693 	 * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
694 	 * since then we don't know for certain that all tuples have a newer xmin.
695 	 */
696 	new_rel_pages = vacrel->rel_pages;
697 	new_live_tuples = vacrel->new_live_tuples;
698 
699 	visibilitymap_count(rel, &new_rel_allvisible, NULL);
700 	if (new_rel_allvisible > new_rel_pages)
701 		new_rel_allvisible = new_rel_pages;
702 
703 	new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId;
704 	new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId;
705 
706 	vac_update_relstats(rel,
707 						new_rel_pages,
708 						new_live_tuples,
709 						new_rel_allvisible,
710 						vacrel->nindexes > 0,
711 						new_frozen_xid,
712 						new_min_multi,
713 						false);
714 
715 	/*
716 	 * Report results to the stats collector, too.
717 	 *
718 	 * Deliberately avoid telling the stats collector about LP_DEAD items that
719 	 * remain in the table due to VACUUM bypassing index and heap vacuuming.
720 	 * ANALYZE will consider the remaining LP_DEAD items to be dead tuples. It
721 	 * seems like a good idea to err on the side of not vacuuming again too
722 	 * soon in cases where the failsafe prevented significant amounts of heap
723 	 * vacuuming.
724 	 */
725 	pgstat_report_vacuum(RelationGetRelid(rel),
726 						 rel->rd_rel->relisshared,
727 						 Max(new_live_tuples, 0),
728 						 vacrel->new_dead_tuples);
729 	pgstat_progress_end_command();
730 
731 	/* and log the action if appropriate */
732 	if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
733 	{
734 		TimestampTz endtime = GetCurrentTimestamp();
735 
736 		if (params->log_min_duration == 0 ||
737 			TimestampDifferenceExceeds(starttime, endtime,
738 									   params->log_min_duration))
739 		{
740 			StringInfoData buf;
741 			char	   *msgfmt;
742 			BlockNumber orig_rel_pages;
743 
744 			TimestampDifference(starttime, endtime, &secs, &usecs);
745 
746 			memset(&walusage, 0, sizeof(WalUsage));
747 			WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
748 
749 			read_rate = 0;
750 			write_rate = 0;
751 			if ((secs > 0) || (usecs > 0))
752 			{
753 				read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) /
754 					(secs + usecs / 1000000.0);
755 				write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) /
756 					(secs + usecs / 1000000.0);
757 			}
758 
759 			/*
760 			 * This is pretty messy, but we split it up so that we can skip
761 			 * emitting individual parts of the message when not applicable.
762 			 */
763 			initStringInfo(&buf);
764 			if (params->is_wraparound)
765 			{
766 				/*
767 				 * While it's possible for a VACUUM to be both is_wraparound
768 				 * and !aggressive, that's just a corner-case -- is_wraparound
769 				 * implies aggressive.  Produce distinct output for the corner
770 				 * case all the same, just in case.
771 				 */
772 				if (aggressive)
773 					msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
774 				else
775 					msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
776 			}
777 			else
778 			{
779 				if (aggressive)
780 					msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
781 				else
782 					msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
783 			}
784 			appendStringInfo(&buf, msgfmt,
785 							 get_database_name(MyDatabaseId),
786 							 vacrel->relnamespace,
787 							 vacrel->relname,
788 							 vacrel->num_index_scans);
789 			appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
790 							 vacrel->pages_removed,
791 							 vacrel->rel_pages,
792 							 vacrel->pinskipped_pages,
793 							 vacrel->frozenskipped_pages);
794 			appendStringInfo(&buf,
795 							 _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %u\n"),
796 							 (long long) vacrel->tuples_deleted,
797 							 (long long) vacrel->new_rel_tuples,
798 							 (long long) vacrel->new_dead_tuples,
799 							 OldestXmin);
800 			orig_rel_pages = vacrel->rel_pages + vacrel->pages_removed;
801 			if (orig_rel_pages > 0)
802 			{
803 				if (vacrel->do_index_vacuuming)
804 				{
805 					if (vacrel->nindexes == 0 || vacrel->num_index_scans == 0)
806 						appendStringInfoString(&buf, _("index scan not needed: "));
807 					else
808 						appendStringInfoString(&buf, _("index scan needed: "));
809 
810 					msgfmt = _("%u pages from table (%.2f%% of total) had %lld dead item identifiers removed\n");
811 				}
812 				else
813 				{
814 					if (!vacrel->failsafe_active)
815 						appendStringInfoString(&buf, _("index scan bypassed: "));
816 					else
817 						appendStringInfoString(&buf, _("index scan bypassed by failsafe: "));
818 
819 					msgfmt = _("%u pages from table (%.2f%% of total) have %lld dead item identifiers\n");
820 				}
821 				appendStringInfo(&buf, msgfmt,
822 								 vacrel->lpdead_item_pages,
823 								 100.0 * vacrel->lpdead_item_pages / orig_rel_pages,
824 								 (long long) vacrel->lpdead_items);
825 			}
826 			for (int i = 0; i < vacrel->nindexes; i++)
827 			{
828 				IndexBulkDeleteResult *istat = vacrel->indstats[i];
829 
830 				if (!istat)
831 					continue;
832 
833 				appendStringInfo(&buf,
834 								 _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"),
835 								 indnames[i],
836 								 istat->num_pages,
837 								 istat->pages_newly_deleted,
838 								 istat->pages_deleted,
839 								 istat->pages_free);
840 			}
841 			if (track_io_timing)
842 			{
843 				double		read_ms = (double) (pgStatBlockReadTime - startreadtime) / 1000;
844 				double		write_ms = (double) (pgStatBlockWriteTime - startwritetime) / 1000;
845 
846 				appendStringInfo(&buf, _("I/O timings: read: %.3f ms, write: %.3f ms\n"),
847 								 read_ms, write_ms);
848 			}
849 			appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
850 							 read_rate, write_rate);
851 			appendStringInfo(&buf,
852 							 _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
853 							 (long long) VacuumPageHit,
854 							 (long long) VacuumPageMiss,
855 							 (long long) VacuumPageDirty);
856 			appendStringInfo(&buf,
857 							 _("WAL usage: %lld records, %lld full page images, %llu bytes\n"),
858 							 (long long) walusage.wal_records,
859 							 (long long) walusage.wal_fpi,
860 							 (unsigned long long) walusage.wal_bytes);
861 			appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
862 
863 			ereport(LOG,
864 					(errmsg_internal("%s", buf.data)));
865 			pfree(buf.data);
866 		}
867 	}
868 
869 	/* Cleanup index statistics and index names */
870 	for (int i = 0; i < vacrel->nindexes; i++)
871 	{
872 		if (vacrel->indstats[i])
873 			pfree(vacrel->indstats[i]);
874 
875 		if (indnames && indnames[i])
876 			pfree(indnames[i]);
877 	}
878 }
879 
880 /*
881  *	lazy_scan_heap() -- scan an open heap relation
882  *
883  *		This routine prunes each page in the heap, which will among other
884  *		things truncate dead tuples to dead line pointers, defragment the
885  *		page, and set commit status bits (see heap_page_prune).  It also builds
886  *		lists of dead tuples and pages with free space, calculates statistics
887  *		on the number of live tuples in the heap, and marks pages as
888  *		all-visible if appropriate.  When done, or when we run low on space
889  *		for dead-tuple TIDs, invoke lazy_vacuum to vacuum indexes and vacuum
890  *		heap relation during its own second pass over the heap.
891  *
892  *		If the table has at least two indexes, we execute both index vacuum
893  *		and index cleanup with parallel workers unless parallel vacuum is
894  *		disabled.  In a parallel vacuum, we enter parallel mode and then
895  *		create both the parallel context and the DSM segment before starting
896  *		heap scan so that we can record dead tuples to the DSM segment.  All
897  *		parallel workers are launched at beginning of index vacuuming and
898  *		index cleanup and they exit once done with all indexes.  At the end of
899  *		this function we exit from parallel mode.  Index bulk-deletion results
900  *		are stored in the DSM segment and we update index statistics for all
901  *		the indexes after exiting from parallel mode since writes are not
902  *		allowed during parallel mode.
903  *
904  *		If there are no indexes then we can reclaim line pointers on the fly;
905  *		dead line pointers need only be retained until all index pointers that
906  *		reference them have been killed.
907  */
908 static void
lazy_scan_heap(LVRelState * vacrel,VacuumParams * params,bool aggressive)909 lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
910 {
911 	LVDeadTuples *dead_tuples;
912 	BlockNumber nblocks,
913 				blkno,
914 				next_unskippable_block,
915 				next_failsafe_block,
916 				next_fsm_block_to_vacuum;
917 	PGRUsage	ru0;
918 	Buffer		vmbuffer = InvalidBuffer;
919 	bool		skipping_blocks;
920 	StringInfoData buf;
921 	const int	initprog_index[] = {
922 		PROGRESS_VACUUM_PHASE,
923 		PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
924 		PROGRESS_VACUUM_MAX_DEAD_TUPLES
925 	};
926 	int64		initprog_val[3];
927 	GlobalVisState *vistest;
928 
929 	pg_rusage_init(&ru0);
930 
931 	if (aggressive)
932 		ereport(elevel,
933 				(errmsg("aggressively vacuuming \"%s.%s\"",
934 						vacrel->relnamespace,
935 						vacrel->relname)));
936 	else
937 		ereport(elevel,
938 				(errmsg("vacuuming \"%s.%s\"",
939 						vacrel->relnamespace,
940 						vacrel->relname)));
941 
942 	nblocks = RelationGetNumberOfBlocks(vacrel->rel);
943 	next_unskippable_block = 0;
944 	next_failsafe_block = 0;
945 	next_fsm_block_to_vacuum = 0;
946 	vacrel->rel_pages = nblocks;
947 	vacrel->scanned_pages = 0;
948 	vacrel->pinskipped_pages = 0;
949 	vacrel->frozenskipped_pages = 0;
950 	vacrel->tupcount_pages = 0;
951 	vacrel->pages_removed = 0;
952 	vacrel->lpdead_item_pages = 0;
953 	vacrel->nonempty_pages = 0;
954 
955 	/* Initialize instrumentation counters */
956 	vacrel->num_index_scans = 0;
957 	vacrel->tuples_deleted = 0;
958 	vacrel->lpdead_items = 0;
959 	vacrel->new_dead_tuples = 0;
960 	vacrel->num_tuples = 0;
961 	vacrel->live_tuples = 0;
962 
963 	vistest = GlobalVisTestFor(vacrel->rel);
964 
965 	vacrel->indstats = (IndexBulkDeleteResult **)
966 		palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *));
967 
968 	/*
969 	 * Before beginning scan, check if it's already necessary to apply
970 	 * failsafe
971 	 */
972 	lazy_check_wraparound_failsafe(vacrel);
973 
974 	/*
975 	 * Allocate the space for dead tuples.  Note that this handles parallel
976 	 * VACUUM initialization as part of allocating shared memory space used
977 	 * for dead_tuples.
978 	 */
979 	lazy_space_alloc(vacrel, params->nworkers, nblocks);
980 	dead_tuples = vacrel->dead_tuples;
981 
982 	/* Report that we're scanning the heap, advertising total # of blocks */
983 	initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
984 	initprog_val[1] = nblocks;
985 	initprog_val[2] = dead_tuples->max_tuples;
986 	pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
987 
988 	/*
989 	 * Except when aggressive is set, we want to skip pages that are
990 	 * all-visible according to the visibility map, but only when we can skip
991 	 * at least SKIP_PAGES_THRESHOLD consecutive pages.  Since we're reading
992 	 * sequentially, the OS should be doing readahead for us, so there's no
993 	 * gain in skipping a page now and then; that's likely to disable
994 	 * readahead and so be counterproductive. Also, skipping even a single
995 	 * page means that we can't update relfrozenxid, so we only want to do it
996 	 * if we can skip a goodly number of pages.
997 	 *
998 	 * When aggressive is set, we can't skip pages just because they are
999 	 * all-visible, but we can still skip pages that are all-frozen, since
1000 	 * such pages do not need freezing and do not affect the value that we can
1001 	 * safely set for relfrozenxid or relminmxid.
1002 	 *
1003 	 * Before entering the main loop, establish the invariant that
1004 	 * next_unskippable_block is the next block number >= blkno that we can't
1005 	 * skip based on the visibility map, either all-visible for a regular scan
1006 	 * or all-frozen for an aggressive scan.  We set it to nblocks if there's
1007 	 * no such block.  We also set up the skipping_blocks flag correctly at
1008 	 * this stage.
1009 	 *
1010 	 * Note: The value returned by visibilitymap_get_status could be slightly
1011 	 * out-of-date, since we make this test before reading the corresponding
1012 	 * heap page or locking the buffer.  This is OK.  If we mistakenly think
1013 	 * that the page is all-visible or all-frozen when in fact the flag's just
1014 	 * been cleared, we might fail to vacuum the page.  It's easy to see that
1015 	 * skipping a page when aggressive is not set is not a very big deal; we
1016 	 * might leave some dead tuples lying around, but the next vacuum will
1017 	 * find them.  But even when aggressive *is* set, it's still OK if we miss
1018 	 * a page whose all-frozen marking has just been cleared.  Any new XIDs
1019 	 * just added to that page are necessarily newer than the GlobalXmin we
1020 	 * computed, so they'll have no effect on the value to which we can safely
1021 	 * set relfrozenxid.  A similar argument applies for MXIDs and relminmxid.
1022 	 *
1023 	 * We will scan the table's last page, at least to the extent of
1024 	 * determining whether it has tuples or not, even if it should be skipped
1025 	 * according to the above rules; except when we've already determined that
1026 	 * it's not worth trying to truncate the table.  This avoids having
1027 	 * lazy_truncate_heap() take access-exclusive lock on the table to attempt
1028 	 * a truncation that just fails immediately because there are tuples in
1029 	 * the last page.  This is worth avoiding mainly because such a lock must
1030 	 * be replayed on any hot standby, where it can be disruptive.
1031 	 */
1032 	if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1033 	{
1034 		while (next_unskippable_block < nblocks)
1035 		{
1036 			uint8		vmstatus;
1037 
1038 			vmstatus = visibilitymap_get_status(vacrel->rel,
1039 												next_unskippable_block,
1040 												&vmbuffer);
1041 			if (aggressive)
1042 			{
1043 				if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
1044 					break;
1045 			}
1046 			else
1047 			{
1048 				if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
1049 					break;
1050 			}
1051 			vacuum_delay_point();
1052 			next_unskippable_block++;
1053 		}
1054 	}
1055 
1056 	if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
1057 		skipping_blocks = true;
1058 	else
1059 		skipping_blocks = false;
1060 
1061 	for (blkno = 0; blkno < nblocks; blkno++)
1062 	{
1063 		Buffer		buf;
1064 		Page		page;
1065 		bool		all_visible_according_to_vm = false;
1066 		LVPagePruneState prunestate;
1067 
1068 		/*
1069 		 * Consider need to skip blocks.  See note above about forcing
1070 		 * scanning of last page.
1071 		 */
1072 #define FORCE_CHECK_PAGE() \
1073 		(blkno == nblocks - 1 && should_attempt_truncation(vacrel))
1074 
1075 		pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1076 
1077 		update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
1078 								 blkno, InvalidOffsetNumber);
1079 
1080 		if (blkno == next_unskippable_block)
1081 		{
1082 			/* Time to advance next_unskippable_block */
1083 			next_unskippable_block++;
1084 			if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1085 			{
1086 				while (next_unskippable_block < nblocks)
1087 				{
1088 					uint8		vmskipflags;
1089 
1090 					vmskipflags = visibilitymap_get_status(vacrel->rel,
1091 														   next_unskippable_block,
1092 														   &vmbuffer);
1093 					if (aggressive)
1094 					{
1095 						if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
1096 							break;
1097 					}
1098 					else
1099 					{
1100 						if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
1101 							break;
1102 					}
1103 					vacuum_delay_point();
1104 					next_unskippable_block++;
1105 				}
1106 			}
1107 
1108 			/*
1109 			 * We know we can't skip the current block.  But set up
1110 			 * skipping_blocks to do the right thing at the following blocks.
1111 			 */
1112 			if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
1113 				skipping_blocks = true;
1114 			else
1115 				skipping_blocks = false;
1116 
1117 			/*
1118 			 * Normally, the fact that we can't skip this block must mean that
1119 			 * it's not all-visible.  But in an aggressive vacuum we know only
1120 			 * that it's not all-frozen, so it might still be all-visible.
1121 			 */
1122 			if (aggressive && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1123 				all_visible_according_to_vm = true;
1124 		}
1125 		else
1126 		{
1127 			/*
1128 			 * The current block is potentially skippable; if we've seen a
1129 			 * long enough run of skippable blocks to justify skipping it, and
1130 			 * we're not forced to check it, then go ahead and skip.
1131 			 * Otherwise, the page must be at least all-visible if not
1132 			 * all-frozen, so we can set all_visible_according_to_vm = true.
1133 			 */
1134 			if (skipping_blocks && !FORCE_CHECK_PAGE())
1135 			{
1136 				/*
1137 				 * Tricky, tricky.  If this is in aggressive vacuum, the page
1138 				 * must have been all-frozen at the time we checked whether it
1139 				 * was skippable, but it might not be any more.  We must be
1140 				 * careful to count it as a skipped all-frozen page in that
1141 				 * case, or else we'll think we can't update relfrozenxid and
1142 				 * relminmxid.  If it's not an aggressive vacuum, we don't
1143 				 * know whether it was all-frozen, so we have to recheck; but
1144 				 * in this case an approximate answer is OK.
1145 				 */
1146 				if (aggressive || VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1147 					vacrel->frozenskipped_pages++;
1148 				continue;
1149 			}
1150 			all_visible_according_to_vm = true;
1151 		}
1152 
1153 		vacuum_delay_point();
1154 
1155 		/*
1156 		 * Regularly check if wraparound failsafe should trigger.
1157 		 *
1158 		 * There is a similar check inside lazy_vacuum_all_indexes(), but
1159 		 * relfrozenxid might start to look dangerously old before we reach
1160 		 * that point.  This check also provides failsafe coverage for the
1161 		 * one-pass strategy, and the two-pass strategy with the index_cleanup
1162 		 * param set to 'off'.
1163 		 */
1164 		if (blkno - next_failsafe_block >= FAILSAFE_EVERY_PAGES)
1165 		{
1166 			lazy_check_wraparound_failsafe(vacrel);
1167 			next_failsafe_block = blkno;
1168 		}
1169 
1170 		/*
1171 		 * Consider if we definitely have enough space to process TIDs on page
1172 		 * already.  If we are close to overrunning the available space for
1173 		 * dead-tuple TIDs, pause and do a cycle of vacuuming before we tackle
1174 		 * this page.
1175 		 */
1176 		if ((dead_tuples->max_tuples - dead_tuples->num_tuples) < MaxHeapTuplesPerPage &&
1177 			dead_tuples->num_tuples > 0)
1178 		{
1179 			/*
1180 			 * Before beginning index vacuuming, we release any pin we may
1181 			 * hold on the visibility map page.  This isn't necessary for
1182 			 * correctness, but we do it anyway to avoid holding the pin
1183 			 * across a lengthy, unrelated operation.
1184 			 */
1185 			if (BufferIsValid(vmbuffer))
1186 			{
1187 				ReleaseBuffer(vmbuffer);
1188 				vmbuffer = InvalidBuffer;
1189 			}
1190 
1191 			/* Remove the collected garbage tuples from table and indexes */
1192 			vacrel->consider_bypass_optimization = false;
1193 			lazy_vacuum(vacrel);
1194 
1195 			/*
1196 			 * Vacuum the Free Space Map to make newly-freed space visible on
1197 			 * upper-level FSM pages.  Note we have not yet processed blkno.
1198 			 */
1199 			FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1200 									blkno);
1201 			next_fsm_block_to_vacuum = blkno;
1202 
1203 			/* Report that we are once again scanning the heap */
1204 			pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1205 										 PROGRESS_VACUUM_PHASE_SCAN_HEAP);
1206 		}
1207 
1208 		/*
1209 		 * Set up visibility map page as needed.
1210 		 *
1211 		 * Pin the visibility map page in case we need to mark the page
1212 		 * all-visible.  In most cases this will be very cheap, because we'll
1213 		 * already have the correct page pinned anyway.  However, it's
1214 		 * possible that (a) next_unskippable_block is covered by a different
1215 		 * VM page than the current block or (b) we released our pin and did a
1216 		 * cycle of index vacuuming.
1217 		 */
1218 		visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
1219 
1220 		buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno,
1221 								 RBM_NORMAL, vacrel->bstrategy);
1222 
1223 		/*
1224 		 * We need buffer cleanup lock so that we can prune HOT chains and
1225 		 * defragment the page.
1226 		 */
1227 		if (!ConditionalLockBufferForCleanup(buf))
1228 		{
1229 			bool		hastup;
1230 
1231 			/*
1232 			 * If we're not performing an aggressive scan to guard against XID
1233 			 * wraparound, and we don't want to forcibly check the page, then
1234 			 * it's OK to skip vacuuming pages we get a lock conflict on. They
1235 			 * will be dealt with in some future vacuum.
1236 			 */
1237 			if (!aggressive && !FORCE_CHECK_PAGE())
1238 			{
1239 				ReleaseBuffer(buf);
1240 				vacrel->pinskipped_pages++;
1241 				continue;
1242 			}
1243 
1244 			/*
1245 			 * Read the page with share lock to see if any xids on it need to
1246 			 * be frozen.  If not we just skip the page, after updating our
1247 			 * scan statistics.  If there are some, we wait for cleanup lock.
1248 			 *
1249 			 * We could defer the lock request further by remembering the page
1250 			 * and coming back to it later, or we could even register
1251 			 * ourselves for multiple buffers and then service whichever one
1252 			 * is received first.  For now, this seems good enough.
1253 			 *
1254 			 * If we get here with aggressive false, then we're just forcibly
1255 			 * checking the page, and so we don't want to insist on getting
1256 			 * the lock; we only need to know if the page contains tuples, so
1257 			 * that we can update nonempty_pages correctly.  It's convenient
1258 			 * to use lazy_check_needs_freeze() for both situations, though.
1259 			 */
1260 			LockBuffer(buf, BUFFER_LOCK_SHARE);
1261 			if (!lazy_check_needs_freeze(buf, &hastup, vacrel))
1262 			{
1263 				UnlockReleaseBuffer(buf);
1264 				vacrel->scanned_pages++;
1265 				vacrel->pinskipped_pages++;
1266 				if (hastup)
1267 					vacrel->nonempty_pages = blkno + 1;
1268 				continue;
1269 			}
1270 			if (!aggressive)
1271 			{
1272 				/*
1273 				 * Here, we must not advance scanned_pages; that would amount
1274 				 * to claiming that the page contains no freezable tuples.
1275 				 */
1276 				UnlockReleaseBuffer(buf);
1277 				vacrel->pinskipped_pages++;
1278 				if (hastup)
1279 					vacrel->nonempty_pages = blkno + 1;
1280 				continue;
1281 			}
1282 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1283 			LockBufferForCleanup(buf);
1284 			/* drop through to normal processing */
1285 		}
1286 
1287 		/*
1288 		 * By here we definitely have enough dead_tuples space for whatever
1289 		 * LP_DEAD tids are on this page, we have the visibility map page set
1290 		 * up in case we need to set this page's all_visible/all_frozen bit,
1291 		 * and we have a super-exclusive lock.  Any tuples on this page are
1292 		 * now sure to be "counted" by this VACUUM.
1293 		 *
1294 		 * One last piece of preamble needs to take place before we can prune:
1295 		 * we need to consider new and empty pages.
1296 		 */
1297 		vacrel->scanned_pages++;
1298 		vacrel->tupcount_pages++;
1299 
1300 		page = BufferGetPage(buf);
1301 
1302 		if (PageIsNew(page))
1303 		{
1304 			/*
1305 			 * All-zeroes pages can be left over if either a backend extends
1306 			 * the relation by a single page, but crashes before the newly
1307 			 * initialized page has been written out, or when bulk-extending
1308 			 * the relation (which creates a number of empty pages at the tail
1309 			 * end of the relation, but enters them into the FSM).
1310 			 *
1311 			 * Note we do not enter the page into the visibilitymap. That has
1312 			 * the downside that we repeatedly visit this page in subsequent
1313 			 * vacuums, but otherwise we'll never not discover the space on a
1314 			 * promoted standby. The harm of repeated checking ought to
1315 			 * normally not be too bad - the space usually should be used at
1316 			 * some point, otherwise there wouldn't be any regular vacuums.
1317 			 *
1318 			 * Make sure these pages are in the FSM, to ensure they can be
1319 			 * reused. Do that by testing if there's any space recorded for
1320 			 * the page. If not, enter it. We do so after releasing the lock
1321 			 * on the heap page, the FSM is approximate, after all.
1322 			 */
1323 			UnlockReleaseBuffer(buf);
1324 
1325 			if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0)
1326 			{
1327 				Size		freespace = BLCKSZ - SizeOfPageHeaderData;
1328 
1329 				RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1330 			}
1331 			continue;
1332 		}
1333 
1334 		if (PageIsEmpty(page))
1335 		{
1336 			Size		freespace = PageGetHeapFreeSpace(page);
1337 
1338 			/*
1339 			 * Empty pages are always all-visible and all-frozen (note that
1340 			 * the same is currently not true for new pages, see above).
1341 			 */
1342 			if (!PageIsAllVisible(page))
1343 			{
1344 				START_CRIT_SECTION();
1345 
1346 				/* mark buffer dirty before writing a WAL record */
1347 				MarkBufferDirty(buf);
1348 
1349 				/*
1350 				 * It's possible that another backend has extended the heap,
1351 				 * initialized the page, and then failed to WAL-log the page
1352 				 * due to an ERROR.  Since heap extension is not WAL-logged,
1353 				 * recovery might try to replay our record setting the page
1354 				 * all-visible and find that the page isn't initialized, which
1355 				 * will cause a PANIC.  To prevent that, check whether the
1356 				 * page has been previously WAL-logged, and if not, do that
1357 				 * now.
1358 				 */
1359 				if (RelationNeedsWAL(vacrel->rel) &&
1360 					PageGetLSN(page) == InvalidXLogRecPtr)
1361 					log_newpage_buffer(buf, true);
1362 
1363 				PageSetAllVisible(page);
1364 				visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1365 								  vmbuffer, InvalidTransactionId,
1366 								  VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
1367 				END_CRIT_SECTION();
1368 			}
1369 
1370 			UnlockReleaseBuffer(buf);
1371 			RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1372 			continue;
1373 		}
1374 
1375 		/*
1376 		 * Prune and freeze tuples.
1377 		 *
1378 		 * Accumulates details of remaining LP_DEAD line pointers on page in
1379 		 * dead tuple list.  This includes LP_DEAD line pointers that we
1380 		 * pruned ourselves, as well as existing LP_DEAD line pointers that
1381 		 * were pruned some time earlier.  Also considers freezing XIDs in the
1382 		 * tuple headers of remaining items with storage.
1383 		 */
1384 		lazy_scan_prune(vacrel, buf, blkno, page, vistest, &prunestate);
1385 
1386 		Assert(!prunestate.all_visible || !prunestate.has_lpdead_items);
1387 
1388 		/* Remember the location of the last page with nonremovable tuples */
1389 		if (prunestate.hastup)
1390 			vacrel->nonempty_pages = blkno + 1;
1391 
1392 		if (vacrel->nindexes == 0)
1393 		{
1394 			/*
1395 			 * Consider the need to do page-at-a-time heap vacuuming when
1396 			 * using the one-pass strategy now.
1397 			 *
1398 			 * The one-pass strategy will never call lazy_vacuum().  The steps
1399 			 * performed here can be thought of as the one-pass equivalent of
1400 			 * a call to lazy_vacuum().
1401 			 */
1402 			if (prunestate.has_lpdead_items)
1403 			{
1404 				Size		freespace;
1405 
1406 				lazy_vacuum_heap_page(vacrel, blkno, buf, 0, &vmbuffer);
1407 
1408 				/* Forget the now-vacuumed tuples */
1409 				dead_tuples->num_tuples = 0;
1410 
1411 				/*
1412 				 * Periodically perform FSM vacuuming to make newly-freed
1413 				 * space visible on upper FSM pages.  Note we have not yet
1414 				 * performed FSM processing for blkno.
1415 				 */
1416 				if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
1417 				{
1418 					FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1419 											blkno);
1420 					next_fsm_block_to_vacuum = blkno;
1421 				}
1422 
1423 				/*
1424 				 * Now perform FSM processing for blkno, and move on to next
1425 				 * page.
1426 				 *
1427 				 * Our call to lazy_vacuum_heap_page() will have considered if
1428 				 * it's possible to set all_visible/all_frozen independently
1429 				 * of lazy_scan_prune().  Note that prunestate was invalidated
1430 				 * by lazy_vacuum_heap_page() call.
1431 				 */
1432 				freespace = PageGetHeapFreeSpace(page);
1433 
1434 				UnlockReleaseBuffer(buf);
1435 				RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1436 				continue;
1437 			}
1438 
1439 			/*
1440 			 * There was no call to lazy_vacuum_heap_page() because pruning
1441 			 * didn't encounter/create any LP_DEAD items that needed to be
1442 			 * vacuumed.  Prune state has not been invalidated, so proceed
1443 			 * with prunestate-driven visibility map and FSM steps (just like
1444 			 * the two-pass strategy).
1445 			 */
1446 			Assert(dead_tuples->num_tuples == 0);
1447 		}
1448 
1449 		/*
1450 		 * Handle setting visibility map bit based on what the VM said about
1451 		 * the page before pruning started, and using prunestate
1452 		 */
1453 		if (!all_visible_according_to_vm && prunestate.all_visible)
1454 		{
1455 			uint8		flags = VISIBILITYMAP_ALL_VISIBLE;
1456 
1457 			if (prunestate.all_frozen)
1458 				flags |= VISIBILITYMAP_ALL_FROZEN;
1459 
1460 			/*
1461 			 * It should never be the case that the visibility map page is set
1462 			 * while the page-level bit is clear, but the reverse is allowed
1463 			 * (if checksums are not enabled).  Regardless, set both bits so
1464 			 * that we get back in sync.
1465 			 *
1466 			 * NB: If the heap page is all-visible but the VM bit is not set,
1467 			 * we don't need to dirty the heap page.  However, if checksums
1468 			 * are enabled, we do need to make sure that the heap page is
1469 			 * dirtied before passing it to visibilitymap_set(), because it
1470 			 * may be logged.  Given that this situation should only happen in
1471 			 * rare cases after a crash, it is not worth optimizing.
1472 			 */
1473 			PageSetAllVisible(page);
1474 			MarkBufferDirty(buf);
1475 			visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1476 							  vmbuffer, prunestate.visibility_cutoff_xid,
1477 							  flags);
1478 		}
1479 
1480 		/*
1481 		 * As of PostgreSQL 9.2, the visibility map bit should never be set if
1482 		 * the page-level bit is clear.  However, it's possible that the bit
1483 		 * got cleared after we checked it and before we took the buffer
1484 		 * content lock, so we must recheck before jumping to the conclusion
1485 		 * that something bad has happened.
1486 		 */
1487 		else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1488 				 && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1489 		{
1490 			elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1491 				 vacrel->relname, blkno);
1492 			visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1493 								VISIBILITYMAP_VALID_BITS);
1494 		}
1495 
1496 		/*
1497 		 * It's possible for the value returned by
1498 		 * GetOldestNonRemovableTransactionId() to move backwards, so it's not
1499 		 * wrong for us to see tuples that appear to not be visible to
1500 		 * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
1501 		 * xmin value never moves backwards, but
1502 		 * GetOldestNonRemovableTransactionId() is conservative and sometimes
1503 		 * returns a value that's unnecessarily small, so if we see that
1504 		 * contradiction it just means that the tuples that we think are not
1505 		 * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
1506 		 * is correct.
1507 		 *
1508 		 * There should never be dead tuples on a page with PD_ALL_VISIBLE
1509 		 * set, however.
1510 		 */
1511 		else if (prunestate.has_lpdead_items && PageIsAllVisible(page))
1512 		{
1513 			elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1514 				 vacrel->relname, blkno);
1515 			PageClearAllVisible(page);
1516 			MarkBufferDirty(buf);
1517 			visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1518 								VISIBILITYMAP_VALID_BITS);
1519 		}
1520 
1521 		/*
1522 		 * If the all-visible page is all-frozen but not marked as such yet,
1523 		 * mark it as all-frozen.  Note that all_frozen is only valid if
1524 		 * all_visible is true, so we must check both.
1525 		 */
1526 		else if (all_visible_according_to_vm && prunestate.all_visible &&
1527 				 prunestate.all_frozen &&
1528 				 !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1529 		{
1530 			/*
1531 			 * We can pass InvalidTransactionId as the cutoff XID here,
1532 			 * because setting the all-frozen bit doesn't cause recovery
1533 			 * conflicts.
1534 			 */
1535 			visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1536 							  vmbuffer, InvalidTransactionId,
1537 							  VISIBILITYMAP_ALL_FROZEN);
1538 		}
1539 
1540 		/*
1541 		 * Final steps for block: drop super-exclusive lock, record free space
1542 		 * in the FSM
1543 		 */
1544 		if (prunestate.has_lpdead_items && vacrel->do_index_vacuuming)
1545 		{
1546 			/*
1547 			 * Wait until lazy_vacuum_heap_rel() to save free space.  This
1548 			 * doesn't just save us some cycles; it also allows us to record
1549 			 * any additional free space that lazy_vacuum_heap_page() will
1550 			 * make available in cases where it's possible to truncate the
1551 			 * page's line pointer array.
1552 			 *
1553 			 * Note: It's not in fact 100% certain that we really will call
1554 			 * lazy_vacuum_heap_rel() -- lazy_vacuum() might yet opt to skip
1555 			 * index vacuuming (and so must skip heap vacuuming).  This is
1556 			 * deemed okay because it only happens in emergencies, or when
1557 			 * there is very little free space anyway. (Besides, we start
1558 			 * recording free space in the FSM once index vacuuming has been
1559 			 * abandoned.)
1560 			 *
1561 			 * Note: The one-pass (no indexes) case is only supposed to make
1562 			 * it this far when there were no LP_DEAD items during pruning.
1563 			 */
1564 			Assert(vacrel->nindexes > 0);
1565 			UnlockReleaseBuffer(buf);
1566 		}
1567 		else
1568 		{
1569 			Size		freespace = PageGetHeapFreeSpace(page);
1570 
1571 			UnlockReleaseBuffer(buf);
1572 			RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1573 		}
1574 	}
1575 
1576 	/* report that everything is now scanned */
1577 	pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1578 
1579 	/* Clear the block number information */
1580 	vacrel->blkno = InvalidBlockNumber;
1581 
1582 	/* now we can compute the new value for pg_class.reltuples */
1583 	vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, nblocks,
1584 													 vacrel->tupcount_pages,
1585 													 vacrel->live_tuples);
1586 
1587 	/*
1588 	 * Also compute the total number of surviving heap entries.  In the
1589 	 * (unlikely) scenario that new_live_tuples is -1, take it as zero.
1590 	 */
1591 	vacrel->new_rel_tuples =
1592 		Max(vacrel->new_live_tuples, 0) + vacrel->new_dead_tuples;
1593 
1594 	/*
1595 	 * Release any remaining pin on visibility map page.
1596 	 */
1597 	if (BufferIsValid(vmbuffer))
1598 	{
1599 		ReleaseBuffer(vmbuffer);
1600 		vmbuffer = InvalidBuffer;
1601 	}
1602 
1603 	/* If any tuples need to be deleted, perform final vacuum cycle */
1604 	if (dead_tuples->num_tuples > 0)
1605 		lazy_vacuum(vacrel);
1606 
1607 	/*
1608 	 * Vacuum the remainder of the Free Space Map.  We must do this whether or
1609 	 * not there were indexes, and whether or not we bypassed index vacuuming.
1610 	 */
1611 	if (blkno > next_fsm_block_to_vacuum)
1612 		FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno);
1613 
1614 	/* report all blocks vacuumed */
1615 	pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1616 
1617 	/* Do post-vacuum cleanup */
1618 	if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1619 		lazy_cleanup_all_indexes(vacrel);
1620 
1621 	/*
1622 	 * Free resources managed by lazy_space_alloc().  (We must end parallel
1623 	 * mode/free shared memory before updating index statistics.  We cannot
1624 	 * write while in parallel mode.)
1625 	 */
1626 	lazy_space_free(vacrel);
1627 
1628 	/* Update index statistics */
1629 	if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1630 		update_index_statistics(vacrel);
1631 
1632 	/*
1633 	 * When the table has no indexes (i.e. in the one-pass strategy case),
1634 	 * make log report that lazy_vacuum_heap_rel would've made had there been
1635 	 * indexes.  (As in the two-pass strategy case, only make this report when
1636 	 * there were LP_DEAD line pointers vacuumed in lazy_vacuum_heap_page.)
1637 	 */
1638 	if (vacrel->nindexes == 0 && vacrel->lpdead_item_pages > 0)
1639 		ereport(elevel,
1640 				(errmsg("table \"%s\": removed %lld dead item identifiers in %u pages",
1641 						vacrel->relname, (long long) vacrel->lpdead_items,
1642 						vacrel->lpdead_item_pages)));
1643 
1644 	/*
1645 	 * Make a log report summarizing pruning and freezing.
1646 	 *
1647 	 * The autovacuum specific logging in heap_vacuum_rel summarizes an entire
1648 	 * VACUUM operation, whereas each VACUUM VERBOSE log report generally
1649 	 * summarizes a single round of index/heap vacuuming (or rel truncation).
1650 	 * It wouldn't make sense to report on pruning or freezing while following
1651 	 * that convention, though.  You can think of this log report as a summary
1652 	 * of our first pass over the heap.
1653 	 */
1654 	initStringInfo(&buf);
1655 	appendStringInfo(&buf,
1656 					 _("%lld dead row versions cannot be removed yet, oldest xmin: %u\n"),
1657 					 (long long) vacrel->new_dead_tuples, vacrel->OldestXmin);
1658 	appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ",
1659 									"Skipped %u pages due to buffer pins, ",
1660 									vacrel->pinskipped_pages),
1661 					 vacrel->pinskipped_pages);
1662 	appendStringInfo(&buf, ngettext("%u frozen page.\n",
1663 									"%u frozen pages.\n",
1664 									vacrel->frozenskipped_pages),
1665 					 vacrel->frozenskipped_pages);
1666 	appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
1667 
1668 	ereport(elevel,
1669 			(errmsg("table \"%s\": found %lld removable, %lld nonremovable row versions in %u out of %u pages",
1670 					vacrel->relname,
1671 					(long long) vacrel->tuples_deleted,
1672 					(long long) vacrel->num_tuples, vacrel->scanned_pages,
1673 					nblocks),
1674 			 errdetail_internal("%s", buf.data)));
1675 	pfree(buf.data);
1676 }
1677 
1678 /*
1679  *	lazy_scan_prune() -- lazy_scan_heap() pruning and freezing.
1680  *
1681  * Caller must hold pin and buffer cleanup lock on the buffer.
1682  *
1683  * Prior to PostgreSQL 14 there were very rare cases where heap_page_prune()
1684  * was allowed to disagree with our HeapTupleSatisfiesVacuum() call about
1685  * whether or not a tuple should be considered DEAD.  This happened when an
1686  * inserting transaction concurrently aborted (after our heap_page_prune()
1687  * call, before our HeapTupleSatisfiesVacuum() call).  There was rather a lot
1688  * of complexity just so we could deal with tuples that were DEAD to VACUUM,
1689  * but nevertheless were left with storage after pruning.
1690  *
1691  * The approach we take now is to restart pruning when the race condition is
1692  * detected.  This allows heap_page_prune() to prune the tuples inserted by
1693  * the now-aborted transaction.  This is a little crude, but it guarantees
1694  * that any items that make it into the dead_tuples array are simple LP_DEAD
1695  * line pointers, and that every remaining item with tuple storage is
1696  * considered as a candidate for freezing.
1697  */
1698 static void
lazy_scan_prune(LVRelState * vacrel,Buffer buf,BlockNumber blkno,Page page,GlobalVisState * vistest,LVPagePruneState * prunestate)1699 lazy_scan_prune(LVRelState *vacrel,
1700 				Buffer buf,
1701 				BlockNumber blkno,
1702 				Page page,
1703 				GlobalVisState *vistest,
1704 				LVPagePruneState *prunestate)
1705 {
1706 	Relation	rel = vacrel->rel;
1707 	OffsetNumber offnum,
1708 				maxoff;
1709 	ItemId		itemid;
1710 	HeapTupleData tuple;
1711 	HTSV_Result res;
1712 	int			tuples_deleted,
1713 				lpdead_items,
1714 				new_dead_tuples,
1715 				num_tuples,
1716 				live_tuples;
1717 	int			nfrozen;
1718 	OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
1719 	xl_heap_freeze_tuple frozen[MaxHeapTuplesPerPage];
1720 
1721 	maxoff = PageGetMaxOffsetNumber(page);
1722 
1723 retry:
1724 
1725 	/* Initialize (or reset) page-level counters */
1726 	tuples_deleted = 0;
1727 	lpdead_items = 0;
1728 	new_dead_tuples = 0;
1729 	num_tuples = 0;
1730 	live_tuples = 0;
1731 
1732 	/*
1733 	 * Prune all HOT-update chains in this page.
1734 	 *
1735 	 * We count tuples removed by the pruning step as tuples_deleted.  Its
1736 	 * final value can be thought of as the number of tuples that have been
1737 	 * deleted from the table.  It should not be confused with lpdead_items;
1738 	 * lpdead_items's final value can be thought of as the number of tuples
1739 	 * that were deleted from indexes.
1740 	 */
1741 	tuples_deleted = heap_page_prune(rel, buf, vistest,
1742 									 InvalidTransactionId, 0, false,
1743 									 &vacrel->offnum);
1744 
1745 	/*
1746 	 * Now scan the page to collect LP_DEAD items and check for tuples
1747 	 * requiring freezing among remaining tuples with storage
1748 	 */
1749 	prunestate->hastup = false;
1750 	prunestate->has_lpdead_items = false;
1751 	prunestate->all_visible = true;
1752 	prunestate->all_frozen = true;
1753 	prunestate->visibility_cutoff_xid = InvalidTransactionId;
1754 	nfrozen = 0;
1755 
1756 	for (offnum = FirstOffsetNumber;
1757 		 offnum <= maxoff;
1758 		 offnum = OffsetNumberNext(offnum))
1759 	{
1760 		bool		tuple_totally_frozen;
1761 
1762 		/*
1763 		 * Set the offset number so that we can display it along with any
1764 		 * error that occurred while processing this tuple.
1765 		 */
1766 		vacrel->offnum = offnum;
1767 		itemid = PageGetItemId(page, offnum);
1768 
1769 		if (!ItemIdIsUsed(itemid))
1770 			continue;
1771 
1772 		/* Redirect items mustn't be touched */
1773 		if (ItemIdIsRedirected(itemid))
1774 		{
1775 			prunestate->hastup = true;	/* page won't be truncatable */
1776 			continue;
1777 		}
1778 
1779 		/*
1780 		 * LP_DEAD items are processed outside of the loop.
1781 		 *
1782 		 * Note that we deliberately don't set hastup=true in the case of an
1783 		 * LP_DEAD item here, which is not how lazy_check_needs_freeze() or
1784 		 * count_nondeletable_pages() do it -- they only consider pages empty
1785 		 * when they only have LP_UNUSED items, which is important for
1786 		 * correctness.
1787 		 *
1788 		 * Our assumption is that any LP_DEAD items we encounter here will
1789 		 * become LP_UNUSED inside lazy_vacuum_heap_page() before we actually
1790 		 * call count_nondeletable_pages().  In any case our opinion of
1791 		 * whether or not a page 'hastup' (which is how our caller sets its
1792 		 * vacrel->nonempty_pages value) is inherently race-prone.  It must be
1793 		 * treated as advisory/unreliable, so we might as well be slightly
1794 		 * optimistic.
1795 		 */
1796 		if (ItemIdIsDead(itemid))
1797 		{
1798 			deadoffsets[lpdead_items++] = offnum;
1799 			prunestate->all_visible = false;
1800 			prunestate->has_lpdead_items = true;
1801 			continue;
1802 		}
1803 
1804 		Assert(ItemIdIsNormal(itemid));
1805 
1806 		ItemPointerSet(&(tuple.t_self), blkno, offnum);
1807 		tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1808 		tuple.t_len = ItemIdGetLength(itemid);
1809 		tuple.t_tableOid = RelationGetRelid(rel);
1810 
1811 		/*
1812 		 * DEAD tuples are almost always pruned into LP_DEAD line pointers by
1813 		 * heap_page_prune(), but it's possible that the tuple state changed
1814 		 * since heap_page_prune() looked.  Handle that here by restarting.
1815 		 * (See comments at the top of function for a full explanation.)
1816 		 */
1817 		res = HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf);
1818 
1819 		if (unlikely(res == HEAPTUPLE_DEAD))
1820 			goto retry;
1821 
1822 		/*
1823 		 * The criteria for counting a tuple as live in this block need to
1824 		 * match what analyze.c's acquire_sample_rows() does, otherwise VACUUM
1825 		 * and ANALYZE may produce wildly different reltuples values, e.g.
1826 		 * when there are many recently-dead tuples.
1827 		 *
1828 		 * The logic here is a bit simpler than acquire_sample_rows(), as
1829 		 * VACUUM can't run inside a transaction block, which makes some cases
1830 		 * impossible (e.g. in-progress insert from the same transaction).
1831 		 *
1832 		 * We treat LP_DEAD items a little differently, too -- we don't count
1833 		 * them as dead_tuples at all (we only consider new_dead_tuples).  The
1834 		 * outcome is no different because we assume that any LP_DEAD items we
1835 		 * encounter here will become LP_UNUSED inside lazy_vacuum_heap_page()
1836 		 * before we report anything to the stats collector. (Cases where we
1837 		 * bypass index vacuuming will violate our assumption, but the overall
1838 		 * impact of that should be negligible.)
1839 		 */
1840 		switch (res)
1841 		{
1842 			case HEAPTUPLE_LIVE:
1843 
1844 				/*
1845 				 * Count it as live.  Not only is this natural, but it's also
1846 				 * what acquire_sample_rows() does.
1847 				 */
1848 				live_tuples++;
1849 
1850 				/*
1851 				 * Is the tuple definitely visible to all transactions?
1852 				 *
1853 				 * NB: Like with per-tuple hint bits, we can't set the
1854 				 * PD_ALL_VISIBLE flag if the inserter committed
1855 				 * asynchronously. See SetHintBits for more info. Check that
1856 				 * the tuple is hinted xmin-committed because of that.
1857 				 */
1858 				if (prunestate->all_visible)
1859 				{
1860 					TransactionId xmin;
1861 
1862 					if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1863 					{
1864 						prunestate->all_visible = false;
1865 						break;
1866 					}
1867 
1868 					/*
1869 					 * The inserter definitely committed. But is it old enough
1870 					 * that everyone sees it as committed?
1871 					 */
1872 					xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1873 					if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
1874 					{
1875 						prunestate->all_visible = false;
1876 						break;
1877 					}
1878 
1879 					/* Track newest xmin on page. */
1880 					if (TransactionIdFollows(xmin, prunestate->visibility_cutoff_xid))
1881 						prunestate->visibility_cutoff_xid = xmin;
1882 				}
1883 				break;
1884 			case HEAPTUPLE_RECENTLY_DEAD:
1885 
1886 				/*
1887 				 * If tuple is recently deleted then we must not remove it
1888 				 * from relation.  (We only remove items that are LP_DEAD from
1889 				 * pruning.)
1890 				 */
1891 				new_dead_tuples++;
1892 				prunestate->all_visible = false;
1893 				break;
1894 			case HEAPTUPLE_INSERT_IN_PROGRESS:
1895 
1896 				/*
1897 				 * We do not count these rows as live, because we expect the
1898 				 * inserting transaction to update the counters at commit, and
1899 				 * we assume that will happen only after we report our
1900 				 * results.  This assumption is a bit shaky, but it is what
1901 				 * acquire_sample_rows() does, so be consistent.
1902 				 */
1903 				prunestate->all_visible = false;
1904 				break;
1905 			case HEAPTUPLE_DELETE_IN_PROGRESS:
1906 				/* This is an expected case during concurrent vacuum */
1907 				prunestate->all_visible = false;
1908 
1909 				/*
1910 				 * Count such rows as live.  As above, we assume the deleting
1911 				 * transaction will commit and update the counters after we
1912 				 * report.
1913 				 */
1914 				live_tuples++;
1915 				break;
1916 			default:
1917 				elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1918 				break;
1919 		}
1920 
1921 		/*
1922 		 * Non-removable tuple (i.e. tuple with storage).
1923 		 *
1924 		 * Check tuple left behind after pruning to see if needs to be frozen
1925 		 * now.
1926 		 */
1927 		num_tuples++;
1928 		prunestate->hastup = true;
1929 		if (heap_prepare_freeze_tuple(tuple.t_data,
1930 									  vacrel->relfrozenxid,
1931 									  vacrel->relminmxid,
1932 									  vacrel->FreezeLimit,
1933 									  vacrel->MultiXactCutoff,
1934 									  &frozen[nfrozen],
1935 									  &tuple_totally_frozen))
1936 		{
1937 			/* Will execute freeze below */
1938 			frozen[nfrozen++].offset = offnum;
1939 		}
1940 
1941 		/*
1942 		 * If tuple is not frozen (and not about to become frozen) then caller
1943 		 * had better not go on to set this page's VM bit
1944 		 */
1945 		if (!tuple_totally_frozen)
1946 			prunestate->all_frozen = false;
1947 	}
1948 
1949 	/*
1950 	 * We have now divided every item on the page into either an LP_DEAD item
1951 	 * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple
1952 	 * that remains and needs to be considered for freezing now (LP_UNUSED and
1953 	 * LP_REDIRECT items also remain, but are of no further interest to us).
1954 	 */
1955 	vacrel->offnum = InvalidOffsetNumber;
1956 
1957 	/*
1958 	 * Consider the need to freeze any items with tuple storage from the page
1959 	 * first (arbitrary)
1960 	 */
1961 	if (nfrozen > 0)
1962 	{
1963 		Assert(prunestate->hastup);
1964 
1965 		/*
1966 		 * At least one tuple with storage needs to be frozen -- execute that
1967 		 * now.
1968 		 *
1969 		 * If we need to freeze any tuples we'll mark the buffer dirty, and
1970 		 * write a WAL record recording the changes.  We must log the changes
1971 		 * to be crash-safe against future truncation of CLOG.
1972 		 */
1973 		START_CRIT_SECTION();
1974 
1975 		MarkBufferDirty(buf);
1976 
1977 		/* execute collected freezes */
1978 		for (int i = 0; i < nfrozen; i++)
1979 		{
1980 			HeapTupleHeader htup;
1981 
1982 			itemid = PageGetItemId(page, frozen[i].offset);
1983 			htup = (HeapTupleHeader) PageGetItem(page, itemid);
1984 
1985 			heap_execute_freeze_tuple(htup, &frozen[i]);
1986 		}
1987 
1988 		/* Now WAL-log freezing if necessary */
1989 		if (RelationNeedsWAL(vacrel->rel))
1990 		{
1991 			XLogRecPtr	recptr;
1992 
1993 			recptr = log_heap_freeze(vacrel->rel, buf, vacrel->FreezeLimit,
1994 									 frozen, nfrozen);
1995 			PageSetLSN(page, recptr);
1996 		}
1997 
1998 		END_CRIT_SECTION();
1999 	}
2000 
2001 	/*
2002 	 * The second pass over the heap can also set visibility map bits, using
2003 	 * the same approach.  This is important when the table frequently has a
2004 	 * few old LP_DEAD items on each page by the time we get to it (typically
2005 	 * because past opportunistic pruning operations freed some non-HOT
2006 	 * tuples).
2007 	 *
2008 	 * VACUUM will call heap_page_is_all_visible() during the second pass over
2009 	 * the heap to determine all_visible and all_frozen for the page -- this
2010 	 * is a specialized version of the logic from this function.  Now that
2011 	 * we've finished pruning and freezing, make sure that we're in total
2012 	 * agreement with heap_page_is_all_visible() using an assertion.
2013 	 */
2014 #ifdef USE_ASSERT_CHECKING
2015 	/* Note that all_frozen value does not matter when !all_visible */
2016 	if (prunestate->all_visible)
2017 	{
2018 		TransactionId cutoff;
2019 		bool		all_frozen;
2020 
2021 		if (!heap_page_is_all_visible(vacrel, buf, &cutoff, &all_frozen))
2022 			Assert(false);
2023 
2024 		Assert(lpdead_items == 0);
2025 		Assert(prunestate->all_frozen == all_frozen);
2026 
2027 		/*
2028 		 * It's possible that we froze tuples and made the page's XID cutoff
2029 		 * (for recovery conflict purposes) FrozenTransactionId.  This is okay
2030 		 * because visibility_cutoff_xid will be logged by our caller in a
2031 		 * moment.
2032 		 */
2033 		Assert(cutoff == FrozenTransactionId ||
2034 			   cutoff == prunestate->visibility_cutoff_xid);
2035 	}
2036 #endif
2037 
2038 	/*
2039 	 * Now save details of the LP_DEAD items from the page in the dead_tuples
2040 	 * array.  Also record that page has dead items in per-page prunestate.
2041 	 */
2042 	if (lpdead_items > 0)
2043 	{
2044 		LVDeadTuples *dead_tuples = vacrel->dead_tuples;
2045 		ItemPointerData tmp;
2046 
2047 		Assert(!prunestate->all_visible);
2048 		Assert(prunestate->has_lpdead_items);
2049 
2050 		vacrel->lpdead_item_pages++;
2051 
2052 		ItemPointerSetBlockNumber(&tmp, blkno);
2053 
2054 		for (int i = 0; i < lpdead_items; i++)
2055 		{
2056 			ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]);
2057 			dead_tuples->itemptrs[dead_tuples->num_tuples++] = tmp;
2058 		}
2059 
2060 		Assert(dead_tuples->num_tuples <= dead_tuples->max_tuples);
2061 		pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
2062 									 dead_tuples->num_tuples);
2063 	}
2064 
2065 	/* Finally, add page-local counts to whole-VACUUM counts */
2066 	vacrel->tuples_deleted += tuples_deleted;
2067 	vacrel->lpdead_items += lpdead_items;
2068 	vacrel->new_dead_tuples += new_dead_tuples;
2069 	vacrel->num_tuples += num_tuples;
2070 	vacrel->live_tuples += live_tuples;
2071 }
2072 
2073 /*
2074  * Remove the collected garbage tuples from the table and its indexes.
2075  *
2076  * We may choose to bypass index vacuuming at this point, though only when the
2077  * ongoing VACUUM operation will definitely only have one index scan/round of
2078  * index vacuuming.  Caller indicates whether or not this is such a VACUUM
2079  * operation using 'onecall' argument.
2080  *
2081  * In rare emergencies, the ongoing VACUUM operation can be made to skip both
2082  * index vacuuming and index cleanup at the point we're called.  This avoids
2083  * having the whole system refuse to allocate further XIDs/MultiXactIds due to
2084  * wraparound.
2085  */
2086 static void
lazy_vacuum(LVRelState * vacrel)2087 lazy_vacuum(LVRelState *vacrel)
2088 {
2089 	bool		bypass;
2090 
2091 	/* Should not end up here with no indexes */
2092 	Assert(vacrel->nindexes > 0);
2093 	Assert(!IsParallelWorker());
2094 	Assert(vacrel->lpdead_item_pages > 0);
2095 
2096 	if (!vacrel->do_index_vacuuming)
2097 	{
2098 		Assert(!vacrel->do_index_cleanup);
2099 		vacrel->dead_tuples->num_tuples = 0;
2100 		return;
2101 	}
2102 
2103 	/*
2104 	 * Consider bypassing index vacuuming (and heap vacuuming) entirely.
2105 	 *
2106 	 * We currently only do this in cases where the number of LP_DEAD items
2107 	 * for the entire VACUUM operation is close to zero.  This avoids sharp
2108 	 * discontinuities in the duration and overhead of successive VACUUM
2109 	 * operations that run against the same table with a fixed workload.
2110 	 * Ideally, successive VACUUM operations will behave as if there are
2111 	 * exactly zero LP_DEAD items in cases where there are close to zero.
2112 	 *
2113 	 * This is likely to be helpful with a table that is continually affected
2114 	 * by UPDATEs that can mostly apply the HOT optimization, but occasionally
2115 	 * have small aberrations that lead to just a few heap pages retaining
2116 	 * only one or two LP_DEAD items.  This is pretty common; even when the
2117 	 * DBA goes out of their way to make UPDATEs use HOT, it is practically
2118 	 * impossible to predict whether HOT will be applied in 100% of cases.
2119 	 * It's far easier to ensure that 99%+ of all UPDATEs against a table use
2120 	 * HOT through careful tuning.
2121 	 */
2122 	bypass = false;
2123 	if (vacrel->consider_bypass_optimization && vacrel->rel_pages > 0)
2124 	{
2125 		BlockNumber threshold;
2126 
2127 		Assert(vacrel->num_index_scans == 0);
2128 		Assert(vacrel->lpdead_items == vacrel->dead_tuples->num_tuples);
2129 		Assert(vacrel->do_index_vacuuming);
2130 		Assert(vacrel->do_index_cleanup);
2131 
2132 		/*
2133 		 * This crossover point at which we'll start to do index vacuuming is
2134 		 * expressed as a percentage of the total number of heap pages in the
2135 		 * table that are known to have at least one LP_DEAD item.  This is
2136 		 * much more important than the total number of LP_DEAD items, since
2137 		 * it's a proxy for the number of heap pages whose visibility map bits
2138 		 * cannot be set on account of bypassing index and heap vacuuming.
2139 		 *
2140 		 * We apply one further precautionary test: the space currently used
2141 		 * to store the TIDs (TIDs that now all point to LP_DEAD items) must
2142 		 * not exceed 32MB.  This limits the risk that we will bypass index
2143 		 * vacuuming again and again until eventually there is a VACUUM whose
2144 		 * dead_tuples space is not CPU cache resident.
2145 		 *
2146 		 * We don't take any special steps to remember the LP_DEAD items (such
2147 		 * as counting them in new_dead_tuples report to the stats collector)
2148 		 * when the optimization is applied.  Though the accounting used in
2149 		 * analyze.c's acquire_sample_rows() will recognize the same LP_DEAD
2150 		 * items as dead rows in its own stats collector report, that's okay.
2151 		 * The discrepancy should be negligible.  If this optimization is ever
2152 		 * expanded to cover more cases then this may need to be reconsidered.
2153 		 */
2154 		threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES;
2155 		bypass = (vacrel->lpdead_item_pages < threshold &&
2156 				  vacrel->lpdead_items < MAXDEADTUPLES(32L * 1024L * 1024L));
2157 	}
2158 
2159 	if (bypass)
2160 	{
2161 		/*
2162 		 * There are almost zero TIDs.  Behave as if there were precisely
2163 		 * zero: bypass index vacuuming, but do index cleanup.
2164 		 *
2165 		 * We expect that the ongoing VACUUM operation will finish very
2166 		 * quickly, so there is no point in considering speeding up as a
2167 		 * failsafe against wraparound failure. (Index cleanup is expected to
2168 		 * finish very quickly in cases where there were no ambulkdelete()
2169 		 * calls.)
2170 		 */
2171 		vacrel->do_index_vacuuming = false;
2172 		ereport(elevel,
2173 				(errmsg("table \"%s\": index scan bypassed: %u pages from table (%.2f%% of total) have %lld dead item identifiers",
2174 						vacrel->relname, vacrel->lpdead_item_pages,
2175 						100.0 * vacrel->lpdead_item_pages / vacrel->rel_pages,
2176 						(long long) vacrel->lpdead_items)));
2177 	}
2178 	else if (lazy_vacuum_all_indexes(vacrel))
2179 	{
2180 		/*
2181 		 * We successfully completed a round of index vacuuming.  Do related
2182 		 * heap vacuuming now.
2183 		 */
2184 		lazy_vacuum_heap_rel(vacrel);
2185 	}
2186 	else
2187 	{
2188 		/*
2189 		 * Failsafe case.
2190 		 *
2191 		 * we attempted index vacuuming, but didn't finish a full round/full
2192 		 * index scan.  This happens when relfrozenxid or relminmxid is too
2193 		 * far in the past.
2194 		 *
2195 		 * From this point on the VACUUM operation will do no further index
2196 		 * vacuuming or heap vacuuming.  This VACUUM operation won't end up
2197 		 * back here again.
2198 		 */
2199 		Assert(vacrel->failsafe_active);
2200 	}
2201 
2202 	/*
2203 	 * Forget the LP_DEAD items that we just vacuumed (or just decided to not
2204 	 * vacuum)
2205 	 */
2206 	vacrel->dead_tuples->num_tuples = 0;
2207 }
2208 
2209 /*
2210  *	lazy_vacuum_all_indexes() -- Main entry for index vacuuming
2211  *
2212  * Returns true in the common case when all indexes were successfully
2213  * vacuumed.  Returns false in rare cases where we determined that the ongoing
2214  * VACUUM operation is at risk of taking too long to finish, leading to
2215  * wraparound failure.
2216  */
2217 static bool
lazy_vacuum_all_indexes(LVRelState * vacrel)2218 lazy_vacuum_all_indexes(LVRelState *vacrel)
2219 {
2220 	bool		allindexes = true;
2221 
2222 	Assert(!IsParallelWorker());
2223 	Assert(vacrel->nindexes > 0);
2224 	Assert(vacrel->do_index_vacuuming);
2225 	Assert(vacrel->do_index_cleanup);
2226 	Assert(TransactionIdIsNormal(vacrel->relfrozenxid));
2227 	Assert(MultiXactIdIsValid(vacrel->relminmxid));
2228 
2229 	/* Precheck for XID wraparound emergencies */
2230 	if (lazy_check_wraparound_failsafe(vacrel))
2231 	{
2232 		/* Wraparound emergency -- don't even start an index scan */
2233 		return false;
2234 	}
2235 
2236 	/* Report that we are now vacuuming indexes */
2237 	pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2238 								 PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
2239 
2240 	if (!ParallelVacuumIsActive(vacrel))
2241 	{
2242 		for (int idx = 0; idx < vacrel->nindexes; idx++)
2243 		{
2244 			Relation	indrel = vacrel->indrels[idx];
2245 			IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2246 
2247 			vacrel->indstats[idx] =
2248 				lazy_vacuum_one_index(indrel, istat, vacrel->old_live_tuples,
2249 									  vacrel);
2250 
2251 			if (lazy_check_wraparound_failsafe(vacrel))
2252 			{
2253 				/* Wraparound emergency -- end current index scan */
2254 				allindexes = false;
2255 				break;
2256 			}
2257 		}
2258 	}
2259 	else
2260 	{
2261 		/* Outsource everything to parallel variant */
2262 		do_parallel_lazy_vacuum_all_indexes(vacrel);
2263 
2264 		/*
2265 		 * Do a postcheck to consider applying wraparound failsafe now.  Note
2266 		 * that parallel VACUUM only gets the precheck and this postcheck.
2267 		 */
2268 		if (lazy_check_wraparound_failsafe(vacrel))
2269 			allindexes = false;
2270 	}
2271 
2272 	/*
2273 	 * We delete all LP_DEAD items from the first heap pass in all indexes on
2274 	 * each call here (except calls where we choose to do the failsafe). This
2275 	 * makes the next call to lazy_vacuum_heap_rel() safe (except in the event
2276 	 * of the failsafe triggering, which prevents the next call from taking
2277 	 * place).
2278 	 */
2279 	Assert(vacrel->num_index_scans > 0 ||
2280 		   vacrel->dead_tuples->num_tuples == vacrel->lpdead_items);
2281 	Assert(allindexes || vacrel->failsafe_active);
2282 
2283 	/*
2284 	 * Increase and report the number of index scans.
2285 	 *
2286 	 * We deliberately include the case where we started a round of bulk
2287 	 * deletes that we weren't able to finish due to the failsafe triggering.
2288 	 */
2289 	vacrel->num_index_scans++;
2290 	pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS,
2291 								 vacrel->num_index_scans);
2292 
2293 	return allindexes;
2294 }
2295 
2296 /*
2297  *	lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
2298  *
2299  * This routine marks LP_DEAD items in vacrel->dead_tuples array as LP_UNUSED.
2300  * Pages that never had lazy_scan_prune record LP_DEAD items are not visited
2301  * at all.
2302  *
2303  * We may also be able to truncate the line pointer array of the heap pages we
2304  * visit.  If there is a contiguous group of LP_UNUSED items at the end of the
2305  * array, it can be reclaimed as free space.  These LP_UNUSED items usually
2306  * start out as LP_DEAD items recorded by lazy_scan_prune (we set items from
2307  * each page to LP_UNUSED, and then consider if it's possible to truncate the
2308  * page's line pointer array).
2309  *
2310  * Note: the reason for doing this as a second pass is we cannot remove the
2311  * tuples until we've removed their index entries, and we want to process
2312  * index entry removal in batches as large as possible.
2313  */
2314 static void
lazy_vacuum_heap_rel(LVRelState * vacrel)2315 lazy_vacuum_heap_rel(LVRelState *vacrel)
2316 {
2317 	int			tupindex;
2318 	BlockNumber vacuumed_pages;
2319 	PGRUsage	ru0;
2320 	Buffer		vmbuffer = InvalidBuffer;
2321 	LVSavedErrInfo saved_err_info;
2322 
2323 	Assert(vacrel->do_index_vacuuming);
2324 	Assert(vacrel->do_index_cleanup);
2325 	Assert(vacrel->num_index_scans > 0);
2326 
2327 	/* Report that we are now vacuuming the heap */
2328 	pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2329 								 PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
2330 
2331 	/* Update error traceback information */
2332 	update_vacuum_error_info(vacrel, &saved_err_info,
2333 							 VACUUM_ERRCB_PHASE_VACUUM_HEAP,
2334 							 InvalidBlockNumber, InvalidOffsetNumber);
2335 
2336 	pg_rusage_init(&ru0);
2337 	vacuumed_pages = 0;
2338 
2339 	tupindex = 0;
2340 	while (tupindex < vacrel->dead_tuples->num_tuples)
2341 	{
2342 		BlockNumber tblk;
2343 		Buffer		buf;
2344 		Page		page;
2345 		Size		freespace;
2346 
2347 		vacuum_delay_point();
2348 
2349 		tblk = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex]);
2350 		vacrel->blkno = tblk;
2351 		buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL,
2352 								 vacrel->bstrategy);
2353 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2354 		tupindex = lazy_vacuum_heap_page(vacrel, tblk, buf, tupindex,
2355 										 &vmbuffer);
2356 
2357 		/* Now that we've vacuumed the page, record its available space */
2358 		page = BufferGetPage(buf);
2359 		freespace = PageGetHeapFreeSpace(page);
2360 
2361 		UnlockReleaseBuffer(buf);
2362 		RecordPageWithFreeSpace(vacrel->rel, tblk, freespace);
2363 		vacuumed_pages++;
2364 	}
2365 
2366 	/* Clear the block number information */
2367 	vacrel->blkno = InvalidBlockNumber;
2368 
2369 	if (BufferIsValid(vmbuffer))
2370 	{
2371 		ReleaseBuffer(vmbuffer);
2372 		vmbuffer = InvalidBuffer;
2373 	}
2374 
2375 	/*
2376 	 * We set all LP_DEAD items from the first heap pass to LP_UNUSED during
2377 	 * the second heap pass.  No more, no less.
2378 	 */
2379 	Assert(tupindex > 0);
2380 	Assert(vacrel->num_index_scans > 1 ||
2381 		   (tupindex == vacrel->lpdead_items &&
2382 			vacuumed_pages == vacrel->lpdead_item_pages));
2383 
2384 	ereport(elevel,
2385 			(errmsg("table \"%s\": removed %lld dead item identifiers in %u pages",
2386 					vacrel->relname, (long long ) tupindex, vacuumed_pages),
2387 			 errdetail_internal("%s", pg_rusage_show(&ru0))));
2388 
2389 	/* Revert to the previous phase information for error traceback */
2390 	restore_vacuum_error_info(vacrel, &saved_err_info);
2391 }
2392 
2393 /*
2394  *	lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the
2395  *						  vacrel->dead_tuples array.
2396  *
2397  * Caller must have an exclusive buffer lock on the buffer (though a
2398  * super-exclusive lock is also acceptable).
2399  *
2400  * tupindex is the index in vacrel->dead_tuples of the first dead tuple for
2401  * this page.  We assume the rest follow sequentially.  The return value is
2402  * the first tupindex after the tuples of this page.
2403  *
2404  * Prior to PostgreSQL 14 there were rare cases where this routine had to set
2405  * tuples with storage to unused.  These days it is strictly responsible for
2406  * marking LP_DEAD stub line pointers as unused.  This only happens for those
2407  * LP_DEAD items on the page that were determined to be LP_DEAD items back
2408  * when the same page was visited by lazy_scan_prune() (i.e. those whose TID
2409  * was recorded in the dead_tuples array).
2410  */
2411 static int
lazy_vacuum_heap_page(LVRelState * vacrel,BlockNumber blkno,Buffer buffer,int tupindex,Buffer * vmbuffer)2412 lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
2413 					  int tupindex, Buffer *vmbuffer)
2414 {
2415 	LVDeadTuples *dead_tuples = vacrel->dead_tuples;
2416 	Page		page = BufferGetPage(buffer);
2417 	OffsetNumber unused[MaxHeapTuplesPerPage];
2418 	int			uncnt = 0;
2419 	TransactionId visibility_cutoff_xid;
2420 	bool		all_frozen;
2421 	LVSavedErrInfo saved_err_info;
2422 
2423 	Assert(vacrel->nindexes == 0 || vacrel->do_index_vacuuming);
2424 
2425 	pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
2426 
2427 	/* Update error traceback information */
2428 	update_vacuum_error_info(vacrel, &saved_err_info,
2429 							 VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno,
2430 							 InvalidOffsetNumber);
2431 
2432 	START_CRIT_SECTION();
2433 
2434 	for (; tupindex < dead_tuples->num_tuples; tupindex++)
2435 	{
2436 		BlockNumber tblk;
2437 		OffsetNumber toff;
2438 		ItemId		itemid;
2439 
2440 		tblk = ItemPointerGetBlockNumber(&dead_tuples->itemptrs[tupindex]);
2441 		if (tblk != blkno)
2442 			break;				/* past end of tuples for this block */
2443 		toff = ItemPointerGetOffsetNumber(&dead_tuples->itemptrs[tupindex]);
2444 		itemid = PageGetItemId(page, toff);
2445 
2446 		Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid));
2447 		ItemIdSetUnused(itemid);
2448 		unused[uncnt++] = toff;
2449 	}
2450 
2451 	Assert(uncnt > 0);
2452 
2453 	/* Attempt to truncate line pointer array now */
2454 	PageTruncateLinePointerArray(page);
2455 
2456 	/*
2457 	 * Mark buffer dirty before we write WAL.
2458 	 */
2459 	MarkBufferDirty(buffer);
2460 
2461 	/* XLOG stuff */
2462 	if (RelationNeedsWAL(vacrel->rel))
2463 	{
2464 		xl_heap_vacuum xlrec;
2465 		XLogRecPtr	recptr;
2466 
2467 		xlrec.nunused = uncnt;
2468 
2469 		XLogBeginInsert();
2470 		XLogRegisterData((char *) &xlrec, SizeOfHeapVacuum);
2471 
2472 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2473 		XLogRegisterBufData(0, (char *) unused, uncnt * sizeof(OffsetNumber));
2474 
2475 		recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VACUUM);
2476 
2477 		PageSetLSN(page, recptr);
2478 	}
2479 
2480 	/*
2481 	 * End critical section, so we safely can do visibility tests (which
2482 	 * possibly need to perform IO and allocate memory!). If we crash now the
2483 	 * page (including the corresponding vm bit) might not be marked all
2484 	 * visible, but that's fine. A later vacuum will fix that.
2485 	 */
2486 	END_CRIT_SECTION();
2487 
2488 	/*
2489 	 * Now that we have removed the LD_DEAD items from the page, once again
2490 	 * check if the page has become all-visible.  The page is already marked
2491 	 * dirty, exclusively locked, and, if needed, a full page image has been
2492 	 * emitted.
2493 	 */
2494 	if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
2495 								 &all_frozen))
2496 		PageSetAllVisible(page);
2497 
2498 	/*
2499 	 * All the changes to the heap page have been done. If the all-visible
2500 	 * flag is now set, also set the VM all-visible bit (and, if possible, the
2501 	 * all-frozen bit) unless this has already been done previously.
2502 	 */
2503 	if (PageIsAllVisible(page))
2504 	{
2505 		uint8		flags = 0;
2506 		uint8		vm_status = visibilitymap_get_status(vacrel->rel,
2507 														 blkno, vmbuffer);
2508 
2509 		/* Set the VM all-frozen bit to flag, if needed */
2510 		if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
2511 			flags |= VISIBILITYMAP_ALL_VISIBLE;
2512 		if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
2513 			flags |= VISIBILITYMAP_ALL_FROZEN;
2514 
2515 		Assert(BufferIsValid(*vmbuffer));
2516 		if (flags != 0)
2517 			visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr,
2518 							  *vmbuffer, visibility_cutoff_xid, flags);
2519 	}
2520 
2521 	/* Revert to the previous phase information for error traceback */
2522 	restore_vacuum_error_info(vacrel, &saved_err_info);
2523 	return tupindex;
2524 }
2525 
2526 /*
2527  *	lazy_check_needs_freeze() -- scan page to see if any tuples
2528  *					 need to be cleaned to avoid wraparound
2529  *
2530  * Returns true if the page needs to be vacuumed using cleanup lock.
2531  * Also returns a flag indicating whether page contains any tuples at all.
2532  */
2533 static bool
lazy_check_needs_freeze(Buffer buf,bool * hastup,LVRelState * vacrel)2534 lazy_check_needs_freeze(Buffer buf, bool *hastup, LVRelState *vacrel)
2535 {
2536 	Page		page = BufferGetPage(buf);
2537 	OffsetNumber offnum,
2538 				maxoff;
2539 	HeapTupleHeader tupleheader;
2540 
2541 	*hastup = false;
2542 
2543 	/*
2544 	 * New and empty pages, obviously, don't contain tuples. We could make
2545 	 * sure that the page is registered in the FSM, but it doesn't seem worth
2546 	 * waiting for a cleanup lock just for that, especially because it's
2547 	 * likely that the pin holder will do so.
2548 	 */
2549 	if (PageIsNew(page) || PageIsEmpty(page))
2550 		return false;
2551 
2552 	maxoff = PageGetMaxOffsetNumber(page);
2553 	for (offnum = FirstOffsetNumber;
2554 		 offnum <= maxoff;
2555 		 offnum = OffsetNumberNext(offnum))
2556 	{
2557 		ItemId		itemid;
2558 
2559 		/*
2560 		 * Set the offset number so that we can display it along with any
2561 		 * error that occurred while processing this tuple.
2562 		 */
2563 		vacrel->offnum = offnum;
2564 		itemid = PageGetItemId(page, offnum);
2565 
2566 		/* this should match hastup test in count_nondeletable_pages() */
2567 		if (ItemIdIsUsed(itemid))
2568 			*hastup = true;
2569 
2570 		/* dead and redirect items never need freezing */
2571 		if (!ItemIdIsNormal(itemid))
2572 			continue;
2573 
2574 		tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
2575 
2576 		if (heap_tuple_needs_freeze(tupleheader, vacrel->FreezeLimit,
2577 									vacrel->MultiXactCutoff, buf))
2578 			break;
2579 	}							/* scan along page */
2580 
2581 	/* Clear the offset information once we have processed the given page. */
2582 	vacrel->offnum = InvalidOffsetNumber;
2583 
2584 	return (offnum <= maxoff);
2585 }
2586 
2587 /*
2588  * Trigger the failsafe to avoid wraparound failure when vacrel table has a
2589  * relfrozenxid and/or relminmxid that is dangerously far in the past.
2590  * Triggering the failsafe makes the ongoing VACUUM bypass any further index
2591  * vacuuming and heap vacuuming.  Truncating the heap is also bypassed.
2592  *
2593  * Any remaining work (work that VACUUM cannot just bypass) is typically sped
2594  * up when the failsafe triggers.  VACUUM stops applying any cost-based delay
2595  * that it started out with.
2596  *
2597  * Returns true when failsafe has been triggered.
2598  */
2599 static bool
lazy_check_wraparound_failsafe(LVRelState * vacrel)2600 lazy_check_wraparound_failsafe(LVRelState *vacrel)
2601 {
2602 	/* Don't warn more than once per VACUUM */
2603 	if (vacrel->failsafe_active)
2604 		return true;
2605 
2606 	if (unlikely(vacuum_xid_failsafe_check(vacrel->relfrozenxid,
2607 										   vacrel->relminmxid)))
2608 	{
2609 		vacrel->failsafe_active = true;
2610 
2611 		/* Disable index vacuuming, index cleanup, and heap rel truncation */
2612 		vacrel->do_index_vacuuming = false;
2613 		vacrel->do_index_cleanup = false;
2614 		vacrel->do_rel_truncate = false;
2615 
2616 		ereport(WARNING,
2617 				(errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans",
2618 						get_database_name(MyDatabaseId),
2619 						vacrel->relnamespace,
2620 						vacrel->relname,
2621 						vacrel->num_index_scans),
2622 				 errdetail("The table's relfrozenxid or relminmxid is too far in the past."),
2623 				 errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n"
2624 						 "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs.")));
2625 
2626 		/* Stop applying cost limits from this point on */
2627 		VacuumCostActive = false;
2628 		VacuumCostBalance = 0;
2629 
2630 		return true;
2631 	}
2632 
2633 	return false;
2634 }
2635 
2636 /*
2637  * Perform lazy_vacuum_all_indexes() steps in parallel
2638  */
2639 static void
do_parallel_lazy_vacuum_all_indexes(LVRelState * vacrel)2640 do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel)
2641 {
2642 	/* Tell parallel workers to do index vacuuming */
2643 	vacrel->lps->lvshared->for_cleanup = false;
2644 	vacrel->lps->lvshared->first_time = false;
2645 
2646 	/*
2647 	 * We can only provide an approximate value of num_heap_tuples, at least
2648 	 * for now.  Matches serial VACUUM case.
2649 	 */
2650 	vacrel->lps->lvshared->reltuples = vacrel->old_live_tuples;
2651 	vacrel->lps->lvshared->estimated_count = true;
2652 
2653 	do_parallel_vacuum_or_cleanup(vacrel,
2654 								  vacrel->lps->nindexes_parallel_bulkdel);
2655 }
2656 
2657 /*
2658  * Perform lazy_cleanup_all_indexes() steps in parallel
2659  */
2660 static void
do_parallel_lazy_cleanup_all_indexes(LVRelState * vacrel)2661 do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel)
2662 {
2663 	int			nworkers;
2664 
2665 	/*
2666 	 * If parallel vacuum is active we perform index cleanup with parallel
2667 	 * workers.
2668 	 *
2669 	 * Tell parallel workers to do index cleanup.
2670 	 */
2671 	vacrel->lps->lvshared->for_cleanup = true;
2672 	vacrel->lps->lvshared->first_time = (vacrel->num_index_scans == 0);
2673 
2674 	/*
2675 	 * Now we can provide a better estimate of total number of surviving
2676 	 * tuples (we assume indexes are more interested in that than in the
2677 	 * number of nominally live tuples).
2678 	 */
2679 	vacrel->lps->lvshared->reltuples = vacrel->new_rel_tuples;
2680 	vacrel->lps->lvshared->estimated_count =
2681 		(vacrel->tupcount_pages < vacrel->rel_pages);
2682 
2683 	/* Determine the number of parallel workers to launch */
2684 	if (vacrel->lps->lvshared->first_time)
2685 		nworkers = vacrel->lps->nindexes_parallel_cleanup +
2686 			vacrel->lps->nindexes_parallel_condcleanup;
2687 	else
2688 		nworkers = vacrel->lps->nindexes_parallel_cleanup;
2689 
2690 	do_parallel_vacuum_or_cleanup(vacrel, nworkers);
2691 }
2692 
2693 /*
2694  * Perform index vacuum or index cleanup with parallel workers.  This function
2695  * must be used by the parallel vacuum leader process.  The caller must set
2696  * lps->lvshared->for_cleanup to indicate whether to perform vacuum or
2697  * cleanup.
2698  */
2699 static void
do_parallel_vacuum_or_cleanup(LVRelState * vacrel,int nworkers)2700 do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers)
2701 {
2702 	LVParallelState *lps = vacrel->lps;
2703 
2704 	Assert(!IsParallelWorker());
2705 	Assert(ParallelVacuumIsActive(vacrel));
2706 	Assert(vacrel->nindexes > 0);
2707 
2708 	/* The leader process will participate */
2709 	nworkers--;
2710 
2711 	/*
2712 	 * It is possible that parallel context is initialized with fewer workers
2713 	 * than the number of indexes that need a separate worker in the current
2714 	 * phase, so we need to consider it.  See compute_parallel_vacuum_workers.
2715 	 */
2716 	nworkers = Min(nworkers, lps->pcxt->nworkers);
2717 
2718 	/* Setup the shared cost-based vacuum delay and launch workers */
2719 	if (nworkers > 0)
2720 	{
2721 		if (vacrel->num_index_scans > 0)
2722 		{
2723 			/* Reset the parallel index processing counter */
2724 			pg_atomic_write_u32(&(lps->lvshared->idx), 0);
2725 
2726 			/* Reinitialize the parallel context to relaunch parallel workers */
2727 			ReinitializeParallelDSM(lps->pcxt);
2728 		}
2729 
2730 		/*
2731 		 * Set up shared cost balance and the number of active workers for
2732 		 * vacuum delay.  We need to do this before launching workers as
2733 		 * otherwise, they might not see the updated values for these
2734 		 * parameters.
2735 		 */
2736 		pg_atomic_write_u32(&(lps->lvshared->cost_balance), VacuumCostBalance);
2737 		pg_atomic_write_u32(&(lps->lvshared->active_nworkers), 0);
2738 
2739 		/*
2740 		 * The number of workers can vary between bulkdelete and cleanup
2741 		 * phase.
2742 		 */
2743 		ReinitializeParallelWorkers(lps->pcxt, nworkers);
2744 
2745 		LaunchParallelWorkers(lps->pcxt);
2746 
2747 		if (lps->pcxt->nworkers_launched > 0)
2748 		{
2749 			/*
2750 			 * Reset the local cost values for leader backend as we have
2751 			 * already accumulated the remaining balance of heap.
2752 			 */
2753 			VacuumCostBalance = 0;
2754 			VacuumCostBalanceLocal = 0;
2755 
2756 			/* Enable shared cost balance for leader backend */
2757 			VacuumSharedCostBalance = &(lps->lvshared->cost_balance);
2758 			VacuumActiveNWorkers = &(lps->lvshared->active_nworkers);
2759 		}
2760 
2761 		if (lps->lvshared->for_cleanup)
2762 			ereport(elevel,
2763 					(errmsg(ngettext("launched %d parallel vacuum worker for index cleanup (planned: %d)",
2764 									 "launched %d parallel vacuum workers for index cleanup (planned: %d)",
2765 									 lps->pcxt->nworkers_launched),
2766 							lps->pcxt->nworkers_launched, nworkers)));
2767 		else
2768 			ereport(elevel,
2769 					(errmsg(ngettext("launched %d parallel vacuum worker for index vacuuming (planned: %d)",
2770 									 "launched %d parallel vacuum workers for index vacuuming (planned: %d)",
2771 									 lps->pcxt->nworkers_launched),
2772 							lps->pcxt->nworkers_launched, nworkers)));
2773 	}
2774 
2775 	/* Process the indexes that can be processed by only leader process */
2776 	do_serial_processing_for_unsafe_indexes(vacrel, lps->lvshared);
2777 
2778 	/*
2779 	 * Join as a parallel worker.  The leader process alone processes all the
2780 	 * indexes in the case where no workers are launched.
2781 	 */
2782 	do_parallel_processing(vacrel, lps->lvshared);
2783 
2784 	/*
2785 	 * Next, accumulate buffer and WAL usage.  (This must wait for the workers
2786 	 * to finish, or we might get incomplete data.)
2787 	 */
2788 	if (nworkers > 0)
2789 	{
2790 		/* Wait for all vacuum workers to finish */
2791 		WaitForParallelWorkersToFinish(lps->pcxt);
2792 
2793 		for (int i = 0; i < lps->pcxt->nworkers_launched; i++)
2794 			InstrAccumParallelQuery(&lps->buffer_usage[i], &lps->wal_usage[i]);
2795 	}
2796 
2797 	/*
2798 	 * Carry the shared balance value to heap scan and disable shared costing
2799 	 */
2800 	if (VacuumSharedCostBalance)
2801 	{
2802 		VacuumCostBalance = pg_atomic_read_u32(VacuumSharedCostBalance);
2803 		VacuumSharedCostBalance = NULL;
2804 		VacuumActiveNWorkers = NULL;
2805 	}
2806 }
2807 
2808 /*
2809  * Index vacuum/cleanup routine used by the leader process and parallel
2810  * vacuum worker processes to process the indexes in parallel.
2811  */
2812 static void
do_parallel_processing(LVRelState * vacrel,LVShared * lvshared)2813 do_parallel_processing(LVRelState *vacrel, LVShared *lvshared)
2814 {
2815 	/*
2816 	 * Increment the active worker count if we are able to launch any worker.
2817 	 */
2818 	if (VacuumActiveNWorkers)
2819 		pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2820 
2821 	/* Loop until all indexes are vacuumed */
2822 	for (;;)
2823 	{
2824 		int			idx;
2825 		LVSharedIndStats *shared_istat;
2826 		Relation	indrel;
2827 		IndexBulkDeleteResult *istat;
2828 
2829 		/* Get an index number to process */
2830 		idx = pg_atomic_fetch_add_u32(&(lvshared->idx), 1);
2831 
2832 		/* Done for all indexes? */
2833 		if (idx >= vacrel->nindexes)
2834 			break;
2835 
2836 		/* Get the index statistics space from DSM, if any */
2837 		shared_istat = parallel_stats_for_idx(lvshared, idx);
2838 
2839 		/* Skip indexes not participating in parallelism */
2840 		if (shared_istat == NULL)
2841 			continue;
2842 
2843 		indrel = vacrel->indrels[idx];
2844 
2845 		/*
2846 		 * Skip processing indexes that are unsafe for workers (these are
2847 		 * processed in do_serial_processing_for_unsafe_indexes() by leader)
2848 		 */
2849 		if (!parallel_processing_is_safe(indrel, lvshared))
2850 			continue;
2851 
2852 		/* Do vacuum or cleanup of the index */
2853 		istat = (vacrel->indstats[idx]);
2854 		vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2855 														   lvshared,
2856 														   shared_istat,
2857 														   vacrel);
2858 	}
2859 
2860 	/*
2861 	 * We have completed the index vacuum so decrement the active worker
2862 	 * count.
2863 	 */
2864 	if (VacuumActiveNWorkers)
2865 		pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2866 }
2867 
2868 /*
2869  * Perform parallel processing of indexes in leader process.
2870  *
2871  * Handles index vacuuming (or index cleanup) for indexes that are not
2872  * parallel safe.  It's possible that this will vary for a given index, based
2873  * on details like whether we're performing for_cleanup processing right now.
2874  *
2875  * Also performs processing of smaller indexes that fell under the size cutoff
2876  * enforced by compute_parallel_vacuum_workers().  These indexes never get a
2877  * slot for statistics in DSM.
2878  */
2879 static void
do_serial_processing_for_unsafe_indexes(LVRelState * vacrel,LVShared * lvshared)2880 do_serial_processing_for_unsafe_indexes(LVRelState *vacrel, LVShared *lvshared)
2881 {
2882 	Assert(!IsParallelWorker());
2883 
2884 	/*
2885 	 * Increment the active worker count if we are able to launch any worker.
2886 	 */
2887 	if (VacuumActiveNWorkers)
2888 		pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2889 
2890 	for (int idx = 0; idx < vacrel->nindexes; idx++)
2891 	{
2892 		LVSharedIndStats *shared_istat;
2893 		Relation	indrel;
2894 		IndexBulkDeleteResult *istat;
2895 
2896 		shared_istat = parallel_stats_for_idx(lvshared, idx);
2897 		indrel = vacrel->indrels[idx];
2898 
2899 		/*
2900 		 * We're only here for the indexes that parallel workers won't
2901 		 * process.  Note that the shared_istat test ensures that we process
2902 		 * indexes that fell under initial size cutoff.
2903 		 */
2904 		if (shared_istat != NULL &&
2905 			parallel_processing_is_safe(indrel, lvshared))
2906 			continue;
2907 
2908 		/* Do vacuum or cleanup of the index */
2909 		istat = (vacrel->indstats[idx]);
2910 		vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2911 														   lvshared,
2912 														   shared_istat,
2913 														   vacrel);
2914 	}
2915 
2916 	/*
2917 	 * We have completed the index vacuum so decrement the active worker
2918 	 * count.
2919 	 */
2920 	if (VacuumActiveNWorkers)
2921 		pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2922 }
2923 
2924 /*
2925  * Vacuum or cleanup index either by leader process or by one of the worker
2926  * process.  After processing the index this function copies the index
2927  * statistics returned from ambulkdelete and amvacuumcleanup to the DSM
2928  * segment.
2929  */
2930 static IndexBulkDeleteResult *
parallel_process_one_index(Relation indrel,IndexBulkDeleteResult * istat,LVShared * lvshared,LVSharedIndStats * shared_istat,LVRelState * vacrel)2931 parallel_process_one_index(Relation indrel,
2932 						   IndexBulkDeleteResult *istat,
2933 						   LVShared *lvshared,
2934 						   LVSharedIndStats *shared_istat,
2935 						   LVRelState *vacrel)
2936 {
2937 	IndexBulkDeleteResult *istat_res;
2938 
2939 	/*
2940 	 * Update the pointer to the corresponding bulk-deletion result if someone
2941 	 * has already updated it
2942 	 */
2943 	if (shared_istat && shared_istat->updated && istat == NULL)
2944 		istat = &shared_istat->istat;
2945 
2946 	/* Do vacuum or cleanup of the index */
2947 	if (lvshared->for_cleanup)
2948 		istat_res = lazy_cleanup_one_index(indrel, istat, lvshared->reltuples,
2949 										   lvshared->estimated_count, vacrel);
2950 	else
2951 		istat_res = lazy_vacuum_one_index(indrel, istat, lvshared->reltuples,
2952 										  vacrel);
2953 
2954 	/*
2955 	 * Copy the index bulk-deletion result returned from ambulkdelete and
2956 	 * amvacuumcleanup to the DSM segment if it's the first cycle because they
2957 	 * allocate locally and it's possible that an index will be vacuumed by a
2958 	 * different vacuum process the next cycle.  Copying the result normally
2959 	 * happens only the first time an index is vacuumed.  For any additional
2960 	 * vacuum pass, we directly point to the result on the DSM segment and
2961 	 * pass it to vacuum index APIs so that workers can update it directly.
2962 	 *
2963 	 * Since all vacuum workers write the bulk-deletion result at different
2964 	 * slots we can write them without locking.
2965 	 */
2966 	if (shared_istat && !shared_istat->updated && istat_res != NULL)
2967 	{
2968 		memcpy(&shared_istat->istat, istat_res, sizeof(IndexBulkDeleteResult));
2969 		shared_istat->updated = true;
2970 
2971 		/* Free the locally-allocated bulk-deletion result */
2972 		pfree(istat_res);
2973 
2974 		/* return the pointer to the result from shared memory */
2975 		return &shared_istat->istat;
2976 	}
2977 
2978 	return istat_res;
2979 }
2980 
2981 /*
2982  *	lazy_cleanup_all_indexes() -- cleanup all indexes of relation.
2983  */
2984 static void
lazy_cleanup_all_indexes(LVRelState * vacrel)2985 lazy_cleanup_all_indexes(LVRelState *vacrel)
2986 {
2987 	Assert(!IsParallelWorker());
2988 	Assert(vacrel->nindexes > 0);
2989 
2990 	/* Report that we are now cleaning up indexes */
2991 	pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2992 								 PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
2993 
2994 	if (!ParallelVacuumIsActive(vacrel))
2995 	{
2996 		double		reltuples = vacrel->new_rel_tuples;
2997 		bool		estimated_count =
2998 		vacrel->tupcount_pages < vacrel->rel_pages;
2999 
3000 		for (int idx = 0; idx < vacrel->nindexes; idx++)
3001 		{
3002 			Relation	indrel = vacrel->indrels[idx];
3003 			IndexBulkDeleteResult *istat = vacrel->indstats[idx];
3004 
3005 			vacrel->indstats[idx] =
3006 				lazy_cleanup_one_index(indrel, istat, reltuples,
3007 									   estimated_count, vacrel);
3008 		}
3009 	}
3010 	else
3011 	{
3012 		/* Outsource everything to parallel variant */
3013 		do_parallel_lazy_cleanup_all_indexes(vacrel);
3014 	}
3015 }
3016 
3017 /*
3018  *	lazy_vacuum_one_index() -- vacuum index relation.
3019  *
3020  *		Delete all the index entries pointing to tuples listed in
3021  *		dead_tuples, and update running statistics.
3022  *
3023  *		reltuples is the number of heap tuples to be passed to the
3024  *		bulkdelete callback.  It's always assumed to be estimated.
3025  *
3026  * Returns bulk delete stats derived from input stats
3027  */
3028 static IndexBulkDeleteResult *
lazy_vacuum_one_index(Relation indrel,IndexBulkDeleteResult * istat,double reltuples,LVRelState * vacrel)3029 lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat,
3030 					  double reltuples, LVRelState *vacrel)
3031 {
3032 	IndexVacuumInfo ivinfo;
3033 	PGRUsage	ru0;
3034 	LVSavedErrInfo saved_err_info;
3035 
3036 	pg_rusage_init(&ru0);
3037 
3038 	ivinfo.index = indrel;
3039 	ivinfo.analyze_only = false;
3040 	ivinfo.report_progress = false;
3041 	ivinfo.estimated_count = true;
3042 	ivinfo.message_level = elevel;
3043 	ivinfo.num_heap_tuples = reltuples;
3044 	ivinfo.strategy = vacrel->bstrategy;
3045 
3046 	/*
3047 	 * Update error traceback information.
3048 	 *
3049 	 * The index name is saved during this phase and restored immediately
3050 	 * after this phase.  See vacuum_error_callback.
3051 	 */
3052 	Assert(vacrel->indname == NULL);
3053 	vacrel->indname = pstrdup(RelationGetRelationName(indrel));
3054 	update_vacuum_error_info(vacrel, &saved_err_info,
3055 							 VACUUM_ERRCB_PHASE_VACUUM_INDEX,
3056 							 InvalidBlockNumber, InvalidOffsetNumber);
3057 
3058 	/* Do bulk deletion */
3059 	istat = index_bulk_delete(&ivinfo, istat, lazy_tid_reaped,
3060 							  (void *) vacrel->dead_tuples);
3061 
3062 	ereport(elevel,
3063 			(errmsg("scanned index \"%s\" to remove %d row versions",
3064 					vacrel->indname, vacrel->dead_tuples->num_tuples),
3065 			 errdetail_internal("%s", pg_rusage_show(&ru0))));
3066 
3067 	/* Revert to the previous phase information for error traceback */
3068 	restore_vacuum_error_info(vacrel, &saved_err_info);
3069 	pfree(vacrel->indname);
3070 	vacrel->indname = NULL;
3071 
3072 	return istat;
3073 }
3074 
3075 /*
3076  *	lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation.
3077  *
3078  *		reltuples is the number of heap tuples and estimated_count is true
3079  *		if reltuples is an estimated value.
3080  *
3081  * Returns bulk delete stats derived from input stats
3082  */
3083 static IndexBulkDeleteResult *
lazy_cleanup_one_index(Relation indrel,IndexBulkDeleteResult * istat,double reltuples,bool estimated_count,LVRelState * vacrel)3084 lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat,
3085 					   double reltuples, bool estimated_count,
3086 					   LVRelState *vacrel)
3087 {
3088 	IndexVacuumInfo ivinfo;
3089 	PGRUsage	ru0;
3090 	LVSavedErrInfo saved_err_info;
3091 
3092 	pg_rusage_init(&ru0);
3093 
3094 	ivinfo.index = indrel;
3095 	ivinfo.analyze_only = false;
3096 	ivinfo.report_progress = false;
3097 	ivinfo.estimated_count = estimated_count;
3098 	ivinfo.message_level = elevel;
3099 
3100 	ivinfo.num_heap_tuples = reltuples;
3101 	ivinfo.strategy = vacrel->bstrategy;
3102 
3103 	/*
3104 	 * Update error traceback information.
3105 	 *
3106 	 * The index name is saved during this phase and restored immediately
3107 	 * after this phase.  See vacuum_error_callback.
3108 	 */
3109 	Assert(vacrel->indname == NULL);
3110 	vacrel->indname = pstrdup(RelationGetRelationName(indrel));
3111 	update_vacuum_error_info(vacrel, &saved_err_info,
3112 							 VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
3113 							 InvalidBlockNumber, InvalidOffsetNumber);
3114 
3115 	istat = index_vacuum_cleanup(&ivinfo, istat);
3116 
3117 	if (istat)
3118 	{
3119 		ereport(elevel,
3120 				(errmsg("index \"%s\" now contains %.0f row versions in %u pages",
3121 						RelationGetRelationName(indrel),
3122 						(istat)->num_index_tuples,
3123 						(istat)->num_pages),
3124 				 errdetail("%.0f index row versions were removed.\n"
3125 						   "%u index pages were newly deleted.\n"
3126 						   "%u index pages are currently deleted, of which %u are currently reusable.\n"
3127 						   "%s.",
3128 						   (istat)->tuples_removed,
3129 						   (istat)->pages_newly_deleted,
3130 						   (istat)->pages_deleted, (istat)->pages_free,
3131 						   pg_rusage_show(&ru0))));
3132 	}
3133 
3134 	/* Revert to the previous phase information for error traceback */
3135 	restore_vacuum_error_info(vacrel, &saved_err_info);
3136 	pfree(vacrel->indname);
3137 	vacrel->indname = NULL;
3138 
3139 	return istat;
3140 }
3141 
3142 /*
3143  * should_attempt_truncation - should we attempt to truncate the heap?
3144  *
3145  * Don't even think about it unless we have a shot at releasing a goodly
3146  * number of pages.  Otherwise, the time taken isn't worth it.
3147  *
3148  * Also don't attempt it if wraparound failsafe is in effect.  It's hard to
3149  * predict how long lazy_truncate_heap will take.  Don't take any chances.
3150  * There is very little chance of truncation working out when the failsafe is
3151  * in effect in any case.  lazy_scan_prune makes the optimistic assumption
3152  * that any LP_DEAD items it encounters will always be LP_UNUSED by the time
3153  * we're called.
3154  *
3155  * Also don't attempt it if we are doing early pruning/vacuuming, because a
3156  * scan which cannot find a truncated heap page cannot determine that the
3157  * snapshot is too old to read that page.
3158  *
3159  * This is split out so that we can test whether truncation is going to be
3160  * called for before we actually do it.  If you change the logic here, be
3161  * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
3162  */
3163 static bool
should_attempt_truncation(LVRelState * vacrel)3164 should_attempt_truncation(LVRelState *vacrel)
3165 {
3166 	BlockNumber possibly_freeable;
3167 
3168 	if (!vacrel->do_rel_truncate || vacrel->failsafe_active)
3169 		return false;
3170 
3171 	possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages;
3172 	if (possibly_freeable > 0 &&
3173 		(possibly_freeable >= REL_TRUNCATE_MINIMUM ||
3174 		 possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION) &&
3175 		old_snapshot_threshold < 0)
3176 		return true;
3177 	else
3178 		return false;
3179 }
3180 
3181 /*
3182  * lazy_truncate_heap - try to truncate off any empty pages at the end
3183  */
3184 static void
lazy_truncate_heap(LVRelState * vacrel)3185 lazy_truncate_heap(LVRelState *vacrel)
3186 {
3187 	BlockNumber old_rel_pages = vacrel->rel_pages;
3188 	BlockNumber new_rel_pages;
3189 	bool		lock_waiter_detected;
3190 	int			lock_retry;
3191 
3192 	/* Report that we are now truncating */
3193 	pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
3194 								 PROGRESS_VACUUM_PHASE_TRUNCATE);
3195 
3196 	/*
3197 	 * Loop until no more truncating can be done.
3198 	 */
3199 	do
3200 	{
3201 		PGRUsage	ru0;
3202 
3203 		pg_rusage_init(&ru0);
3204 
3205 		/*
3206 		 * We need full exclusive lock on the relation in order to do
3207 		 * truncation. If we can't get it, give up rather than waiting --- we
3208 		 * don't want to block other backends, and we don't want to deadlock
3209 		 * (which is quite possible considering we already hold a lower-grade
3210 		 * lock).
3211 		 */
3212 		lock_waiter_detected = false;
3213 		lock_retry = 0;
3214 		while (true)
3215 		{
3216 			if (ConditionalLockRelation(vacrel->rel, AccessExclusiveLock))
3217 				break;
3218 
3219 			/*
3220 			 * Check for interrupts while trying to (re-)acquire the exclusive
3221 			 * lock.
3222 			 */
3223 			CHECK_FOR_INTERRUPTS();
3224 
3225 			if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
3226 								VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
3227 			{
3228 				/*
3229 				 * We failed to establish the lock in the specified number of
3230 				 * retries. This means we give up truncating.
3231 				 */
3232 				ereport(elevel,
3233 						(errmsg("\"%s\": stopping truncate due to conflicting lock request",
3234 								vacrel->relname)));
3235 				return;
3236 			}
3237 
3238 			pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L);
3239 		}
3240 
3241 		/*
3242 		 * Now that we have exclusive lock, look to see if the rel has grown
3243 		 * whilst we were vacuuming with non-exclusive lock.  If so, give up;
3244 		 * the newly added pages presumably contain non-deletable tuples.
3245 		 */
3246 		new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel);
3247 		if (new_rel_pages != old_rel_pages)
3248 		{
3249 			/*
3250 			 * Note: we intentionally don't update vacrel->rel_pages with the
3251 			 * new rel size here.  If we did, it would amount to assuming that
3252 			 * the new pages are empty, which is unlikely. Leaving the numbers
3253 			 * alone amounts to assuming that the new pages have the same
3254 			 * tuple density as existing ones, which is less unlikely.
3255 			 */
3256 			UnlockRelation(vacrel->rel, AccessExclusiveLock);
3257 			return;
3258 		}
3259 
3260 		/*
3261 		 * Scan backwards from the end to verify that the end pages actually
3262 		 * contain no tuples.  This is *necessary*, not optional, because
3263 		 * other backends could have added tuples to these pages whilst we
3264 		 * were vacuuming.
3265 		 */
3266 		new_rel_pages = count_nondeletable_pages(vacrel, &lock_waiter_detected);
3267 		vacrel->blkno = new_rel_pages;
3268 
3269 		if (new_rel_pages >= old_rel_pages)
3270 		{
3271 			/* can't do anything after all */
3272 			UnlockRelation(vacrel->rel, AccessExclusiveLock);
3273 			return;
3274 		}
3275 
3276 		/*
3277 		 * Okay to truncate.
3278 		 */
3279 		RelationTruncate(vacrel->rel, new_rel_pages);
3280 
3281 		/*
3282 		 * We can release the exclusive lock as soon as we have truncated.
3283 		 * Other backends can't safely access the relation until they have
3284 		 * processed the smgr invalidation that smgrtruncate sent out ... but
3285 		 * that should happen as part of standard invalidation processing once
3286 		 * they acquire lock on the relation.
3287 		 */
3288 		UnlockRelation(vacrel->rel, AccessExclusiveLock);
3289 
3290 		/*
3291 		 * Update statistics.  Here, it *is* correct to adjust rel_pages
3292 		 * without also touching reltuples, since the tuple count wasn't
3293 		 * changed by the truncation.
3294 		 */
3295 		vacrel->pages_removed += old_rel_pages - new_rel_pages;
3296 		vacrel->rel_pages = new_rel_pages;
3297 
3298 		ereport(elevel,
3299 				(errmsg("table \"%s\": truncated %u to %u pages",
3300 						vacrel->relname,
3301 						old_rel_pages, new_rel_pages),
3302 				 errdetail_internal("%s",
3303 									pg_rusage_show(&ru0))));
3304 		old_rel_pages = new_rel_pages;
3305 	} while (new_rel_pages > vacrel->nonempty_pages && lock_waiter_detected);
3306 }
3307 
3308 /*
3309  * Rescan end pages to verify that they are (still) empty of tuples.
3310  *
3311  * Returns number of nondeletable pages (last nonempty page + 1).
3312  */
3313 static BlockNumber
count_nondeletable_pages(LVRelState * vacrel,bool * lock_waiter_detected)3314 count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
3315 {
3316 	BlockNumber blkno;
3317 	BlockNumber prefetchedUntil;
3318 	instr_time	starttime;
3319 
3320 	/* Initialize the starttime if we check for conflicting lock requests */
3321 	INSTR_TIME_SET_CURRENT(starttime);
3322 
3323 	/*
3324 	 * Start checking blocks at what we believe relation end to be and move
3325 	 * backwards.  (Strange coding of loop control is needed because blkno is
3326 	 * unsigned.)  To make the scan faster, we prefetch a few blocks at a time
3327 	 * in forward direction, so that OS-level readahead can kick in.
3328 	 */
3329 	blkno = vacrel->rel_pages;
3330 	StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0,
3331 					 "prefetch size must be power of 2");
3332 	prefetchedUntil = InvalidBlockNumber;
3333 	while (blkno > vacrel->nonempty_pages)
3334 	{
3335 		Buffer		buf;
3336 		Page		page;
3337 		OffsetNumber offnum,
3338 					maxoff;
3339 		bool		hastup;
3340 
3341 		/*
3342 		 * Check if another process requests a lock on our relation. We are
3343 		 * holding an AccessExclusiveLock here, so they will be waiting. We
3344 		 * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
3345 		 * only check if that interval has elapsed once every 32 blocks to
3346 		 * keep the number of system calls and actual shared lock table
3347 		 * lookups to a minimum.
3348 		 */
3349 		if ((blkno % 32) == 0)
3350 		{
3351 			instr_time	currenttime;
3352 			instr_time	elapsed;
3353 
3354 			INSTR_TIME_SET_CURRENT(currenttime);
3355 			elapsed = currenttime;
3356 			INSTR_TIME_SUBTRACT(elapsed, starttime);
3357 			if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
3358 				>= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
3359 			{
3360 				if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
3361 				{
3362 					ereport(elevel,
3363 							(errmsg("table \"%s\": suspending truncate due to conflicting lock request",
3364 									vacrel->relname)));
3365 
3366 					*lock_waiter_detected = true;
3367 					return blkno;
3368 				}
3369 				starttime = currenttime;
3370 			}
3371 		}
3372 
3373 		/*
3374 		 * We don't insert a vacuum delay point here, because we have an
3375 		 * exclusive lock on the table which we want to hold for as short a
3376 		 * time as possible.  We still need to check for interrupts however.
3377 		 */
3378 		CHECK_FOR_INTERRUPTS();
3379 
3380 		blkno--;
3381 
3382 		/* If we haven't prefetched this lot yet, do so now. */
3383 		if (prefetchedUntil > blkno)
3384 		{
3385 			BlockNumber prefetchStart;
3386 			BlockNumber pblkno;
3387 
3388 			prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
3389 			for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
3390 			{
3391 				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno);
3392 				CHECK_FOR_INTERRUPTS();
3393 			}
3394 			prefetchedUntil = prefetchStart;
3395 		}
3396 
3397 		buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
3398 								 vacrel->bstrategy);
3399 
3400 		/* In this phase we only need shared access to the buffer */
3401 		LockBuffer(buf, BUFFER_LOCK_SHARE);
3402 
3403 		page = BufferGetPage(buf);
3404 
3405 		if (PageIsNew(page) || PageIsEmpty(page))
3406 		{
3407 			UnlockReleaseBuffer(buf);
3408 			continue;
3409 		}
3410 
3411 		hastup = false;
3412 		maxoff = PageGetMaxOffsetNumber(page);
3413 		for (offnum = FirstOffsetNumber;
3414 			 offnum <= maxoff;
3415 			 offnum = OffsetNumberNext(offnum))
3416 		{
3417 			ItemId		itemid;
3418 
3419 			itemid = PageGetItemId(page, offnum);
3420 
3421 			/*
3422 			 * Note: any non-unused item should be taken as a reason to keep
3423 			 * this page.  Even an LP_DEAD item makes truncation unsafe, since
3424 			 * we must not have cleaned out its index entries.
3425 			 */
3426 			if (ItemIdIsUsed(itemid))
3427 			{
3428 				hastup = true;
3429 				break;			/* can stop scanning */
3430 			}
3431 		}						/* scan along page */
3432 
3433 		UnlockReleaseBuffer(buf);
3434 
3435 		/* Done scanning if we found a tuple here */
3436 		if (hastup)
3437 			return blkno + 1;
3438 	}
3439 
3440 	/*
3441 	 * If we fall out of the loop, all the previously-thought-to-be-empty
3442 	 * pages still are; we need not bother to look at the last known-nonempty
3443 	 * page.
3444 	 */
3445 	return vacrel->nonempty_pages;
3446 }
3447 
3448 /*
3449  * Return the maximum number of dead tuples we can record.
3450  */
3451 static long
compute_max_dead_tuples(BlockNumber relblocks,bool hasindex)3452 compute_max_dead_tuples(BlockNumber relblocks, bool hasindex)
3453 {
3454 	long		maxtuples;
3455 	int			vac_work_mem = IsAutoVacuumWorkerProcess() &&
3456 	autovacuum_work_mem != -1 ?
3457 	autovacuum_work_mem : maintenance_work_mem;
3458 
3459 	if (hasindex)
3460 	{
3461 		maxtuples = MAXDEADTUPLES(vac_work_mem * 1024L);
3462 		maxtuples = Min(maxtuples, INT_MAX);
3463 		maxtuples = Min(maxtuples, MAXDEADTUPLES(MaxAllocSize));
3464 
3465 		/* curious coding here to ensure the multiplication can't overflow */
3466 		if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
3467 			maxtuples = relblocks * LAZY_ALLOC_TUPLES;
3468 
3469 		/* stay sane if small maintenance_work_mem */
3470 		maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
3471 	}
3472 	else
3473 		maxtuples = MaxHeapTuplesPerPage;
3474 
3475 	return maxtuples;
3476 }
3477 
3478 /*
3479  * lazy_space_alloc - space allocation decisions for lazy vacuum
3480  *
3481  * See the comments at the head of this file for rationale.
3482  */
3483 static void
lazy_space_alloc(LVRelState * vacrel,int nworkers,BlockNumber nblocks)3484 lazy_space_alloc(LVRelState *vacrel, int nworkers, BlockNumber nblocks)
3485 {
3486 	LVDeadTuples *dead_tuples;
3487 	long		maxtuples;
3488 
3489 	/*
3490 	 * Initialize state for a parallel vacuum.  As of now, only one worker can
3491 	 * be used for an index, so we invoke parallelism only if there are at
3492 	 * least two indexes on a table.
3493 	 */
3494 	if (nworkers >= 0 && vacrel->nindexes > 1 && vacrel->do_index_vacuuming)
3495 	{
3496 		/*
3497 		 * Since parallel workers cannot access data in temporary tables, we
3498 		 * can't perform parallel vacuum on them.
3499 		 */
3500 		if (RelationUsesLocalBuffers(vacrel->rel))
3501 		{
3502 			/*
3503 			 * Give warning only if the user explicitly tries to perform a
3504 			 * parallel vacuum on the temporary table.
3505 			 */
3506 			if (nworkers > 0)
3507 				ereport(WARNING,
3508 						(errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel",
3509 								vacrel->relname)));
3510 		}
3511 		else
3512 			vacrel->lps = begin_parallel_vacuum(vacrel, nblocks, nworkers);
3513 
3514 		/* If parallel mode started, we're done */
3515 		if (ParallelVacuumIsActive(vacrel))
3516 			return;
3517 	}
3518 
3519 	maxtuples = compute_max_dead_tuples(nblocks, vacrel->nindexes > 0);
3520 
3521 	dead_tuples = (LVDeadTuples *) palloc(SizeOfDeadTuples(maxtuples));
3522 	dead_tuples->num_tuples = 0;
3523 	dead_tuples->max_tuples = (int) maxtuples;
3524 
3525 	vacrel->dead_tuples = dead_tuples;
3526 }
3527 
3528 /*
3529  * lazy_space_free - free space allocated in lazy_space_alloc
3530  */
3531 static void
lazy_space_free(LVRelState * vacrel)3532 lazy_space_free(LVRelState *vacrel)
3533 {
3534 	if (!ParallelVacuumIsActive(vacrel))
3535 		return;
3536 
3537 	/*
3538 	 * End parallel mode before updating index statistics as we cannot write
3539 	 * during parallel mode.
3540 	 */
3541 	end_parallel_vacuum(vacrel);
3542 }
3543 
3544 /*
3545  *	lazy_tid_reaped() -- is a particular tid deletable?
3546  *
3547  *		This has the right signature to be an IndexBulkDeleteCallback.
3548  *
3549  *		Assumes dead_tuples array is in sorted order.
3550  */
3551 static bool
lazy_tid_reaped(ItemPointer itemptr,void * state)3552 lazy_tid_reaped(ItemPointer itemptr, void *state)
3553 {
3554 	LVDeadTuples *dead_tuples = (LVDeadTuples *) state;
3555 	int64		litem,
3556 				ritem,
3557 				item;
3558 	ItemPointer res;
3559 
3560 	litem = itemptr_encode(&dead_tuples->itemptrs[0]);
3561 	ritem = itemptr_encode(&dead_tuples->itemptrs[dead_tuples->num_tuples - 1]);
3562 	item = itemptr_encode(itemptr);
3563 
3564 	/*
3565 	 * Doing a simple bound check before bsearch() is useful to avoid the
3566 	 * extra cost of bsearch(), especially if dead tuples on the heap are
3567 	 * concentrated in a certain range.  Since this function is called for
3568 	 * every index tuple, it pays to be really fast.
3569 	 */
3570 	if (item < litem || item > ritem)
3571 		return false;
3572 
3573 	res = (ItemPointer) bsearch((void *) itemptr,
3574 								(void *) dead_tuples->itemptrs,
3575 								dead_tuples->num_tuples,
3576 								sizeof(ItemPointerData),
3577 								vac_cmp_itemptr);
3578 
3579 	return (res != NULL);
3580 }
3581 
3582 /*
3583  * Comparator routines for use with qsort() and bsearch().
3584  */
3585 static int
vac_cmp_itemptr(const void * left,const void * right)3586 vac_cmp_itemptr(const void *left, const void *right)
3587 {
3588 	BlockNumber lblk,
3589 				rblk;
3590 	OffsetNumber loff,
3591 				roff;
3592 
3593 	lblk = ItemPointerGetBlockNumber((ItemPointer) left);
3594 	rblk = ItemPointerGetBlockNumber((ItemPointer) right);
3595 
3596 	if (lblk < rblk)
3597 		return -1;
3598 	if (lblk > rblk)
3599 		return 1;
3600 
3601 	loff = ItemPointerGetOffsetNumber((ItemPointer) left);
3602 	roff = ItemPointerGetOffsetNumber((ItemPointer) right);
3603 
3604 	if (loff < roff)
3605 		return -1;
3606 	if (loff > roff)
3607 		return 1;
3608 
3609 	return 0;
3610 }
3611 
3612 /*
3613  * Check if every tuple in the given page is visible to all current and future
3614  * transactions. Also return the visibility_cutoff_xid which is the highest
3615  * xmin amongst the visible tuples.  Set *all_frozen to true if every tuple
3616  * on this page is frozen.
3617  */
3618 static bool
heap_page_is_all_visible(LVRelState * vacrel,Buffer buf,TransactionId * visibility_cutoff_xid,bool * all_frozen)3619 heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
3620 						 TransactionId *visibility_cutoff_xid,
3621 						 bool *all_frozen)
3622 {
3623 	Page		page = BufferGetPage(buf);
3624 	BlockNumber blockno = BufferGetBlockNumber(buf);
3625 	OffsetNumber offnum,
3626 				maxoff;
3627 	bool		all_visible = true;
3628 
3629 	*visibility_cutoff_xid = InvalidTransactionId;
3630 	*all_frozen = true;
3631 
3632 	/*
3633 	 * This is a stripped down version of the line pointer scan in
3634 	 * lazy_scan_heap(). So if you change anything here, also check that code.
3635 	 */
3636 	maxoff = PageGetMaxOffsetNumber(page);
3637 	for (offnum = FirstOffsetNumber;
3638 		 offnum <= maxoff && all_visible;
3639 		 offnum = OffsetNumberNext(offnum))
3640 	{
3641 		ItemId		itemid;
3642 		HeapTupleData tuple;
3643 
3644 		/*
3645 		 * Set the offset number so that we can display it along with any
3646 		 * error that occurred while processing this tuple.
3647 		 */
3648 		vacrel->offnum = offnum;
3649 		itemid = PageGetItemId(page, offnum);
3650 
3651 		/* Unused or redirect line pointers are of no interest */
3652 		if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
3653 			continue;
3654 
3655 		ItemPointerSet(&(tuple.t_self), blockno, offnum);
3656 
3657 		/*
3658 		 * Dead line pointers can have index pointers pointing to them. So
3659 		 * they can't be treated as visible
3660 		 */
3661 		if (ItemIdIsDead(itemid))
3662 		{
3663 			all_visible = false;
3664 			*all_frozen = false;
3665 			break;
3666 		}
3667 
3668 		Assert(ItemIdIsNormal(itemid));
3669 
3670 		tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
3671 		tuple.t_len = ItemIdGetLength(itemid);
3672 		tuple.t_tableOid = RelationGetRelid(vacrel->rel);
3673 
3674 		switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf))
3675 		{
3676 			case HEAPTUPLE_LIVE:
3677 				{
3678 					TransactionId xmin;
3679 
3680 					/* Check comments in lazy_scan_heap. */
3681 					if (!HeapTupleHeaderXminCommitted(tuple.t_data))
3682 					{
3683 						all_visible = false;
3684 						*all_frozen = false;
3685 						break;
3686 					}
3687 
3688 					/*
3689 					 * The inserter definitely committed. But is it old enough
3690 					 * that everyone sees it as committed?
3691 					 */
3692 					xmin = HeapTupleHeaderGetXmin(tuple.t_data);
3693 					if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
3694 					{
3695 						all_visible = false;
3696 						*all_frozen = false;
3697 						break;
3698 					}
3699 
3700 					/* Track newest xmin on page. */
3701 					if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
3702 						*visibility_cutoff_xid = xmin;
3703 
3704 					/* Check whether this tuple is already frozen or not */
3705 					if (all_visible && *all_frozen &&
3706 						heap_tuple_needs_eventual_freeze(tuple.t_data))
3707 						*all_frozen = false;
3708 				}
3709 				break;
3710 
3711 			case HEAPTUPLE_DEAD:
3712 			case HEAPTUPLE_RECENTLY_DEAD:
3713 			case HEAPTUPLE_INSERT_IN_PROGRESS:
3714 			case HEAPTUPLE_DELETE_IN_PROGRESS:
3715 				{
3716 					all_visible = false;
3717 					*all_frozen = false;
3718 					break;
3719 				}
3720 			default:
3721 				elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
3722 				break;
3723 		}
3724 	}							/* scan along page */
3725 
3726 	/* Clear the offset information once we have processed the given page. */
3727 	vacrel->offnum = InvalidOffsetNumber;
3728 
3729 	return all_visible;
3730 }
3731 
3732 /*
3733  * Compute the number of parallel worker processes to request.  Both index
3734  * vacuum and index cleanup can be executed with parallel workers.  The index
3735  * is eligible for parallel vacuum iff its size is greater than
3736  * min_parallel_index_scan_size as invoking workers for very small indexes
3737  * can hurt performance.
3738  *
3739  * nrequested is the number of parallel workers that user requested.  If
3740  * nrequested is 0, we compute the parallel degree based on nindexes, that is
3741  * the number of indexes that support parallel vacuum.  This function also
3742  * sets will_parallel_vacuum to remember indexes that participate in parallel
3743  * vacuum.
3744  */
3745 static int
compute_parallel_vacuum_workers(LVRelState * vacrel,int nrequested,bool * will_parallel_vacuum)3746 compute_parallel_vacuum_workers(LVRelState *vacrel, int nrequested,
3747 								bool *will_parallel_vacuum)
3748 {
3749 	int			nindexes_parallel = 0;
3750 	int			nindexes_parallel_bulkdel = 0;
3751 	int			nindexes_parallel_cleanup = 0;
3752 	int			parallel_workers;
3753 
3754 	/*
3755 	 * We don't allow performing parallel operation in standalone backend or
3756 	 * when parallelism is disabled.
3757 	 */
3758 	if (!IsUnderPostmaster || max_parallel_maintenance_workers == 0)
3759 		return 0;
3760 
3761 	/*
3762 	 * Compute the number of indexes that can participate in parallel vacuum.
3763 	 */
3764 	for (int idx = 0; idx < vacrel->nindexes; idx++)
3765 	{
3766 		Relation	indrel = vacrel->indrels[idx];
3767 		uint8		vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3768 
3769 		if (vacoptions == VACUUM_OPTION_NO_PARALLEL ||
3770 			RelationGetNumberOfBlocks(indrel) < min_parallel_index_scan_size)
3771 			continue;
3772 
3773 		will_parallel_vacuum[idx] = true;
3774 
3775 		if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3776 			nindexes_parallel_bulkdel++;
3777 		if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0) ||
3778 			((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
3779 			nindexes_parallel_cleanup++;
3780 	}
3781 
3782 	nindexes_parallel = Max(nindexes_parallel_bulkdel,
3783 							nindexes_parallel_cleanup);
3784 
3785 	/* The leader process takes one index */
3786 	nindexes_parallel--;
3787 
3788 	/* No index supports parallel vacuum */
3789 	if (nindexes_parallel <= 0)
3790 		return 0;
3791 
3792 	/* Compute the parallel degree */
3793 	parallel_workers = (nrequested > 0) ?
3794 		Min(nrequested, nindexes_parallel) : nindexes_parallel;
3795 
3796 	/* Cap by max_parallel_maintenance_workers */
3797 	parallel_workers = Min(parallel_workers, max_parallel_maintenance_workers);
3798 
3799 	return parallel_workers;
3800 }
3801 
3802 /*
3803  * Update index statistics in pg_class if the statistics are accurate.
3804  */
3805 static void
update_index_statistics(LVRelState * vacrel)3806 update_index_statistics(LVRelState *vacrel)
3807 {
3808 	Relation   *indrels = vacrel->indrels;
3809 	int			nindexes = vacrel->nindexes;
3810 	IndexBulkDeleteResult **indstats = vacrel->indstats;
3811 
3812 	Assert(!IsInParallelMode());
3813 
3814 	for (int idx = 0; idx < nindexes; idx++)
3815 	{
3816 		Relation	indrel = indrels[idx];
3817 		IndexBulkDeleteResult *istat = indstats[idx];
3818 
3819 		if (istat == NULL || istat->estimated_count)
3820 			continue;
3821 
3822 		/* Update index statistics */
3823 		vac_update_relstats(indrel,
3824 							istat->num_pages,
3825 							istat->num_index_tuples,
3826 							0,
3827 							false,
3828 							InvalidTransactionId,
3829 							InvalidMultiXactId,
3830 							false);
3831 	}
3832 }
3833 
3834 /*
3835  * This function prepares and returns parallel vacuum state if we can launch
3836  * even one worker.  This function is responsible for entering parallel mode,
3837  * create a parallel context, and then initialize the DSM segment.
3838  */
3839 static LVParallelState *
begin_parallel_vacuum(LVRelState * vacrel,BlockNumber nblocks,int nrequested)3840 begin_parallel_vacuum(LVRelState *vacrel, BlockNumber nblocks,
3841 					  int nrequested)
3842 {
3843 	LVParallelState *lps = NULL;
3844 	Relation   *indrels = vacrel->indrels;
3845 	int			nindexes = vacrel->nindexes;
3846 	ParallelContext *pcxt;
3847 	LVShared   *shared;
3848 	LVDeadTuples *dead_tuples;
3849 	BufferUsage *buffer_usage;
3850 	WalUsage   *wal_usage;
3851 	bool	   *will_parallel_vacuum;
3852 	long		maxtuples;
3853 	Size		est_shared;
3854 	Size		est_deadtuples;
3855 	int			nindexes_mwm = 0;
3856 	int			parallel_workers = 0;
3857 	int			querylen;
3858 
3859 	/*
3860 	 * A parallel vacuum must be requested and there must be indexes on the
3861 	 * relation
3862 	 */
3863 	Assert(nrequested >= 0);
3864 	Assert(nindexes > 0);
3865 
3866 	/*
3867 	 * Compute the number of parallel vacuum workers to launch
3868 	 */
3869 	will_parallel_vacuum = (bool *) palloc0(sizeof(bool) * nindexes);
3870 	parallel_workers = compute_parallel_vacuum_workers(vacrel,
3871 													   nrequested,
3872 													   will_parallel_vacuum);
3873 
3874 	/* Can't perform vacuum in parallel */
3875 	if (parallel_workers <= 0)
3876 	{
3877 		pfree(will_parallel_vacuum);
3878 		return lps;
3879 	}
3880 
3881 	lps = (LVParallelState *) palloc0(sizeof(LVParallelState));
3882 
3883 	EnterParallelMode();
3884 	pcxt = CreateParallelContext("postgres", "parallel_vacuum_main",
3885 								 parallel_workers);
3886 	Assert(pcxt->nworkers > 0);
3887 	lps->pcxt = pcxt;
3888 
3889 	/* Estimate size for shared information -- PARALLEL_VACUUM_KEY_SHARED */
3890 	est_shared = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3891 	for (int idx = 0; idx < nindexes; idx++)
3892 	{
3893 		Relation	indrel = indrels[idx];
3894 		uint8		vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3895 
3896 		/*
3897 		 * Cleanup option should be either disabled, always performing in
3898 		 * parallel or conditionally performing in parallel.
3899 		 */
3900 		Assert(((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) ||
3901 			   ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0));
3902 		Assert(vacoptions <= VACUUM_OPTION_MAX_VALID_VALUE);
3903 
3904 		/* Skip indexes that don't participate in parallel vacuum */
3905 		if (!will_parallel_vacuum[idx])
3906 			continue;
3907 
3908 		if (indrel->rd_indam->amusemaintenanceworkmem)
3909 			nindexes_mwm++;
3910 
3911 		est_shared = add_size(est_shared, sizeof(LVSharedIndStats));
3912 
3913 		/*
3914 		 * Remember the number of indexes that support parallel operation for
3915 		 * each phase.
3916 		 */
3917 		if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3918 			lps->nindexes_parallel_bulkdel++;
3919 		if ((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0)
3920 			lps->nindexes_parallel_cleanup++;
3921 		if ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0)
3922 			lps->nindexes_parallel_condcleanup++;
3923 	}
3924 	shm_toc_estimate_chunk(&pcxt->estimator, est_shared);
3925 	shm_toc_estimate_keys(&pcxt->estimator, 1);
3926 
3927 	/* Estimate size for dead tuples -- PARALLEL_VACUUM_KEY_DEAD_TUPLES */
3928 	maxtuples = compute_max_dead_tuples(nblocks, true);
3929 	est_deadtuples = MAXALIGN(SizeOfDeadTuples(maxtuples));
3930 	shm_toc_estimate_chunk(&pcxt->estimator, est_deadtuples);
3931 	shm_toc_estimate_keys(&pcxt->estimator, 1);
3932 
3933 	/*
3934 	 * Estimate space for BufferUsage and WalUsage --
3935 	 * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
3936 	 *
3937 	 * If there are no extensions loaded that care, we could skip this.  We
3938 	 * have no way of knowing whether anyone's looking at pgBufferUsage or
3939 	 * pgWalUsage, so do it unconditionally.
3940 	 */
3941 	shm_toc_estimate_chunk(&pcxt->estimator,
3942 						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
3943 	shm_toc_estimate_keys(&pcxt->estimator, 1);
3944 	shm_toc_estimate_chunk(&pcxt->estimator,
3945 						   mul_size(sizeof(WalUsage), pcxt->nworkers));
3946 	shm_toc_estimate_keys(&pcxt->estimator, 1);
3947 
3948 	/* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
3949 	if (debug_query_string)
3950 	{
3951 		querylen = strlen(debug_query_string);
3952 		shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
3953 		shm_toc_estimate_keys(&pcxt->estimator, 1);
3954 	}
3955 	else
3956 		querylen = 0;			/* keep compiler quiet */
3957 
3958 	InitializeParallelDSM(pcxt);
3959 
3960 	/* Prepare shared information */
3961 	shared = (LVShared *) shm_toc_allocate(pcxt->toc, est_shared);
3962 	MemSet(shared, 0, est_shared);
3963 	shared->relid = RelationGetRelid(vacrel->rel);
3964 	shared->elevel = elevel;
3965 	shared->maintenance_work_mem_worker =
3966 		(nindexes_mwm > 0) ?
3967 		maintenance_work_mem / Min(parallel_workers, nindexes_mwm) :
3968 		maintenance_work_mem;
3969 
3970 	pg_atomic_init_u32(&(shared->cost_balance), 0);
3971 	pg_atomic_init_u32(&(shared->active_nworkers), 0);
3972 	pg_atomic_init_u32(&(shared->idx), 0);
3973 	shared->offset = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3974 
3975 	/*
3976 	 * Initialize variables for shared index statistics, set NULL bitmap and
3977 	 * the size of stats for each index.
3978 	 */
3979 	memset(shared->bitmap, 0x00, BITMAPLEN(nindexes));
3980 	for (int idx = 0; idx < nindexes; idx++)
3981 	{
3982 		if (!will_parallel_vacuum[idx])
3983 			continue;
3984 
3985 		/* Set NOT NULL as this index does support parallelism */
3986 		shared->bitmap[idx >> 3] |= 1 << (idx & 0x07);
3987 	}
3988 
3989 	shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_SHARED, shared);
3990 	lps->lvshared = shared;
3991 
3992 	/* Prepare the dead tuple space */
3993 	dead_tuples = (LVDeadTuples *) shm_toc_allocate(pcxt->toc, est_deadtuples);
3994 	dead_tuples->max_tuples = maxtuples;
3995 	dead_tuples->num_tuples = 0;
3996 	MemSet(dead_tuples->itemptrs, 0, sizeof(ItemPointerData) * maxtuples);
3997 	shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DEAD_TUPLES, dead_tuples);
3998 	vacrel->dead_tuples = dead_tuples;
3999 
4000 	/*
4001 	 * Allocate space for each worker's BufferUsage and WalUsage; no need to
4002 	 * initialize
4003 	 */
4004 	buffer_usage = shm_toc_allocate(pcxt->toc,
4005 									mul_size(sizeof(BufferUsage), pcxt->nworkers));
4006 	shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage);
4007 	lps->buffer_usage = buffer_usage;
4008 	wal_usage = shm_toc_allocate(pcxt->toc,
4009 								 mul_size(sizeof(WalUsage), pcxt->nworkers));
4010 	shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_WAL_USAGE, wal_usage);
4011 	lps->wal_usage = wal_usage;
4012 
4013 	/* Store query string for workers */
4014 	if (debug_query_string)
4015 	{
4016 		char	   *sharedquery;
4017 
4018 		sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
4019 		memcpy(sharedquery, debug_query_string, querylen + 1);
4020 		sharedquery[querylen] = '\0';
4021 		shm_toc_insert(pcxt->toc,
4022 					   PARALLEL_VACUUM_KEY_QUERY_TEXT, sharedquery);
4023 	}
4024 
4025 	pfree(will_parallel_vacuum);
4026 	return lps;
4027 }
4028 
4029 /*
4030  * Destroy the parallel context, and end parallel mode.
4031  *
4032  * Since writes are not allowed during parallel mode, copy the
4033  * updated index statistics from DSM into local memory and then later use that
4034  * to update the index statistics.  One might think that we can exit from
4035  * parallel mode, update the index statistics and then destroy parallel
4036  * context, but that won't be safe (see ExitParallelMode).
4037  */
4038 static void
end_parallel_vacuum(LVRelState * vacrel)4039 end_parallel_vacuum(LVRelState *vacrel)
4040 {
4041 	IndexBulkDeleteResult **indstats = vacrel->indstats;
4042 	LVParallelState *lps = vacrel->lps;
4043 	int			nindexes = vacrel->nindexes;
4044 
4045 	Assert(!IsParallelWorker());
4046 
4047 	/* Copy the updated statistics */
4048 	for (int idx = 0; idx < nindexes; idx++)
4049 	{
4050 		LVSharedIndStats *shared_istat;
4051 
4052 		shared_istat = parallel_stats_for_idx(lps->lvshared, idx);
4053 
4054 		/*
4055 		 * Skip index -- it must have been processed by the leader, from
4056 		 * inside do_serial_processing_for_unsafe_indexes()
4057 		 */
4058 		if (shared_istat == NULL)
4059 			continue;
4060 
4061 		if (shared_istat->updated)
4062 		{
4063 			indstats[idx] = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
4064 			memcpy(indstats[idx], &(shared_istat->istat), sizeof(IndexBulkDeleteResult));
4065 		}
4066 		else
4067 			indstats[idx] = NULL;
4068 	}
4069 
4070 	DestroyParallelContext(lps->pcxt);
4071 	ExitParallelMode();
4072 
4073 	/* Deactivate parallel vacuum */
4074 	pfree(lps);
4075 	vacrel->lps = NULL;
4076 }
4077 
4078 /*
4079  * Return shared memory statistics for index at offset 'getidx', if any
4080  *
4081  * Returning NULL indicates that compute_parallel_vacuum_workers() determined
4082  * that the index is a totally unsuitable target for all parallel processing
4083  * up front.  For example, the index could be < min_parallel_index_scan_size
4084  * cutoff.
4085  */
4086 static LVSharedIndStats *
parallel_stats_for_idx(LVShared * lvshared,int getidx)4087 parallel_stats_for_idx(LVShared *lvshared, int getidx)
4088 {
4089 	char	   *p;
4090 
4091 	if (IndStatsIsNull(lvshared, getidx))
4092 		return NULL;
4093 
4094 	p = (char *) GetSharedIndStats(lvshared);
4095 	for (int idx = 0; idx < getidx; idx++)
4096 	{
4097 		if (IndStatsIsNull(lvshared, idx))
4098 			continue;
4099 
4100 		p += sizeof(LVSharedIndStats);
4101 	}
4102 
4103 	return (LVSharedIndStats *) p;
4104 }
4105 
4106 /*
4107  * Returns false, if the given index can't participate in parallel index
4108  * vacuum or parallel index cleanup
4109  */
4110 static bool
parallel_processing_is_safe(Relation indrel,LVShared * lvshared)4111 parallel_processing_is_safe(Relation indrel, LVShared *lvshared)
4112 {
4113 	uint8		vacoptions = indrel->rd_indam->amparallelvacuumoptions;
4114 
4115 	/* first_time must be true only if for_cleanup is true */
4116 	Assert(lvshared->for_cleanup || !lvshared->first_time);
4117 
4118 	if (lvshared->for_cleanup)
4119 	{
4120 		/* Skip, if the index does not support parallel cleanup */
4121 		if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) &&
4122 			((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0))
4123 			return false;
4124 
4125 		/*
4126 		 * Skip, if the index supports parallel cleanup conditionally, but we
4127 		 * have already processed the index (for bulkdelete).  See the
4128 		 * comments for option VACUUM_OPTION_PARALLEL_COND_CLEANUP to know
4129 		 * when indexes support parallel cleanup conditionally.
4130 		 */
4131 		if (!lvshared->first_time &&
4132 			((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
4133 			return false;
4134 	}
4135 	else if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) == 0)
4136 	{
4137 		/* Skip if the index does not support parallel bulk deletion */
4138 		return false;
4139 	}
4140 
4141 	return true;
4142 }
4143 
4144 /*
4145  * Perform work within a launched parallel process.
4146  *
4147  * Since parallel vacuum workers perform only index vacuum or index cleanup,
4148  * we don't need to report progress information.
4149  */
4150 void
parallel_vacuum_main(dsm_segment * seg,shm_toc * toc)4151 parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
4152 {
4153 	Relation	rel;
4154 	Relation   *indrels;
4155 	LVShared   *lvshared;
4156 	LVDeadTuples *dead_tuples;
4157 	BufferUsage *buffer_usage;
4158 	WalUsage   *wal_usage;
4159 	int			nindexes;
4160 	char	   *sharedquery;
4161 	LVRelState	vacrel;
4162 	ErrorContextCallback errcallback;
4163 
4164 	lvshared = (LVShared *) shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_SHARED,
4165 										   false);
4166 	elevel = lvshared->elevel;
4167 
4168 	if (lvshared->for_cleanup)
4169 		elog(DEBUG1, "starting parallel vacuum worker for cleanup");
4170 	else
4171 		elog(DEBUG1, "starting parallel vacuum worker for bulk delete");
4172 
4173 	/* Set debug_query_string for individual workers */
4174 	sharedquery = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_QUERY_TEXT, true);
4175 	debug_query_string = sharedquery;
4176 	pgstat_report_activity(STATE_RUNNING, debug_query_string);
4177 
4178 	/*
4179 	 * Open table.  The lock mode is the same as the leader process.  It's
4180 	 * okay because the lock mode does not conflict among the parallel
4181 	 * workers.
4182 	 */
4183 	rel = table_open(lvshared->relid, ShareUpdateExclusiveLock);
4184 
4185 	/*
4186 	 * Open all indexes. indrels are sorted in order by OID, which should be
4187 	 * matched to the leader's one.
4188 	 */
4189 	vac_open_indexes(rel, RowExclusiveLock, &nindexes, &indrels);
4190 	Assert(nindexes > 0);
4191 
4192 	/* Set dead tuple space */
4193 	dead_tuples = (LVDeadTuples *) shm_toc_lookup(toc,
4194 												  PARALLEL_VACUUM_KEY_DEAD_TUPLES,
4195 												  false);
4196 
4197 	/* Set cost-based vacuum delay */
4198 	VacuumCostActive = (VacuumCostDelay > 0);
4199 	VacuumCostBalance = 0;
4200 	VacuumPageHit = 0;
4201 	VacuumPageMiss = 0;
4202 	VacuumPageDirty = 0;
4203 	VacuumCostBalanceLocal = 0;
4204 	VacuumSharedCostBalance = &(lvshared->cost_balance);
4205 	VacuumActiveNWorkers = &(lvshared->active_nworkers);
4206 
4207 	vacrel.rel = rel;
4208 	vacrel.indrels = indrels;
4209 	vacrel.nindexes = nindexes;
4210 	/* Each parallel VACUUM worker gets its own access strategy */
4211 	vacrel.bstrategy = GetAccessStrategy(BAS_VACUUM);
4212 	vacrel.indstats = (IndexBulkDeleteResult **)
4213 		palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
4214 
4215 	if (lvshared->maintenance_work_mem_worker > 0)
4216 		maintenance_work_mem = lvshared->maintenance_work_mem_worker;
4217 
4218 	/*
4219 	 * Initialize vacrel for use as error callback arg by parallel worker.
4220 	 */
4221 	vacrel.relnamespace = get_namespace_name(RelationGetNamespace(rel));
4222 	vacrel.relname = pstrdup(RelationGetRelationName(rel));
4223 	vacrel.indname = NULL;
4224 	vacrel.phase = VACUUM_ERRCB_PHASE_UNKNOWN;	/* Not yet processing */
4225 	vacrel.dead_tuples = dead_tuples;
4226 
4227 	/* Setup error traceback support for ereport() */
4228 	errcallback.callback = vacuum_error_callback;
4229 	errcallback.arg = &vacrel;
4230 	errcallback.previous = error_context_stack;
4231 	error_context_stack = &errcallback;
4232 
4233 	/* Prepare to track buffer usage during parallel execution */
4234 	InstrStartParallelQuery();
4235 
4236 	/* Process indexes to perform vacuum/cleanup */
4237 	do_parallel_processing(&vacrel, lvshared);
4238 
4239 	/* Report buffer/WAL usage during parallel execution */
4240 	buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
4241 	wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
4242 	InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
4243 						  &wal_usage[ParallelWorkerNumber]);
4244 
4245 	/* Pop the error context stack */
4246 	error_context_stack = errcallback.previous;
4247 
4248 	vac_close_indexes(nindexes, indrels, RowExclusiveLock);
4249 	table_close(rel, ShareUpdateExclusiveLock);
4250 	FreeAccessStrategy(vacrel.bstrategy);
4251 	pfree(vacrel.indstats);
4252 }
4253 
4254 /*
4255  * Error context callback for errors occurring during vacuum.
4256  */
4257 static void
vacuum_error_callback(void * arg)4258 vacuum_error_callback(void *arg)
4259 {
4260 	LVRelState *errinfo = arg;
4261 
4262 	switch (errinfo->phase)
4263 	{
4264 		case VACUUM_ERRCB_PHASE_SCAN_HEAP:
4265 			if (BlockNumberIsValid(errinfo->blkno))
4266 			{
4267 				if (OffsetNumberIsValid(errinfo->offnum))
4268 					errcontext("while scanning block %u offset %u of relation \"%s.%s\"",
4269 							   errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
4270 				else
4271 					errcontext("while scanning block %u of relation \"%s.%s\"",
4272 							   errinfo->blkno, errinfo->relnamespace, errinfo->relname);
4273 			}
4274 			else
4275 				errcontext("while scanning relation \"%s.%s\"",
4276 						   errinfo->relnamespace, errinfo->relname);
4277 			break;
4278 
4279 		case VACUUM_ERRCB_PHASE_VACUUM_HEAP:
4280 			if (BlockNumberIsValid(errinfo->blkno))
4281 			{
4282 				if (OffsetNumberIsValid(errinfo->offnum))
4283 					errcontext("while vacuuming block %u offset %u of relation \"%s.%s\"",
4284 							   errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
4285 				else
4286 					errcontext("while vacuuming block %u of relation \"%s.%s\"",
4287 							   errinfo->blkno, errinfo->relnamespace, errinfo->relname);
4288 			}
4289 			else
4290 				errcontext("while vacuuming relation \"%s.%s\"",
4291 						   errinfo->relnamespace, errinfo->relname);
4292 			break;
4293 
4294 		case VACUUM_ERRCB_PHASE_VACUUM_INDEX:
4295 			errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
4296 					   errinfo->indname, errinfo->relnamespace, errinfo->relname);
4297 			break;
4298 
4299 		case VACUUM_ERRCB_PHASE_INDEX_CLEANUP:
4300 			errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
4301 					   errinfo->indname, errinfo->relnamespace, errinfo->relname);
4302 			break;
4303 
4304 		case VACUUM_ERRCB_PHASE_TRUNCATE:
4305 			if (BlockNumberIsValid(errinfo->blkno))
4306 				errcontext("while truncating relation \"%s.%s\" to %u blocks",
4307 						   errinfo->relnamespace, errinfo->relname, errinfo->blkno);
4308 			break;
4309 
4310 		case VACUUM_ERRCB_PHASE_UNKNOWN:
4311 		default:
4312 			return;				/* do nothing; the errinfo may not be
4313 								 * initialized */
4314 	}
4315 }
4316 
4317 /*
4318  * Updates the information required for vacuum error callback.  This also saves
4319  * the current information which can be later restored via restore_vacuum_error_info.
4320  */
4321 static void
update_vacuum_error_info(LVRelState * vacrel,LVSavedErrInfo * saved_vacrel,int phase,BlockNumber blkno,OffsetNumber offnum)4322 update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel,
4323 						 int phase, BlockNumber blkno, OffsetNumber offnum)
4324 {
4325 	if (saved_vacrel)
4326 	{
4327 		saved_vacrel->offnum = vacrel->offnum;
4328 		saved_vacrel->blkno = vacrel->blkno;
4329 		saved_vacrel->phase = vacrel->phase;
4330 	}
4331 
4332 	vacrel->blkno = blkno;
4333 	vacrel->offnum = offnum;
4334 	vacrel->phase = phase;
4335 }
4336 
4337 /*
4338  * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
4339  */
4340 static void
restore_vacuum_error_info(LVRelState * vacrel,const LVSavedErrInfo * saved_vacrel)4341 restore_vacuum_error_info(LVRelState *vacrel,
4342 						  const LVSavedErrInfo *saved_vacrel)
4343 {
4344 	vacrel->blkno = saved_vacrel->blkno;
4345 	vacrel->offnum = saved_vacrel->offnum;
4346 	vacrel->phase = saved_vacrel->phase;
4347 }
4348