1 /*-------------------------------------------------------------------------
2 *
3 * vacuumlazy.c
4 * Concurrent ("lazy") vacuuming.
5 *
6 *
7 * The major space usage for LAZY VACUUM is storage for the array of dead tuple
8 * TIDs. We want to ensure we can vacuum even the very largest relations with
9 * finite memory space usage. To do that, we set upper bounds on the number of
10 * tuples we will keep track of at once.
11 *
12 * We are willing to use at most maintenance_work_mem (or perhaps
13 * autovacuum_work_mem) memory space to keep track of dead tuples. We
14 * initially allocate an array of TIDs of that size, with an upper limit that
15 * depends on table size (this limit ensures we don't allocate a huge area
16 * uselessly for vacuuming small tables). If the array threatens to overflow,
17 * we suspend the heap scan phase and perform a pass of index cleanup and page
18 * compaction, then resume the heap scan with an empty TID array.
19 *
20 * If we're processing a table with no indexes, we can just vacuum each page
21 * as we go; there's no need to save up multiple tuples to minimize the number
22 * of index scans performed. So we don't use maintenance_work_mem memory for
23 * the TID array, just enough to hold as many heap tuples as fit on one page.
24 *
25 * Lazy vacuum supports parallel execution with parallel worker processes. In
26 * a parallel vacuum, we perform both index vacuum and index cleanup with
27 * parallel worker processes. Individual indexes are processed by one vacuum
28 * process. At the beginning of a lazy vacuum (at lazy_scan_heap) we prepare
29 * the parallel context and initialize the DSM segment that contains shared
30 * information as well as the memory space for storing dead tuples. When
31 * starting either index vacuum or index cleanup, we launch parallel worker
32 * processes. Once all indexes are processed the parallel worker processes
33 * exit. After that, the leader process re-initializes the parallel context
34 * so that it can use the same DSM for multiple passes of index vacuum and
35 * for performing index cleanup. For updating the index statistics, we need
36 * to update the system table and since updates are not allowed during
37 * parallel mode we update the index statistics after exiting from the
38 * parallel mode.
39 *
40 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
41 * Portions Copyright (c) 1994, Regents of the University of California
42 *
43 *
44 * IDENTIFICATION
45 * src/backend/access/heap/vacuumlazy.c
46 *
47 *-------------------------------------------------------------------------
48 */
49 #include "postgres.h"
50
51 #include <math.h>
52
53 #include "access/amapi.h"
54 #include "access/genam.h"
55 #include "access/heapam.h"
56 #include "access/heapam_xlog.h"
57 #include "access/htup_details.h"
58 #include "access/multixact.h"
59 #include "access/parallel.h"
60 #include "access/transam.h"
61 #include "access/visibilitymap.h"
62 #include "access/xact.h"
63 #include "access/xlog.h"
64 #include "catalog/index.h"
65 #include "catalog/storage.h"
66 #include "commands/dbcommands.h"
67 #include "commands/progress.h"
68 #include "commands/vacuum.h"
69 #include "executor/instrument.h"
70 #include "miscadmin.h"
71 #include "optimizer/paths.h"
72 #include "pgstat.h"
73 #include "portability/instr_time.h"
74 #include "postmaster/autovacuum.h"
75 #include "storage/bufmgr.h"
76 #include "storage/freespace.h"
77 #include "storage/lmgr.h"
78 #include "tcop/tcopprot.h"
79 #include "utils/lsyscache.h"
80 #include "utils/memutils.h"
81 #include "utils/pg_rusage.h"
82 #include "utils/timestamp.h"
83
84
85 /*
86 * Space/time tradeoff parameters: do these need to be user-tunable?
87 *
88 * To consider truncating the relation, we want there to be at least
89 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
90 * is less) potentially-freeable pages.
91 */
92 #define REL_TRUNCATE_MINIMUM 1000
93 #define REL_TRUNCATE_FRACTION 16
94
95 /*
96 * Timing parameters for truncate locking heuristics.
97 *
98 * These were not exposed as user tunable GUC values because it didn't seem
99 * that the potential for improvement was great enough to merit the cost of
100 * supporting them.
101 */
102 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
103 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
104 #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
105
106 /*
107 * Threshold that controls whether we bypass index vacuuming and heap
108 * vacuuming as an optimization
109 */
110 #define BYPASS_THRESHOLD_PAGES 0.02 /* i.e. 2% of rel_pages */
111
112 /*
113 * Perform a failsafe check every 4GB during the heap scan, approximately
114 */
115 #define FAILSAFE_EVERY_PAGES \
116 ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ))
117
118 /*
119 * When a table has no indexes, vacuum the FSM after every 8GB, approximately
120 * (it won't be exact because we only vacuum FSM after processing a heap page
121 * that has some removable tuples). When there are indexes, this is ignored,
122 * and we vacuum FSM after each index/heap cleaning pass.
123 */
124 #define VACUUM_FSM_EVERY_PAGES \
125 ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
126
127 /*
128 * Guesstimation of number of dead tuples per page. This is used to
129 * provide an upper limit to memory allocated when vacuuming small
130 * tables.
131 */
132 #define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage
133
134 /*
135 * Before we consider skipping a page that's marked as clean in
136 * visibility map, we must've seen at least this many clean pages.
137 */
138 #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32)
139
140 /*
141 * Size of the prefetch window for lazy vacuum backwards truncation scan.
142 * Needs to be a power of 2.
143 */
144 #define PREFETCH_SIZE ((BlockNumber) 32)
145
146 /*
147 * DSM keys for parallel vacuum. Unlike other parallel execution code, since
148 * we don't need to worry about DSM keys conflicting with plan_node_id we can
149 * use small integers.
150 */
151 #define PARALLEL_VACUUM_KEY_SHARED 1
152 #define PARALLEL_VACUUM_KEY_DEAD_TUPLES 2
153 #define PARALLEL_VACUUM_KEY_QUERY_TEXT 3
154 #define PARALLEL_VACUUM_KEY_BUFFER_USAGE 4
155 #define PARALLEL_VACUUM_KEY_WAL_USAGE 5
156
157 /*
158 * Macro to check if we are in a parallel vacuum. If true, we are in the
159 * parallel mode and the DSM segment is initialized.
160 */
161 #define ParallelVacuumIsActive(vacrel) ((vacrel)->lps != NULL)
162
163 /* Phases of vacuum during which we report error context. */
164 typedef enum
165 {
166 VACUUM_ERRCB_PHASE_UNKNOWN,
167 VACUUM_ERRCB_PHASE_SCAN_HEAP,
168 VACUUM_ERRCB_PHASE_VACUUM_INDEX,
169 VACUUM_ERRCB_PHASE_VACUUM_HEAP,
170 VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
171 VACUUM_ERRCB_PHASE_TRUNCATE
172 } VacErrPhase;
173
174 /*
175 * LVDeadTuples stores the dead tuple TIDs collected during the heap scan.
176 * This is allocated in the DSM segment in parallel mode and in local memory
177 * in non-parallel mode.
178 */
179 typedef struct LVDeadTuples
180 {
181 int max_tuples; /* # slots allocated in array */
182 int num_tuples; /* current # of entries */
183 /* List of TIDs of tuples we intend to delete */
184 /* NB: this list is ordered by TID address */
185 ItemPointerData itemptrs[FLEXIBLE_ARRAY_MEMBER]; /* array of
186 * ItemPointerData */
187 } LVDeadTuples;
188
189 /* The dead tuple space consists of LVDeadTuples and dead tuple TIDs */
190 #define SizeOfDeadTuples(cnt) \
191 add_size(offsetof(LVDeadTuples, itemptrs), \
192 mul_size(sizeof(ItemPointerData), cnt))
193 #define MAXDEADTUPLES(max_size) \
194 (((max_size) - offsetof(LVDeadTuples, itemptrs)) / sizeof(ItemPointerData))
195
196 /*
197 * Shared information among parallel workers. So this is allocated in the DSM
198 * segment.
199 */
200 typedef struct LVShared
201 {
202 /*
203 * Target table relid and log level. These fields are not modified during
204 * the lazy vacuum.
205 */
206 Oid relid;
207 int elevel;
208
209 /*
210 * An indication for vacuum workers to perform either index vacuum or
211 * index cleanup. first_time is true only if for_cleanup is true and
212 * bulk-deletion is not performed yet.
213 */
214 bool for_cleanup;
215 bool first_time;
216
217 /*
218 * Fields for both index vacuum and cleanup.
219 *
220 * reltuples is the total number of input heap tuples. We set either old
221 * live tuples in the index vacuum case or the new live tuples in the
222 * index cleanup case.
223 *
224 * estimated_count is true if reltuples is an estimated value. (Note that
225 * reltuples could be -1 in this case, indicating we have no idea.)
226 */
227 double reltuples;
228 bool estimated_count;
229
230 /*
231 * In single process lazy vacuum we could consume more memory during index
232 * vacuuming or cleanup apart from the memory for heap scanning. In
233 * parallel vacuum, since individual vacuum workers can consume memory
234 * equal to maintenance_work_mem, the new maintenance_work_mem for each
235 * worker is set such that the parallel operation doesn't consume more
236 * memory than single process lazy vacuum.
237 */
238 int maintenance_work_mem_worker;
239
240 /*
241 * Shared vacuum cost balance. During parallel vacuum,
242 * VacuumSharedCostBalance points to this value and it accumulates the
243 * balance of each parallel vacuum worker.
244 */
245 pg_atomic_uint32 cost_balance;
246
247 /*
248 * Number of active parallel workers. This is used for computing the
249 * minimum threshold of the vacuum cost balance before a worker sleeps for
250 * cost-based delay.
251 */
252 pg_atomic_uint32 active_nworkers;
253
254 /*
255 * Variables to control parallel vacuum. We have a bitmap to indicate
256 * which index has stats in shared memory. The set bit in the map
257 * indicates that the particular index supports a parallel vacuum.
258 */
259 pg_atomic_uint32 idx; /* counter for vacuuming and clean up */
260 uint32 offset; /* sizeof header incl. bitmap */
261 bits8 bitmap[FLEXIBLE_ARRAY_MEMBER]; /* bit map of NULLs */
262
263 /* Shared index statistics data follows at end of struct */
264 } LVShared;
265
266 #define SizeOfLVShared (offsetof(LVShared, bitmap) + sizeof(bits8))
267 #define GetSharedIndStats(s) \
268 ((LVSharedIndStats *)((char *)(s) + ((LVShared *)(s))->offset))
269 #define IndStatsIsNull(s, i) \
270 (!(((LVShared *)(s))->bitmap[(i) >> 3] & (1 << ((i) & 0x07))))
271
272 /*
273 * Struct for an index bulk-deletion statistic used for parallel vacuum. This
274 * is allocated in the DSM segment.
275 */
276 typedef struct LVSharedIndStats
277 {
278 bool updated; /* are the stats updated? */
279 IndexBulkDeleteResult istat;
280 } LVSharedIndStats;
281
282 /* Struct for maintaining a parallel vacuum state. */
283 typedef struct LVParallelState
284 {
285 ParallelContext *pcxt;
286
287 /* Shared information among parallel vacuum workers */
288 LVShared *lvshared;
289
290 /* Points to buffer usage area in DSM */
291 BufferUsage *buffer_usage;
292
293 /* Points to WAL usage area in DSM */
294 WalUsage *wal_usage;
295
296 /*
297 * The number of indexes that support parallel index bulk-deletion and
298 * parallel index cleanup respectively.
299 */
300 int nindexes_parallel_bulkdel;
301 int nindexes_parallel_cleanup;
302 int nindexes_parallel_condcleanup;
303 } LVParallelState;
304
305 typedef struct LVRelState
306 {
307 /* Target heap relation and its indexes */
308 Relation rel;
309 Relation *indrels;
310 int nindexes;
311
312 /* Wraparound failsafe has been triggered? */
313 bool failsafe_active;
314 /* Consider index vacuuming bypass optimization? */
315 bool consider_bypass_optimization;
316
317 /* Doing index vacuuming, index cleanup, rel truncation? */
318 bool do_index_vacuuming;
319 bool do_index_cleanup;
320 bool do_rel_truncate;
321
322 /* Buffer access strategy and parallel state */
323 BufferAccessStrategy bstrategy;
324 LVParallelState *lps;
325
326 /* Statistics from pg_class when we start out */
327 BlockNumber old_rel_pages; /* previous value of pg_class.relpages */
328 double old_live_tuples; /* previous value of pg_class.reltuples */
329 /* rel's initial relfrozenxid and relminmxid */
330 TransactionId relfrozenxid;
331 MultiXactId relminmxid;
332
333 /* VACUUM operation's cutoff for pruning */
334 TransactionId OldestXmin;
335 /* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */
336 TransactionId FreezeLimit;
337 MultiXactId MultiXactCutoff;
338
339 /* Error reporting state */
340 char *relnamespace;
341 char *relname;
342 char *indname;
343 BlockNumber blkno; /* used only for heap operations */
344 OffsetNumber offnum; /* used only for heap operations */
345 VacErrPhase phase;
346
347 /*
348 * State managed by lazy_scan_heap() follows
349 */
350 LVDeadTuples *dead_tuples; /* items to vacuum from indexes */
351 BlockNumber rel_pages; /* total number of pages */
352 BlockNumber scanned_pages; /* number of pages we examined */
353 BlockNumber pinskipped_pages; /* # of pages skipped due to a pin */
354 BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */
355 BlockNumber tupcount_pages; /* pages whose tuples we counted */
356 BlockNumber pages_removed; /* pages remove by truncation */
357 BlockNumber lpdead_item_pages; /* # pages with LP_DEAD items */
358 BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
359
360 /* Statistics output by us, for table */
361 double new_rel_tuples; /* new estimated total # of tuples */
362 double new_live_tuples; /* new estimated total # of live tuples */
363 /* Statistics output by index AMs */
364 IndexBulkDeleteResult **indstats;
365
366 /* Instrumentation counters */
367 int num_index_scans;
368 int64 tuples_deleted; /* # deleted from table */
369 int64 lpdead_items; /* # deleted from indexes */
370 int64 new_dead_tuples; /* new estimated total # of dead items in
371 * table */
372 int64 num_tuples; /* total number of nonremovable tuples */
373 int64 live_tuples; /* live tuples (reltuples estimate) */
374 } LVRelState;
375
376 /*
377 * State returned by lazy_scan_prune()
378 */
379 typedef struct LVPagePruneState
380 {
381 bool hastup; /* Page is truncatable? */
382 bool has_lpdead_items; /* includes existing LP_DEAD items */
383
384 /*
385 * State describes the proper VM bit states to set for the page following
386 * pruning and freezing. all_visible implies !has_lpdead_items, but don't
387 * trust all_frozen result unless all_visible is also set to true.
388 */
389 bool all_visible; /* Every item visible to all? */
390 bool all_frozen; /* provided all_visible is also true */
391 TransactionId visibility_cutoff_xid; /* For recovery conflicts */
392 } LVPagePruneState;
393
394 /* Struct for saving and restoring vacuum error information. */
395 typedef struct LVSavedErrInfo
396 {
397 BlockNumber blkno;
398 OffsetNumber offnum;
399 VacErrPhase phase;
400 } LVSavedErrInfo;
401
402 /* elevel controls whole VACUUM's verbosity */
403 static int elevel = -1;
404
405
406 /* non-export function prototypes */
407 static void lazy_scan_heap(LVRelState *vacrel, VacuumParams *params,
408 bool aggressive);
409 static void lazy_scan_prune(LVRelState *vacrel, Buffer buf,
410 BlockNumber blkno, Page page,
411 GlobalVisState *vistest,
412 LVPagePruneState *prunestate);
413 static void lazy_vacuum(LVRelState *vacrel);
414 static bool lazy_vacuum_all_indexes(LVRelState *vacrel);
415 static void lazy_vacuum_heap_rel(LVRelState *vacrel);
416 static int lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno,
417 Buffer buffer, int tupindex, Buffer *vmbuffer);
418 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup,
419 LVRelState *vacrel);
420 static bool lazy_check_wraparound_failsafe(LVRelState *vacrel);
421 static void do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel);
422 static void do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel);
423 static void do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers);
424 static void do_parallel_processing(LVRelState *vacrel,
425 LVShared *lvshared);
426 static void do_serial_processing_for_unsafe_indexes(LVRelState *vacrel,
427 LVShared *lvshared);
428 static IndexBulkDeleteResult *parallel_process_one_index(Relation indrel,
429 IndexBulkDeleteResult *istat,
430 LVShared *lvshared,
431 LVSharedIndStats *shared_indstats,
432 LVRelState *vacrel);
433 static void lazy_cleanup_all_indexes(LVRelState *vacrel);
434 static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel,
435 IndexBulkDeleteResult *istat,
436 double reltuples,
437 LVRelState *vacrel);
438 static IndexBulkDeleteResult *lazy_cleanup_one_index(Relation indrel,
439 IndexBulkDeleteResult *istat,
440 double reltuples,
441 bool estimated_count,
442 LVRelState *vacrel);
443 static bool should_attempt_truncation(LVRelState *vacrel);
444 static void lazy_truncate_heap(LVRelState *vacrel);
445 static BlockNumber count_nondeletable_pages(LVRelState *vacrel,
446 bool *lock_waiter_detected);
447 static long compute_max_dead_tuples(BlockNumber relblocks, bool hasindex);
448 static void lazy_space_alloc(LVRelState *vacrel, int nworkers,
449 BlockNumber relblocks);
450 static void lazy_space_free(LVRelState *vacrel);
451 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
452 static int vac_cmp_itemptr(const void *left, const void *right);
453 static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
454 TransactionId *visibility_cutoff_xid, bool *all_frozen);
455 static int compute_parallel_vacuum_workers(LVRelState *vacrel,
456 int nrequested,
457 bool *will_parallel_vacuum);
458 static void update_index_statistics(LVRelState *vacrel);
459 static LVParallelState *begin_parallel_vacuum(LVRelState *vacrel,
460 BlockNumber nblocks,
461 int nrequested);
462 static void end_parallel_vacuum(LVRelState *vacrel);
463 static LVSharedIndStats *parallel_stats_for_idx(LVShared *lvshared, int getidx);
464 static bool parallel_processing_is_safe(Relation indrel, LVShared *lvshared);
465 static void vacuum_error_callback(void *arg);
466 static void update_vacuum_error_info(LVRelState *vacrel,
467 LVSavedErrInfo *saved_vacrel,
468 int phase, BlockNumber blkno,
469 OffsetNumber offnum);
470 static void restore_vacuum_error_info(LVRelState *vacrel,
471 const LVSavedErrInfo *saved_vacrel);
472
473
474 /*
475 * heap_vacuum_rel() -- perform VACUUM for one heap relation
476 *
477 * This routine vacuums a single heap, cleans out its indexes, and
478 * updates its relpages and reltuples statistics.
479 *
480 * At entry, we have already established a transaction and opened
481 * and locked the relation.
482 */
483 void
heap_vacuum_rel(Relation rel,VacuumParams * params,BufferAccessStrategy bstrategy)484 heap_vacuum_rel(Relation rel, VacuumParams *params,
485 BufferAccessStrategy bstrategy)
486 {
487 LVRelState *vacrel;
488 PGRUsage ru0;
489 TimestampTz starttime = 0;
490 WalUsage walusage_start = pgWalUsage;
491 WalUsage walusage = {0, 0, 0};
492 long secs;
493 int usecs;
494 double read_rate,
495 write_rate;
496 bool aggressive; /* should we scan all unfrozen pages? */
497 bool scanned_all_unfrozen; /* actually scanned all such pages? */
498 char **indnames = NULL;
499 TransactionId xidFullScanLimit;
500 MultiXactId mxactFullScanLimit;
501 BlockNumber new_rel_pages;
502 BlockNumber new_rel_allvisible;
503 double new_live_tuples;
504 TransactionId new_frozen_xid;
505 MultiXactId new_min_multi;
506 ErrorContextCallback errcallback;
507 PgStat_Counter startreadtime = 0;
508 PgStat_Counter startwritetime = 0;
509 TransactionId OldestXmin;
510 TransactionId FreezeLimit;
511 MultiXactId MultiXactCutoff;
512
513 /* measure elapsed time iff autovacuum logging requires it */
514 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
515 {
516 pg_rusage_init(&ru0);
517 starttime = GetCurrentTimestamp();
518 if (track_io_timing)
519 {
520 startreadtime = pgStatBlockReadTime;
521 startwritetime = pgStatBlockWriteTime;
522 }
523 }
524
525 if (params->options & VACOPT_VERBOSE)
526 elevel = INFO;
527 else
528 elevel = DEBUG2;
529
530 pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
531 RelationGetRelid(rel));
532
533 vacuum_set_xid_limits(rel,
534 params->freeze_min_age,
535 params->freeze_table_age,
536 params->multixact_freeze_min_age,
537 params->multixact_freeze_table_age,
538 &OldestXmin, &FreezeLimit, &xidFullScanLimit,
539 &MultiXactCutoff, &mxactFullScanLimit);
540
541 /*
542 * We request an aggressive scan if the table's frozen Xid is now older
543 * than or equal to the requested Xid full-table scan limit; or if the
544 * table's minimum MultiXactId is older than or equal to the requested
545 * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
546 */
547 aggressive = TransactionIdPrecedesOrEquals(rel->rd_rel->relfrozenxid,
548 xidFullScanLimit);
549 aggressive |= MultiXactIdPrecedesOrEquals(rel->rd_rel->relminmxid,
550 mxactFullScanLimit);
551 if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
552 aggressive = true;
553
554 vacrel = (LVRelState *) palloc0(sizeof(LVRelState));
555
556 /* Set up high level stuff about rel */
557 vacrel->rel = rel;
558 vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes,
559 &vacrel->indrels);
560 vacrel->failsafe_active = false;
561 vacrel->consider_bypass_optimization = true;
562
563 /*
564 * The index_cleanup param either disables index vacuuming and cleanup or
565 * forces it to go ahead when we would otherwise apply the index bypass
566 * optimization. The default is 'auto', which leaves the final decision
567 * up to lazy_vacuum().
568 *
569 * The truncate param allows user to avoid attempting relation truncation,
570 * though it can't force truncation to happen.
571 */
572 Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED);
573 Assert(params->truncate != VACOPTVALUE_UNSPECIFIED &&
574 params->truncate != VACOPTVALUE_AUTO);
575 vacrel->do_index_vacuuming = true;
576 vacrel->do_index_cleanup = true;
577 vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED);
578 if (params->index_cleanup == VACOPTVALUE_DISABLED)
579 {
580 /* Force disable index vacuuming up-front */
581 vacrel->do_index_vacuuming = false;
582 vacrel->do_index_cleanup = false;
583 }
584 else if (params->index_cleanup == VACOPTVALUE_ENABLED)
585 {
586 /* Force index vacuuming. Note that failsafe can still bypass. */
587 vacrel->consider_bypass_optimization = false;
588 }
589 else
590 {
591 /* Default/auto, make all decisions dynamically */
592 Assert(params->index_cleanup == VACOPTVALUE_AUTO);
593 }
594
595 vacrel->bstrategy = bstrategy;
596 vacrel->old_rel_pages = rel->rd_rel->relpages;
597 vacrel->old_live_tuples = rel->rd_rel->reltuples;
598 vacrel->relfrozenxid = rel->rd_rel->relfrozenxid;
599 vacrel->relminmxid = rel->rd_rel->relminmxid;
600
601 /* Set cutoffs for entire VACUUM */
602 vacrel->OldestXmin = OldestXmin;
603 vacrel->FreezeLimit = FreezeLimit;
604 vacrel->MultiXactCutoff = MultiXactCutoff;
605
606 vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel));
607 vacrel->relname = pstrdup(RelationGetRelationName(rel));
608 vacrel->indname = NULL;
609 vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN;
610
611 /* Save index names iff autovacuum logging requires it */
612 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0 &&
613 vacrel->nindexes > 0)
614 {
615 indnames = palloc(sizeof(char *) * vacrel->nindexes);
616 for (int i = 0; i < vacrel->nindexes; i++)
617 indnames[i] =
618 pstrdup(RelationGetRelationName(vacrel->indrels[i]));
619 }
620
621 /*
622 * Setup error traceback support for ereport(). The idea is to set up an
623 * error context callback to display additional information on any error
624 * during a vacuum. During different phases of vacuum (heap scan, heap
625 * vacuum, index vacuum, index clean up, heap truncate), we update the
626 * error context callback to display appropriate information.
627 *
628 * Note that the index vacuum and heap vacuum phases may be called
629 * multiple times in the middle of the heap scan phase. So the old phase
630 * information is restored at the end of those phases.
631 */
632 errcallback.callback = vacuum_error_callback;
633 errcallback.arg = vacrel;
634 errcallback.previous = error_context_stack;
635 error_context_stack = &errcallback;
636
637 /* Do the vacuuming */
638 lazy_scan_heap(vacrel, params, aggressive);
639
640 /* Done with indexes */
641 vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock);
642
643 /*
644 * Compute whether we actually scanned the all unfrozen pages. If we did,
645 * we can adjust relfrozenxid and relminmxid.
646 *
647 * NB: We need to check this before truncating the relation, because that
648 * will change ->rel_pages.
649 */
650 if ((vacrel->scanned_pages + vacrel->frozenskipped_pages)
651 < vacrel->rel_pages)
652 {
653 Assert(!aggressive);
654 scanned_all_unfrozen = false;
655 }
656 else
657 scanned_all_unfrozen = true;
658
659 /*
660 * Optionally truncate the relation.
661 */
662 if (should_attempt_truncation(vacrel))
663 {
664 /*
665 * Update error traceback information. This is the last phase during
666 * which we add context information to errors, so we don't need to
667 * revert to the previous phase.
668 */
669 update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE,
670 vacrel->nonempty_pages,
671 InvalidOffsetNumber);
672 lazy_truncate_heap(vacrel);
673 }
674
675 /* Pop the error context stack */
676 error_context_stack = errcallback.previous;
677
678 /* Report that we are now doing final cleanup */
679 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
680 PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
681
682 /*
683 * Update statistics in pg_class.
684 *
685 * In principle new_live_tuples could be -1 indicating that we (still)
686 * don't know the tuple count. In practice that probably can't happen,
687 * since we'd surely have scanned some pages if the table is new and
688 * nonempty.
689 *
690 * For safety, clamp relallvisible to be not more than what we're setting
691 * relpages to.
692 *
693 * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
694 * since then we don't know for certain that all tuples have a newer xmin.
695 */
696 new_rel_pages = vacrel->rel_pages;
697 new_live_tuples = vacrel->new_live_tuples;
698
699 visibilitymap_count(rel, &new_rel_allvisible, NULL);
700 if (new_rel_allvisible > new_rel_pages)
701 new_rel_allvisible = new_rel_pages;
702
703 new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId;
704 new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId;
705
706 vac_update_relstats(rel,
707 new_rel_pages,
708 new_live_tuples,
709 new_rel_allvisible,
710 vacrel->nindexes > 0,
711 new_frozen_xid,
712 new_min_multi,
713 false);
714
715 /*
716 * Report results to the stats collector, too.
717 *
718 * Deliberately avoid telling the stats collector about LP_DEAD items that
719 * remain in the table due to VACUUM bypassing index and heap vacuuming.
720 * ANALYZE will consider the remaining LP_DEAD items to be dead tuples. It
721 * seems like a good idea to err on the side of not vacuuming again too
722 * soon in cases where the failsafe prevented significant amounts of heap
723 * vacuuming.
724 */
725 pgstat_report_vacuum(RelationGetRelid(rel),
726 rel->rd_rel->relisshared,
727 Max(new_live_tuples, 0),
728 vacrel->new_dead_tuples);
729 pgstat_progress_end_command();
730
731 /* and log the action if appropriate */
732 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
733 {
734 TimestampTz endtime = GetCurrentTimestamp();
735
736 if (params->log_min_duration == 0 ||
737 TimestampDifferenceExceeds(starttime, endtime,
738 params->log_min_duration))
739 {
740 StringInfoData buf;
741 char *msgfmt;
742 BlockNumber orig_rel_pages;
743
744 TimestampDifference(starttime, endtime, &secs, &usecs);
745
746 memset(&walusage, 0, sizeof(WalUsage));
747 WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
748
749 read_rate = 0;
750 write_rate = 0;
751 if ((secs > 0) || (usecs > 0))
752 {
753 read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) /
754 (secs + usecs / 1000000.0);
755 write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) /
756 (secs + usecs / 1000000.0);
757 }
758
759 /*
760 * This is pretty messy, but we split it up so that we can skip
761 * emitting individual parts of the message when not applicable.
762 */
763 initStringInfo(&buf);
764 if (params->is_wraparound)
765 {
766 /*
767 * While it's possible for a VACUUM to be both is_wraparound
768 * and !aggressive, that's just a corner-case -- is_wraparound
769 * implies aggressive. Produce distinct output for the corner
770 * case all the same, just in case.
771 */
772 if (aggressive)
773 msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
774 else
775 msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
776 }
777 else
778 {
779 if (aggressive)
780 msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
781 else
782 msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
783 }
784 appendStringInfo(&buf, msgfmt,
785 get_database_name(MyDatabaseId),
786 vacrel->relnamespace,
787 vacrel->relname,
788 vacrel->num_index_scans);
789 appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
790 vacrel->pages_removed,
791 vacrel->rel_pages,
792 vacrel->pinskipped_pages,
793 vacrel->frozenskipped_pages);
794 appendStringInfo(&buf,
795 _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %u\n"),
796 (long long) vacrel->tuples_deleted,
797 (long long) vacrel->new_rel_tuples,
798 (long long) vacrel->new_dead_tuples,
799 OldestXmin);
800 orig_rel_pages = vacrel->rel_pages + vacrel->pages_removed;
801 if (orig_rel_pages > 0)
802 {
803 if (vacrel->do_index_vacuuming)
804 {
805 if (vacrel->nindexes == 0 || vacrel->num_index_scans == 0)
806 appendStringInfoString(&buf, _("index scan not needed: "));
807 else
808 appendStringInfoString(&buf, _("index scan needed: "));
809
810 msgfmt = _("%u pages from table (%.2f%% of total) had %lld dead item identifiers removed\n");
811 }
812 else
813 {
814 if (!vacrel->failsafe_active)
815 appendStringInfoString(&buf, _("index scan bypassed: "));
816 else
817 appendStringInfoString(&buf, _("index scan bypassed by failsafe: "));
818
819 msgfmt = _("%u pages from table (%.2f%% of total) have %lld dead item identifiers\n");
820 }
821 appendStringInfo(&buf, msgfmt,
822 vacrel->lpdead_item_pages,
823 100.0 * vacrel->lpdead_item_pages / orig_rel_pages,
824 (long long) vacrel->lpdead_items);
825 }
826 for (int i = 0; i < vacrel->nindexes; i++)
827 {
828 IndexBulkDeleteResult *istat = vacrel->indstats[i];
829
830 if (!istat)
831 continue;
832
833 appendStringInfo(&buf,
834 _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"),
835 indnames[i],
836 istat->num_pages,
837 istat->pages_newly_deleted,
838 istat->pages_deleted,
839 istat->pages_free);
840 }
841 if (track_io_timing)
842 {
843 double read_ms = (double) (pgStatBlockReadTime - startreadtime) / 1000;
844 double write_ms = (double) (pgStatBlockWriteTime - startwritetime) / 1000;
845
846 appendStringInfo(&buf, _("I/O timings: read: %.3f ms, write: %.3f ms\n"),
847 read_ms, write_ms);
848 }
849 appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
850 read_rate, write_rate);
851 appendStringInfo(&buf,
852 _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
853 (long long) VacuumPageHit,
854 (long long) VacuumPageMiss,
855 (long long) VacuumPageDirty);
856 appendStringInfo(&buf,
857 _("WAL usage: %lld records, %lld full page images, %llu bytes\n"),
858 (long long) walusage.wal_records,
859 (long long) walusage.wal_fpi,
860 (unsigned long long) walusage.wal_bytes);
861 appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
862
863 ereport(LOG,
864 (errmsg_internal("%s", buf.data)));
865 pfree(buf.data);
866 }
867 }
868
869 /* Cleanup index statistics and index names */
870 for (int i = 0; i < vacrel->nindexes; i++)
871 {
872 if (vacrel->indstats[i])
873 pfree(vacrel->indstats[i]);
874
875 if (indnames && indnames[i])
876 pfree(indnames[i]);
877 }
878 }
879
880 /*
881 * lazy_scan_heap() -- scan an open heap relation
882 *
883 * This routine prunes each page in the heap, which will among other
884 * things truncate dead tuples to dead line pointers, defragment the
885 * page, and set commit status bits (see heap_page_prune). It also builds
886 * lists of dead tuples and pages with free space, calculates statistics
887 * on the number of live tuples in the heap, and marks pages as
888 * all-visible if appropriate. When done, or when we run low on space
889 * for dead-tuple TIDs, invoke lazy_vacuum to vacuum indexes and vacuum
890 * heap relation during its own second pass over the heap.
891 *
892 * If the table has at least two indexes, we execute both index vacuum
893 * and index cleanup with parallel workers unless parallel vacuum is
894 * disabled. In a parallel vacuum, we enter parallel mode and then
895 * create both the parallel context and the DSM segment before starting
896 * heap scan so that we can record dead tuples to the DSM segment. All
897 * parallel workers are launched at beginning of index vacuuming and
898 * index cleanup and they exit once done with all indexes. At the end of
899 * this function we exit from parallel mode. Index bulk-deletion results
900 * are stored in the DSM segment and we update index statistics for all
901 * the indexes after exiting from parallel mode since writes are not
902 * allowed during parallel mode.
903 *
904 * If there are no indexes then we can reclaim line pointers on the fly;
905 * dead line pointers need only be retained until all index pointers that
906 * reference them have been killed.
907 */
908 static void
lazy_scan_heap(LVRelState * vacrel,VacuumParams * params,bool aggressive)909 lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
910 {
911 LVDeadTuples *dead_tuples;
912 BlockNumber nblocks,
913 blkno,
914 next_unskippable_block,
915 next_failsafe_block,
916 next_fsm_block_to_vacuum;
917 PGRUsage ru0;
918 Buffer vmbuffer = InvalidBuffer;
919 bool skipping_blocks;
920 StringInfoData buf;
921 const int initprog_index[] = {
922 PROGRESS_VACUUM_PHASE,
923 PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
924 PROGRESS_VACUUM_MAX_DEAD_TUPLES
925 };
926 int64 initprog_val[3];
927 GlobalVisState *vistest;
928
929 pg_rusage_init(&ru0);
930
931 if (aggressive)
932 ereport(elevel,
933 (errmsg("aggressively vacuuming \"%s.%s\"",
934 vacrel->relnamespace,
935 vacrel->relname)));
936 else
937 ereport(elevel,
938 (errmsg("vacuuming \"%s.%s\"",
939 vacrel->relnamespace,
940 vacrel->relname)));
941
942 nblocks = RelationGetNumberOfBlocks(vacrel->rel);
943 next_unskippable_block = 0;
944 next_failsafe_block = 0;
945 next_fsm_block_to_vacuum = 0;
946 vacrel->rel_pages = nblocks;
947 vacrel->scanned_pages = 0;
948 vacrel->pinskipped_pages = 0;
949 vacrel->frozenskipped_pages = 0;
950 vacrel->tupcount_pages = 0;
951 vacrel->pages_removed = 0;
952 vacrel->lpdead_item_pages = 0;
953 vacrel->nonempty_pages = 0;
954
955 /* Initialize instrumentation counters */
956 vacrel->num_index_scans = 0;
957 vacrel->tuples_deleted = 0;
958 vacrel->lpdead_items = 0;
959 vacrel->new_dead_tuples = 0;
960 vacrel->num_tuples = 0;
961 vacrel->live_tuples = 0;
962
963 vistest = GlobalVisTestFor(vacrel->rel);
964
965 vacrel->indstats = (IndexBulkDeleteResult **)
966 palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *));
967
968 /*
969 * Before beginning scan, check if it's already necessary to apply
970 * failsafe
971 */
972 lazy_check_wraparound_failsafe(vacrel);
973
974 /*
975 * Allocate the space for dead tuples. Note that this handles parallel
976 * VACUUM initialization as part of allocating shared memory space used
977 * for dead_tuples.
978 */
979 lazy_space_alloc(vacrel, params->nworkers, nblocks);
980 dead_tuples = vacrel->dead_tuples;
981
982 /* Report that we're scanning the heap, advertising total # of blocks */
983 initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
984 initprog_val[1] = nblocks;
985 initprog_val[2] = dead_tuples->max_tuples;
986 pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
987
988 /*
989 * Except when aggressive is set, we want to skip pages that are
990 * all-visible according to the visibility map, but only when we can skip
991 * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading
992 * sequentially, the OS should be doing readahead for us, so there's no
993 * gain in skipping a page now and then; that's likely to disable
994 * readahead and so be counterproductive. Also, skipping even a single
995 * page means that we can't update relfrozenxid, so we only want to do it
996 * if we can skip a goodly number of pages.
997 *
998 * When aggressive is set, we can't skip pages just because they are
999 * all-visible, but we can still skip pages that are all-frozen, since
1000 * such pages do not need freezing and do not affect the value that we can
1001 * safely set for relfrozenxid or relminmxid.
1002 *
1003 * Before entering the main loop, establish the invariant that
1004 * next_unskippable_block is the next block number >= blkno that we can't
1005 * skip based on the visibility map, either all-visible for a regular scan
1006 * or all-frozen for an aggressive scan. We set it to nblocks if there's
1007 * no such block. We also set up the skipping_blocks flag correctly at
1008 * this stage.
1009 *
1010 * Note: The value returned by visibilitymap_get_status could be slightly
1011 * out-of-date, since we make this test before reading the corresponding
1012 * heap page or locking the buffer. This is OK. If we mistakenly think
1013 * that the page is all-visible or all-frozen when in fact the flag's just
1014 * been cleared, we might fail to vacuum the page. It's easy to see that
1015 * skipping a page when aggressive is not set is not a very big deal; we
1016 * might leave some dead tuples lying around, but the next vacuum will
1017 * find them. But even when aggressive *is* set, it's still OK if we miss
1018 * a page whose all-frozen marking has just been cleared. Any new XIDs
1019 * just added to that page are necessarily newer than the GlobalXmin we
1020 * computed, so they'll have no effect on the value to which we can safely
1021 * set relfrozenxid. A similar argument applies for MXIDs and relminmxid.
1022 *
1023 * We will scan the table's last page, at least to the extent of
1024 * determining whether it has tuples or not, even if it should be skipped
1025 * according to the above rules; except when we've already determined that
1026 * it's not worth trying to truncate the table. This avoids having
1027 * lazy_truncate_heap() take access-exclusive lock on the table to attempt
1028 * a truncation that just fails immediately because there are tuples in
1029 * the last page. This is worth avoiding mainly because such a lock must
1030 * be replayed on any hot standby, where it can be disruptive.
1031 */
1032 if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1033 {
1034 while (next_unskippable_block < nblocks)
1035 {
1036 uint8 vmstatus;
1037
1038 vmstatus = visibilitymap_get_status(vacrel->rel,
1039 next_unskippable_block,
1040 &vmbuffer);
1041 if (aggressive)
1042 {
1043 if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
1044 break;
1045 }
1046 else
1047 {
1048 if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
1049 break;
1050 }
1051 vacuum_delay_point();
1052 next_unskippable_block++;
1053 }
1054 }
1055
1056 if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
1057 skipping_blocks = true;
1058 else
1059 skipping_blocks = false;
1060
1061 for (blkno = 0; blkno < nblocks; blkno++)
1062 {
1063 Buffer buf;
1064 Page page;
1065 bool all_visible_according_to_vm = false;
1066 LVPagePruneState prunestate;
1067
1068 /*
1069 * Consider need to skip blocks. See note above about forcing
1070 * scanning of last page.
1071 */
1072 #define FORCE_CHECK_PAGE() \
1073 (blkno == nblocks - 1 && should_attempt_truncation(vacrel))
1074
1075 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1076
1077 update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
1078 blkno, InvalidOffsetNumber);
1079
1080 if (blkno == next_unskippable_block)
1081 {
1082 /* Time to advance next_unskippable_block */
1083 next_unskippable_block++;
1084 if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1085 {
1086 while (next_unskippable_block < nblocks)
1087 {
1088 uint8 vmskipflags;
1089
1090 vmskipflags = visibilitymap_get_status(vacrel->rel,
1091 next_unskippable_block,
1092 &vmbuffer);
1093 if (aggressive)
1094 {
1095 if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
1096 break;
1097 }
1098 else
1099 {
1100 if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
1101 break;
1102 }
1103 vacuum_delay_point();
1104 next_unskippable_block++;
1105 }
1106 }
1107
1108 /*
1109 * We know we can't skip the current block. But set up
1110 * skipping_blocks to do the right thing at the following blocks.
1111 */
1112 if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
1113 skipping_blocks = true;
1114 else
1115 skipping_blocks = false;
1116
1117 /*
1118 * Normally, the fact that we can't skip this block must mean that
1119 * it's not all-visible. But in an aggressive vacuum we know only
1120 * that it's not all-frozen, so it might still be all-visible.
1121 */
1122 if (aggressive && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1123 all_visible_according_to_vm = true;
1124 }
1125 else
1126 {
1127 /*
1128 * The current block is potentially skippable; if we've seen a
1129 * long enough run of skippable blocks to justify skipping it, and
1130 * we're not forced to check it, then go ahead and skip.
1131 * Otherwise, the page must be at least all-visible if not
1132 * all-frozen, so we can set all_visible_according_to_vm = true.
1133 */
1134 if (skipping_blocks && !FORCE_CHECK_PAGE())
1135 {
1136 /*
1137 * Tricky, tricky. If this is in aggressive vacuum, the page
1138 * must have been all-frozen at the time we checked whether it
1139 * was skippable, but it might not be any more. We must be
1140 * careful to count it as a skipped all-frozen page in that
1141 * case, or else we'll think we can't update relfrozenxid and
1142 * relminmxid. If it's not an aggressive vacuum, we don't
1143 * know whether it was all-frozen, so we have to recheck; but
1144 * in this case an approximate answer is OK.
1145 */
1146 if (aggressive || VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1147 vacrel->frozenskipped_pages++;
1148 continue;
1149 }
1150 all_visible_according_to_vm = true;
1151 }
1152
1153 vacuum_delay_point();
1154
1155 /*
1156 * Regularly check if wraparound failsafe should trigger.
1157 *
1158 * There is a similar check inside lazy_vacuum_all_indexes(), but
1159 * relfrozenxid might start to look dangerously old before we reach
1160 * that point. This check also provides failsafe coverage for the
1161 * one-pass strategy, and the two-pass strategy with the index_cleanup
1162 * param set to 'off'.
1163 */
1164 if (blkno - next_failsafe_block >= FAILSAFE_EVERY_PAGES)
1165 {
1166 lazy_check_wraparound_failsafe(vacrel);
1167 next_failsafe_block = blkno;
1168 }
1169
1170 /*
1171 * Consider if we definitely have enough space to process TIDs on page
1172 * already. If we are close to overrunning the available space for
1173 * dead-tuple TIDs, pause and do a cycle of vacuuming before we tackle
1174 * this page.
1175 */
1176 if ((dead_tuples->max_tuples - dead_tuples->num_tuples) < MaxHeapTuplesPerPage &&
1177 dead_tuples->num_tuples > 0)
1178 {
1179 /*
1180 * Before beginning index vacuuming, we release any pin we may
1181 * hold on the visibility map page. This isn't necessary for
1182 * correctness, but we do it anyway to avoid holding the pin
1183 * across a lengthy, unrelated operation.
1184 */
1185 if (BufferIsValid(vmbuffer))
1186 {
1187 ReleaseBuffer(vmbuffer);
1188 vmbuffer = InvalidBuffer;
1189 }
1190
1191 /* Remove the collected garbage tuples from table and indexes */
1192 vacrel->consider_bypass_optimization = false;
1193 lazy_vacuum(vacrel);
1194
1195 /*
1196 * Vacuum the Free Space Map to make newly-freed space visible on
1197 * upper-level FSM pages. Note we have not yet processed blkno.
1198 */
1199 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1200 blkno);
1201 next_fsm_block_to_vacuum = blkno;
1202
1203 /* Report that we are once again scanning the heap */
1204 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1205 PROGRESS_VACUUM_PHASE_SCAN_HEAP);
1206 }
1207
1208 /*
1209 * Set up visibility map page as needed.
1210 *
1211 * Pin the visibility map page in case we need to mark the page
1212 * all-visible. In most cases this will be very cheap, because we'll
1213 * already have the correct page pinned anyway. However, it's
1214 * possible that (a) next_unskippable_block is covered by a different
1215 * VM page than the current block or (b) we released our pin and did a
1216 * cycle of index vacuuming.
1217 */
1218 visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
1219
1220 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno,
1221 RBM_NORMAL, vacrel->bstrategy);
1222
1223 /*
1224 * We need buffer cleanup lock so that we can prune HOT chains and
1225 * defragment the page.
1226 */
1227 if (!ConditionalLockBufferForCleanup(buf))
1228 {
1229 bool hastup;
1230
1231 /*
1232 * If we're not performing an aggressive scan to guard against XID
1233 * wraparound, and we don't want to forcibly check the page, then
1234 * it's OK to skip vacuuming pages we get a lock conflict on. They
1235 * will be dealt with in some future vacuum.
1236 */
1237 if (!aggressive && !FORCE_CHECK_PAGE())
1238 {
1239 ReleaseBuffer(buf);
1240 vacrel->pinskipped_pages++;
1241 continue;
1242 }
1243
1244 /*
1245 * Read the page with share lock to see if any xids on it need to
1246 * be frozen. If not we just skip the page, after updating our
1247 * scan statistics. If there are some, we wait for cleanup lock.
1248 *
1249 * We could defer the lock request further by remembering the page
1250 * and coming back to it later, or we could even register
1251 * ourselves for multiple buffers and then service whichever one
1252 * is received first. For now, this seems good enough.
1253 *
1254 * If we get here with aggressive false, then we're just forcibly
1255 * checking the page, and so we don't want to insist on getting
1256 * the lock; we only need to know if the page contains tuples, so
1257 * that we can update nonempty_pages correctly. It's convenient
1258 * to use lazy_check_needs_freeze() for both situations, though.
1259 */
1260 LockBuffer(buf, BUFFER_LOCK_SHARE);
1261 if (!lazy_check_needs_freeze(buf, &hastup, vacrel))
1262 {
1263 UnlockReleaseBuffer(buf);
1264 vacrel->scanned_pages++;
1265 vacrel->pinskipped_pages++;
1266 if (hastup)
1267 vacrel->nonempty_pages = blkno + 1;
1268 continue;
1269 }
1270 if (!aggressive)
1271 {
1272 /*
1273 * Here, we must not advance scanned_pages; that would amount
1274 * to claiming that the page contains no freezable tuples.
1275 */
1276 UnlockReleaseBuffer(buf);
1277 vacrel->pinskipped_pages++;
1278 if (hastup)
1279 vacrel->nonempty_pages = blkno + 1;
1280 continue;
1281 }
1282 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1283 LockBufferForCleanup(buf);
1284 /* drop through to normal processing */
1285 }
1286
1287 /*
1288 * By here we definitely have enough dead_tuples space for whatever
1289 * LP_DEAD tids are on this page, we have the visibility map page set
1290 * up in case we need to set this page's all_visible/all_frozen bit,
1291 * and we have a super-exclusive lock. Any tuples on this page are
1292 * now sure to be "counted" by this VACUUM.
1293 *
1294 * One last piece of preamble needs to take place before we can prune:
1295 * we need to consider new and empty pages.
1296 */
1297 vacrel->scanned_pages++;
1298 vacrel->tupcount_pages++;
1299
1300 page = BufferGetPage(buf);
1301
1302 if (PageIsNew(page))
1303 {
1304 /*
1305 * All-zeroes pages can be left over if either a backend extends
1306 * the relation by a single page, but crashes before the newly
1307 * initialized page has been written out, or when bulk-extending
1308 * the relation (which creates a number of empty pages at the tail
1309 * end of the relation, but enters them into the FSM).
1310 *
1311 * Note we do not enter the page into the visibilitymap. That has
1312 * the downside that we repeatedly visit this page in subsequent
1313 * vacuums, but otherwise we'll never not discover the space on a
1314 * promoted standby. The harm of repeated checking ought to
1315 * normally not be too bad - the space usually should be used at
1316 * some point, otherwise there wouldn't be any regular vacuums.
1317 *
1318 * Make sure these pages are in the FSM, to ensure they can be
1319 * reused. Do that by testing if there's any space recorded for
1320 * the page. If not, enter it. We do so after releasing the lock
1321 * on the heap page, the FSM is approximate, after all.
1322 */
1323 UnlockReleaseBuffer(buf);
1324
1325 if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0)
1326 {
1327 Size freespace = BLCKSZ - SizeOfPageHeaderData;
1328
1329 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1330 }
1331 continue;
1332 }
1333
1334 if (PageIsEmpty(page))
1335 {
1336 Size freespace = PageGetHeapFreeSpace(page);
1337
1338 /*
1339 * Empty pages are always all-visible and all-frozen (note that
1340 * the same is currently not true for new pages, see above).
1341 */
1342 if (!PageIsAllVisible(page))
1343 {
1344 START_CRIT_SECTION();
1345
1346 /* mark buffer dirty before writing a WAL record */
1347 MarkBufferDirty(buf);
1348
1349 /*
1350 * It's possible that another backend has extended the heap,
1351 * initialized the page, and then failed to WAL-log the page
1352 * due to an ERROR. Since heap extension is not WAL-logged,
1353 * recovery might try to replay our record setting the page
1354 * all-visible and find that the page isn't initialized, which
1355 * will cause a PANIC. To prevent that, check whether the
1356 * page has been previously WAL-logged, and if not, do that
1357 * now.
1358 */
1359 if (RelationNeedsWAL(vacrel->rel) &&
1360 PageGetLSN(page) == InvalidXLogRecPtr)
1361 log_newpage_buffer(buf, true);
1362
1363 PageSetAllVisible(page);
1364 visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1365 vmbuffer, InvalidTransactionId,
1366 VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
1367 END_CRIT_SECTION();
1368 }
1369
1370 UnlockReleaseBuffer(buf);
1371 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1372 continue;
1373 }
1374
1375 /*
1376 * Prune and freeze tuples.
1377 *
1378 * Accumulates details of remaining LP_DEAD line pointers on page in
1379 * dead tuple list. This includes LP_DEAD line pointers that we
1380 * pruned ourselves, as well as existing LP_DEAD line pointers that
1381 * were pruned some time earlier. Also considers freezing XIDs in the
1382 * tuple headers of remaining items with storage.
1383 */
1384 lazy_scan_prune(vacrel, buf, blkno, page, vistest, &prunestate);
1385
1386 Assert(!prunestate.all_visible || !prunestate.has_lpdead_items);
1387
1388 /* Remember the location of the last page with nonremovable tuples */
1389 if (prunestate.hastup)
1390 vacrel->nonempty_pages = blkno + 1;
1391
1392 if (vacrel->nindexes == 0)
1393 {
1394 /*
1395 * Consider the need to do page-at-a-time heap vacuuming when
1396 * using the one-pass strategy now.
1397 *
1398 * The one-pass strategy will never call lazy_vacuum(). The steps
1399 * performed here can be thought of as the one-pass equivalent of
1400 * a call to lazy_vacuum().
1401 */
1402 if (prunestate.has_lpdead_items)
1403 {
1404 Size freespace;
1405
1406 lazy_vacuum_heap_page(vacrel, blkno, buf, 0, &vmbuffer);
1407
1408 /* Forget the now-vacuumed tuples */
1409 dead_tuples->num_tuples = 0;
1410
1411 /*
1412 * Periodically perform FSM vacuuming to make newly-freed
1413 * space visible on upper FSM pages. Note we have not yet
1414 * performed FSM processing for blkno.
1415 */
1416 if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
1417 {
1418 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1419 blkno);
1420 next_fsm_block_to_vacuum = blkno;
1421 }
1422
1423 /*
1424 * Now perform FSM processing for blkno, and move on to next
1425 * page.
1426 *
1427 * Our call to lazy_vacuum_heap_page() will have considered if
1428 * it's possible to set all_visible/all_frozen independently
1429 * of lazy_scan_prune(). Note that prunestate was invalidated
1430 * by lazy_vacuum_heap_page() call.
1431 */
1432 freespace = PageGetHeapFreeSpace(page);
1433
1434 UnlockReleaseBuffer(buf);
1435 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1436 continue;
1437 }
1438
1439 /*
1440 * There was no call to lazy_vacuum_heap_page() because pruning
1441 * didn't encounter/create any LP_DEAD items that needed to be
1442 * vacuumed. Prune state has not been invalidated, so proceed
1443 * with prunestate-driven visibility map and FSM steps (just like
1444 * the two-pass strategy).
1445 */
1446 Assert(dead_tuples->num_tuples == 0);
1447 }
1448
1449 /*
1450 * Handle setting visibility map bit based on what the VM said about
1451 * the page before pruning started, and using prunestate
1452 */
1453 if (!all_visible_according_to_vm && prunestate.all_visible)
1454 {
1455 uint8 flags = VISIBILITYMAP_ALL_VISIBLE;
1456
1457 if (prunestate.all_frozen)
1458 flags |= VISIBILITYMAP_ALL_FROZEN;
1459
1460 /*
1461 * It should never be the case that the visibility map page is set
1462 * while the page-level bit is clear, but the reverse is allowed
1463 * (if checksums are not enabled). Regardless, set both bits so
1464 * that we get back in sync.
1465 *
1466 * NB: If the heap page is all-visible but the VM bit is not set,
1467 * we don't need to dirty the heap page. However, if checksums
1468 * are enabled, we do need to make sure that the heap page is
1469 * dirtied before passing it to visibilitymap_set(), because it
1470 * may be logged. Given that this situation should only happen in
1471 * rare cases after a crash, it is not worth optimizing.
1472 */
1473 PageSetAllVisible(page);
1474 MarkBufferDirty(buf);
1475 visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1476 vmbuffer, prunestate.visibility_cutoff_xid,
1477 flags);
1478 }
1479
1480 /*
1481 * As of PostgreSQL 9.2, the visibility map bit should never be set if
1482 * the page-level bit is clear. However, it's possible that the bit
1483 * got cleared after we checked it and before we took the buffer
1484 * content lock, so we must recheck before jumping to the conclusion
1485 * that something bad has happened.
1486 */
1487 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1488 && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1489 {
1490 elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1491 vacrel->relname, blkno);
1492 visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1493 VISIBILITYMAP_VALID_BITS);
1494 }
1495
1496 /*
1497 * It's possible for the value returned by
1498 * GetOldestNonRemovableTransactionId() to move backwards, so it's not
1499 * wrong for us to see tuples that appear to not be visible to
1500 * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
1501 * xmin value never moves backwards, but
1502 * GetOldestNonRemovableTransactionId() is conservative and sometimes
1503 * returns a value that's unnecessarily small, so if we see that
1504 * contradiction it just means that the tuples that we think are not
1505 * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
1506 * is correct.
1507 *
1508 * There should never be dead tuples on a page with PD_ALL_VISIBLE
1509 * set, however.
1510 */
1511 else if (prunestate.has_lpdead_items && PageIsAllVisible(page))
1512 {
1513 elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1514 vacrel->relname, blkno);
1515 PageClearAllVisible(page);
1516 MarkBufferDirty(buf);
1517 visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1518 VISIBILITYMAP_VALID_BITS);
1519 }
1520
1521 /*
1522 * If the all-visible page is all-frozen but not marked as such yet,
1523 * mark it as all-frozen. Note that all_frozen is only valid if
1524 * all_visible is true, so we must check both.
1525 */
1526 else if (all_visible_according_to_vm && prunestate.all_visible &&
1527 prunestate.all_frozen &&
1528 !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1529 {
1530 /*
1531 * We can pass InvalidTransactionId as the cutoff XID here,
1532 * because setting the all-frozen bit doesn't cause recovery
1533 * conflicts.
1534 */
1535 visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1536 vmbuffer, InvalidTransactionId,
1537 VISIBILITYMAP_ALL_FROZEN);
1538 }
1539
1540 /*
1541 * Final steps for block: drop super-exclusive lock, record free space
1542 * in the FSM
1543 */
1544 if (prunestate.has_lpdead_items && vacrel->do_index_vacuuming)
1545 {
1546 /*
1547 * Wait until lazy_vacuum_heap_rel() to save free space. This
1548 * doesn't just save us some cycles; it also allows us to record
1549 * any additional free space that lazy_vacuum_heap_page() will
1550 * make available in cases where it's possible to truncate the
1551 * page's line pointer array.
1552 *
1553 * Note: It's not in fact 100% certain that we really will call
1554 * lazy_vacuum_heap_rel() -- lazy_vacuum() might yet opt to skip
1555 * index vacuuming (and so must skip heap vacuuming). This is
1556 * deemed okay because it only happens in emergencies, or when
1557 * there is very little free space anyway. (Besides, we start
1558 * recording free space in the FSM once index vacuuming has been
1559 * abandoned.)
1560 *
1561 * Note: The one-pass (no indexes) case is only supposed to make
1562 * it this far when there were no LP_DEAD items during pruning.
1563 */
1564 Assert(vacrel->nindexes > 0);
1565 UnlockReleaseBuffer(buf);
1566 }
1567 else
1568 {
1569 Size freespace = PageGetHeapFreeSpace(page);
1570
1571 UnlockReleaseBuffer(buf);
1572 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1573 }
1574 }
1575
1576 /* report that everything is now scanned */
1577 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1578
1579 /* Clear the block number information */
1580 vacrel->blkno = InvalidBlockNumber;
1581
1582 /* now we can compute the new value for pg_class.reltuples */
1583 vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, nblocks,
1584 vacrel->tupcount_pages,
1585 vacrel->live_tuples);
1586
1587 /*
1588 * Also compute the total number of surviving heap entries. In the
1589 * (unlikely) scenario that new_live_tuples is -1, take it as zero.
1590 */
1591 vacrel->new_rel_tuples =
1592 Max(vacrel->new_live_tuples, 0) + vacrel->new_dead_tuples;
1593
1594 /*
1595 * Release any remaining pin on visibility map page.
1596 */
1597 if (BufferIsValid(vmbuffer))
1598 {
1599 ReleaseBuffer(vmbuffer);
1600 vmbuffer = InvalidBuffer;
1601 }
1602
1603 /* If any tuples need to be deleted, perform final vacuum cycle */
1604 if (dead_tuples->num_tuples > 0)
1605 lazy_vacuum(vacrel);
1606
1607 /*
1608 * Vacuum the remainder of the Free Space Map. We must do this whether or
1609 * not there were indexes, and whether or not we bypassed index vacuuming.
1610 */
1611 if (blkno > next_fsm_block_to_vacuum)
1612 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno);
1613
1614 /* report all blocks vacuumed */
1615 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1616
1617 /* Do post-vacuum cleanup */
1618 if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1619 lazy_cleanup_all_indexes(vacrel);
1620
1621 /*
1622 * Free resources managed by lazy_space_alloc(). (We must end parallel
1623 * mode/free shared memory before updating index statistics. We cannot
1624 * write while in parallel mode.)
1625 */
1626 lazy_space_free(vacrel);
1627
1628 /* Update index statistics */
1629 if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1630 update_index_statistics(vacrel);
1631
1632 /*
1633 * When the table has no indexes (i.e. in the one-pass strategy case),
1634 * make log report that lazy_vacuum_heap_rel would've made had there been
1635 * indexes. (As in the two-pass strategy case, only make this report when
1636 * there were LP_DEAD line pointers vacuumed in lazy_vacuum_heap_page.)
1637 */
1638 if (vacrel->nindexes == 0 && vacrel->lpdead_item_pages > 0)
1639 ereport(elevel,
1640 (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages",
1641 vacrel->relname, (long long) vacrel->lpdead_items,
1642 vacrel->lpdead_item_pages)));
1643
1644 /*
1645 * Make a log report summarizing pruning and freezing.
1646 *
1647 * The autovacuum specific logging in heap_vacuum_rel summarizes an entire
1648 * VACUUM operation, whereas each VACUUM VERBOSE log report generally
1649 * summarizes a single round of index/heap vacuuming (or rel truncation).
1650 * It wouldn't make sense to report on pruning or freezing while following
1651 * that convention, though. You can think of this log report as a summary
1652 * of our first pass over the heap.
1653 */
1654 initStringInfo(&buf);
1655 appendStringInfo(&buf,
1656 _("%lld dead row versions cannot be removed yet, oldest xmin: %u\n"),
1657 (long long) vacrel->new_dead_tuples, vacrel->OldestXmin);
1658 appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ",
1659 "Skipped %u pages due to buffer pins, ",
1660 vacrel->pinskipped_pages),
1661 vacrel->pinskipped_pages);
1662 appendStringInfo(&buf, ngettext("%u frozen page.\n",
1663 "%u frozen pages.\n",
1664 vacrel->frozenskipped_pages),
1665 vacrel->frozenskipped_pages);
1666 appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
1667
1668 ereport(elevel,
1669 (errmsg("table \"%s\": found %lld removable, %lld nonremovable row versions in %u out of %u pages",
1670 vacrel->relname,
1671 (long long) vacrel->tuples_deleted,
1672 (long long) vacrel->num_tuples, vacrel->scanned_pages,
1673 nblocks),
1674 errdetail_internal("%s", buf.data)));
1675 pfree(buf.data);
1676 }
1677
1678 /*
1679 * lazy_scan_prune() -- lazy_scan_heap() pruning and freezing.
1680 *
1681 * Caller must hold pin and buffer cleanup lock on the buffer.
1682 *
1683 * Prior to PostgreSQL 14 there were very rare cases where heap_page_prune()
1684 * was allowed to disagree with our HeapTupleSatisfiesVacuum() call about
1685 * whether or not a tuple should be considered DEAD. This happened when an
1686 * inserting transaction concurrently aborted (after our heap_page_prune()
1687 * call, before our HeapTupleSatisfiesVacuum() call). There was rather a lot
1688 * of complexity just so we could deal with tuples that were DEAD to VACUUM,
1689 * but nevertheless were left with storage after pruning.
1690 *
1691 * The approach we take now is to restart pruning when the race condition is
1692 * detected. This allows heap_page_prune() to prune the tuples inserted by
1693 * the now-aborted transaction. This is a little crude, but it guarantees
1694 * that any items that make it into the dead_tuples array are simple LP_DEAD
1695 * line pointers, and that every remaining item with tuple storage is
1696 * considered as a candidate for freezing.
1697 */
1698 static void
lazy_scan_prune(LVRelState * vacrel,Buffer buf,BlockNumber blkno,Page page,GlobalVisState * vistest,LVPagePruneState * prunestate)1699 lazy_scan_prune(LVRelState *vacrel,
1700 Buffer buf,
1701 BlockNumber blkno,
1702 Page page,
1703 GlobalVisState *vistest,
1704 LVPagePruneState *prunestate)
1705 {
1706 Relation rel = vacrel->rel;
1707 OffsetNumber offnum,
1708 maxoff;
1709 ItemId itemid;
1710 HeapTupleData tuple;
1711 HTSV_Result res;
1712 int tuples_deleted,
1713 lpdead_items,
1714 new_dead_tuples,
1715 num_tuples,
1716 live_tuples;
1717 int nfrozen;
1718 OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
1719 xl_heap_freeze_tuple frozen[MaxHeapTuplesPerPage];
1720
1721 maxoff = PageGetMaxOffsetNumber(page);
1722
1723 retry:
1724
1725 /* Initialize (or reset) page-level counters */
1726 tuples_deleted = 0;
1727 lpdead_items = 0;
1728 new_dead_tuples = 0;
1729 num_tuples = 0;
1730 live_tuples = 0;
1731
1732 /*
1733 * Prune all HOT-update chains in this page.
1734 *
1735 * We count tuples removed by the pruning step as tuples_deleted. Its
1736 * final value can be thought of as the number of tuples that have been
1737 * deleted from the table. It should not be confused with lpdead_items;
1738 * lpdead_items's final value can be thought of as the number of tuples
1739 * that were deleted from indexes.
1740 */
1741 tuples_deleted = heap_page_prune(rel, buf, vistest,
1742 InvalidTransactionId, 0, false,
1743 &vacrel->offnum);
1744
1745 /*
1746 * Now scan the page to collect LP_DEAD items and check for tuples
1747 * requiring freezing among remaining tuples with storage
1748 */
1749 prunestate->hastup = false;
1750 prunestate->has_lpdead_items = false;
1751 prunestate->all_visible = true;
1752 prunestate->all_frozen = true;
1753 prunestate->visibility_cutoff_xid = InvalidTransactionId;
1754 nfrozen = 0;
1755
1756 for (offnum = FirstOffsetNumber;
1757 offnum <= maxoff;
1758 offnum = OffsetNumberNext(offnum))
1759 {
1760 bool tuple_totally_frozen;
1761
1762 /*
1763 * Set the offset number so that we can display it along with any
1764 * error that occurred while processing this tuple.
1765 */
1766 vacrel->offnum = offnum;
1767 itemid = PageGetItemId(page, offnum);
1768
1769 if (!ItemIdIsUsed(itemid))
1770 continue;
1771
1772 /* Redirect items mustn't be touched */
1773 if (ItemIdIsRedirected(itemid))
1774 {
1775 prunestate->hastup = true; /* page won't be truncatable */
1776 continue;
1777 }
1778
1779 /*
1780 * LP_DEAD items are processed outside of the loop.
1781 *
1782 * Note that we deliberately don't set hastup=true in the case of an
1783 * LP_DEAD item here, which is not how lazy_check_needs_freeze() or
1784 * count_nondeletable_pages() do it -- they only consider pages empty
1785 * when they only have LP_UNUSED items, which is important for
1786 * correctness.
1787 *
1788 * Our assumption is that any LP_DEAD items we encounter here will
1789 * become LP_UNUSED inside lazy_vacuum_heap_page() before we actually
1790 * call count_nondeletable_pages(). In any case our opinion of
1791 * whether or not a page 'hastup' (which is how our caller sets its
1792 * vacrel->nonempty_pages value) is inherently race-prone. It must be
1793 * treated as advisory/unreliable, so we might as well be slightly
1794 * optimistic.
1795 */
1796 if (ItemIdIsDead(itemid))
1797 {
1798 deadoffsets[lpdead_items++] = offnum;
1799 prunestate->all_visible = false;
1800 prunestate->has_lpdead_items = true;
1801 continue;
1802 }
1803
1804 Assert(ItemIdIsNormal(itemid));
1805
1806 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1807 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1808 tuple.t_len = ItemIdGetLength(itemid);
1809 tuple.t_tableOid = RelationGetRelid(rel);
1810
1811 /*
1812 * DEAD tuples are almost always pruned into LP_DEAD line pointers by
1813 * heap_page_prune(), but it's possible that the tuple state changed
1814 * since heap_page_prune() looked. Handle that here by restarting.
1815 * (See comments at the top of function for a full explanation.)
1816 */
1817 res = HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf);
1818
1819 if (unlikely(res == HEAPTUPLE_DEAD))
1820 goto retry;
1821
1822 /*
1823 * The criteria for counting a tuple as live in this block need to
1824 * match what analyze.c's acquire_sample_rows() does, otherwise VACUUM
1825 * and ANALYZE may produce wildly different reltuples values, e.g.
1826 * when there are many recently-dead tuples.
1827 *
1828 * The logic here is a bit simpler than acquire_sample_rows(), as
1829 * VACUUM can't run inside a transaction block, which makes some cases
1830 * impossible (e.g. in-progress insert from the same transaction).
1831 *
1832 * We treat LP_DEAD items a little differently, too -- we don't count
1833 * them as dead_tuples at all (we only consider new_dead_tuples). The
1834 * outcome is no different because we assume that any LP_DEAD items we
1835 * encounter here will become LP_UNUSED inside lazy_vacuum_heap_page()
1836 * before we report anything to the stats collector. (Cases where we
1837 * bypass index vacuuming will violate our assumption, but the overall
1838 * impact of that should be negligible.)
1839 */
1840 switch (res)
1841 {
1842 case HEAPTUPLE_LIVE:
1843
1844 /*
1845 * Count it as live. Not only is this natural, but it's also
1846 * what acquire_sample_rows() does.
1847 */
1848 live_tuples++;
1849
1850 /*
1851 * Is the tuple definitely visible to all transactions?
1852 *
1853 * NB: Like with per-tuple hint bits, we can't set the
1854 * PD_ALL_VISIBLE flag if the inserter committed
1855 * asynchronously. See SetHintBits for more info. Check that
1856 * the tuple is hinted xmin-committed because of that.
1857 */
1858 if (prunestate->all_visible)
1859 {
1860 TransactionId xmin;
1861
1862 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1863 {
1864 prunestate->all_visible = false;
1865 break;
1866 }
1867
1868 /*
1869 * The inserter definitely committed. But is it old enough
1870 * that everyone sees it as committed?
1871 */
1872 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1873 if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
1874 {
1875 prunestate->all_visible = false;
1876 break;
1877 }
1878
1879 /* Track newest xmin on page. */
1880 if (TransactionIdFollows(xmin, prunestate->visibility_cutoff_xid))
1881 prunestate->visibility_cutoff_xid = xmin;
1882 }
1883 break;
1884 case HEAPTUPLE_RECENTLY_DEAD:
1885
1886 /*
1887 * If tuple is recently deleted then we must not remove it
1888 * from relation. (We only remove items that are LP_DEAD from
1889 * pruning.)
1890 */
1891 new_dead_tuples++;
1892 prunestate->all_visible = false;
1893 break;
1894 case HEAPTUPLE_INSERT_IN_PROGRESS:
1895
1896 /*
1897 * We do not count these rows as live, because we expect the
1898 * inserting transaction to update the counters at commit, and
1899 * we assume that will happen only after we report our
1900 * results. This assumption is a bit shaky, but it is what
1901 * acquire_sample_rows() does, so be consistent.
1902 */
1903 prunestate->all_visible = false;
1904 break;
1905 case HEAPTUPLE_DELETE_IN_PROGRESS:
1906 /* This is an expected case during concurrent vacuum */
1907 prunestate->all_visible = false;
1908
1909 /*
1910 * Count such rows as live. As above, we assume the deleting
1911 * transaction will commit and update the counters after we
1912 * report.
1913 */
1914 live_tuples++;
1915 break;
1916 default:
1917 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1918 break;
1919 }
1920
1921 /*
1922 * Non-removable tuple (i.e. tuple with storage).
1923 *
1924 * Check tuple left behind after pruning to see if needs to be frozen
1925 * now.
1926 */
1927 num_tuples++;
1928 prunestate->hastup = true;
1929 if (heap_prepare_freeze_tuple(tuple.t_data,
1930 vacrel->relfrozenxid,
1931 vacrel->relminmxid,
1932 vacrel->FreezeLimit,
1933 vacrel->MultiXactCutoff,
1934 &frozen[nfrozen],
1935 &tuple_totally_frozen))
1936 {
1937 /* Will execute freeze below */
1938 frozen[nfrozen++].offset = offnum;
1939 }
1940
1941 /*
1942 * If tuple is not frozen (and not about to become frozen) then caller
1943 * had better not go on to set this page's VM bit
1944 */
1945 if (!tuple_totally_frozen)
1946 prunestate->all_frozen = false;
1947 }
1948
1949 /*
1950 * We have now divided every item on the page into either an LP_DEAD item
1951 * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple
1952 * that remains and needs to be considered for freezing now (LP_UNUSED and
1953 * LP_REDIRECT items also remain, but are of no further interest to us).
1954 */
1955 vacrel->offnum = InvalidOffsetNumber;
1956
1957 /*
1958 * Consider the need to freeze any items with tuple storage from the page
1959 * first (arbitrary)
1960 */
1961 if (nfrozen > 0)
1962 {
1963 Assert(prunestate->hastup);
1964
1965 /*
1966 * At least one tuple with storage needs to be frozen -- execute that
1967 * now.
1968 *
1969 * If we need to freeze any tuples we'll mark the buffer dirty, and
1970 * write a WAL record recording the changes. We must log the changes
1971 * to be crash-safe against future truncation of CLOG.
1972 */
1973 START_CRIT_SECTION();
1974
1975 MarkBufferDirty(buf);
1976
1977 /* execute collected freezes */
1978 for (int i = 0; i < nfrozen; i++)
1979 {
1980 HeapTupleHeader htup;
1981
1982 itemid = PageGetItemId(page, frozen[i].offset);
1983 htup = (HeapTupleHeader) PageGetItem(page, itemid);
1984
1985 heap_execute_freeze_tuple(htup, &frozen[i]);
1986 }
1987
1988 /* Now WAL-log freezing if necessary */
1989 if (RelationNeedsWAL(vacrel->rel))
1990 {
1991 XLogRecPtr recptr;
1992
1993 recptr = log_heap_freeze(vacrel->rel, buf, vacrel->FreezeLimit,
1994 frozen, nfrozen);
1995 PageSetLSN(page, recptr);
1996 }
1997
1998 END_CRIT_SECTION();
1999 }
2000
2001 /*
2002 * The second pass over the heap can also set visibility map bits, using
2003 * the same approach. This is important when the table frequently has a
2004 * few old LP_DEAD items on each page by the time we get to it (typically
2005 * because past opportunistic pruning operations freed some non-HOT
2006 * tuples).
2007 *
2008 * VACUUM will call heap_page_is_all_visible() during the second pass over
2009 * the heap to determine all_visible and all_frozen for the page -- this
2010 * is a specialized version of the logic from this function. Now that
2011 * we've finished pruning and freezing, make sure that we're in total
2012 * agreement with heap_page_is_all_visible() using an assertion.
2013 */
2014 #ifdef USE_ASSERT_CHECKING
2015 /* Note that all_frozen value does not matter when !all_visible */
2016 if (prunestate->all_visible)
2017 {
2018 TransactionId cutoff;
2019 bool all_frozen;
2020
2021 if (!heap_page_is_all_visible(vacrel, buf, &cutoff, &all_frozen))
2022 Assert(false);
2023
2024 Assert(lpdead_items == 0);
2025 Assert(prunestate->all_frozen == all_frozen);
2026
2027 /*
2028 * It's possible that we froze tuples and made the page's XID cutoff
2029 * (for recovery conflict purposes) FrozenTransactionId. This is okay
2030 * because visibility_cutoff_xid will be logged by our caller in a
2031 * moment.
2032 */
2033 Assert(cutoff == FrozenTransactionId ||
2034 cutoff == prunestate->visibility_cutoff_xid);
2035 }
2036 #endif
2037
2038 /*
2039 * Now save details of the LP_DEAD items from the page in the dead_tuples
2040 * array. Also record that page has dead items in per-page prunestate.
2041 */
2042 if (lpdead_items > 0)
2043 {
2044 LVDeadTuples *dead_tuples = vacrel->dead_tuples;
2045 ItemPointerData tmp;
2046
2047 Assert(!prunestate->all_visible);
2048 Assert(prunestate->has_lpdead_items);
2049
2050 vacrel->lpdead_item_pages++;
2051
2052 ItemPointerSetBlockNumber(&tmp, blkno);
2053
2054 for (int i = 0; i < lpdead_items; i++)
2055 {
2056 ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]);
2057 dead_tuples->itemptrs[dead_tuples->num_tuples++] = tmp;
2058 }
2059
2060 Assert(dead_tuples->num_tuples <= dead_tuples->max_tuples);
2061 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
2062 dead_tuples->num_tuples);
2063 }
2064
2065 /* Finally, add page-local counts to whole-VACUUM counts */
2066 vacrel->tuples_deleted += tuples_deleted;
2067 vacrel->lpdead_items += lpdead_items;
2068 vacrel->new_dead_tuples += new_dead_tuples;
2069 vacrel->num_tuples += num_tuples;
2070 vacrel->live_tuples += live_tuples;
2071 }
2072
2073 /*
2074 * Remove the collected garbage tuples from the table and its indexes.
2075 *
2076 * We may choose to bypass index vacuuming at this point, though only when the
2077 * ongoing VACUUM operation will definitely only have one index scan/round of
2078 * index vacuuming. Caller indicates whether or not this is such a VACUUM
2079 * operation using 'onecall' argument.
2080 *
2081 * In rare emergencies, the ongoing VACUUM operation can be made to skip both
2082 * index vacuuming and index cleanup at the point we're called. This avoids
2083 * having the whole system refuse to allocate further XIDs/MultiXactIds due to
2084 * wraparound.
2085 */
2086 static void
lazy_vacuum(LVRelState * vacrel)2087 lazy_vacuum(LVRelState *vacrel)
2088 {
2089 bool bypass;
2090
2091 /* Should not end up here with no indexes */
2092 Assert(vacrel->nindexes > 0);
2093 Assert(!IsParallelWorker());
2094 Assert(vacrel->lpdead_item_pages > 0);
2095
2096 if (!vacrel->do_index_vacuuming)
2097 {
2098 Assert(!vacrel->do_index_cleanup);
2099 vacrel->dead_tuples->num_tuples = 0;
2100 return;
2101 }
2102
2103 /*
2104 * Consider bypassing index vacuuming (and heap vacuuming) entirely.
2105 *
2106 * We currently only do this in cases where the number of LP_DEAD items
2107 * for the entire VACUUM operation is close to zero. This avoids sharp
2108 * discontinuities in the duration and overhead of successive VACUUM
2109 * operations that run against the same table with a fixed workload.
2110 * Ideally, successive VACUUM operations will behave as if there are
2111 * exactly zero LP_DEAD items in cases where there are close to zero.
2112 *
2113 * This is likely to be helpful with a table that is continually affected
2114 * by UPDATEs that can mostly apply the HOT optimization, but occasionally
2115 * have small aberrations that lead to just a few heap pages retaining
2116 * only one or two LP_DEAD items. This is pretty common; even when the
2117 * DBA goes out of their way to make UPDATEs use HOT, it is practically
2118 * impossible to predict whether HOT will be applied in 100% of cases.
2119 * It's far easier to ensure that 99%+ of all UPDATEs against a table use
2120 * HOT through careful tuning.
2121 */
2122 bypass = false;
2123 if (vacrel->consider_bypass_optimization && vacrel->rel_pages > 0)
2124 {
2125 BlockNumber threshold;
2126
2127 Assert(vacrel->num_index_scans == 0);
2128 Assert(vacrel->lpdead_items == vacrel->dead_tuples->num_tuples);
2129 Assert(vacrel->do_index_vacuuming);
2130 Assert(vacrel->do_index_cleanup);
2131
2132 /*
2133 * This crossover point at which we'll start to do index vacuuming is
2134 * expressed as a percentage of the total number of heap pages in the
2135 * table that are known to have at least one LP_DEAD item. This is
2136 * much more important than the total number of LP_DEAD items, since
2137 * it's a proxy for the number of heap pages whose visibility map bits
2138 * cannot be set on account of bypassing index and heap vacuuming.
2139 *
2140 * We apply one further precautionary test: the space currently used
2141 * to store the TIDs (TIDs that now all point to LP_DEAD items) must
2142 * not exceed 32MB. This limits the risk that we will bypass index
2143 * vacuuming again and again until eventually there is a VACUUM whose
2144 * dead_tuples space is not CPU cache resident.
2145 *
2146 * We don't take any special steps to remember the LP_DEAD items (such
2147 * as counting them in new_dead_tuples report to the stats collector)
2148 * when the optimization is applied. Though the accounting used in
2149 * analyze.c's acquire_sample_rows() will recognize the same LP_DEAD
2150 * items as dead rows in its own stats collector report, that's okay.
2151 * The discrepancy should be negligible. If this optimization is ever
2152 * expanded to cover more cases then this may need to be reconsidered.
2153 */
2154 threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES;
2155 bypass = (vacrel->lpdead_item_pages < threshold &&
2156 vacrel->lpdead_items < MAXDEADTUPLES(32L * 1024L * 1024L));
2157 }
2158
2159 if (bypass)
2160 {
2161 /*
2162 * There are almost zero TIDs. Behave as if there were precisely
2163 * zero: bypass index vacuuming, but do index cleanup.
2164 *
2165 * We expect that the ongoing VACUUM operation will finish very
2166 * quickly, so there is no point in considering speeding up as a
2167 * failsafe against wraparound failure. (Index cleanup is expected to
2168 * finish very quickly in cases where there were no ambulkdelete()
2169 * calls.)
2170 */
2171 vacrel->do_index_vacuuming = false;
2172 ereport(elevel,
2173 (errmsg("table \"%s\": index scan bypassed: %u pages from table (%.2f%% of total) have %lld dead item identifiers",
2174 vacrel->relname, vacrel->lpdead_item_pages,
2175 100.0 * vacrel->lpdead_item_pages / vacrel->rel_pages,
2176 (long long) vacrel->lpdead_items)));
2177 }
2178 else if (lazy_vacuum_all_indexes(vacrel))
2179 {
2180 /*
2181 * We successfully completed a round of index vacuuming. Do related
2182 * heap vacuuming now.
2183 */
2184 lazy_vacuum_heap_rel(vacrel);
2185 }
2186 else
2187 {
2188 /*
2189 * Failsafe case.
2190 *
2191 * we attempted index vacuuming, but didn't finish a full round/full
2192 * index scan. This happens when relfrozenxid or relminmxid is too
2193 * far in the past.
2194 *
2195 * From this point on the VACUUM operation will do no further index
2196 * vacuuming or heap vacuuming. This VACUUM operation won't end up
2197 * back here again.
2198 */
2199 Assert(vacrel->failsafe_active);
2200 }
2201
2202 /*
2203 * Forget the LP_DEAD items that we just vacuumed (or just decided to not
2204 * vacuum)
2205 */
2206 vacrel->dead_tuples->num_tuples = 0;
2207 }
2208
2209 /*
2210 * lazy_vacuum_all_indexes() -- Main entry for index vacuuming
2211 *
2212 * Returns true in the common case when all indexes were successfully
2213 * vacuumed. Returns false in rare cases where we determined that the ongoing
2214 * VACUUM operation is at risk of taking too long to finish, leading to
2215 * wraparound failure.
2216 */
2217 static bool
lazy_vacuum_all_indexes(LVRelState * vacrel)2218 lazy_vacuum_all_indexes(LVRelState *vacrel)
2219 {
2220 bool allindexes = true;
2221
2222 Assert(!IsParallelWorker());
2223 Assert(vacrel->nindexes > 0);
2224 Assert(vacrel->do_index_vacuuming);
2225 Assert(vacrel->do_index_cleanup);
2226 Assert(TransactionIdIsNormal(vacrel->relfrozenxid));
2227 Assert(MultiXactIdIsValid(vacrel->relminmxid));
2228
2229 /* Precheck for XID wraparound emergencies */
2230 if (lazy_check_wraparound_failsafe(vacrel))
2231 {
2232 /* Wraparound emergency -- don't even start an index scan */
2233 return false;
2234 }
2235
2236 /* Report that we are now vacuuming indexes */
2237 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2238 PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
2239
2240 if (!ParallelVacuumIsActive(vacrel))
2241 {
2242 for (int idx = 0; idx < vacrel->nindexes; idx++)
2243 {
2244 Relation indrel = vacrel->indrels[idx];
2245 IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2246
2247 vacrel->indstats[idx] =
2248 lazy_vacuum_one_index(indrel, istat, vacrel->old_live_tuples,
2249 vacrel);
2250
2251 if (lazy_check_wraparound_failsafe(vacrel))
2252 {
2253 /* Wraparound emergency -- end current index scan */
2254 allindexes = false;
2255 break;
2256 }
2257 }
2258 }
2259 else
2260 {
2261 /* Outsource everything to parallel variant */
2262 do_parallel_lazy_vacuum_all_indexes(vacrel);
2263
2264 /*
2265 * Do a postcheck to consider applying wraparound failsafe now. Note
2266 * that parallel VACUUM only gets the precheck and this postcheck.
2267 */
2268 if (lazy_check_wraparound_failsafe(vacrel))
2269 allindexes = false;
2270 }
2271
2272 /*
2273 * We delete all LP_DEAD items from the first heap pass in all indexes on
2274 * each call here (except calls where we choose to do the failsafe). This
2275 * makes the next call to lazy_vacuum_heap_rel() safe (except in the event
2276 * of the failsafe triggering, which prevents the next call from taking
2277 * place).
2278 */
2279 Assert(vacrel->num_index_scans > 0 ||
2280 vacrel->dead_tuples->num_tuples == vacrel->lpdead_items);
2281 Assert(allindexes || vacrel->failsafe_active);
2282
2283 /*
2284 * Increase and report the number of index scans.
2285 *
2286 * We deliberately include the case where we started a round of bulk
2287 * deletes that we weren't able to finish due to the failsafe triggering.
2288 */
2289 vacrel->num_index_scans++;
2290 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS,
2291 vacrel->num_index_scans);
2292
2293 return allindexes;
2294 }
2295
2296 /*
2297 * lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
2298 *
2299 * This routine marks LP_DEAD items in vacrel->dead_tuples array as LP_UNUSED.
2300 * Pages that never had lazy_scan_prune record LP_DEAD items are not visited
2301 * at all.
2302 *
2303 * We may also be able to truncate the line pointer array of the heap pages we
2304 * visit. If there is a contiguous group of LP_UNUSED items at the end of the
2305 * array, it can be reclaimed as free space. These LP_UNUSED items usually
2306 * start out as LP_DEAD items recorded by lazy_scan_prune (we set items from
2307 * each page to LP_UNUSED, and then consider if it's possible to truncate the
2308 * page's line pointer array).
2309 *
2310 * Note: the reason for doing this as a second pass is we cannot remove the
2311 * tuples until we've removed their index entries, and we want to process
2312 * index entry removal in batches as large as possible.
2313 */
2314 static void
lazy_vacuum_heap_rel(LVRelState * vacrel)2315 lazy_vacuum_heap_rel(LVRelState *vacrel)
2316 {
2317 int tupindex;
2318 BlockNumber vacuumed_pages;
2319 PGRUsage ru0;
2320 Buffer vmbuffer = InvalidBuffer;
2321 LVSavedErrInfo saved_err_info;
2322
2323 Assert(vacrel->do_index_vacuuming);
2324 Assert(vacrel->do_index_cleanup);
2325 Assert(vacrel->num_index_scans > 0);
2326
2327 /* Report that we are now vacuuming the heap */
2328 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2329 PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
2330
2331 /* Update error traceback information */
2332 update_vacuum_error_info(vacrel, &saved_err_info,
2333 VACUUM_ERRCB_PHASE_VACUUM_HEAP,
2334 InvalidBlockNumber, InvalidOffsetNumber);
2335
2336 pg_rusage_init(&ru0);
2337 vacuumed_pages = 0;
2338
2339 tupindex = 0;
2340 while (tupindex < vacrel->dead_tuples->num_tuples)
2341 {
2342 BlockNumber tblk;
2343 Buffer buf;
2344 Page page;
2345 Size freespace;
2346
2347 vacuum_delay_point();
2348
2349 tblk = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex]);
2350 vacrel->blkno = tblk;
2351 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL,
2352 vacrel->bstrategy);
2353 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2354 tupindex = lazy_vacuum_heap_page(vacrel, tblk, buf, tupindex,
2355 &vmbuffer);
2356
2357 /* Now that we've vacuumed the page, record its available space */
2358 page = BufferGetPage(buf);
2359 freespace = PageGetHeapFreeSpace(page);
2360
2361 UnlockReleaseBuffer(buf);
2362 RecordPageWithFreeSpace(vacrel->rel, tblk, freespace);
2363 vacuumed_pages++;
2364 }
2365
2366 /* Clear the block number information */
2367 vacrel->blkno = InvalidBlockNumber;
2368
2369 if (BufferIsValid(vmbuffer))
2370 {
2371 ReleaseBuffer(vmbuffer);
2372 vmbuffer = InvalidBuffer;
2373 }
2374
2375 /*
2376 * We set all LP_DEAD items from the first heap pass to LP_UNUSED during
2377 * the second heap pass. No more, no less.
2378 */
2379 Assert(tupindex > 0);
2380 Assert(vacrel->num_index_scans > 1 ||
2381 (tupindex == vacrel->lpdead_items &&
2382 vacuumed_pages == vacrel->lpdead_item_pages));
2383
2384 ereport(elevel,
2385 (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages",
2386 vacrel->relname, (long long ) tupindex, vacuumed_pages),
2387 errdetail_internal("%s", pg_rusage_show(&ru0))));
2388
2389 /* Revert to the previous phase information for error traceback */
2390 restore_vacuum_error_info(vacrel, &saved_err_info);
2391 }
2392
2393 /*
2394 * lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the
2395 * vacrel->dead_tuples array.
2396 *
2397 * Caller must have an exclusive buffer lock on the buffer (though a
2398 * super-exclusive lock is also acceptable).
2399 *
2400 * tupindex is the index in vacrel->dead_tuples of the first dead tuple for
2401 * this page. We assume the rest follow sequentially. The return value is
2402 * the first tupindex after the tuples of this page.
2403 *
2404 * Prior to PostgreSQL 14 there were rare cases where this routine had to set
2405 * tuples with storage to unused. These days it is strictly responsible for
2406 * marking LP_DEAD stub line pointers as unused. This only happens for those
2407 * LP_DEAD items on the page that were determined to be LP_DEAD items back
2408 * when the same page was visited by lazy_scan_prune() (i.e. those whose TID
2409 * was recorded in the dead_tuples array).
2410 */
2411 static int
lazy_vacuum_heap_page(LVRelState * vacrel,BlockNumber blkno,Buffer buffer,int tupindex,Buffer * vmbuffer)2412 lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
2413 int tupindex, Buffer *vmbuffer)
2414 {
2415 LVDeadTuples *dead_tuples = vacrel->dead_tuples;
2416 Page page = BufferGetPage(buffer);
2417 OffsetNumber unused[MaxHeapTuplesPerPage];
2418 int uncnt = 0;
2419 TransactionId visibility_cutoff_xid;
2420 bool all_frozen;
2421 LVSavedErrInfo saved_err_info;
2422
2423 Assert(vacrel->nindexes == 0 || vacrel->do_index_vacuuming);
2424
2425 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
2426
2427 /* Update error traceback information */
2428 update_vacuum_error_info(vacrel, &saved_err_info,
2429 VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno,
2430 InvalidOffsetNumber);
2431
2432 START_CRIT_SECTION();
2433
2434 for (; tupindex < dead_tuples->num_tuples; tupindex++)
2435 {
2436 BlockNumber tblk;
2437 OffsetNumber toff;
2438 ItemId itemid;
2439
2440 tblk = ItemPointerGetBlockNumber(&dead_tuples->itemptrs[tupindex]);
2441 if (tblk != blkno)
2442 break; /* past end of tuples for this block */
2443 toff = ItemPointerGetOffsetNumber(&dead_tuples->itemptrs[tupindex]);
2444 itemid = PageGetItemId(page, toff);
2445
2446 Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid));
2447 ItemIdSetUnused(itemid);
2448 unused[uncnt++] = toff;
2449 }
2450
2451 Assert(uncnt > 0);
2452
2453 /* Attempt to truncate line pointer array now */
2454 PageTruncateLinePointerArray(page);
2455
2456 /*
2457 * Mark buffer dirty before we write WAL.
2458 */
2459 MarkBufferDirty(buffer);
2460
2461 /* XLOG stuff */
2462 if (RelationNeedsWAL(vacrel->rel))
2463 {
2464 xl_heap_vacuum xlrec;
2465 XLogRecPtr recptr;
2466
2467 xlrec.nunused = uncnt;
2468
2469 XLogBeginInsert();
2470 XLogRegisterData((char *) &xlrec, SizeOfHeapVacuum);
2471
2472 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2473 XLogRegisterBufData(0, (char *) unused, uncnt * sizeof(OffsetNumber));
2474
2475 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VACUUM);
2476
2477 PageSetLSN(page, recptr);
2478 }
2479
2480 /*
2481 * End critical section, so we safely can do visibility tests (which
2482 * possibly need to perform IO and allocate memory!). If we crash now the
2483 * page (including the corresponding vm bit) might not be marked all
2484 * visible, but that's fine. A later vacuum will fix that.
2485 */
2486 END_CRIT_SECTION();
2487
2488 /*
2489 * Now that we have removed the LD_DEAD items from the page, once again
2490 * check if the page has become all-visible. The page is already marked
2491 * dirty, exclusively locked, and, if needed, a full page image has been
2492 * emitted.
2493 */
2494 if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
2495 &all_frozen))
2496 PageSetAllVisible(page);
2497
2498 /*
2499 * All the changes to the heap page have been done. If the all-visible
2500 * flag is now set, also set the VM all-visible bit (and, if possible, the
2501 * all-frozen bit) unless this has already been done previously.
2502 */
2503 if (PageIsAllVisible(page))
2504 {
2505 uint8 flags = 0;
2506 uint8 vm_status = visibilitymap_get_status(vacrel->rel,
2507 blkno, vmbuffer);
2508
2509 /* Set the VM all-frozen bit to flag, if needed */
2510 if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
2511 flags |= VISIBILITYMAP_ALL_VISIBLE;
2512 if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
2513 flags |= VISIBILITYMAP_ALL_FROZEN;
2514
2515 Assert(BufferIsValid(*vmbuffer));
2516 if (flags != 0)
2517 visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr,
2518 *vmbuffer, visibility_cutoff_xid, flags);
2519 }
2520
2521 /* Revert to the previous phase information for error traceback */
2522 restore_vacuum_error_info(vacrel, &saved_err_info);
2523 return tupindex;
2524 }
2525
2526 /*
2527 * lazy_check_needs_freeze() -- scan page to see if any tuples
2528 * need to be cleaned to avoid wraparound
2529 *
2530 * Returns true if the page needs to be vacuumed using cleanup lock.
2531 * Also returns a flag indicating whether page contains any tuples at all.
2532 */
2533 static bool
lazy_check_needs_freeze(Buffer buf,bool * hastup,LVRelState * vacrel)2534 lazy_check_needs_freeze(Buffer buf, bool *hastup, LVRelState *vacrel)
2535 {
2536 Page page = BufferGetPage(buf);
2537 OffsetNumber offnum,
2538 maxoff;
2539 HeapTupleHeader tupleheader;
2540
2541 *hastup = false;
2542
2543 /*
2544 * New and empty pages, obviously, don't contain tuples. We could make
2545 * sure that the page is registered in the FSM, but it doesn't seem worth
2546 * waiting for a cleanup lock just for that, especially because it's
2547 * likely that the pin holder will do so.
2548 */
2549 if (PageIsNew(page) || PageIsEmpty(page))
2550 return false;
2551
2552 maxoff = PageGetMaxOffsetNumber(page);
2553 for (offnum = FirstOffsetNumber;
2554 offnum <= maxoff;
2555 offnum = OffsetNumberNext(offnum))
2556 {
2557 ItemId itemid;
2558
2559 /*
2560 * Set the offset number so that we can display it along with any
2561 * error that occurred while processing this tuple.
2562 */
2563 vacrel->offnum = offnum;
2564 itemid = PageGetItemId(page, offnum);
2565
2566 /* this should match hastup test in count_nondeletable_pages() */
2567 if (ItemIdIsUsed(itemid))
2568 *hastup = true;
2569
2570 /* dead and redirect items never need freezing */
2571 if (!ItemIdIsNormal(itemid))
2572 continue;
2573
2574 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
2575
2576 if (heap_tuple_needs_freeze(tupleheader, vacrel->FreezeLimit,
2577 vacrel->MultiXactCutoff, buf))
2578 break;
2579 } /* scan along page */
2580
2581 /* Clear the offset information once we have processed the given page. */
2582 vacrel->offnum = InvalidOffsetNumber;
2583
2584 return (offnum <= maxoff);
2585 }
2586
2587 /*
2588 * Trigger the failsafe to avoid wraparound failure when vacrel table has a
2589 * relfrozenxid and/or relminmxid that is dangerously far in the past.
2590 * Triggering the failsafe makes the ongoing VACUUM bypass any further index
2591 * vacuuming and heap vacuuming. Truncating the heap is also bypassed.
2592 *
2593 * Any remaining work (work that VACUUM cannot just bypass) is typically sped
2594 * up when the failsafe triggers. VACUUM stops applying any cost-based delay
2595 * that it started out with.
2596 *
2597 * Returns true when failsafe has been triggered.
2598 */
2599 static bool
lazy_check_wraparound_failsafe(LVRelState * vacrel)2600 lazy_check_wraparound_failsafe(LVRelState *vacrel)
2601 {
2602 /* Don't warn more than once per VACUUM */
2603 if (vacrel->failsafe_active)
2604 return true;
2605
2606 if (unlikely(vacuum_xid_failsafe_check(vacrel->relfrozenxid,
2607 vacrel->relminmxid)))
2608 {
2609 vacrel->failsafe_active = true;
2610
2611 /* Disable index vacuuming, index cleanup, and heap rel truncation */
2612 vacrel->do_index_vacuuming = false;
2613 vacrel->do_index_cleanup = false;
2614 vacrel->do_rel_truncate = false;
2615
2616 ereport(WARNING,
2617 (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans",
2618 get_database_name(MyDatabaseId),
2619 vacrel->relnamespace,
2620 vacrel->relname,
2621 vacrel->num_index_scans),
2622 errdetail("The table's relfrozenxid or relminmxid is too far in the past."),
2623 errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n"
2624 "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs.")));
2625
2626 /* Stop applying cost limits from this point on */
2627 VacuumCostActive = false;
2628 VacuumCostBalance = 0;
2629
2630 return true;
2631 }
2632
2633 return false;
2634 }
2635
2636 /*
2637 * Perform lazy_vacuum_all_indexes() steps in parallel
2638 */
2639 static void
do_parallel_lazy_vacuum_all_indexes(LVRelState * vacrel)2640 do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel)
2641 {
2642 /* Tell parallel workers to do index vacuuming */
2643 vacrel->lps->lvshared->for_cleanup = false;
2644 vacrel->lps->lvshared->first_time = false;
2645
2646 /*
2647 * We can only provide an approximate value of num_heap_tuples, at least
2648 * for now. Matches serial VACUUM case.
2649 */
2650 vacrel->lps->lvshared->reltuples = vacrel->old_live_tuples;
2651 vacrel->lps->lvshared->estimated_count = true;
2652
2653 do_parallel_vacuum_or_cleanup(vacrel,
2654 vacrel->lps->nindexes_parallel_bulkdel);
2655 }
2656
2657 /*
2658 * Perform lazy_cleanup_all_indexes() steps in parallel
2659 */
2660 static void
do_parallel_lazy_cleanup_all_indexes(LVRelState * vacrel)2661 do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel)
2662 {
2663 int nworkers;
2664
2665 /*
2666 * If parallel vacuum is active we perform index cleanup with parallel
2667 * workers.
2668 *
2669 * Tell parallel workers to do index cleanup.
2670 */
2671 vacrel->lps->lvshared->for_cleanup = true;
2672 vacrel->lps->lvshared->first_time = (vacrel->num_index_scans == 0);
2673
2674 /*
2675 * Now we can provide a better estimate of total number of surviving
2676 * tuples (we assume indexes are more interested in that than in the
2677 * number of nominally live tuples).
2678 */
2679 vacrel->lps->lvshared->reltuples = vacrel->new_rel_tuples;
2680 vacrel->lps->lvshared->estimated_count =
2681 (vacrel->tupcount_pages < vacrel->rel_pages);
2682
2683 /* Determine the number of parallel workers to launch */
2684 if (vacrel->lps->lvshared->first_time)
2685 nworkers = vacrel->lps->nindexes_parallel_cleanup +
2686 vacrel->lps->nindexes_parallel_condcleanup;
2687 else
2688 nworkers = vacrel->lps->nindexes_parallel_cleanup;
2689
2690 do_parallel_vacuum_or_cleanup(vacrel, nworkers);
2691 }
2692
2693 /*
2694 * Perform index vacuum or index cleanup with parallel workers. This function
2695 * must be used by the parallel vacuum leader process. The caller must set
2696 * lps->lvshared->for_cleanup to indicate whether to perform vacuum or
2697 * cleanup.
2698 */
2699 static void
do_parallel_vacuum_or_cleanup(LVRelState * vacrel,int nworkers)2700 do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers)
2701 {
2702 LVParallelState *lps = vacrel->lps;
2703
2704 Assert(!IsParallelWorker());
2705 Assert(ParallelVacuumIsActive(vacrel));
2706 Assert(vacrel->nindexes > 0);
2707
2708 /* The leader process will participate */
2709 nworkers--;
2710
2711 /*
2712 * It is possible that parallel context is initialized with fewer workers
2713 * than the number of indexes that need a separate worker in the current
2714 * phase, so we need to consider it. See compute_parallel_vacuum_workers.
2715 */
2716 nworkers = Min(nworkers, lps->pcxt->nworkers);
2717
2718 /* Setup the shared cost-based vacuum delay and launch workers */
2719 if (nworkers > 0)
2720 {
2721 if (vacrel->num_index_scans > 0)
2722 {
2723 /* Reset the parallel index processing counter */
2724 pg_atomic_write_u32(&(lps->lvshared->idx), 0);
2725
2726 /* Reinitialize the parallel context to relaunch parallel workers */
2727 ReinitializeParallelDSM(lps->pcxt);
2728 }
2729
2730 /*
2731 * Set up shared cost balance and the number of active workers for
2732 * vacuum delay. We need to do this before launching workers as
2733 * otherwise, they might not see the updated values for these
2734 * parameters.
2735 */
2736 pg_atomic_write_u32(&(lps->lvshared->cost_balance), VacuumCostBalance);
2737 pg_atomic_write_u32(&(lps->lvshared->active_nworkers), 0);
2738
2739 /*
2740 * The number of workers can vary between bulkdelete and cleanup
2741 * phase.
2742 */
2743 ReinitializeParallelWorkers(lps->pcxt, nworkers);
2744
2745 LaunchParallelWorkers(lps->pcxt);
2746
2747 if (lps->pcxt->nworkers_launched > 0)
2748 {
2749 /*
2750 * Reset the local cost values for leader backend as we have
2751 * already accumulated the remaining balance of heap.
2752 */
2753 VacuumCostBalance = 0;
2754 VacuumCostBalanceLocal = 0;
2755
2756 /* Enable shared cost balance for leader backend */
2757 VacuumSharedCostBalance = &(lps->lvshared->cost_balance);
2758 VacuumActiveNWorkers = &(lps->lvshared->active_nworkers);
2759 }
2760
2761 if (lps->lvshared->for_cleanup)
2762 ereport(elevel,
2763 (errmsg(ngettext("launched %d parallel vacuum worker for index cleanup (planned: %d)",
2764 "launched %d parallel vacuum workers for index cleanup (planned: %d)",
2765 lps->pcxt->nworkers_launched),
2766 lps->pcxt->nworkers_launched, nworkers)));
2767 else
2768 ereport(elevel,
2769 (errmsg(ngettext("launched %d parallel vacuum worker for index vacuuming (planned: %d)",
2770 "launched %d parallel vacuum workers for index vacuuming (planned: %d)",
2771 lps->pcxt->nworkers_launched),
2772 lps->pcxt->nworkers_launched, nworkers)));
2773 }
2774
2775 /* Process the indexes that can be processed by only leader process */
2776 do_serial_processing_for_unsafe_indexes(vacrel, lps->lvshared);
2777
2778 /*
2779 * Join as a parallel worker. The leader process alone processes all the
2780 * indexes in the case where no workers are launched.
2781 */
2782 do_parallel_processing(vacrel, lps->lvshared);
2783
2784 /*
2785 * Next, accumulate buffer and WAL usage. (This must wait for the workers
2786 * to finish, or we might get incomplete data.)
2787 */
2788 if (nworkers > 0)
2789 {
2790 /* Wait for all vacuum workers to finish */
2791 WaitForParallelWorkersToFinish(lps->pcxt);
2792
2793 for (int i = 0; i < lps->pcxt->nworkers_launched; i++)
2794 InstrAccumParallelQuery(&lps->buffer_usage[i], &lps->wal_usage[i]);
2795 }
2796
2797 /*
2798 * Carry the shared balance value to heap scan and disable shared costing
2799 */
2800 if (VacuumSharedCostBalance)
2801 {
2802 VacuumCostBalance = pg_atomic_read_u32(VacuumSharedCostBalance);
2803 VacuumSharedCostBalance = NULL;
2804 VacuumActiveNWorkers = NULL;
2805 }
2806 }
2807
2808 /*
2809 * Index vacuum/cleanup routine used by the leader process and parallel
2810 * vacuum worker processes to process the indexes in parallel.
2811 */
2812 static void
do_parallel_processing(LVRelState * vacrel,LVShared * lvshared)2813 do_parallel_processing(LVRelState *vacrel, LVShared *lvshared)
2814 {
2815 /*
2816 * Increment the active worker count if we are able to launch any worker.
2817 */
2818 if (VacuumActiveNWorkers)
2819 pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2820
2821 /* Loop until all indexes are vacuumed */
2822 for (;;)
2823 {
2824 int idx;
2825 LVSharedIndStats *shared_istat;
2826 Relation indrel;
2827 IndexBulkDeleteResult *istat;
2828
2829 /* Get an index number to process */
2830 idx = pg_atomic_fetch_add_u32(&(lvshared->idx), 1);
2831
2832 /* Done for all indexes? */
2833 if (idx >= vacrel->nindexes)
2834 break;
2835
2836 /* Get the index statistics space from DSM, if any */
2837 shared_istat = parallel_stats_for_idx(lvshared, idx);
2838
2839 /* Skip indexes not participating in parallelism */
2840 if (shared_istat == NULL)
2841 continue;
2842
2843 indrel = vacrel->indrels[idx];
2844
2845 /*
2846 * Skip processing indexes that are unsafe for workers (these are
2847 * processed in do_serial_processing_for_unsafe_indexes() by leader)
2848 */
2849 if (!parallel_processing_is_safe(indrel, lvshared))
2850 continue;
2851
2852 /* Do vacuum or cleanup of the index */
2853 istat = (vacrel->indstats[idx]);
2854 vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2855 lvshared,
2856 shared_istat,
2857 vacrel);
2858 }
2859
2860 /*
2861 * We have completed the index vacuum so decrement the active worker
2862 * count.
2863 */
2864 if (VacuumActiveNWorkers)
2865 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2866 }
2867
2868 /*
2869 * Perform parallel processing of indexes in leader process.
2870 *
2871 * Handles index vacuuming (or index cleanup) for indexes that are not
2872 * parallel safe. It's possible that this will vary for a given index, based
2873 * on details like whether we're performing for_cleanup processing right now.
2874 *
2875 * Also performs processing of smaller indexes that fell under the size cutoff
2876 * enforced by compute_parallel_vacuum_workers(). These indexes never get a
2877 * slot for statistics in DSM.
2878 */
2879 static void
do_serial_processing_for_unsafe_indexes(LVRelState * vacrel,LVShared * lvshared)2880 do_serial_processing_for_unsafe_indexes(LVRelState *vacrel, LVShared *lvshared)
2881 {
2882 Assert(!IsParallelWorker());
2883
2884 /*
2885 * Increment the active worker count if we are able to launch any worker.
2886 */
2887 if (VacuumActiveNWorkers)
2888 pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2889
2890 for (int idx = 0; idx < vacrel->nindexes; idx++)
2891 {
2892 LVSharedIndStats *shared_istat;
2893 Relation indrel;
2894 IndexBulkDeleteResult *istat;
2895
2896 shared_istat = parallel_stats_for_idx(lvshared, idx);
2897 indrel = vacrel->indrels[idx];
2898
2899 /*
2900 * We're only here for the indexes that parallel workers won't
2901 * process. Note that the shared_istat test ensures that we process
2902 * indexes that fell under initial size cutoff.
2903 */
2904 if (shared_istat != NULL &&
2905 parallel_processing_is_safe(indrel, lvshared))
2906 continue;
2907
2908 /* Do vacuum or cleanup of the index */
2909 istat = (vacrel->indstats[idx]);
2910 vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2911 lvshared,
2912 shared_istat,
2913 vacrel);
2914 }
2915
2916 /*
2917 * We have completed the index vacuum so decrement the active worker
2918 * count.
2919 */
2920 if (VacuumActiveNWorkers)
2921 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2922 }
2923
2924 /*
2925 * Vacuum or cleanup index either by leader process or by one of the worker
2926 * process. After processing the index this function copies the index
2927 * statistics returned from ambulkdelete and amvacuumcleanup to the DSM
2928 * segment.
2929 */
2930 static IndexBulkDeleteResult *
parallel_process_one_index(Relation indrel,IndexBulkDeleteResult * istat,LVShared * lvshared,LVSharedIndStats * shared_istat,LVRelState * vacrel)2931 parallel_process_one_index(Relation indrel,
2932 IndexBulkDeleteResult *istat,
2933 LVShared *lvshared,
2934 LVSharedIndStats *shared_istat,
2935 LVRelState *vacrel)
2936 {
2937 IndexBulkDeleteResult *istat_res;
2938
2939 /*
2940 * Update the pointer to the corresponding bulk-deletion result if someone
2941 * has already updated it
2942 */
2943 if (shared_istat && shared_istat->updated && istat == NULL)
2944 istat = &shared_istat->istat;
2945
2946 /* Do vacuum or cleanup of the index */
2947 if (lvshared->for_cleanup)
2948 istat_res = lazy_cleanup_one_index(indrel, istat, lvshared->reltuples,
2949 lvshared->estimated_count, vacrel);
2950 else
2951 istat_res = lazy_vacuum_one_index(indrel, istat, lvshared->reltuples,
2952 vacrel);
2953
2954 /*
2955 * Copy the index bulk-deletion result returned from ambulkdelete and
2956 * amvacuumcleanup to the DSM segment if it's the first cycle because they
2957 * allocate locally and it's possible that an index will be vacuumed by a
2958 * different vacuum process the next cycle. Copying the result normally
2959 * happens only the first time an index is vacuumed. For any additional
2960 * vacuum pass, we directly point to the result on the DSM segment and
2961 * pass it to vacuum index APIs so that workers can update it directly.
2962 *
2963 * Since all vacuum workers write the bulk-deletion result at different
2964 * slots we can write them without locking.
2965 */
2966 if (shared_istat && !shared_istat->updated && istat_res != NULL)
2967 {
2968 memcpy(&shared_istat->istat, istat_res, sizeof(IndexBulkDeleteResult));
2969 shared_istat->updated = true;
2970
2971 /* Free the locally-allocated bulk-deletion result */
2972 pfree(istat_res);
2973
2974 /* return the pointer to the result from shared memory */
2975 return &shared_istat->istat;
2976 }
2977
2978 return istat_res;
2979 }
2980
2981 /*
2982 * lazy_cleanup_all_indexes() -- cleanup all indexes of relation.
2983 */
2984 static void
lazy_cleanup_all_indexes(LVRelState * vacrel)2985 lazy_cleanup_all_indexes(LVRelState *vacrel)
2986 {
2987 Assert(!IsParallelWorker());
2988 Assert(vacrel->nindexes > 0);
2989
2990 /* Report that we are now cleaning up indexes */
2991 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2992 PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
2993
2994 if (!ParallelVacuumIsActive(vacrel))
2995 {
2996 double reltuples = vacrel->new_rel_tuples;
2997 bool estimated_count =
2998 vacrel->tupcount_pages < vacrel->rel_pages;
2999
3000 for (int idx = 0; idx < vacrel->nindexes; idx++)
3001 {
3002 Relation indrel = vacrel->indrels[idx];
3003 IndexBulkDeleteResult *istat = vacrel->indstats[idx];
3004
3005 vacrel->indstats[idx] =
3006 lazy_cleanup_one_index(indrel, istat, reltuples,
3007 estimated_count, vacrel);
3008 }
3009 }
3010 else
3011 {
3012 /* Outsource everything to parallel variant */
3013 do_parallel_lazy_cleanup_all_indexes(vacrel);
3014 }
3015 }
3016
3017 /*
3018 * lazy_vacuum_one_index() -- vacuum index relation.
3019 *
3020 * Delete all the index entries pointing to tuples listed in
3021 * dead_tuples, and update running statistics.
3022 *
3023 * reltuples is the number of heap tuples to be passed to the
3024 * bulkdelete callback. It's always assumed to be estimated.
3025 *
3026 * Returns bulk delete stats derived from input stats
3027 */
3028 static IndexBulkDeleteResult *
lazy_vacuum_one_index(Relation indrel,IndexBulkDeleteResult * istat,double reltuples,LVRelState * vacrel)3029 lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat,
3030 double reltuples, LVRelState *vacrel)
3031 {
3032 IndexVacuumInfo ivinfo;
3033 PGRUsage ru0;
3034 LVSavedErrInfo saved_err_info;
3035
3036 pg_rusage_init(&ru0);
3037
3038 ivinfo.index = indrel;
3039 ivinfo.analyze_only = false;
3040 ivinfo.report_progress = false;
3041 ivinfo.estimated_count = true;
3042 ivinfo.message_level = elevel;
3043 ivinfo.num_heap_tuples = reltuples;
3044 ivinfo.strategy = vacrel->bstrategy;
3045
3046 /*
3047 * Update error traceback information.
3048 *
3049 * The index name is saved during this phase and restored immediately
3050 * after this phase. See vacuum_error_callback.
3051 */
3052 Assert(vacrel->indname == NULL);
3053 vacrel->indname = pstrdup(RelationGetRelationName(indrel));
3054 update_vacuum_error_info(vacrel, &saved_err_info,
3055 VACUUM_ERRCB_PHASE_VACUUM_INDEX,
3056 InvalidBlockNumber, InvalidOffsetNumber);
3057
3058 /* Do bulk deletion */
3059 istat = index_bulk_delete(&ivinfo, istat, lazy_tid_reaped,
3060 (void *) vacrel->dead_tuples);
3061
3062 ereport(elevel,
3063 (errmsg("scanned index \"%s\" to remove %d row versions",
3064 vacrel->indname, vacrel->dead_tuples->num_tuples),
3065 errdetail_internal("%s", pg_rusage_show(&ru0))));
3066
3067 /* Revert to the previous phase information for error traceback */
3068 restore_vacuum_error_info(vacrel, &saved_err_info);
3069 pfree(vacrel->indname);
3070 vacrel->indname = NULL;
3071
3072 return istat;
3073 }
3074
3075 /*
3076 * lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation.
3077 *
3078 * reltuples is the number of heap tuples and estimated_count is true
3079 * if reltuples is an estimated value.
3080 *
3081 * Returns bulk delete stats derived from input stats
3082 */
3083 static IndexBulkDeleteResult *
lazy_cleanup_one_index(Relation indrel,IndexBulkDeleteResult * istat,double reltuples,bool estimated_count,LVRelState * vacrel)3084 lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat,
3085 double reltuples, bool estimated_count,
3086 LVRelState *vacrel)
3087 {
3088 IndexVacuumInfo ivinfo;
3089 PGRUsage ru0;
3090 LVSavedErrInfo saved_err_info;
3091
3092 pg_rusage_init(&ru0);
3093
3094 ivinfo.index = indrel;
3095 ivinfo.analyze_only = false;
3096 ivinfo.report_progress = false;
3097 ivinfo.estimated_count = estimated_count;
3098 ivinfo.message_level = elevel;
3099
3100 ivinfo.num_heap_tuples = reltuples;
3101 ivinfo.strategy = vacrel->bstrategy;
3102
3103 /*
3104 * Update error traceback information.
3105 *
3106 * The index name is saved during this phase and restored immediately
3107 * after this phase. See vacuum_error_callback.
3108 */
3109 Assert(vacrel->indname == NULL);
3110 vacrel->indname = pstrdup(RelationGetRelationName(indrel));
3111 update_vacuum_error_info(vacrel, &saved_err_info,
3112 VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
3113 InvalidBlockNumber, InvalidOffsetNumber);
3114
3115 istat = index_vacuum_cleanup(&ivinfo, istat);
3116
3117 if (istat)
3118 {
3119 ereport(elevel,
3120 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
3121 RelationGetRelationName(indrel),
3122 (istat)->num_index_tuples,
3123 (istat)->num_pages),
3124 errdetail("%.0f index row versions were removed.\n"
3125 "%u index pages were newly deleted.\n"
3126 "%u index pages are currently deleted, of which %u are currently reusable.\n"
3127 "%s.",
3128 (istat)->tuples_removed,
3129 (istat)->pages_newly_deleted,
3130 (istat)->pages_deleted, (istat)->pages_free,
3131 pg_rusage_show(&ru0))));
3132 }
3133
3134 /* Revert to the previous phase information for error traceback */
3135 restore_vacuum_error_info(vacrel, &saved_err_info);
3136 pfree(vacrel->indname);
3137 vacrel->indname = NULL;
3138
3139 return istat;
3140 }
3141
3142 /*
3143 * should_attempt_truncation - should we attempt to truncate the heap?
3144 *
3145 * Don't even think about it unless we have a shot at releasing a goodly
3146 * number of pages. Otherwise, the time taken isn't worth it.
3147 *
3148 * Also don't attempt it if wraparound failsafe is in effect. It's hard to
3149 * predict how long lazy_truncate_heap will take. Don't take any chances.
3150 * There is very little chance of truncation working out when the failsafe is
3151 * in effect in any case. lazy_scan_prune makes the optimistic assumption
3152 * that any LP_DEAD items it encounters will always be LP_UNUSED by the time
3153 * we're called.
3154 *
3155 * Also don't attempt it if we are doing early pruning/vacuuming, because a
3156 * scan which cannot find a truncated heap page cannot determine that the
3157 * snapshot is too old to read that page.
3158 *
3159 * This is split out so that we can test whether truncation is going to be
3160 * called for before we actually do it. If you change the logic here, be
3161 * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
3162 */
3163 static bool
should_attempt_truncation(LVRelState * vacrel)3164 should_attempt_truncation(LVRelState *vacrel)
3165 {
3166 BlockNumber possibly_freeable;
3167
3168 if (!vacrel->do_rel_truncate || vacrel->failsafe_active)
3169 return false;
3170
3171 possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages;
3172 if (possibly_freeable > 0 &&
3173 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
3174 possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION) &&
3175 old_snapshot_threshold < 0)
3176 return true;
3177 else
3178 return false;
3179 }
3180
3181 /*
3182 * lazy_truncate_heap - try to truncate off any empty pages at the end
3183 */
3184 static void
lazy_truncate_heap(LVRelState * vacrel)3185 lazy_truncate_heap(LVRelState *vacrel)
3186 {
3187 BlockNumber old_rel_pages = vacrel->rel_pages;
3188 BlockNumber new_rel_pages;
3189 bool lock_waiter_detected;
3190 int lock_retry;
3191
3192 /* Report that we are now truncating */
3193 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
3194 PROGRESS_VACUUM_PHASE_TRUNCATE);
3195
3196 /*
3197 * Loop until no more truncating can be done.
3198 */
3199 do
3200 {
3201 PGRUsage ru0;
3202
3203 pg_rusage_init(&ru0);
3204
3205 /*
3206 * We need full exclusive lock on the relation in order to do
3207 * truncation. If we can't get it, give up rather than waiting --- we
3208 * don't want to block other backends, and we don't want to deadlock
3209 * (which is quite possible considering we already hold a lower-grade
3210 * lock).
3211 */
3212 lock_waiter_detected = false;
3213 lock_retry = 0;
3214 while (true)
3215 {
3216 if (ConditionalLockRelation(vacrel->rel, AccessExclusiveLock))
3217 break;
3218
3219 /*
3220 * Check for interrupts while trying to (re-)acquire the exclusive
3221 * lock.
3222 */
3223 CHECK_FOR_INTERRUPTS();
3224
3225 if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
3226 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
3227 {
3228 /*
3229 * We failed to establish the lock in the specified number of
3230 * retries. This means we give up truncating.
3231 */
3232 ereport(elevel,
3233 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
3234 vacrel->relname)));
3235 return;
3236 }
3237
3238 pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L);
3239 }
3240
3241 /*
3242 * Now that we have exclusive lock, look to see if the rel has grown
3243 * whilst we were vacuuming with non-exclusive lock. If so, give up;
3244 * the newly added pages presumably contain non-deletable tuples.
3245 */
3246 new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel);
3247 if (new_rel_pages != old_rel_pages)
3248 {
3249 /*
3250 * Note: we intentionally don't update vacrel->rel_pages with the
3251 * new rel size here. If we did, it would amount to assuming that
3252 * the new pages are empty, which is unlikely. Leaving the numbers
3253 * alone amounts to assuming that the new pages have the same
3254 * tuple density as existing ones, which is less unlikely.
3255 */
3256 UnlockRelation(vacrel->rel, AccessExclusiveLock);
3257 return;
3258 }
3259
3260 /*
3261 * Scan backwards from the end to verify that the end pages actually
3262 * contain no tuples. This is *necessary*, not optional, because
3263 * other backends could have added tuples to these pages whilst we
3264 * were vacuuming.
3265 */
3266 new_rel_pages = count_nondeletable_pages(vacrel, &lock_waiter_detected);
3267 vacrel->blkno = new_rel_pages;
3268
3269 if (new_rel_pages >= old_rel_pages)
3270 {
3271 /* can't do anything after all */
3272 UnlockRelation(vacrel->rel, AccessExclusiveLock);
3273 return;
3274 }
3275
3276 /*
3277 * Okay to truncate.
3278 */
3279 RelationTruncate(vacrel->rel, new_rel_pages);
3280
3281 /*
3282 * We can release the exclusive lock as soon as we have truncated.
3283 * Other backends can't safely access the relation until they have
3284 * processed the smgr invalidation that smgrtruncate sent out ... but
3285 * that should happen as part of standard invalidation processing once
3286 * they acquire lock on the relation.
3287 */
3288 UnlockRelation(vacrel->rel, AccessExclusiveLock);
3289
3290 /*
3291 * Update statistics. Here, it *is* correct to adjust rel_pages
3292 * without also touching reltuples, since the tuple count wasn't
3293 * changed by the truncation.
3294 */
3295 vacrel->pages_removed += old_rel_pages - new_rel_pages;
3296 vacrel->rel_pages = new_rel_pages;
3297
3298 ereport(elevel,
3299 (errmsg("table \"%s\": truncated %u to %u pages",
3300 vacrel->relname,
3301 old_rel_pages, new_rel_pages),
3302 errdetail_internal("%s",
3303 pg_rusage_show(&ru0))));
3304 old_rel_pages = new_rel_pages;
3305 } while (new_rel_pages > vacrel->nonempty_pages && lock_waiter_detected);
3306 }
3307
3308 /*
3309 * Rescan end pages to verify that they are (still) empty of tuples.
3310 *
3311 * Returns number of nondeletable pages (last nonempty page + 1).
3312 */
3313 static BlockNumber
count_nondeletable_pages(LVRelState * vacrel,bool * lock_waiter_detected)3314 count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
3315 {
3316 BlockNumber blkno;
3317 BlockNumber prefetchedUntil;
3318 instr_time starttime;
3319
3320 /* Initialize the starttime if we check for conflicting lock requests */
3321 INSTR_TIME_SET_CURRENT(starttime);
3322
3323 /*
3324 * Start checking blocks at what we believe relation end to be and move
3325 * backwards. (Strange coding of loop control is needed because blkno is
3326 * unsigned.) To make the scan faster, we prefetch a few blocks at a time
3327 * in forward direction, so that OS-level readahead can kick in.
3328 */
3329 blkno = vacrel->rel_pages;
3330 StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0,
3331 "prefetch size must be power of 2");
3332 prefetchedUntil = InvalidBlockNumber;
3333 while (blkno > vacrel->nonempty_pages)
3334 {
3335 Buffer buf;
3336 Page page;
3337 OffsetNumber offnum,
3338 maxoff;
3339 bool hastup;
3340
3341 /*
3342 * Check if another process requests a lock on our relation. We are
3343 * holding an AccessExclusiveLock here, so they will be waiting. We
3344 * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
3345 * only check if that interval has elapsed once every 32 blocks to
3346 * keep the number of system calls and actual shared lock table
3347 * lookups to a minimum.
3348 */
3349 if ((blkno % 32) == 0)
3350 {
3351 instr_time currenttime;
3352 instr_time elapsed;
3353
3354 INSTR_TIME_SET_CURRENT(currenttime);
3355 elapsed = currenttime;
3356 INSTR_TIME_SUBTRACT(elapsed, starttime);
3357 if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
3358 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
3359 {
3360 if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
3361 {
3362 ereport(elevel,
3363 (errmsg("table \"%s\": suspending truncate due to conflicting lock request",
3364 vacrel->relname)));
3365
3366 *lock_waiter_detected = true;
3367 return blkno;
3368 }
3369 starttime = currenttime;
3370 }
3371 }
3372
3373 /*
3374 * We don't insert a vacuum delay point here, because we have an
3375 * exclusive lock on the table which we want to hold for as short a
3376 * time as possible. We still need to check for interrupts however.
3377 */
3378 CHECK_FOR_INTERRUPTS();
3379
3380 blkno--;
3381
3382 /* If we haven't prefetched this lot yet, do so now. */
3383 if (prefetchedUntil > blkno)
3384 {
3385 BlockNumber prefetchStart;
3386 BlockNumber pblkno;
3387
3388 prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
3389 for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
3390 {
3391 PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno);
3392 CHECK_FOR_INTERRUPTS();
3393 }
3394 prefetchedUntil = prefetchStart;
3395 }
3396
3397 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
3398 vacrel->bstrategy);
3399
3400 /* In this phase we only need shared access to the buffer */
3401 LockBuffer(buf, BUFFER_LOCK_SHARE);
3402
3403 page = BufferGetPage(buf);
3404
3405 if (PageIsNew(page) || PageIsEmpty(page))
3406 {
3407 UnlockReleaseBuffer(buf);
3408 continue;
3409 }
3410
3411 hastup = false;
3412 maxoff = PageGetMaxOffsetNumber(page);
3413 for (offnum = FirstOffsetNumber;
3414 offnum <= maxoff;
3415 offnum = OffsetNumberNext(offnum))
3416 {
3417 ItemId itemid;
3418
3419 itemid = PageGetItemId(page, offnum);
3420
3421 /*
3422 * Note: any non-unused item should be taken as a reason to keep
3423 * this page. Even an LP_DEAD item makes truncation unsafe, since
3424 * we must not have cleaned out its index entries.
3425 */
3426 if (ItemIdIsUsed(itemid))
3427 {
3428 hastup = true;
3429 break; /* can stop scanning */
3430 }
3431 } /* scan along page */
3432
3433 UnlockReleaseBuffer(buf);
3434
3435 /* Done scanning if we found a tuple here */
3436 if (hastup)
3437 return blkno + 1;
3438 }
3439
3440 /*
3441 * If we fall out of the loop, all the previously-thought-to-be-empty
3442 * pages still are; we need not bother to look at the last known-nonempty
3443 * page.
3444 */
3445 return vacrel->nonempty_pages;
3446 }
3447
3448 /*
3449 * Return the maximum number of dead tuples we can record.
3450 */
3451 static long
compute_max_dead_tuples(BlockNumber relblocks,bool hasindex)3452 compute_max_dead_tuples(BlockNumber relblocks, bool hasindex)
3453 {
3454 long maxtuples;
3455 int vac_work_mem = IsAutoVacuumWorkerProcess() &&
3456 autovacuum_work_mem != -1 ?
3457 autovacuum_work_mem : maintenance_work_mem;
3458
3459 if (hasindex)
3460 {
3461 maxtuples = MAXDEADTUPLES(vac_work_mem * 1024L);
3462 maxtuples = Min(maxtuples, INT_MAX);
3463 maxtuples = Min(maxtuples, MAXDEADTUPLES(MaxAllocSize));
3464
3465 /* curious coding here to ensure the multiplication can't overflow */
3466 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
3467 maxtuples = relblocks * LAZY_ALLOC_TUPLES;
3468
3469 /* stay sane if small maintenance_work_mem */
3470 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
3471 }
3472 else
3473 maxtuples = MaxHeapTuplesPerPage;
3474
3475 return maxtuples;
3476 }
3477
3478 /*
3479 * lazy_space_alloc - space allocation decisions for lazy vacuum
3480 *
3481 * See the comments at the head of this file for rationale.
3482 */
3483 static void
lazy_space_alloc(LVRelState * vacrel,int nworkers,BlockNumber nblocks)3484 lazy_space_alloc(LVRelState *vacrel, int nworkers, BlockNumber nblocks)
3485 {
3486 LVDeadTuples *dead_tuples;
3487 long maxtuples;
3488
3489 /*
3490 * Initialize state for a parallel vacuum. As of now, only one worker can
3491 * be used for an index, so we invoke parallelism only if there are at
3492 * least two indexes on a table.
3493 */
3494 if (nworkers >= 0 && vacrel->nindexes > 1 && vacrel->do_index_vacuuming)
3495 {
3496 /*
3497 * Since parallel workers cannot access data in temporary tables, we
3498 * can't perform parallel vacuum on them.
3499 */
3500 if (RelationUsesLocalBuffers(vacrel->rel))
3501 {
3502 /*
3503 * Give warning only if the user explicitly tries to perform a
3504 * parallel vacuum on the temporary table.
3505 */
3506 if (nworkers > 0)
3507 ereport(WARNING,
3508 (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel",
3509 vacrel->relname)));
3510 }
3511 else
3512 vacrel->lps = begin_parallel_vacuum(vacrel, nblocks, nworkers);
3513
3514 /* If parallel mode started, we're done */
3515 if (ParallelVacuumIsActive(vacrel))
3516 return;
3517 }
3518
3519 maxtuples = compute_max_dead_tuples(nblocks, vacrel->nindexes > 0);
3520
3521 dead_tuples = (LVDeadTuples *) palloc(SizeOfDeadTuples(maxtuples));
3522 dead_tuples->num_tuples = 0;
3523 dead_tuples->max_tuples = (int) maxtuples;
3524
3525 vacrel->dead_tuples = dead_tuples;
3526 }
3527
3528 /*
3529 * lazy_space_free - free space allocated in lazy_space_alloc
3530 */
3531 static void
lazy_space_free(LVRelState * vacrel)3532 lazy_space_free(LVRelState *vacrel)
3533 {
3534 if (!ParallelVacuumIsActive(vacrel))
3535 return;
3536
3537 /*
3538 * End parallel mode before updating index statistics as we cannot write
3539 * during parallel mode.
3540 */
3541 end_parallel_vacuum(vacrel);
3542 }
3543
3544 /*
3545 * lazy_tid_reaped() -- is a particular tid deletable?
3546 *
3547 * This has the right signature to be an IndexBulkDeleteCallback.
3548 *
3549 * Assumes dead_tuples array is in sorted order.
3550 */
3551 static bool
lazy_tid_reaped(ItemPointer itemptr,void * state)3552 lazy_tid_reaped(ItemPointer itemptr, void *state)
3553 {
3554 LVDeadTuples *dead_tuples = (LVDeadTuples *) state;
3555 int64 litem,
3556 ritem,
3557 item;
3558 ItemPointer res;
3559
3560 litem = itemptr_encode(&dead_tuples->itemptrs[0]);
3561 ritem = itemptr_encode(&dead_tuples->itemptrs[dead_tuples->num_tuples - 1]);
3562 item = itemptr_encode(itemptr);
3563
3564 /*
3565 * Doing a simple bound check before bsearch() is useful to avoid the
3566 * extra cost of bsearch(), especially if dead tuples on the heap are
3567 * concentrated in a certain range. Since this function is called for
3568 * every index tuple, it pays to be really fast.
3569 */
3570 if (item < litem || item > ritem)
3571 return false;
3572
3573 res = (ItemPointer) bsearch((void *) itemptr,
3574 (void *) dead_tuples->itemptrs,
3575 dead_tuples->num_tuples,
3576 sizeof(ItemPointerData),
3577 vac_cmp_itemptr);
3578
3579 return (res != NULL);
3580 }
3581
3582 /*
3583 * Comparator routines for use with qsort() and bsearch().
3584 */
3585 static int
vac_cmp_itemptr(const void * left,const void * right)3586 vac_cmp_itemptr(const void *left, const void *right)
3587 {
3588 BlockNumber lblk,
3589 rblk;
3590 OffsetNumber loff,
3591 roff;
3592
3593 lblk = ItemPointerGetBlockNumber((ItemPointer) left);
3594 rblk = ItemPointerGetBlockNumber((ItemPointer) right);
3595
3596 if (lblk < rblk)
3597 return -1;
3598 if (lblk > rblk)
3599 return 1;
3600
3601 loff = ItemPointerGetOffsetNumber((ItemPointer) left);
3602 roff = ItemPointerGetOffsetNumber((ItemPointer) right);
3603
3604 if (loff < roff)
3605 return -1;
3606 if (loff > roff)
3607 return 1;
3608
3609 return 0;
3610 }
3611
3612 /*
3613 * Check if every tuple in the given page is visible to all current and future
3614 * transactions. Also return the visibility_cutoff_xid which is the highest
3615 * xmin amongst the visible tuples. Set *all_frozen to true if every tuple
3616 * on this page is frozen.
3617 */
3618 static bool
heap_page_is_all_visible(LVRelState * vacrel,Buffer buf,TransactionId * visibility_cutoff_xid,bool * all_frozen)3619 heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
3620 TransactionId *visibility_cutoff_xid,
3621 bool *all_frozen)
3622 {
3623 Page page = BufferGetPage(buf);
3624 BlockNumber blockno = BufferGetBlockNumber(buf);
3625 OffsetNumber offnum,
3626 maxoff;
3627 bool all_visible = true;
3628
3629 *visibility_cutoff_xid = InvalidTransactionId;
3630 *all_frozen = true;
3631
3632 /*
3633 * This is a stripped down version of the line pointer scan in
3634 * lazy_scan_heap(). So if you change anything here, also check that code.
3635 */
3636 maxoff = PageGetMaxOffsetNumber(page);
3637 for (offnum = FirstOffsetNumber;
3638 offnum <= maxoff && all_visible;
3639 offnum = OffsetNumberNext(offnum))
3640 {
3641 ItemId itemid;
3642 HeapTupleData tuple;
3643
3644 /*
3645 * Set the offset number so that we can display it along with any
3646 * error that occurred while processing this tuple.
3647 */
3648 vacrel->offnum = offnum;
3649 itemid = PageGetItemId(page, offnum);
3650
3651 /* Unused or redirect line pointers are of no interest */
3652 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
3653 continue;
3654
3655 ItemPointerSet(&(tuple.t_self), blockno, offnum);
3656
3657 /*
3658 * Dead line pointers can have index pointers pointing to them. So
3659 * they can't be treated as visible
3660 */
3661 if (ItemIdIsDead(itemid))
3662 {
3663 all_visible = false;
3664 *all_frozen = false;
3665 break;
3666 }
3667
3668 Assert(ItemIdIsNormal(itemid));
3669
3670 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
3671 tuple.t_len = ItemIdGetLength(itemid);
3672 tuple.t_tableOid = RelationGetRelid(vacrel->rel);
3673
3674 switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf))
3675 {
3676 case HEAPTUPLE_LIVE:
3677 {
3678 TransactionId xmin;
3679
3680 /* Check comments in lazy_scan_heap. */
3681 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
3682 {
3683 all_visible = false;
3684 *all_frozen = false;
3685 break;
3686 }
3687
3688 /*
3689 * The inserter definitely committed. But is it old enough
3690 * that everyone sees it as committed?
3691 */
3692 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
3693 if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
3694 {
3695 all_visible = false;
3696 *all_frozen = false;
3697 break;
3698 }
3699
3700 /* Track newest xmin on page. */
3701 if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
3702 *visibility_cutoff_xid = xmin;
3703
3704 /* Check whether this tuple is already frozen or not */
3705 if (all_visible && *all_frozen &&
3706 heap_tuple_needs_eventual_freeze(tuple.t_data))
3707 *all_frozen = false;
3708 }
3709 break;
3710
3711 case HEAPTUPLE_DEAD:
3712 case HEAPTUPLE_RECENTLY_DEAD:
3713 case HEAPTUPLE_INSERT_IN_PROGRESS:
3714 case HEAPTUPLE_DELETE_IN_PROGRESS:
3715 {
3716 all_visible = false;
3717 *all_frozen = false;
3718 break;
3719 }
3720 default:
3721 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
3722 break;
3723 }
3724 } /* scan along page */
3725
3726 /* Clear the offset information once we have processed the given page. */
3727 vacrel->offnum = InvalidOffsetNumber;
3728
3729 return all_visible;
3730 }
3731
3732 /*
3733 * Compute the number of parallel worker processes to request. Both index
3734 * vacuum and index cleanup can be executed with parallel workers. The index
3735 * is eligible for parallel vacuum iff its size is greater than
3736 * min_parallel_index_scan_size as invoking workers for very small indexes
3737 * can hurt performance.
3738 *
3739 * nrequested is the number of parallel workers that user requested. If
3740 * nrequested is 0, we compute the parallel degree based on nindexes, that is
3741 * the number of indexes that support parallel vacuum. This function also
3742 * sets will_parallel_vacuum to remember indexes that participate in parallel
3743 * vacuum.
3744 */
3745 static int
compute_parallel_vacuum_workers(LVRelState * vacrel,int nrequested,bool * will_parallel_vacuum)3746 compute_parallel_vacuum_workers(LVRelState *vacrel, int nrequested,
3747 bool *will_parallel_vacuum)
3748 {
3749 int nindexes_parallel = 0;
3750 int nindexes_parallel_bulkdel = 0;
3751 int nindexes_parallel_cleanup = 0;
3752 int parallel_workers;
3753
3754 /*
3755 * We don't allow performing parallel operation in standalone backend or
3756 * when parallelism is disabled.
3757 */
3758 if (!IsUnderPostmaster || max_parallel_maintenance_workers == 0)
3759 return 0;
3760
3761 /*
3762 * Compute the number of indexes that can participate in parallel vacuum.
3763 */
3764 for (int idx = 0; idx < vacrel->nindexes; idx++)
3765 {
3766 Relation indrel = vacrel->indrels[idx];
3767 uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3768
3769 if (vacoptions == VACUUM_OPTION_NO_PARALLEL ||
3770 RelationGetNumberOfBlocks(indrel) < min_parallel_index_scan_size)
3771 continue;
3772
3773 will_parallel_vacuum[idx] = true;
3774
3775 if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3776 nindexes_parallel_bulkdel++;
3777 if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0) ||
3778 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
3779 nindexes_parallel_cleanup++;
3780 }
3781
3782 nindexes_parallel = Max(nindexes_parallel_bulkdel,
3783 nindexes_parallel_cleanup);
3784
3785 /* The leader process takes one index */
3786 nindexes_parallel--;
3787
3788 /* No index supports parallel vacuum */
3789 if (nindexes_parallel <= 0)
3790 return 0;
3791
3792 /* Compute the parallel degree */
3793 parallel_workers = (nrequested > 0) ?
3794 Min(nrequested, nindexes_parallel) : nindexes_parallel;
3795
3796 /* Cap by max_parallel_maintenance_workers */
3797 parallel_workers = Min(parallel_workers, max_parallel_maintenance_workers);
3798
3799 return parallel_workers;
3800 }
3801
3802 /*
3803 * Update index statistics in pg_class if the statistics are accurate.
3804 */
3805 static void
update_index_statistics(LVRelState * vacrel)3806 update_index_statistics(LVRelState *vacrel)
3807 {
3808 Relation *indrels = vacrel->indrels;
3809 int nindexes = vacrel->nindexes;
3810 IndexBulkDeleteResult **indstats = vacrel->indstats;
3811
3812 Assert(!IsInParallelMode());
3813
3814 for (int idx = 0; idx < nindexes; idx++)
3815 {
3816 Relation indrel = indrels[idx];
3817 IndexBulkDeleteResult *istat = indstats[idx];
3818
3819 if (istat == NULL || istat->estimated_count)
3820 continue;
3821
3822 /* Update index statistics */
3823 vac_update_relstats(indrel,
3824 istat->num_pages,
3825 istat->num_index_tuples,
3826 0,
3827 false,
3828 InvalidTransactionId,
3829 InvalidMultiXactId,
3830 false);
3831 }
3832 }
3833
3834 /*
3835 * This function prepares and returns parallel vacuum state if we can launch
3836 * even one worker. This function is responsible for entering parallel mode,
3837 * create a parallel context, and then initialize the DSM segment.
3838 */
3839 static LVParallelState *
begin_parallel_vacuum(LVRelState * vacrel,BlockNumber nblocks,int nrequested)3840 begin_parallel_vacuum(LVRelState *vacrel, BlockNumber nblocks,
3841 int nrequested)
3842 {
3843 LVParallelState *lps = NULL;
3844 Relation *indrels = vacrel->indrels;
3845 int nindexes = vacrel->nindexes;
3846 ParallelContext *pcxt;
3847 LVShared *shared;
3848 LVDeadTuples *dead_tuples;
3849 BufferUsage *buffer_usage;
3850 WalUsage *wal_usage;
3851 bool *will_parallel_vacuum;
3852 long maxtuples;
3853 Size est_shared;
3854 Size est_deadtuples;
3855 int nindexes_mwm = 0;
3856 int parallel_workers = 0;
3857 int querylen;
3858
3859 /*
3860 * A parallel vacuum must be requested and there must be indexes on the
3861 * relation
3862 */
3863 Assert(nrequested >= 0);
3864 Assert(nindexes > 0);
3865
3866 /*
3867 * Compute the number of parallel vacuum workers to launch
3868 */
3869 will_parallel_vacuum = (bool *) palloc0(sizeof(bool) * nindexes);
3870 parallel_workers = compute_parallel_vacuum_workers(vacrel,
3871 nrequested,
3872 will_parallel_vacuum);
3873
3874 /* Can't perform vacuum in parallel */
3875 if (parallel_workers <= 0)
3876 {
3877 pfree(will_parallel_vacuum);
3878 return lps;
3879 }
3880
3881 lps = (LVParallelState *) palloc0(sizeof(LVParallelState));
3882
3883 EnterParallelMode();
3884 pcxt = CreateParallelContext("postgres", "parallel_vacuum_main",
3885 parallel_workers);
3886 Assert(pcxt->nworkers > 0);
3887 lps->pcxt = pcxt;
3888
3889 /* Estimate size for shared information -- PARALLEL_VACUUM_KEY_SHARED */
3890 est_shared = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3891 for (int idx = 0; idx < nindexes; idx++)
3892 {
3893 Relation indrel = indrels[idx];
3894 uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3895
3896 /*
3897 * Cleanup option should be either disabled, always performing in
3898 * parallel or conditionally performing in parallel.
3899 */
3900 Assert(((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) ||
3901 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0));
3902 Assert(vacoptions <= VACUUM_OPTION_MAX_VALID_VALUE);
3903
3904 /* Skip indexes that don't participate in parallel vacuum */
3905 if (!will_parallel_vacuum[idx])
3906 continue;
3907
3908 if (indrel->rd_indam->amusemaintenanceworkmem)
3909 nindexes_mwm++;
3910
3911 est_shared = add_size(est_shared, sizeof(LVSharedIndStats));
3912
3913 /*
3914 * Remember the number of indexes that support parallel operation for
3915 * each phase.
3916 */
3917 if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3918 lps->nindexes_parallel_bulkdel++;
3919 if ((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0)
3920 lps->nindexes_parallel_cleanup++;
3921 if ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0)
3922 lps->nindexes_parallel_condcleanup++;
3923 }
3924 shm_toc_estimate_chunk(&pcxt->estimator, est_shared);
3925 shm_toc_estimate_keys(&pcxt->estimator, 1);
3926
3927 /* Estimate size for dead tuples -- PARALLEL_VACUUM_KEY_DEAD_TUPLES */
3928 maxtuples = compute_max_dead_tuples(nblocks, true);
3929 est_deadtuples = MAXALIGN(SizeOfDeadTuples(maxtuples));
3930 shm_toc_estimate_chunk(&pcxt->estimator, est_deadtuples);
3931 shm_toc_estimate_keys(&pcxt->estimator, 1);
3932
3933 /*
3934 * Estimate space for BufferUsage and WalUsage --
3935 * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
3936 *
3937 * If there are no extensions loaded that care, we could skip this. We
3938 * have no way of knowing whether anyone's looking at pgBufferUsage or
3939 * pgWalUsage, so do it unconditionally.
3940 */
3941 shm_toc_estimate_chunk(&pcxt->estimator,
3942 mul_size(sizeof(BufferUsage), pcxt->nworkers));
3943 shm_toc_estimate_keys(&pcxt->estimator, 1);
3944 shm_toc_estimate_chunk(&pcxt->estimator,
3945 mul_size(sizeof(WalUsage), pcxt->nworkers));
3946 shm_toc_estimate_keys(&pcxt->estimator, 1);
3947
3948 /* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
3949 if (debug_query_string)
3950 {
3951 querylen = strlen(debug_query_string);
3952 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
3953 shm_toc_estimate_keys(&pcxt->estimator, 1);
3954 }
3955 else
3956 querylen = 0; /* keep compiler quiet */
3957
3958 InitializeParallelDSM(pcxt);
3959
3960 /* Prepare shared information */
3961 shared = (LVShared *) shm_toc_allocate(pcxt->toc, est_shared);
3962 MemSet(shared, 0, est_shared);
3963 shared->relid = RelationGetRelid(vacrel->rel);
3964 shared->elevel = elevel;
3965 shared->maintenance_work_mem_worker =
3966 (nindexes_mwm > 0) ?
3967 maintenance_work_mem / Min(parallel_workers, nindexes_mwm) :
3968 maintenance_work_mem;
3969
3970 pg_atomic_init_u32(&(shared->cost_balance), 0);
3971 pg_atomic_init_u32(&(shared->active_nworkers), 0);
3972 pg_atomic_init_u32(&(shared->idx), 0);
3973 shared->offset = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3974
3975 /*
3976 * Initialize variables for shared index statistics, set NULL bitmap and
3977 * the size of stats for each index.
3978 */
3979 memset(shared->bitmap, 0x00, BITMAPLEN(nindexes));
3980 for (int idx = 0; idx < nindexes; idx++)
3981 {
3982 if (!will_parallel_vacuum[idx])
3983 continue;
3984
3985 /* Set NOT NULL as this index does support parallelism */
3986 shared->bitmap[idx >> 3] |= 1 << (idx & 0x07);
3987 }
3988
3989 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_SHARED, shared);
3990 lps->lvshared = shared;
3991
3992 /* Prepare the dead tuple space */
3993 dead_tuples = (LVDeadTuples *) shm_toc_allocate(pcxt->toc, est_deadtuples);
3994 dead_tuples->max_tuples = maxtuples;
3995 dead_tuples->num_tuples = 0;
3996 MemSet(dead_tuples->itemptrs, 0, sizeof(ItemPointerData) * maxtuples);
3997 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DEAD_TUPLES, dead_tuples);
3998 vacrel->dead_tuples = dead_tuples;
3999
4000 /*
4001 * Allocate space for each worker's BufferUsage and WalUsage; no need to
4002 * initialize
4003 */
4004 buffer_usage = shm_toc_allocate(pcxt->toc,
4005 mul_size(sizeof(BufferUsage), pcxt->nworkers));
4006 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage);
4007 lps->buffer_usage = buffer_usage;
4008 wal_usage = shm_toc_allocate(pcxt->toc,
4009 mul_size(sizeof(WalUsage), pcxt->nworkers));
4010 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_WAL_USAGE, wal_usage);
4011 lps->wal_usage = wal_usage;
4012
4013 /* Store query string for workers */
4014 if (debug_query_string)
4015 {
4016 char *sharedquery;
4017
4018 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
4019 memcpy(sharedquery, debug_query_string, querylen + 1);
4020 sharedquery[querylen] = '\0';
4021 shm_toc_insert(pcxt->toc,
4022 PARALLEL_VACUUM_KEY_QUERY_TEXT, sharedquery);
4023 }
4024
4025 pfree(will_parallel_vacuum);
4026 return lps;
4027 }
4028
4029 /*
4030 * Destroy the parallel context, and end parallel mode.
4031 *
4032 * Since writes are not allowed during parallel mode, copy the
4033 * updated index statistics from DSM into local memory and then later use that
4034 * to update the index statistics. One might think that we can exit from
4035 * parallel mode, update the index statistics and then destroy parallel
4036 * context, but that won't be safe (see ExitParallelMode).
4037 */
4038 static void
end_parallel_vacuum(LVRelState * vacrel)4039 end_parallel_vacuum(LVRelState *vacrel)
4040 {
4041 IndexBulkDeleteResult **indstats = vacrel->indstats;
4042 LVParallelState *lps = vacrel->lps;
4043 int nindexes = vacrel->nindexes;
4044
4045 Assert(!IsParallelWorker());
4046
4047 /* Copy the updated statistics */
4048 for (int idx = 0; idx < nindexes; idx++)
4049 {
4050 LVSharedIndStats *shared_istat;
4051
4052 shared_istat = parallel_stats_for_idx(lps->lvshared, idx);
4053
4054 /*
4055 * Skip index -- it must have been processed by the leader, from
4056 * inside do_serial_processing_for_unsafe_indexes()
4057 */
4058 if (shared_istat == NULL)
4059 continue;
4060
4061 if (shared_istat->updated)
4062 {
4063 indstats[idx] = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
4064 memcpy(indstats[idx], &(shared_istat->istat), sizeof(IndexBulkDeleteResult));
4065 }
4066 else
4067 indstats[idx] = NULL;
4068 }
4069
4070 DestroyParallelContext(lps->pcxt);
4071 ExitParallelMode();
4072
4073 /* Deactivate parallel vacuum */
4074 pfree(lps);
4075 vacrel->lps = NULL;
4076 }
4077
4078 /*
4079 * Return shared memory statistics for index at offset 'getidx', if any
4080 *
4081 * Returning NULL indicates that compute_parallel_vacuum_workers() determined
4082 * that the index is a totally unsuitable target for all parallel processing
4083 * up front. For example, the index could be < min_parallel_index_scan_size
4084 * cutoff.
4085 */
4086 static LVSharedIndStats *
parallel_stats_for_idx(LVShared * lvshared,int getidx)4087 parallel_stats_for_idx(LVShared *lvshared, int getidx)
4088 {
4089 char *p;
4090
4091 if (IndStatsIsNull(lvshared, getidx))
4092 return NULL;
4093
4094 p = (char *) GetSharedIndStats(lvshared);
4095 for (int idx = 0; idx < getidx; idx++)
4096 {
4097 if (IndStatsIsNull(lvshared, idx))
4098 continue;
4099
4100 p += sizeof(LVSharedIndStats);
4101 }
4102
4103 return (LVSharedIndStats *) p;
4104 }
4105
4106 /*
4107 * Returns false, if the given index can't participate in parallel index
4108 * vacuum or parallel index cleanup
4109 */
4110 static bool
parallel_processing_is_safe(Relation indrel,LVShared * lvshared)4111 parallel_processing_is_safe(Relation indrel, LVShared *lvshared)
4112 {
4113 uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
4114
4115 /* first_time must be true only if for_cleanup is true */
4116 Assert(lvshared->for_cleanup || !lvshared->first_time);
4117
4118 if (lvshared->for_cleanup)
4119 {
4120 /* Skip, if the index does not support parallel cleanup */
4121 if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) &&
4122 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0))
4123 return false;
4124
4125 /*
4126 * Skip, if the index supports parallel cleanup conditionally, but we
4127 * have already processed the index (for bulkdelete). See the
4128 * comments for option VACUUM_OPTION_PARALLEL_COND_CLEANUP to know
4129 * when indexes support parallel cleanup conditionally.
4130 */
4131 if (!lvshared->first_time &&
4132 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
4133 return false;
4134 }
4135 else if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) == 0)
4136 {
4137 /* Skip if the index does not support parallel bulk deletion */
4138 return false;
4139 }
4140
4141 return true;
4142 }
4143
4144 /*
4145 * Perform work within a launched parallel process.
4146 *
4147 * Since parallel vacuum workers perform only index vacuum or index cleanup,
4148 * we don't need to report progress information.
4149 */
4150 void
parallel_vacuum_main(dsm_segment * seg,shm_toc * toc)4151 parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
4152 {
4153 Relation rel;
4154 Relation *indrels;
4155 LVShared *lvshared;
4156 LVDeadTuples *dead_tuples;
4157 BufferUsage *buffer_usage;
4158 WalUsage *wal_usage;
4159 int nindexes;
4160 char *sharedquery;
4161 LVRelState vacrel;
4162 ErrorContextCallback errcallback;
4163
4164 lvshared = (LVShared *) shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_SHARED,
4165 false);
4166 elevel = lvshared->elevel;
4167
4168 if (lvshared->for_cleanup)
4169 elog(DEBUG1, "starting parallel vacuum worker for cleanup");
4170 else
4171 elog(DEBUG1, "starting parallel vacuum worker for bulk delete");
4172
4173 /* Set debug_query_string for individual workers */
4174 sharedquery = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_QUERY_TEXT, true);
4175 debug_query_string = sharedquery;
4176 pgstat_report_activity(STATE_RUNNING, debug_query_string);
4177
4178 /*
4179 * Open table. The lock mode is the same as the leader process. It's
4180 * okay because the lock mode does not conflict among the parallel
4181 * workers.
4182 */
4183 rel = table_open(lvshared->relid, ShareUpdateExclusiveLock);
4184
4185 /*
4186 * Open all indexes. indrels are sorted in order by OID, which should be
4187 * matched to the leader's one.
4188 */
4189 vac_open_indexes(rel, RowExclusiveLock, &nindexes, &indrels);
4190 Assert(nindexes > 0);
4191
4192 /* Set dead tuple space */
4193 dead_tuples = (LVDeadTuples *) shm_toc_lookup(toc,
4194 PARALLEL_VACUUM_KEY_DEAD_TUPLES,
4195 false);
4196
4197 /* Set cost-based vacuum delay */
4198 VacuumCostActive = (VacuumCostDelay > 0);
4199 VacuumCostBalance = 0;
4200 VacuumPageHit = 0;
4201 VacuumPageMiss = 0;
4202 VacuumPageDirty = 0;
4203 VacuumCostBalanceLocal = 0;
4204 VacuumSharedCostBalance = &(lvshared->cost_balance);
4205 VacuumActiveNWorkers = &(lvshared->active_nworkers);
4206
4207 vacrel.rel = rel;
4208 vacrel.indrels = indrels;
4209 vacrel.nindexes = nindexes;
4210 /* Each parallel VACUUM worker gets its own access strategy */
4211 vacrel.bstrategy = GetAccessStrategy(BAS_VACUUM);
4212 vacrel.indstats = (IndexBulkDeleteResult **)
4213 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
4214
4215 if (lvshared->maintenance_work_mem_worker > 0)
4216 maintenance_work_mem = lvshared->maintenance_work_mem_worker;
4217
4218 /*
4219 * Initialize vacrel for use as error callback arg by parallel worker.
4220 */
4221 vacrel.relnamespace = get_namespace_name(RelationGetNamespace(rel));
4222 vacrel.relname = pstrdup(RelationGetRelationName(rel));
4223 vacrel.indname = NULL;
4224 vacrel.phase = VACUUM_ERRCB_PHASE_UNKNOWN; /* Not yet processing */
4225 vacrel.dead_tuples = dead_tuples;
4226
4227 /* Setup error traceback support for ereport() */
4228 errcallback.callback = vacuum_error_callback;
4229 errcallback.arg = &vacrel;
4230 errcallback.previous = error_context_stack;
4231 error_context_stack = &errcallback;
4232
4233 /* Prepare to track buffer usage during parallel execution */
4234 InstrStartParallelQuery();
4235
4236 /* Process indexes to perform vacuum/cleanup */
4237 do_parallel_processing(&vacrel, lvshared);
4238
4239 /* Report buffer/WAL usage during parallel execution */
4240 buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
4241 wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
4242 InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
4243 &wal_usage[ParallelWorkerNumber]);
4244
4245 /* Pop the error context stack */
4246 error_context_stack = errcallback.previous;
4247
4248 vac_close_indexes(nindexes, indrels, RowExclusiveLock);
4249 table_close(rel, ShareUpdateExclusiveLock);
4250 FreeAccessStrategy(vacrel.bstrategy);
4251 pfree(vacrel.indstats);
4252 }
4253
4254 /*
4255 * Error context callback for errors occurring during vacuum.
4256 */
4257 static void
vacuum_error_callback(void * arg)4258 vacuum_error_callback(void *arg)
4259 {
4260 LVRelState *errinfo = arg;
4261
4262 switch (errinfo->phase)
4263 {
4264 case VACUUM_ERRCB_PHASE_SCAN_HEAP:
4265 if (BlockNumberIsValid(errinfo->blkno))
4266 {
4267 if (OffsetNumberIsValid(errinfo->offnum))
4268 errcontext("while scanning block %u offset %u of relation \"%s.%s\"",
4269 errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
4270 else
4271 errcontext("while scanning block %u of relation \"%s.%s\"",
4272 errinfo->blkno, errinfo->relnamespace, errinfo->relname);
4273 }
4274 else
4275 errcontext("while scanning relation \"%s.%s\"",
4276 errinfo->relnamespace, errinfo->relname);
4277 break;
4278
4279 case VACUUM_ERRCB_PHASE_VACUUM_HEAP:
4280 if (BlockNumberIsValid(errinfo->blkno))
4281 {
4282 if (OffsetNumberIsValid(errinfo->offnum))
4283 errcontext("while vacuuming block %u offset %u of relation \"%s.%s\"",
4284 errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
4285 else
4286 errcontext("while vacuuming block %u of relation \"%s.%s\"",
4287 errinfo->blkno, errinfo->relnamespace, errinfo->relname);
4288 }
4289 else
4290 errcontext("while vacuuming relation \"%s.%s\"",
4291 errinfo->relnamespace, errinfo->relname);
4292 break;
4293
4294 case VACUUM_ERRCB_PHASE_VACUUM_INDEX:
4295 errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
4296 errinfo->indname, errinfo->relnamespace, errinfo->relname);
4297 break;
4298
4299 case VACUUM_ERRCB_PHASE_INDEX_CLEANUP:
4300 errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
4301 errinfo->indname, errinfo->relnamespace, errinfo->relname);
4302 break;
4303
4304 case VACUUM_ERRCB_PHASE_TRUNCATE:
4305 if (BlockNumberIsValid(errinfo->blkno))
4306 errcontext("while truncating relation \"%s.%s\" to %u blocks",
4307 errinfo->relnamespace, errinfo->relname, errinfo->blkno);
4308 break;
4309
4310 case VACUUM_ERRCB_PHASE_UNKNOWN:
4311 default:
4312 return; /* do nothing; the errinfo may not be
4313 * initialized */
4314 }
4315 }
4316
4317 /*
4318 * Updates the information required for vacuum error callback. This also saves
4319 * the current information which can be later restored via restore_vacuum_error_info.
4320 */
4321 static void
update_vacuum_error_info(LVRelState * vacrel,LVSavedErrInfo * saved_vacrel,int phase,BlockNumber blkno,OffsetNumber offnum)4322 update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel,
4323 int phase, BlockNumber blkno, OffsetNumber offnum)
4324 {
4325 if (saved_vacrel)
4326 {
4327 saved_vacrel->offnum = vacrel->offnum;
4328 saved_vacrel->blkno = vacrel->blkno;
4329 saved_vacrel->phase = vacrel->phase;
4330 }
4331
4332 vacrel->blkno = blkno;
4333 vacrel->offnum = offnum;
4334 vacrel->phase = phase;
4335 }
4336
4337 /*
4338 * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
4339 */
4340 static void
restore_vacuum_error_info(LVRelState * vacrel,const LVSavedErrInfo * saved_vacrel)4341 restore_vacuum_error_info(LVRelState *vacrel,
4342 const LVSavedErrInfo *saved_vacrel)
4343 {
4344 vacrel->blkno = saved_vacrel->blkno;
4345 vacrel->offnum = saved_vacrel->offnum;
4346 vacrel->phase = saved_vacrel->phase;
4347 }
4348