1 /*-------------------------------------------------------------------------
2 *
3 * vacuumlazy.c
4 * Concurrent ("lazy") vacuuming.
5 *
6 *
7 * The major space usage for LAZY VACUUM is storage for the array of dead tuple
8 * TIDs. We want to ensure we can vacuum even the very largest relations with
9 * finite memory space usage. To do that, we set upper bounds on the number of
10 * tuples we will keep track of at once.
11 *
12 * We are willing to use at most maintenance_work_mem (or perhaps
13 * autovacuum_work_mem) memory space to keep track of dead tuples. We
14 * initially allocate an array of TIDs of that size, with an upper limit that
15 * depends on table size (this limit ensures we don't allocate a huge area
16 * uselessly for vacuuming small tables). If the array threatens to overflow,
17 * we suspend the heap scan phase and perform a pass of index cleanup and page
18 * compaction, then resume the heap scan with an empty TID array.
19 *
20 * If we're processing a table with no indexes, we can just vacuum each page
21 * as we go; there's no need to save up multiple tuples to minimize the number
22 * of index scans performed. So we don't use maintenance_work_mem memory for
23 * the TID array, just enough to hold as many heap tuples as fit on one page.
24 *
25 *
26 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
28 *
29 *
30 * IDENTIFICATION
31 * src/backend/access/heap/vacuumlazy.c
32 *
33 *-------------------------------------------------------------------------
34 */
35 #include "postgres.h"
36
37 #include <math.h>
38
39 #include "access/genam.h"
40 #include "access/heapam.h"
41 #include "access/heapam_xlog.h"
42 #include "access/htup_details.h"
43 #include "access/multixact.h"
44 #include "access/transam.h"
45 #include "access/visibilitymap.h"
46 #include "access/xlog.h"
47 #include "catalog/storage.h"
48 #include "commands/dbcommands.h"
49 #include "commands/progress.h"
50 #include "commands/vacuum.h"
51 #include "miscadmin.h"
52 #include "pgstat.h"
53 #include "portability/instr_time.h"
54 #include "postmaster/autovacuum.h"
55 #include "storage/bufmgr.h"
56 #include "storage/freespace.h"
57 #include "storage/lmgr.h"
58 #include "utils/lsyscache.h"
59 #include "utils/memutils.h"
60 #include "utils/pg_rusage.h"
61 #include "utils/timestamp.h"
62
63
64 /*
65 * Space/time tradeoff parameters: do these need to be user-tunable?
66 *
67 * To consider truncating the relation, we want there to be at least
68 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
69 * is less) potentially-freeable pages.
70 */
71 #define REL_TRUNCATE_MINIMUM 1000
72 #define REL_TRUNCATE_FRACTION 16
73
74 /*
75 * Timing parameters for truncate locking heuristics.
76 *
77 * These were not exposed as user tunable GUC values because it didn't seem
78 * that the potential for improvement was great enough to merit the cost of
79 * supporting them.
80 */
81 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
82 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
83 #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
84
85 /*
86 * When a table has no indexes, vacuum the FSM after every 8GB, approximately
87 * (it won't be exact because we only vacuum FSM after processing a heap page
88 * that has some removable tuples). When there are indexes, this is ignored,
89 * and we vacuum FSM after each index/heap cleaning pass.
90 */
91 #define VACUUM_FSM_EVERY_PAGES \
92 ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
93
94 /*
95 * Guesstimation of number of dead tuples per page. This is used to
96 * provide an upper limit to memory allocated when vacuuming small
97 * tables.
98 */
99 #define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage
100
101 /*
102 * Before we consider skipping a page that's marked as clean in
103 * visibility map, we must've seen at least this many clean pages.
104 */
105 #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32)
106
107 /*
108 * Size of the prefetch window for lazy vacuum backwards truncation scan.
109 * Needs to be a power of 2.
110 */
111 #define PREFETCH_SIZE ((BlockNumber) 32)
112
113 typedef struct LVRelStats
114 {
115 /* useindex = true means two-pass strategy; false means one-pass */
116 bool useindex;
117 /* Overall statistics about rel */
118 BlockNumber old_rel_pages; /* previous value of pg_class.relpages */
119 BlockNumber rel_pages; /* total number of pages */
120 BlockNumber scanned_pages; /* number of pages we examined */
121 BlockNumber pinskipped_pages; /* # of pages we skipped due to a pin */
122 BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */
123 BlockNumber tupcount_pages; /* pages whose tuples we counted */
124 double old_live_tuples; /* previous value of pg_class.reltuples */
125 double new_rel_tuples; /* new estimated total # of tuples */
126 double new_live_tuples; /* new estimated total # of live tuples */
127 double new_dead_tuples; /* new estimated total # of dead tuples */
128 BlockNumber pages_removed;
129 double tuples_deleted;
130 BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
131 /* List of TIDs of tuples we intend to delete */
132 /* NB: this list is ordered by TID address */
133 int num_dead_tuples; /* current # of entries */
134 int max_dead_tuples; /* # slots allocated in array */
135 ItemPointer dead_tuples; /* array of ItemPointerData */
136 int num_index_scans;
137 TransactionId latestRemovedXid;
138 bool lock_waiter_detected;
139 } LVRelStats;
140
141
142 /* A few variables that don't seem worth passing around as parameters */
143 static int elevel = -1;
144
145 static TransactionId OldestXmin;
146 static TransactionId FreezeLimit;
147 static MultiXactId MultiXactCutoff;
148
149 static BufferAccessStrategy vac_strategy;
150
151
152 /* non-export function prototypes */
153 static void lazy_scan_heap(Relation onerel, VacuumParams *params,
154 LVRelStats *vacrelstats, Relation *Irel, int nindexes,
155 bool aggressive);
156 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
157 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup);
158 static void lazy_vacuum_index(Relation indrel,
159 IndexBulkDeleteResult **stats,
160 LVRelStats *vacrelstats);
161 static void lazy_cleanup_index(Relation indrel,
162 IndexBulkDeleteResult *stats,
163 LVRelStats *vacrelstats);
164 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
165 int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer);
166 static bool should_attempt_truncation(VacuumParams *params,
167 LVRelStats *vacrelstats);
168 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
169 static BlockNumber count_nondeletable_pages(Relation onerel,
170 LVRelStats *vacrelstats);
171 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
172 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
173 ItemPointer itemptr);
174 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
175 static int vac_cmp_itemptr(const void *left, const void *right);
176 static bool heap_page_is_all_visible(Relation rel, Buffer buf,
177 TransactionId *visibility_cutoff_xid, bool *all_frozen);
178
179
180 /*
181 * heap_vacuum_rel() -- perform VACUUM for one heap relation
182 *
183 * This routine vacuums a single heap, cleans out its indexes, and
184 * updates its relpages and reltuples statistics.
185 *
186 * At entry, we have already established a transaction and opened
187 * and locked the relation.
188 */
189 void
heap_vacuum_rel(Relation onerel,VacuumParams * params,BufferAccessStrategy bstrategy)190 heap_vacuum_rel(Relation onerel, VacuumParams *params,
191 BufferAccessStrategy bstrategy)
192 {
193 LVRelStats *vacrelstats;
194 Relation *Irel;
195 int nindexes;
196 PGRUsage ru0;
197 TimestampTz starttime = 0;
198 long secs;
199 int usecs;
200 double read_rate,
201 write_rate;
202 bool aggressive; /* should we scan all unfrozen pages? */
203 bool scanned_all_unfrozen; /* actually scanned all such pages? */
204 TransactionId xidFullScanLimit;
205 MultiXactId mxactFullScanLimit;
206 BlockNumber new_rel_pages;
207 BlockNumber new_rel_allvisible;
208 double new_live_tuples;
209 TransactionId new_frozen_xid;
210 MultiXactId new_min_multi;
211
212 Assert(params != NULL);
213 Assert(params->index_cleanup != VACOPT_TERNARY_DEFAULT);
214 Assert(params->truncate != VACOPT_TERNARY_DEFAULT);
215
216 /* not every AM requires these to be valid, but heap does */
217 Assert(TransactionIdIsNormal(onerel->rd_rel->relfrozenxid));
218 Assert(MultiXactIdIsValid(onerel->rd_rel->relminmxid));
219
220 /* measure elapsed time iff autovacuum logging requires it */
221 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
222 {
223 pg_rusage_init(&ru0);
224 starttime = GetCurrentTimestamp();
225 }
226
227 if (params->options & VACOPT_VERBOSE)
228 elevel = INFO;
229 else
230 elevel = DEBUG2;
231
232 pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
233 RelationGetRelid(onerel));
234
235 vac_strategy = bstrategy;
236
237 vacuum_set_xid_limits(onerel,
238 params->freeze_min_age,
239 params->freeze_table_age,
240 params->multixact_freeze_min_age,
241 params->multixact_freeze_table_age,
242 &OldestXmin, &FreezeLimit, &xidFullScanLimit,
243 &MultiXactCutoff, &mxactFullScanLimit);
244
245 /*
246 * We request an aggressive scan if the table's frozen Xid is now older
247 * than or equal to the requested Xid full-table scan limit; or if the
248 * table's minimum MultiXactId is older than or equal to the requested
249 * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
250 */
251 aggressive = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
252 xidFullScanLimit);
253 aggressive |= MultiXactIdPrecedesOrEquals(onerel->rd_rel->relminmxid,
254 mxactFullScanLimit);
255 if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
256 aggressive = true;
257
258 vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
259
260 vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
261 vacrelstats->old_live_tuples = onerel->rd_rel->reltuples;
262 vacrelstats->num_index_scans = 0;
263 vacrelstats->pages_removed = 0;
264 vacrelstats->lock_waiter_detected = false;
265
266 /* Open all indexes of the relation */
267 vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
268 vacrelstats->useindex = (nindexes > 0 &&
269 params->index_cleanup == VACOPT_TERNARY_ENABLED);
270
271 /* Do the vacuuming */
272 lazy_scan_heap(onerel, params, vacrelstats, Irel, nindexes, aggressive);
273
274 /* Done with indexes */
275 vac_close_indexes(nindexes, Irel, NoLock);
276
277 /*
278 * Compute whether we actually scanned the all unfrozen pages. If we did,
279 * we can adjust relfrozenxid and relminmxid.
280 *
281 * NB: We need to check this before truncating the relation, because that
282 * will change ->rel_pages.
283 */
284 if ((vacrelstats->scanned_pages + vacrelstats->frozenskipped_pages)
285 < vacrelstats->rel_pages)
286 {
287 Assert(!aggressive);
288 scanned_all_unfrozen = false;
289 }
290 else
291 scanned_all_unfrozen = true;
292
293 /*
294 * Optionally truncate the relation.
295 */
296 if (should_attempt_truncation(params, vacrelstats))
297 lazy_truncate_heap(onerel, vacrelstats);
298
299 /* Report that we are now doing final cleanup */
300 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
301 PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
302
303 /*
304 * Update statistics in pg_class.
305 *
306 * A corner case here is that if we scanned no pages at all because every
307 * page is all-visible, we should not update relpages/reltuples, because
308 * we have no new information to contribute. In particular this keeps us
309 * from replacing relpages=reltuples=0 (which means "unknown tuple
310 * density") with nonzero relpages and reltuples=0 (which means "zero
311 * tuple density") unless there's some actual evidence for the latter.
312 *
313 * It's important that we use tupcount_pages and not scanned_pages for the
314 * check described above; scanned_pages counts pages where we could not
315 * get cleanup lock, and which were processed only for frozenxid purposes.
316 *
317 * We do update relallvisible even in the corner case, since if the table
318 * is all-visible we'd definitely like to know that. But clamp the value
319 * to be not more than what we're setting relpages to.
320 *
321 * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
322 * since then we don't know for certain that all tuples have a newer xmin.
323 */
324 new_rel_pages = vacrelstats->rel_pages;
325 new_live_tuples = vacrelstats->new_live_tuples;
326 if (vacrelstats->tupcount_pages == 0 && new_rel_pages > 0)
327 {
328 new_rel_pages = vacrelstats->old_rel_pages;
329 new_live_tuples = vacrelstats->old_live_tuples;
330 }
331
332 visibilitymap_count(onerel, &new_rel_allvisible, NULL);
333 if (new_rel_allvisible > new_rel_pages)
334 new_rel_allvisible = new_rel_pages;
335
336 new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId;
337 new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId;
338
339 vac_update_relstats(onerel,
340 new_rel_pages,
341 new_live_tuples,
342 new_rel_allvisible,
343 nindexes > 0,
344 new_frozen_xid,
345 new_min_multi,
346 false);
347
348 /* report results to the stats collector, too */
349 pgstat_report_vacuum(RelationGetRelid(onerel),
350 onerel->rd_rel->relisshared,
351 new_live_tuples,
352 vacrelstats->new_dead_tuples);
353 pgstat_progress_end_command();
354
355 /* and log the action if appropriate */
356 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
357 {
358 TimestampTz endtime = GetCurrentTimestamp();
359
360 if (params->log_min_duration == 0 ||
361 TimestampDifferenceExceeds(starttime, endtime,
362 params->log_min_duration))
363 {
364 StringInfoData buf;
365 char *msgfmt;
366
367 TimestampDifference(starttime, endtime, &secs, &usecs);
368
369 read_rate = 0;
370 write_rate = 0;
371 if ((secs > 0) || (usecs > 0))
372 {
373 read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) /
374 (secs + usecs / 1000000.0);
375 write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) /
376 (secs + usecs / 1000000.0);
377 }
378
379 /*
380 * This is pretty messy, but we split it up so that we can skip
381 * emitting individual parts of the message when not applicable.
382 */
383 initStringInfo(&buf);
384 if (params->is_wraparound)
385 {
386 if (aggressive)
387 msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
388 else
389 msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
390 }
391 else
392 {
393 if (aggressive)
394 msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
395 else
396 msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
397 }
398 appendStringInfo(&buf, msgfmt,
399 get_database_name(MyDatabaseId),
400 get_namespace_name(RelationGetNamespace(onerel)),
401 RelationGetRelationName(onerel),
402 vacrelstats->num_index_scans);
403 appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
404 vacrelstats->pages_removed,
405 vacrelstats->rel_pages,
406 vacrelstats->pinskipped_pages,
407 vacrelstats->frozenskipped_pages);
408 appendStringInfo(&buf,
409 _("tuples: %.0f removed, %.0f remain, %.0f are dead but not yet removable, oldest xmin: %u\n"),
410 vacrelstats->tuples_deleted,
411 vacrelstats->new_rel_tuples,
412 vacrelstats->new_dead_tuples,
413 OldestXmin);
414 appendStringInfo(&buf,
415 _("buffer usage: %d hits, %d misses, %d dirtied\n"),
416 VacuumPageHit,
417 VacuumPageMiss,
418 VacuumPageDirty);
419 appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
420 read_rate, write_rate);
421 appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
422
423 ereport(LOG,
424 (errmsg_internal("%s", buf.data)));
425 pfree(buf.data);
426 }
427 }
428 }
429
430 /*
431 * For Hot Standby we need to know the highest transaction id that will
432 * be removed by any change. VACUUM proceeds in a number of passes so
433 * we need to consider how each pass operates. The first phase runs
434 * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
435 * progresses - these will have a latestRemovedXid on each record.
436 * In some cases this removes all of the tuples to be removed, though
437 * often we have dead tuples with index pointers so we must remember them
438 * for removal in phase 3. Index records for those rows are removed
439 * in phase 2 and index blocks do not have MVCC information attached.
440 * So before we can allow removal of any index tuples we need to issue
441 * a WAL record containing the latestRemovedXid of rows that will be
442 * removed in phase three. This allows recovery queries to block at the
443 * correct place, i.e. before phase two, rather than during phase three
444 * which would be after the rows have become inaccessible.
445 */
446 static void
vacuum_log_cleanup_info(Relation rel,LVRelStats * vacrelstats)447 vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
448 {
449 /*
450 * Skip this for relations for which no WAL is to be written, or if we're
451 * not trying to support archive recovery.
452 */
453 if (!RelationNeedsWAL(rel) || !XLogIsNeeded())
454 return;
455
456 /*
457 * No need to write the record at all unless it contains a valid value
458 */
459 if (TransactionIdIsValid(vacrelstats->latestRemovedXid))
460 (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
461 }
462
463 /*
464 * lazy_scan_heap() -- scan an open heap relation
465 *
466 * This routine prunes each page in the heap, which will among other
467 * things truncate dead tuples to dead line pointers, defragment the
468 * page, and set commit status bits (see heap_page_prune). It also builds
469 * lists of dead tuples and pages with free space, calculates statistics
470 * on the number of live tuples in the heap, and marks pages as
471 * all-visible if appropriate. When done, or when we run low on space for
472 * dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap
473 * to reclaim dead line pointers.
474 *
475 * If there are no indexes then we can reclaim line pointers on the fly;
476 * dead line pointers need only be retained until all index pointers that
477 * reference them have been killed.
478 */
479 static void
lazy_scan_heap(Relation onerel,VacuumParams * params,LVRelStats * vacrelstats,Relation * Irel,int nindexes,bool aggressive)480 lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
481 Relation *Irel, int nindexes, bool aggressive)
482 {
483 BlockNumber nblocks,
484 blkno;
485 HeapTupleData tuple;
486 char *relname;
487 TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid;
488 TransactionId relminmxid = onerel->rd_rel->relminmxid;
489 BlockNumber empty_pages,
490 vacuumed_pages,
491 next_fsm_block_to_vacuum;
492 double num_tuples, /* total number of nonremovable tuples */
493 live_tuples, /* live tuples (reltuples estimate) */
494 tups_vacuumed, /* tuples cleaned up by vacuum */
495 nkeep, /* dead-but-not-removable tuples */
496 nunused; /* unused line pointers */
497 IndexBulkDeleteResult **indstats;
498 int i;
499 PGRUsage ru0;
500 Buffer vmbuffer = InvalidBuffer;
501 BlockNumber next_unskippable_block;
502 bool skipping_blocks;
503 xl_heap_freeze_tuple *frozen;
504 StringInfoData buf;
505 const int initprog_index[] = {
506 PROGRESS_VACUUM_PHASE,
507 PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
508 PROGRESS_VACUUM_MAX_DEAD_TUPLES
509 };
510 int64 initprog_val[3];
511
512 pg_rusage_init(&ru0);
513
514 relname = RelationGetRelationName(onerel);
515 if (aggressive)
516 ereport(elevel,
517 (errmsg("aggressively vacuuming \"%s.%s\"",
518 get_namespace_name(RelationGetNamespace(onerel)),
519 relname)));
520 else
521 ereport(elevel,
522 (errmsg("vacuuming \"%s.%s\"",
523 get_namespace_name(RelationGetNamespace(onerel)),
524 relname)));
525
526 empty_pages = vacuumed_pages = 0;
527 next_fsm_block_to_vacuum = (BlockNumber) 0;
528 num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0;
529
530 indstats = (IndexBulkDeleteResult **)
531 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
532
533 nblocks = RelationGetNumberOfBlocks(onerel);
534 vacrelstats->rel_pages = nblocks;
535 vacrelstats->scanned_pages = 0;
536 vacrelstats->tupcount_pages = 0;
537 vacrelstats->nonempty_pages = 0;
538 vacrelstats->latestRemovedXid = InvalidTransactionId;
539
540 lazy_space_alloc(vacrelstats, nblocks);
541 frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
542
543 /* Report that we're scanning the heap, advertising total # of blocks */
544 initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
545 initprog_val[1] = nblocks;
546 initprog_val[2] = vacrelstats->max_dead_tuples;
547 pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
548
549 /*
550 * Except when aggressive is set, we want to skip pages that are
551 * all-visible according to the visibility map, but only when we can skip
552 * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading
553 * sequentially, the OS should be doing readahead for us, so there's no
554 * gain in skipping a page now and then; that's likely to disable
555 * readahead and so be counterproductive. Also, skipping even a single
556 * page means that we can't update relfrozenxid, so we only want to do it
557 * if we can skip a goodly number of pages.
558 *
559 * When aggressive is set, we can't skip pages just because they are
560 * all-visible, but we can still skip pages that are all-frozen, since
561 * such pages do not need freezing and do not affect the value that we can
562 * safely set for relfrozenxid or relminmxid.
563 *
564 * Before entering the main loop, establish the invariant that
565 * next_unskippable_block is the next block number >= blkno that we can't
566 * skip based on the visibility map, either all-visible for a regular scan
567 * or all-frozen for an aggressive scan. We set it to nblocks if there's
568 * no such block. We also set up the skipping_blocks flag correctly at
569 * this stage.
570 *
571 * Note: The value returned by visibilitymap_get_status could be slightly
572 * out-of-date, since we make this test before reading the corresponding
573 * heap page or locking the buffer. This is OK. If we mistakenly think
574 * that the page is all-visible or all-frozen when in fact the flag's just
575 * been cleared, we might fail to vacuum the page. It's easy to see that
576 * skipping a page when aggressive is not set is not a very big deal; we
577 * might leave some dead tuples lying around, but the next vacuum will
578 * find them. But even when aggressive *is* set, it's still OK if we miss
579 * a page whose all-frozen marking has just been cleared. Any new XIDs
580 * just added to that page are necessarily newer than the GlobalXmin we
581 * computed, so they'll have no effect on the value to which we can safely
582 * set relfrozenxid. A similar argument applies for MXIDs and relminmxid.
583 *
584 * We will scan the table's last page, at least to the extent of
585 * determining whether it has tuples or not, even if it should be skipped
586 * according to the above rules; except when we've already determined that
587 * it's not worth trying to truncate the table. This avoids having
588 * lazy_truncate_heap() take access-exclusive lock on the table to attempt
589 * a truncation that just fails immediately because there are tuples in
590 * the last page. This is worth avoiding mainly because such a lock must
591 * be replayed on any hot standby, where it can be disruptive.
592 */
593 next_unskippable_block = 0;
594 if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
595 {
596 while (next_unskippable_block < nblocks)
597 {
598 uint8 vmstatus;
599
600 vmstatus = visibilitymap_get_status(onerel, next_unskippable_block,
601 &vmbuffer);
602 if (aggressive)
603 {
604 if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
605 break;
606 }
607 else
608 {
609 if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
610 break;
611 }
612 vacuum_delay_point();
613 next_unskippable_block++;
614 }
615 }
616
617 if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
618 skipping_blocks = true;
619 else
620 skipping_blocks = false;
621
622 for (blkno = 0; blkno < nblocks; blkno++)
623 {
624 Buffer buf;
625 Page page;
626 OffsetNumber offnum,
627 maxoff;
628 bool tupgone,
629 hastup;
630 int prev_dead_count;
631 int nfrozen;
632 Size freespace;
633 bool all_visible_according_to_vm = false;
634 bool all_visible;
635 bool all_frozen = true; /* provided all_visible is also true */
636 bool has_dead_tuples;
637 TransactionId visibility_cutoff_xid = InvalidTransactionId;
638
639 /* see note above about forcing scanning of last page */
640 #define FORCE_CHECK_PAGE() \
641 (blkno == nblocks - 1 && should_attempt_truncation(params, vacrelstats))
642
643 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
644
645 if (blkno == next_unskippable_block)
646 {
647 /* Time to advance next_unskippable_block */
648 next_unskippable_block++;
649 if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
650 {
651 while (next_unskippable_block < nblocks)
652 {
653 uint8 vmskipflags;
654
655 vmskipflags = visibilitymap_get_status(onerel,
656 next_unskippable_block,
657 &vmbuffer);
658 if (aggressive)
659 {
660 if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
661 break;
662 }
663 else
664 {
665 if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
666 break;
667 }
668 vacuum_delay_point();
669 next_unskippable_block++;
670 }
671 }
672
673 /*
674 * We know we can't skip the current block. But set up
675 * skipping_blocks to do the right thing at the following blocks.
676 */
677 if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
678 skipping_blocks = true;
679 else
680 skipping_blocks = false;
681
682 /*
683 * Normally, the fact that we can't skip this block must mean that
684 * it's not all-visible. But in an aggressive vacuum we know only
685 * that it's not all-frozen, so it might still be all-visible.
686 */
687 if (aggressive && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
688 all_visible_according_to_vm = true;
689 }
690 else
691 {
692 /*
693 * The current block is potentially skippable; if we've seen a
694 * long enough run of skippable blocks to justify skipping it, and
695 * we're not forced to check it, then go ahead and skip.
696 * Otherwise, the page must be at least all-visible if not
697 * all-frozen, so we can set all_visible_according_to_vm = true.
698 */
699 if (skipping_blocks && !FORCE_CHECK_PAGE())
700 {
701 /*
702 * Tricky, tricky. If this is in aggressive vacuum, the page
703 * must have been all-frozen at the time we checked whether it
704 * was skippable, but it might not be any more. We must be
705 * careful to count it as a skipped all-frozen page in that
706 * case, or else we'll think we can't update relfrozenxid and
707 * relminmxid. If it's not an aggressive vacuum, we don't
708 * know whether it was all-frozen, so we have to recheck; but
709 * in this case an approximate answer is OK.
710 */
711 if (aggressive || VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
712 vacrelstats->frozenskipped_pages++;
713 continue;
714 }
715 all_visible_according_to_vm = true;
716 }
717
718 vacuum_delay_point();
719
720 /*
721 * If we are close to overrunning the available space for dead-tuple
722 * TIDs, pause and do a cycle of vacuuming before we tackle this page.
723 */
724 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
725 vacrelstats->num_dead_tuples > 0)
726 {
727 const int hvp_index[] = {
728 PROGRESS_VACUUM_PHASE,
729 PROGRESS_VACUUM_NUM_INDEX_VACUUMS
730 };
731 int64 hvp_val[2];
732
733 /*
734 * Before beginning index vacuuming, we release any pin we may
735 * hold on the visibility map page. This isn't necessary for
736 * correctness, but we do it anyway to avoid holding the pin
737 * across a lengthy, unrelated operation.
738 */
739 if (BufferIsValid(vmbuffer))
740 {
741 ReleaseBuffer(vmbuffer);
742 vmbuffer = InvalidBuffer;
743 }
744
745 /* Log cleanup info before we touch indexes */
746 vacuum_log_cleanup_info(onerel, vacrelstats);
747
748 /* Report that we are now vacuuming indexes */
749 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
750 PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
751
752 /* Remove index entries */
753 for (i = 0; i < nindexes; i++)
754 lazy_vacuum_index(Irel[i],
755 &indstats[i],
756 vacrelstats);
757
758 /*
759 * Report that we are now vacuuming the heap. We also increase
760 * the number of index scans here; note that by using
761 * pgstat_progress_update_multi_param we can update both
762 * parameters atomically.
763 */
764 hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP;
765 hvp_val[1] = vacrelstats->num_index_scans + 1;
766 pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
767
768 /* Remove tuples from heap */
769 lazy_vacuum_heap(onerel, vacrelstats);
770
771 /*
772 * Forget the now-vacuumed tuples, and press on, but be careful
773 * not to reset latestRemovedXid since we want that value to be
774 * valid.
775 */
776 vacrelstats->num_dead_tuples = 0;
777 vacrelstats->num_index_scans++;
778
779 /*
780 * Vacuum the Free Space Map to make newly-freed space visible on
781 * upper-level FSM pages. Note we have not yet processed blkno.
782 */
783 FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno);
784 next_fsm_block_to_vacuum = blkno;
785
786 /* Report that we are once again scanning the heap */
787 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
788 PROGRESS_VACUUM_PHASE_SCAN_HEAP);
789 }
790
791 /*
792 * Pin the visibility map page in case we need to mark the page
793 * all-visible. In most cases this will be very cheap, because we'll
794 * already have the correct page pinned anyway. However, it's
795 * possible that (a) next_unskippable_block is covered by a different
796 * VM page than the current block or (b) we released our pin and did a
797 * cycle of index vacuuming.
798 *
799 */
800 visibilitymap_pin(onerel, blkno, &vmbuffer);
801
802 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
803 RBM_NORMAL, vac_strategy);
804
805 /* We need buffer cleanup lock so that we can prune HOT chains. */
806 if (!ConditionalLockBufferForCleanup(buf))
807 {
808 /*
809 * If we're not performing an aggressive scan to guard against XID
810 * wraparound, and we don't want to forcibly check the page, then
811 * it's OK to skip vacuuming pages we get a lock conflict on. They
812 * will be dealt with in some future vacuum.
813 */
814 if (!aggressive && !FORCE_CHECK_PAGE())
815 {
816 ReleaseBuffer(buf);
817 vacrelstats->pinskipped_pages++;
818 continue;
819 }
820
821 /*
822 * Read the page with share lock to see if any xids on it need to
823 * be frozen. If not we just skip the page, after updating our
824 * scan statistics. If there are some, we wait for cleanup lock.
825 *
826 * We could defer the lock request further by remembering the page
827 * and coming back to it later, or we could even register
828 * ourselves for multiple buffers and then service whichever one
829 * is received first. For now, this seems good enough.
830 *
831 * If we get here with aggressive false, then we're just forcibly
832 * checking the page, and so we don't want to insist on getting
833 * the lock; we only need to know if the page contains tuples, so
834 * that we can update nonempty_pages correctly. It's convenient
835 * to use lazy_check_needs_freeze() for both situations, though.
836 */
837 LockBuffer(buf, BUFFER_LOCK_SHARE);
838 if (!lazy_check_needs_freeze(buf, &hastup))
839 {
840 UnlockReleaseBuffer(buf);
841 vacrelstats->scanned_pages++;
842 vacrelstats->pinskipped_pages++;
843 if (hastup)
844 vacrelstats->nonempty_pages = blkno + 1;
845 continue;
846 }
847 if (!aggressive)
848 {
849 /*
850 * Here, we must not advance scanned_pages; that would amount
851 * to claiming that the page contains no freezable tuples.
852 */
853 UnlockReleaseBuffer(buf);
854 vacrelstats->pinskipped_pages++;
855 if (hastup)
856 vacrelstats->nonempty_pages = blkno + 1;
857 continue;
858 }
859 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
860 LockBufferForCleanup(buf);
861 /* drop through to normal processing */
862 }
863
864 vacrelstats->scanned_pages++;
865 vacrelstats->tupcount_pages++;
866
867 page = BufferGetPage(buf);
868
869 if (PageIsNew(page))
870 {
871 bool still_new;
872
873 /*
874 * All-zeroes pages can be left over if either a backend extends
875 * the relation by a single page, but crashes before the newly
876 * initialized page has been written out, or when bulk-extending
877 * the relation (which creates a number of empty pages at the tail
878 * end of the relation, but enters them into the FSM).
879 *
880 * Make sure these pages are in the FSM, to ensure they can be
881 * reused. Do that by testing if there's any space recorded for
882 * the page. If not, enter it.
883 *
884 * Note we do not enter the page into the visibilitymap. That has
885 * the downside that we repeatedly visit this page in subsequent
886 * vacuums, but otherwise we'll never not discover the space on a
887 * promoted standby. The harm of repeated checking ought to
888 * normally not be too bad - the space usually should be used at
889 * some point, otherwise there wouldn't be any regular vacuums.
890 */
891
892 /*
893 * Perform checking of FSM after releasing lock, the fsm is
894 * approximate, after all.
895 */
896 still_new = PageIsNew(page);
897 UnlockReleaseBuffer(buf);
898
899 if (still_new)
900 {
901 empty_pages++;
902
903 if (GetRecordedFreeSpace(onerel, blkno) == 0)
904 {
905 Size freespace;
906
907 freespace = BufferGetPageSize(buf) - SizeOfPageHeaderData;
908 RecordPageWithFreeSpace(onerel, blkno, freespace);
909 }
910 }
911 continue;
912 }
913
914 if (PageIsEmpty(page))
915 {
916 empty_pages++;
917 freespace = PageGetHeapFreeSpace(page);
918
919 /*
920 * Empty pages are always all-visible and all-frozen (note that
921 * the same is currently not true for new pages, see above).
922 */
923 if (!PageIsAllVisible(page))
924 {
925 START_CRIT_SECTION();
926
927 /* mark buffer dirty before writing a WAL record */
928 MarkBufferDirty(buf);
929
930 /*
931 * It's possible that another backend has extended the heap,
932 * initialized the page, and then failed to WAL-log the page
933 * due to an ERROR. Since heap extension is not WAL-logged,
934 * recovery might try to replay our record setting the page
935 * all-visible and find that the page isn't initialized, which
936 * will cause a PANIC. To prevent that, check whether the
937 * page has been previously WAL-logged, and if not, do that
938 * now.
939 */
940 if (RelationNeedsWAL(onerel) &&
941 PageGetLSN(page) == InvalidXLogRecPtr)
942 log_newpage_buffer(buf, true);
943
944 PageSetAllVisible(page);
945 visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
946 vmbuffer, InvalidTransactionId,
947 VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
948 END_CRIT_SECTION();
949 }
950
951 UnlockReleaseBuffer(buf);
952 RecordPageWithFreeSpace(onerel, blkno, freespace);
953 continue;
954 }
955
956 /*
957 * Prune all HOT-update chains in this page.
958 *
959 * We count tuples removed by the pruning step as removed by VACUUM.
960 */
961 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
962 &vacrelstats->latestRemovedXid);
963
964 /*
965 * Now scan the page to collect vacuumable items and check for tuples
966 * requiring freezing.
967 */
968 all_visible = true;
969 has_dead_tuples = false;
970 nfrozen = 0;
971 hastup = false;
972 prev_dead_count = vacrelstats->num_dead_tuples;
973 maxoff = PageGetMaxOffsetNumber(page);
974
975 /*
976 * Note: If you change anything in the loop below, also look at
977 * heap_page_is_all_visible to see if that needs to be changed.
978 */
979 for (offnum = FirstOffsetNumber;
980 offnum <= maxoff;
981 offnum = OffsetNumberNext(offnum))
982 {
983 ItemId itemid;
984
985 itemid = PageGetItemId(page, offnum);
986
987 /* Unused items require no processing, but we count 'em */
988 if (!ItemIdIsUsed(itemid))
989 {
990 nunused += 1;
991 continue;
992 }
993
994 /* Redirect items mustn't be touched */
995 if (ItemIdIsRedirected(itemid))
996 {
997 hastup = true; /* this page won't be truncatable */
998 continue;
999 }
1000
1001 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1002
1003 /*
1004 * DEAD line pointers are to be vacuumed normally; but we don't
1005 * count them in tups_vacuumed, else we'd be double-counting (at
1006 * least in the common case where heap_page_prune() just freed up
1007 * a non-HOT tuple).
1008 */
1009 if (ItemIdIsDead(itemid))
1010 {
1011 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
1012 all_visible = false;
1013 continue;
1014 }
1015
1016 Assert(ItemIdIsNormal(itemid));
1017
1018 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1019 tuple.t_len = ItemIdGetLength(itemid);
1020 tuple.t_tableOid = RelationGetRelid(onerel);
1021
1022 tupgone = false;
1023
1024 /*
1025 * The criteria for counting a tuple as live in this block need to
1026 * match what analyze.c's acquire_sample_rows() does, otherwise
1027 * VACUUM and ANALYZE may produce wildly different reltuples
1028 * values, e.g. when there are many recently-dead tuples.
1029 *
1030 * The logic here is a bit simpler than acquire_sample_rows(), as
1031 * VACUUM can't run inside a transaction block, which makes some
1032 * cases impossible (e.g. in-progress insert from the same
1033 * transaction).
1034 */
1035 switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
1036 {
1037 case HEAPTUPLE_DEAD:
1038
1039 /*
1040 * Ordinarily, DEAD tuples would have been removed by
1041 * heap_page_prune(), but it's possible that the tuple
1042 * state changed since heap_page_prune() looked. In
1043 * particular an INSERT_IN_PROGRESS tuple could have
1044 * changed to DEAD if the inserter aborted. So this
1045 * cannot be considered an error condition.
1046 *
1047 * If the tuple is HOT-updated then it must only be
1048 * removed by a prune operation; so we keep it just as if
1049 * it were RECENTLY_DEAD. Also, if it's a heap-only
1050 * tuple, we choose to keep it, because it'll be a lot
1051 * cheaper to get rid of it in the next pruning pass than
1052 * to treat it like an indexed tuple. Finally, if index
1053 * cleanup is disabled, the second heap pass will not
1054 * execute, and the tuple will not get removed, so we must
1055 * treat it like any other dead tuple that we choose to
1056 * keep.
1057 *
1058 * If this were to happen for a tuple that actually needed
1059 * to be deleted, we'd be in trouble, because it'd
1060 * possibly leave a tuple below the relation's xmin
1061 * horizon alive. heap_prepare_freeze_tuple() is prepared
1062 * to detect that case and abort the transaction,
1063 * preventing corruption.
1064 */
1065 if (HeapTupleIsHotUpdated(&tuple) ||
1066 HeapTupleIsHeapOnly(&tuple) ||
1067 params->index_cleanup == VACOPT_TERNARY_DISABLED)
1068 nkeep += 1;
1069 else
1070 tupgone = true; /* we can delete the tuple */
1071 all_visible = false;
1072 break;
1073 case HEAPTUPLE_LIVE:
1074
1075 /*
1076 * Count it as live. Not only is this natural, but it's
1077 * also what acquire_sample_rows() does.
1078 */
1079 live_tuples += 1;
1080
1081 /*
1082 * Is the tuple definitely visible to all transactions?
1083 *
1084 * NB: Like with per-tuple hint bits, we can't set the
1085 * PD_ALL_VISIBLE flag if the inserter committed
1086 * asynchronously. See SetHintBits for more info. Check
1087 * that the tuple is hinted xmin-committed because of
1088 * that.
1089 */
1090 if (all_visible)
1091 {
1092 TransactionId xmin;
1093
1094 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1095 {
1096 all_visible = false;
1097 break;
1098 }
1099
1100 /*
1101 * The inserter definitely committed. But is it old
1102 * enough that everyone sees it as committed?
1103 */
1104 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1105 if (!TransactionIdPrecedes(xmin, OldestXmin))
1106 {
1107 all_visible = false;
1108 break;
1109 }
1110
1111 /* Track newest xmin on page. */
1112 if (TransactionIdFollows(xmin, visibility_cutoff_xid))
1113 visibility_cutoff_xid = xmin;
1114 }
1115 break;
1116 case HEAPTUPLE_RECENTLY_DEAD:
1117
1118 /*
1119 * If tuple is recently deleted then we must not remove it
1120 * from relation.
1121 */
1122 nkeep += 1;
1123 all_visible = false;
1124 break;
1125 case HEAPTUPLE_INSERT_IN_PROGRESS:
1126
1127 /*
1128 * This is an expected case during concurrent vacuum.
1129 *
1130 * We do not count these rows as live, because we expect
1131 * the inserting transaction to update the counters at
1132 * commit, and we assume that will happen only after we
1133 * report our results. This assumption is a bit shaky,
1134 * but it is what acquire_sample_rows() does, so be
1135 * consistent.
1136 */
1137 all_visible = false;
1138 break;
1139 case HEAPTUPLE_DELETE_IN_PROGRESS:
1140 /* This is an expected case during concurrent vacuum */
1141 all_visible = false;
1142
1143 /*
1144 * Count such rows as live. As above, we assume the
1145 * deleting transaction will commit and update the
1146 * counters after we report.
1147 */
1148 live_tuples += 1;
1149 break;
1150 default:
1151 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1152 break;
1153 }
1154
1155 if (tupgone)
1156 {
1157 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
1158 HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
1159 &vacrelstats->latestRemovedXid);
1160 tups_vacuumed += 1;
1161 has_dead_tuples = true;
1162 }
1163 else
1164 {
1165 bool tuple_totally_frozen;
1166
1167 num_tuples += 1;
1168 hastup = true;
1169
1170 /*
1171 * Each non-removable tuple must be checked to see if it needs
1172 * freezing. Note we already have exclusive buffer lock.
1173 */
1174 if (heap_prepare_freeze_tuple(tuple.t_data,
1175 relfrozenxid, relminmxid,
1176 FreezeLimit, MultiXactCutoff,
1177 &frozen[nfrozen],
1178 &tuple_totally_frozen))
1179 frozen[nfrozen++].offset = offnum;
1180
1181 if (!tuple_totally_frozen)
1182 all_frozen = false;
1183 }
1184 } /* scan along page */
1185
1186 /*
1187 * If we froze any tuples, mark the buffer dirty, and write a WAL
1188 * record recording the changes. We must log the changes to be
1189 * crash-safe against future truncation of CLOG.
1190 */
1191 if (nfrozen > 0)
1192 {
1193 START_CRIT_SECTION();
1194
1195 MarkBufferDirty(buf);
1196
1197 /* execute collected freezes */
1198 for (i = 0; i < nfrozen; i++)
1199 {
1200 ItemId itemid;
1201 HeapTupleHeader htup;
1202
1203 itemid = PageGetItemId(page, frozen[i].offset);
1204 htup = (HeapTupleHeader) PageGetItem(page, itemid);
1205
1206 heap_execute_freeze_tuple(htup, &frozen[i]);
1207 }
1208
1209 /* Now WAL-log freezing if necessary */
1210 if (RelationNeedsWAL(onerel))
1211 {
1212 XLogRecPtr recptr;
1213
1214 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
1215 frozen, nfrozen);
1216 PageSetLSN(page, recptr);
1217 }
1218
1219 END_CRIT_SECTION();
1220 }
1221
1222 /*
1223 * If there are no indexes we can vacuum the page right now instead of
1224 * doing a second scan. Also we don't do that but forget dead tuples
1225 * when index cleanup is disabled.
1226 */
1227 if (!vacrelstats->useindex && vacrelstats->num_dead_tuples > 0)
1228 {
1229 if (nindexes == 0)
1230 {
1231 /* Remove tuples from heap if the table has no index */
1232 lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
1233 vacuumed_pages++;
1234 has_dead_tuples = false;
1235 }
1236 else
1237 {
1238 /*
1239 * Here, we have indexes but index cleanup is disabled.
1240 * Instead of vacuuming the dead tuples on the heap, we just
1241 * forget them.
1242 *
1243 * Note that vacrelstats->dead_tuples could have tuples which
1244 * became dead after HOT-pruning but are not marked dead yet.
1245 * We do not process them because it's a very rare condition,
1246 * and the next vacuum will process them anyway.
1247 */
1248 Assert(params->index_cleanup == VACOPT_TERNARY_DISABLED);
1249 }
1250
1251 /*
1252 * Forget the now-vacuumed tuples, and press on, but be careful
1253 * not to reset latestRemovedXid since we want that value to be
1254 * valid.
1255 */
1256 vacrelstats->num_dead_tuples = 0;
1257
1258 /*
1259 * Periodically do incremental FSM vacuuming to make newly-freed
1260 * space visible on upper FSM pages. Note: although we've cleaned
1261 * the current block, we haven't yet updated its FSM entry (that
1262 * happens further down), so passing end == blkno is correct.
1263 */
1264 if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
1265 {
1266 FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum,
1267 blkno);
1268 next_fsm_block_to_vacuum = blkno;
1269 }
1270 }
1271
1272 freespace = PageGetHeapFreeSpace(page);
1273
1274 /* mark page all-visible, if appropriate */
1275 if (all_visible && !all_visible_according_to_vm)
1276 {
1277 uint8 flags = VISIBILITYMAP_ALL_VISIBLE;
1278
1279 if (all_frozen)
1280 flags |= VISIBILITYMAP_ALL_FROZEN;
1281
1282 /*
1283 * It should never be the case that the visibility map page is set
1284 * while the page-level bit is clear, but the reverse is allowed
1285 * (if checksums are not enabled). Regardless, set the both bits
1286 * so that we get back in sync.
1287 *
1288 * NB: If the heap page is all-visible but the VM bit is not set,
1289 * we don't need to dirty the heap page. However, if checksums
1290 * are enabled, we do need to make sure that the heap page is
1291 * dirtied before passing it to visibilitymap_set(), because it
1292 * may be logged. Given that this situation should only happen in
1293 * rare cases after a crash, it is not worth optimizing.
1294 */
1295 PageSetAllVisible(page);
1296 MarkBufferDirty(buf);
1297 visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
1298 vmbuffer, visibility_cutoff_xid, flags);
1299 }
1300
1301 /*
1302 * As of PostgreSQL 9.2, the visibility map bit should never be set if
1303 * the page-level bit is clear. However, it's possible that the bit
1304 * got cleared after we checked it and before we took the buffer
1305 * content lock, so we must recheck before jumping to the conclusion
1306 * that something bad has happened.
1307 */
1308 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1309 && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
1310 {
1311 elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1312 relname, blkno);
1313 visibilitymap_clear(onerel, blkno, vmbuffer,
1314 VISIBILITYMAP_VALID_BITS);
1315 }
1316
1317 /*
1318 * It's possible for the value returned by GetOldestXmin() to move
1319 * backwards, so it's not wrong for us to see tuples that appear to
1320 * not be visible to everyone yet, while PD_ALL_VISIBLE is already
1321 * set. The real safe xmin value never moves backwards, but
1322 * GetOldestXmin() is conservative and sometimes returns a value
1323 * that's unnecessarily small, so if we see that contradiction it just
1324 * means that the tuples that we think are not visible to everyone yet
1325 * actually are, and the PD_ALL_VISIBLE flag is correct.
1326 *
1327 * There should never be dead tuples on a page with PD_ALL_VISIBLE
1328 * set, however.
1329 */
1330 else if (PageIsAllVisible(page) && has_dead_tuples)
1331 {
1332 elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1333 relname, blkno);
1334 PageClearAllVisible(page);
1335 MarkBufferDirty(buf);
1336 visibilitymap_clear(onerel, blkno, vmbuffer,
1337 VISIBILITYMAP_VALID_BITS);
1338 }
1339
1340 /*
1341 * If the all-visible page is turned out to be all-frozen but not
1342 * marked, we should so mark it. Note that all_frozen is only valid
1343 * if all_visible is true, so we must check both.
1344 */
1345 else if (all_visible_according_to_vm && all_visible && all_frozen &&
1346 !VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
1347 {
1348 /*
1349 * We can pass InvalidTransactionId as the cutoff XID here,
1350 * because setting the all-frozen bit doesn't cause recovery
1351 * conflicts.
1352 */
1353 visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
1354 vmbuffer, InvalidTransactionId,
1355 VISIBILITYMAP_ALL_FROZEN);
1356 }
1357
1358 UnlockReleaseBuffer(buf);
1359
1360 /* Remember the location of the last page with nonremovable tuples */
1361 if (hastup)
1362 vacrelstats->nonempty_pages = blkno + 1;
1363
1364 /*
1365 * If we remembered any tuples for deletion, then the page will be
1366 * visited again by lazy_vacuum_heap, which will compute and record
1367 * its post-compaction free space. If not, then we're done with this
1368 * page, so remember its free space as-is. (This path will always be
1369 * taken if there are no indexes.)
1370 */
1371 if (vacrelstats->num_dead_tuples == prev_dead_count)
1372 RecordPageWithFreeSpace(onerel, blkno, freespace);
1373 }
1374
1375 /* report that everything is scanned and vacuumed */
1376 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1377
1378 pfree(frozen);
1379
1380 /* save stats for use later */
1381 vacrelstats->tuples_deleted = tups_vacuumed;
1382 vacrelstats->new_dead_tuples = nkeep;
1383
1384 /* now we can compute the new value for pg_class.reltuples */
1385 vacrelstats->new_live_tuples = vac_estimate_reltuples(onerel,
1386 nblocks,
1387 vacrelstats->tupcount_pages,
1388 live_tuples);
1389
1390 /* also compute total number of surviving heap entries */
1391 vacrelstats->new_rel_tuples =
1392 vacrelstats->new_live_tuples + vacrelstats->new_dead_tuples;
1393
1394 /*
1395 * Release any remaining pin on visibility map page.
1396 */
1397 if (BufferIsValid(vmbuffer))
1398 {
1399 ReleaseBuffer(vmbuffer);
1400 vmbuffer = InvalidBuffer;
1401 }
1402
1403 /* If any tuples need to be deleted, perform final vacuum cycle */
1404 /* XXX put a threshold on min number of tuples here? */
1405 if (vacrelstats->num_dead_tuples > 0)
1406 {
1407 const int hvp_index[] = {
1408 PROGRESS_VACUUM_PHASE,
1409 PROGRESS_VACUUM_NUM_INDEX_VACUUMS
1410 };
1411 int64 hvp_val[2];
1412
1413 /* Log cleanup info before we touch indexes */
1414 vacuum_log_cleanup_info(onerel, vacrelstats);
1415
1416 /* Report that we are now vacuuming indexes */
1417 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1418 PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
1419
1420 /* Remove index entries */
1421 for (i = 0; i < nindexes; i++)
1422 lazy_vacuum_index(Irel[i],
1423 &indstats[i],
1424 vacrelstats);
1425
1426 /* Report that we are now vacuuming the heap */
1427 hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP;
1428 hvp_val[1] = vacrelstats->num_index_scans + 1;
1429 pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
1430
1431 /* Remove tuples from heap */
1432 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1433 PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
1434 lazy_vacuum_heap(onerel, vacrelstats);
1435 vacrelstats->num_index_scans++;
1436 }
1437
1438 /*
1439 * Vacuum the remainder of the Free Space Map. We must do this whether or
1440 * not there were indexes.
1441 */
1442 if (blkno > next_fsm_block_to_vacuum)
1443 FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno);
1444
1445 /* report all blocks vacuumed; and that we're cleaning up */
1446 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1447 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1448 PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
1449
1450 /* Do post-vacuum cleanup and statistics update for each index */
1451 if (vacrelstats->useindex)
1452 {
1453 for (i = 0; i < nindexes; i++)
1454 lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
1455 }
1456
1457 /* If no indexes, make log report that lazy_vacuum_heap would've made */
1458 if (vacuumed_pages)
1459 ereport(elevel,
1460 (errmsg("\"%s\": removed %.0f row versions in %u pages",
1461 RelationGetRelationName(onerel),
1462 tups_vacuumed, vacuumed_pages)));
1463
1464 /*
1465 * This is pretty messy, but we split it up so that we can skip emitting
1466 * individual parts of the message when not applicable.
1467 */
1468 initStringInfo(&buf);
1469 appendStringInfo(&buf,
1470 _("%.0f dead row versions cannot be removed yet, oldest xmin: %u\n"),
1471 nkeep, OldestXmin);
1472 appendStringInfo(&buf, _("There were %.0f unused item identifiers.\n"),
1473 nunused);
1474 appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ",
1475 "Skipped %u pages due to buffer pins, ",
1476 vacrelstats->pinskipped_pages),
1477 vacrelstats->pinskipped_pages);
1478 appendStringInfo(&buf, ngettext("%u frozen page.\n",
1479 "%u frozen pages.\n",
1480 vacrelstats->frozenskipped_pages),
1481 vacrelstats->frozenskipped_pages);
1482 appendStringInfo(&buf, ngettext("%u page is entirely empty.\n",
1483 "%u pages are entirely empty.\n",
1484 empty_pages),
1485 empty_pages);
1486 appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
1487
1488 ereport(elevel,
1489 (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
1490 RelationGetRelationName(onerel),
1491 tups_vacuumed, num_tuples,
1492 vacrelstats->scanned_pages, nblocks),
1493 errdetail_internal("%s", buf.data)));
1494 pfree(buf.data);
1495 }
1496
1497
1498 /*
1499 * lazy_vacuum_heap() -- second pass over the heap
1500 *
1501 * This routine marks dead tuples as unused and compacts out free
1502 * space on their pages. Pages not having dead tuples recorded from
1503 * lazy_scan_heap are not visited at all.
1504 *
1505 * Note: the reason for doing this as a second pass is we cannot remove
1506 * the tuples until we've removed their index entries, and we want to
1507 * process index entry removal in batches as large as possible.
1508 */
1509 static void
lazy_vacuum_heap(Relation onerel,LVRelStats * vacrelstats)1510 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
1511 {
1512 int tupindex;
1513 int npages;
1514 PGRUsage ru0;
1515 Buffer vmbuffer = InvalidBuffer;
1516
1517 pg_rusage_init(&ru0);
1518 npages = 0;
1519
1520 tupindex = 0;
1521 while (tupindex < vacrelstats->num_dead_tuples)
1522 {
1523 BlockNumber tblk;
1524 Buffer buf;
1525 Page page;
1526 Size freespace;
1527
1528 vacuum_delay_point();
1529
1530 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1531 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
1532 vac_strategy);
1533 if (!ConditionalLockBufferForCleanup(buf))
1534 {
1535 ReleaseBuffer(buf);
1536 ++tupindex;
1537 continue;
1538 }
1539 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
1540 &vmbuffer);
1541
1542 /* Now that we've compacted the page, record its available space */
1543 page = BufferGetPage(buf);
1544 freespace = PageGetHeapFreeSpace(page);
1545
1546 UnlockReleaseBuffer(buf);
1547 RecordPageWithFreeSpace(onerel, tblk, freespace);
1548 npages++;
1549 }
1550
1551 if (BufferIsValid(vmbuffer))
1552 {
1553 ReleaseBuffer(vmbuffer);
1554 vmbuffer = InvalidBuffer;
1555 }
1556
1557 ereport(elevel,
1558 (errmsg("\"%s\": removed %d row versions in %d pages",
1559 RelationGetRelationName(onerel),
1560 tupindex, npages),
1561 errdetail_internal("%s", pg_rusage_show(&ru0))));
1562 }
1563
1564 /*
1565 * lazy_vacuum_page() -- free dead tuples on a page
1566 * and repair its fragmentation.
1567 *
1568 * Caller must hold pin and buffer cleanup lock on the buffer.
1569 *
1570 * tupindex is the index in vacrelstats->dead_tuples of the first dead
1571 * tuple for this page. We assume the rest follow sequentially.
1572 * The return value is the first tupindex after the tuples of this page.
1573 */
1574 static int
lazy_vacuum_page(Relation onerel,BlockNumber blkno,Buffer buffer,int tupindex,LVRelStats * vacrelstats,Buffer * vmbuffer)1575 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
1576 int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer)
1577 {
1578 Page page = BufferGetPage(buffer);
1579 OffsetNumber unused[MaxOffsetNumber];
1580 int uncnt = 0;
1581 TransactionId visibility_cutoff_xid;
1582 bool all_frozen;
1583
1584 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1585
1586 START_CRIT_SECTION();
1587
1588 for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
1589 {
1590 BlockNumber tblk;
1591 OffsetNumber toff;
1592 ItemId itemid;
1593
1594 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1595 if (tblk != blkno)
1596 break; /* past end of tuples for this block */
1597 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
1598 itemid = PageGetItemId(page, toff);
1599 ItemIdSetUnused(itemid);
1600 unused[uncnt++] = toff;
1601 }
1602
1603 PageRepairFragmentation(page);
1604
1605 /*
1606 * Mark buffer dirty before we write WAL.
1607 */
1608 MarkBufferDirty(buffer);
1609
1610 /* XLOG stuff */
1611 if (RelationNeedsWAL(onerel))
1612 {
1613 XLogRecPtr recptr;
1614
1615 recptr = log_heap_clean(onerel, buffer,
1616 NULL, 0, NULL, 0,
1617 unused, uncnt,
1618 vacrelstats->latestRemovedXid);
1619 PageSetLSN(page, recptr);
1620 }
1621
1622 /*
1623 * End critical section, so we safely can do visibility tests (which
1624 * possibly need to perform IO and allocate memory!). If we crash now the
1625 * page (including the corresponding vm bit) might not be marked all
1626 * visible, but that's fine. A later vacuum will fix that.
1627 */
1628 END_CRIT_SECTION();
1629
1630 /*
1631 * Now that we have removed the dead tuples from the page, once again
1632 * check if the page has become all-visible. The page is already marked
1633 * dirty, exclusively locked, and, if needed, a full page image has been
1634 * emitted in the log_heap_clean() above.
1635 */
1636 if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid,
1637 &all_frozen))
1638 PageSetAllVisible(page);
1639
1640 /*
1641 * All the changes to the heap page have been done. If the all-visible
1642 * flag is now set, also set the VM all-visible bit (and, if possible, the
1643 * all-frozen bit) unless this has already been done previously.
1644 */
1645 if (PageIsAllVisible(page))
1646 {
1647 uint8 vm_status = visibilitymap_get_status(onerel, blkno, vmbuffer);
1648 uint8 flags = 0;
1649
1650 /* Set the VM all-frozen bit to flag, if needed */
1651 if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
1652 flags |= VISIBILITYMAP_ALL_VISIBLE;
1653 if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
1654 flags |= VISIBILITYMAP_ALL_FROZEN;
1655
1656 Assert(BufferIsValid(*vmbuffer));
1657 if (flags != 0)
1658 visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr,
1659 *vmbuffer, visibility_cutoff_xid, flags);
1660 }
1661
1662 return tupindex;
1663 }
1664
1665 /*
1666 * lazy_check_needs_freeze() -- scan page to see if any tuples
1667 * need to be cleaned to avoid wraparound
1668 *
1669 * Returns true if the page needs to be vacuumed using cleanup lock.
1670 * Also returns a flag indicating whether page contains any tuples at all.
1671 */
1672 static bool
lazy_check_needs_freeze(Buffer buf,bool * hastup)1673 lazy_check_needs_freeze(Buffer buf, bool *hastup)
1674 {
1675 Page page = BufferGetPage(buf);
1676 OffsetNumber offnum,
1677 maxoff;
1678 HeapTupleHeader tupleheader;
1679
1680 *hastup = false;
1681
1682 /*
1683 * New and empty pages, obviously, don't contain tuples. We could make
1684 * sure that the page is registered in the FSM, but it doesn't seem worth
1685 * waiting for a cleanup lock just for that, especially because it's
1686 * likely that the pin holder will do so.
1687 */
1688 if (PageIsNew(page) || PageIsEmpty(page))
1689 return false;
1690
1691 maxoff = PageGetMaxOffsetNumber(page);
1692 for (offnum = FirstOffsetNumber;
1693 offnum <= maxoff;
1694 offnum = OffsetNumberNext(offnum))
1695 {
1696 ItemId itemid;
1697
1698 itemid = PageGetItemId(page, offnum);
1699
1700 /* this should match hastup test in count_nondeletable_pages() */
1701 if (ItemIdIsUsed(itemid))
1702 *hastup = true;
1703
1704 /* dead and redirect items never need freezing */
1705 if (!ItemIdIsNormal(itemid))
1706 continue;
1707
1708 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
1709
1710 if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
1711 MultiXactCutoff, buf))
1712 return true;
1713 } /* scan along page */
1714
1715 return false;
1716 }
1717
1718
1719 /*
1720 * lazy_vacuum_index() -- vacuum one index relation.
1721 *
1722 * Delete all the index entries pointing to tuples listed in
1723 * vacrelstats->dead_tuples, and update running statistics.
1724 */
1725 static void
lazy_vacuum_index(Relation indrel,IndexBulkDeleteResult ** stats,LVRelStats * vacrelstats)1726 lazy_vacuum_index(Relation indrel,
1727 IndexBulkDeleteResult **stats,
1728 LVRelStats *vacrelstats)
1729 {
1730 IndexVacuumInfo ivinfo;
1731 PGRUsage ru0;
1732
1733 pg_rusage_init(&ru0);
1734
1735 ivinfo.index = indrel;
1736 ivinfo.analyze_only = false;
1737 ivinfo.report_progress = false;
1738 ivinfo.estimated_count = true;
1739 ivinfo.message_level = elevel;
1740 /* We can only provide an approximate value of num_heap_tuples here */
1741 ivinfo.num_heap_tuples = vacrelstats->old_live_tuples;
1742 ivinfo.strategy = vac_strategy;
1743
1744 /* Do bulk deletion */
1745 *stats = index_bulk_delete(&ivinfo, *stats,
1746 lazy_tid_reaped, (void *) vacrelstats);
1747
1748 ereport(elevel,
1749 (errmsg("scanned index \"%s\" to remove %d row versions",
1750 RelationGetRelationName(indrel),
1751 vacrelstats->num_dead_tuples),
1752 errdetail_internal("%s", pg_rusage_show(&ru0))));
1753 }
1754
1755 /*
1756 * lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
1757 */
1758 static void
lazy_cleanup_index(Relation indrel,IndexBulkDeleteResult * stats,LVRelStats * vacrelstats)1759 lazy_cleanup_index(Relation indrel,
1760 IndexBulkDeleteResult *stats,
1761 LVRelStats *vacrelstats)
1762 {
1763 IndexVacuumInfo ivinfo;
1764 PGRUsage ru0;
1765
1766 pg_rusage_init(&ru0);
1767
1768 ivinfo.index = indrel;
1769 ivinfo.analyze_only = false;
1770 ivinfo.report_progress = false;
1771 ivinfo.estimated_count = (vacrelstats->tupcount_pages < vacrelstats->rel_pages);
1772 ivinfo.message_level = elevel;
1773
1774 /*
1775 * Now we can provide a better estimate of total number of surviving
1776 * tuples (we assume indexes are more interested in that than in the
1777 * number of nominally live tuples).
1778 */
1779 ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
1780 ivinfo.strategy = vac_strategy;
1781
1782 stats = index_vacuum_cleanup(&ivinfo, stats);
1783
1784 if (!stats)
1785 return;
1786
1787 /*
1788 * Now update statistics in pg_class, but only if the index says the count
1789 * is accurate.
1790 */
1791 if (!stats->estimated_count)
1792 vac_update_relstats(indrel,
1793 stats->num_pages,
1794 stats->num_index_tuples,
1795 0,
1796 false,
1797 InvalidTransactionId,
1798 InvalidMultiXactId,
1799 false);
1800
1801 ereport(elevel,
1802 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
1803 RelationGetRelationName(indrel),
1804 stats->num_index_tuples,
1805 stats->num_pages),
1806 errdetail("%.0f index row versions were removed.\n"
1807 "%u index pages have been deleted, %u are currently reusable.\n"
1808 "%s.",
1809 stats->tuples_removed,
1810 stats->pages_deleted, stats->pages_free,
1811 pg_rusage_show(&ru0))));
1812
1813 pfree(stats);
1814 }
1815
1816 /*
1817 * should_attempt_truncation - should we attempt to truncate the heap?
1818 *
1819 * Don't even think about it unless we have a shot at releasing a goodly
1820 * number of pages. Otherwise, the time taken isn't worth it.
1821 *
1822 * Also don't attempt it if we are doing early pruning/vacuuming, because a
1823 * scan which cannot find a truncated heap page cannot determine that the
1824 * snapshot is too old to read that page. We might be able to get away with
1825 * truncating all except one of the pages, setting its LSN to (at least) the
1826 * maximum of the truncated range if we also treated an index leaf tuple
1827 * pointing to a missing heap page as something to trigger the "snapshot too
1828 * old" error, but that seems fragile and seems like it deserves its own patch
1829 * if we consider it.
1830 *
1831 * This is split out so that we can test whether truncation is going to be
1832 * called for before we actually do it. If you change the logic here, be
1833 * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
1834 */
1835 static bool
should_attempt_truncation(VacuumParams * params,LVRelStats * vacrelstats)1836 should_attempt_truncation(VacuumParams *params, LVRelStats *vacrelstats)
1837 {
1838 BlockNumber possibly_freeable;
1839
1840 if (params->truncate == VACOPT_TERNARY_DISABLED)
1841 return false;
1842
1843 possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
1844 if (possibly_freeable > 0 &&
1845 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
1846 possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION) &&
1847 old_snapshot_threshold < 0)
1848 return true;
1849 else
1850 return false;
1851 }
1852
1853 /*
1854 * lazy_truncate_heap - try to truncate off any empty pages at the end
1855 */
1856 static void
lazy_truncate_heap(Relation onerel,LVRelStats * vacrelstats)1857 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
1858 {
1859 BlockNumber old_rel_pages = vacrelstats->rel_pages;
1860 BlockNumber new_rel_pages;
1861 int lock_retry;
1862
1863 /* Report that we are now truncating */
1864 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1865 PROGRESS_VACUUM_PHASE_TRUNCATE);
1866
1867 /*
1868 * Loop until no more truncating can be done.
1869 */
1870 do
1871 {
1872 PGRUsage ru0;
1873
1874 pg_rusage_init(&ru0);
1875
1876 /*
1877 * We need full exclusive lock on the relation in order to do
1878 * truncation. If we can't get it, give up rather than waiting --- we
1879 * don't want to block other backends, and we don't want to deadlock
1880 * (which is quite possible considering we already hold a lower-grade
1881 * lock).
1882 */
1883 vacrelstats->lock_waiter_detected = false;
1884 lock_retry = 0;
1885 while (true)
1886 {
1887 if (ConditionalLockRelation(onerel, AccessExclusiveLock))
1888 break;
1889
1890 /*
1891 * Check for interrupts while trying to (re-)acquire the exclusive
1892 * lock.
1893 */
1894 CHECK_FOR_INTERRUPTS();
1895
1896 if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
1897 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
1898 {
1899 /*
1900 * We failed to establish the lock in the specified number of
1901 * retries. This means we give up truncating.
1902 */
1903 vacrelstats->lock_waiter_detected = true;
1904 ereport(elevel,
1905 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
1906 RelationGetRelationName(onerel))));
1907 return;
1908 }
1909
1910 pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L);
1911 }
1912
1913 /*
1914 * Now that we have exclusive lock, look to see if the rel has grown
1915 * whilst we were vacuuming with non-exclusive lock. If so, give up;
1916 * the newly added pages presumably contain non-deletable tuples.
1917 */
1918 new_rel_pages = RelationGetNumberOfBlocks(onerel);
1919 if (new_rel_pages != old_rel_pages)
1920 {
1921 /*
1922 * Note: we intentionally don't update vacrelstats->rel_pages with
1923 * the new rel size here. If we did, it would amount to assuming
1924 * that the new pages are empty, which is unlikely. Leaving the
1925 * numbers alone amounts to assuming that the new pages have the
1926 * same tuple density as existing ones, which is less unlikely.
1927 */
1928 UnlockRelation(onerel, AccessExclusiveLock);
1929 return;
1930 }
1931
1932 /*
1933 * Scan backwards from the end to verify that the end pages actually
1934 * contain no tuples. This is *necessary*, not optional, because
1935 * other backends could have added tuples to these pages whilst we
1936 * were vacuuming.
1937 */
1938 new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
1939
1940 if (new_rel_pages >= old_rel_pages)
1941 {
1942 /* can't do anything after all */
1943 UnlockRelation(onerel, AccessExclusiveLock);
1944 return;
1945 }
1946
1947 /*
1948 * Okay to truncate.
1949 */
1950 RelationTruncate(onerel, new_rel_pages);
1951
1952 /*
1953 * We can release the exclusive lock as soon as we have truncated.
1954 * Other backends can't safely access the relation until they have
1955 * processed the smgr invalidation that smgrtruncate sent out ... but
1956 * that should happen as part of standard invalidation processing once
1957 * they acquire lock on the relation.
1958 */
1959 UnlockRelation(onerel, AccessExclusiveLock);
1960
1961 /*
1962 * Update statistics. Here, it *is* correct to adjust rel_pages
1963 * without also touching reltuples, since the tuple count wasn't
1964 * changed by the truncation.
1965 */
1966 vacrelstats->pages_removed += old_rel_pages - new_rel_pages;
1967 vacrelstats->rel_pages = new_rel_pages;
1968
1969 ereport(elevel,
1970 (errmsg("\"%s\": truncated %u to %u pages",
1971 RelationGetRelationName(onerel),
1972 old_rel_pages, new_rel_pages),
1973 errdetail_internal("%s",
1974 pg_rusage_show(&ru0))));
1975 old_rel_pages = new_rel_pages;
1976 } while (new_rel_pages > vacrelstats->nonempty_pages &&
1977 vacrelstats->lock_waiter_detected);
1978 }
1979
1980 /*
1981 * Rescan end pages to verify that they are (still) empty of tuples.
1982 *
1983 * Returns number of nondeletable pages (last nonempty page + 1).
1984 */
1985 static BlockNumber
count_nondeletable_pages(Relation onerel,LVRelStats * vacrelstats)1986 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
1987 {
1988 BlockNumber blkno;
1989 BlockNumber prefetchedUntil;
1990 instr_time starttime;
1991
1992 /* Initialize the starttime if we check for conflicting lock requests */
1993 INSTR_TIME_SET_CURRENT(starttime);
1994
1995 /*
1996 * Start checking blocks at what we believe relation end to be and move
1997 * backwards. (Strange coding of loop control is needed because blkno is
1998 * unsigned.) To make the scan faster, we prefetch a few blocks at a time
1999 * in forward direction, so that OS-level readahead can kick in.
2000 */
2001 blkno = vacrelstats->rel_pages;
2002 StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0,
2003 "prefetch size must be power of 2");
2004 prefetchedUntil = InvalidBlockNumber;
2005 while (blkno > vacrelstats->nonempty_pages)
2006 {
2007 Buffer buf;
2008 Page page;
2009 OffsetNumber offnum,
2010 maxoff;
2011 bool hastup;
2012
2013 /*
2014 * Check if another process requests a lock on our relation. We are
2015 * holding an AccessExclusiveLock here, so they will be waiting. We
2016 * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
2017 * only check if that interval has elapsed once every 32 blocks to
2018 * keep the number of system calls and actual shared lock table
2019 * lookups to a minimum.
2020 */
2021 if ((blkno % 32) == 0)
2022 {
2023 instr_time currenttime;
2024 instr_time elapsed;
2025
2026 INSTR_TIME_SET_CURRENT(currenttime);
2027 elapsed = currenttime;
2028 INSTR_TIME_SUBTRACT(elapsed, starttime);
2029 if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
2030 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
2031 {
2032 if (LockHasWaitersRelation(onerel, AccessExclusiveLock))
2033 {
2034 ereport(elevel,
2035 (errmsg("\"%s\": suspending truncate due to conflicting lock request",
2036 RelationGetRelationName(onerel))));
2037
2038 vacrelstats->lock_waiter_detected = true;
2039 return blkno;
2040 }
2041 starttime = currenttime;
2042 }
2043 }
2044
2045 /*
2046 * We don't insert a vacuum delay point here, because we have an
2047 * exclusive lock on the table which we want to hold for as short a
2048 * time as possible. We still need to check for interrupts however.
2049 */
2050 CHECK_FOR_INTERRUPTS();
2051
2052 blkno--;
2053
2054 /* If we haven't prefetched this lot yet, do so now. */
2055 if (prefetchedUntil > blkno)
2056 {
2057 BlockNumber prefetchStart;
2058 BlockNumber pblkno;
2059
2060 prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
2061 for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
2062 {
2063 PrefetchBuffer(onerel, MAIN_FORKNUM, pblkno);
2064 CHECK_FOR_INTERRUPTS();
2065 }
2066 prefetchedUntil = prefetchStart;
2067 }
2068
2069 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
2070 RBM_NORMAL, vac_strategy);
2071
2072 /* In this phase we only need shared access to the buffer */
2073 LockBuffer(buf, BUFFER_LOCK_SHARE);
2074
2075 page = BufferGetPage(buf);
2076
2077 if (PageIsNew(page) || PageIsEmpty(page))
2078 {
2079 UnlockReleaseBuffer(buf);
2080 continue;
2081 }
2082
2083 hastup = false;
2084 maxoff = PageGetMaxOffsetNumber(page);
2085 for (offnum = FirstOffsetNumber;
2086 offnum <= maxoff;
2087 offnum = OffsetNumberNext(offnum))
2088 {
2089 ItemId itemid;
2090
2091 itemid = PageGetItemId(page, offnum);
2092
2093 /*
2094 * Note: any non-unused item should be taken as a reason to keep
2095 * this page. We formerly thought that DEAD tuples could be
2096 * thrown away, but that's not so, because we'd not have cleaned
2097 * out their index entries.
2098 */
2099 if (ItemIdIsUsed(itemid))
2100 {
2101 hastup = true;
2102 break; /* can stop scanning */
2103 }
2104 } /* scan along page */
2105
2106 UnlockReleaseBuffer(buf);
2107
2108 /* Done scanning if we found a tuple here */
2109 if (hastup)
2110 return blkno + 1;
2111 }
2112
2113 /*
2114 * If we fall out of the loop, all the previously-thought-to-be-empty
2115 * pages still are; we need not bother to look at the last known-nonempty
2116 * page.
2117 */
2118 return vacrelstats->nonempty_pages;
2119 }
2120
2121 /*
2122 * lazy_space_alloc - space allocation decisions for lazy vacuum
2123 *
2124 * See the comments at the head of this file for rationale.
2125 */
2126 static void
lazy_space_alloc(LVRelStats * vacrelstats,BlockNumber relblocks)2127 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
2128 {
2129 long maxtuples;
2130 int vac_work_mem = IsAutoVacuumWorkerProcess() &&
2131 autovacuum_work_mem != -1 ?
2132 autovacuum_work_mem : maintenance_work_mem;
2133
2134 if (vacrelstats->useindex)
2135 {
2136 maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData);
2137 maxtuples = Min(maxtuples, INT_MAX);
2138 maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
2139
2140 /* curious coding here to ensure the multiplication can't overflow */
2141 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
2142 maxtuples = relblocks * LAZY_ALLOC_TUPLES;
2143
2144 /* stay sane if small maintenance_work_mem */
2145 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
2146 }
2147 else
2148 {
2149 maxtuples = MaxHeapTuplesPerPage;
2150 }
2151
2152 vacrelstats->num_dead_tuples = 0;
2153 vacrelstats->max_dead_tuples = (int) maxtuples;
2154 vacrelstats->dead_tuples = (ItemPointer)
2155 palloc(maxtuples * sizeof(ItemPointerData));
2156 }
2157
2158 /*
2159 * lazy_record_dead_tuple - remember one deletable tuple
2160 */
2161 static void
lazy_record_dead_tuple(LVRelStats * vacrelstats,ItemPointer itemptr)2162 lazy_record_dead_tuple(LVRelStats *vacrelstats,
2163 ItemPointer itemptr)
2164 {
2165 /*
2166 * The array shouldn't overflow under normal behavior, but perhaps it
2167 * could if we are given a really small maintenance_work_mem. In that
2168 * case, just forget the last few tuples (we'll get 'em next time).
2169 */
2170 if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
2171 {
2172 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
2173 vacrelstats->num_dead_tuples++;
2174 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
2175 vacrelstats->num_dead_tuples);
2176 }
2177 }
2178
2179 /*
2180 * lazy_tid_reaped() -- is a particular tid deletable?
2181 *
2182 * This has the right signature to be an IndexBulkDeleteCallback.
2183 *
2184 * Assumes dead_tuples array is in sorted order.
2185 */
2186 static bool
lazy_tid_reaped(ItemPointer itemptr,void * state)2187 lazy_tid_reaped(ItemPointer itemptr, void *state)
2188 {
2189 LVRelStats *vacrelstats = (LVRelStats *) state;
2190 ItemPointer res;
2191
2192 res = (ItemPointer) bsearch((void *) itemptr,
2193 (void *) vacrelstats->dead_tuples,
2194 vacrelstats->num_dead_tuples,
2195 sizeof(ItemPointerData),
2196 vac_cmp_itemptr);
2197
2198 return (res != NULL);
2199 }
2200
2201 /*
2202 * Comparator routines for use with qsort() and bsearch().
2203 */
2204 static int
vac_cmp_itemptr(const void * left,const void * right)2205 vac_cmp_itemptr(const void *left, const void *right)
2206 {
2207 BlockNumber lblk,
2208 rblk;
2209 OffsetNumber loff,
2210 roff;
2211
2212 lblk = ItemPointerGetBlockNumber((ItemPointer) left);
2213 rblk = ItemPointerGetBlockNumber((ItemPointer) right);
2214
2215 if (lblk < rblk)
2216 return -1;
2217 if (lblk > rblk)
2218 return 1;
2219
2220 loff = ItemPointerGetOffsetNumber((ItemPointer) left);
2221 roff = ItemPointerGetOffsetNumber((ItemPointer) right);
2222
2223 if (loff < roff)
2224 return -1;
2225 if (loff > roff)
2226 return 1;
2227
2228 return 0;
2229 }
2230
2231 /*
2232 * Check if every tuple in the given page is visible to all current and future
2233 * transactions. Also return the visibility_cutoff_xid which is the highest
2234 * xmin amongst the visible tuples. Set *all_frozen to true if every tuple
2235 * on this page is frozen.
2236 */
2237 static bool
heap_page_is_all_visible(Relation rel,Buffer buf,TransactionId * visibility_cutoff_xid,bool * all_frozen)2238 heap_page_is_all_visible(Relation rel, Buffer buf,
2239 TransactionId *visibility_cutoff_xid,
2240 bool *all_frozen)
2241 {
2242 Page page = BufferGetPage(buf);
2243 BlockNumber blockno = BufferGetBlockNumber(buf);
2244 OffsetNumber offnum,
2245 maxoff;
2246 bool all_visible = true;
2247
2248 *visibility_cutoff_xid = InvalidTransactionId;
2249 *all_frozen = true;
2250
2251 /*
2252 * This is a stripped down version of the line pointer scan in
2253 * lazy_scan_heap(). So if you change anything here, also check that code.
2254 */
2255 maxoff = PageGetMaxOffsetNumber(page);
2256 for (offnum = FirstOffsetNumber;
2257 offnum <= maxoff && all_visible;
2258 offnum = OffsetNumberNext(offnum))
2259 {
2260 ItemId itemid;
2261 HeapTupleData tuple;
2262
2263 itemid = PageGetItemId(page, offnum);
2264
2265 /* Unused or redirect line pointers are of no interest */
2266 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
2267 continue;
2268
2269 ItemPointerSet(&(tuple.t_self), blockno, offnum);
2270
2271 /*
2272 * Dead line pointers can have index pointers pointing to them. So
2273 * they can't be treated as visible
2274 */
2275 if (ItemIdIsDead(itemid))
2276 {
2277 all_visible = false;
2278 *all_frozen = false;
2279 break;
2280 }
2281
2282 Assert(ItemIdIsNormal(itemid));
2283
2284 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2285 tuple.t_len = ItemIdGetLength(itemid);
2286 tuple.t_tableOid = RelationGetRelid(rel);
2287
2288 switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
2289 {
2290 case HEAPTUPLE_LIVE:
2291 {
2292 TransactionId xmin;
2293
2294 /* Check comments in lazy_scan_heap. */
2295 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
2296 {
2297 all_visible = false;
2298 *all_frozen = false;
2299 break;
2300 }
2301
2302 /*
2303 * The inserter definitely committed. But is it old enough
2304 * that everyone sees it as committed?
2305 */
2306 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
2307 if (!TransactionIdPrecedes(xmin, OldestXmin))
2308 {
2309 all_visible = false;
2310 *all_frozen = false;
2311 break;
2312 }
2313
2314 /* Track newest xmin on page. */
2315 if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
2316 *visibility_cutoff_xid = xmin;
2317
2318 /* Check whether this tuple is already frozen or not */
2319 if (all_visible && *all_frozen &&
2320 heap_tuple_needs_eventual_freeze(tuple.t_data))
2321 *all_frozen = false;
2322 }
2323 break;
2324
2325 case HEAPTUPLE_DEAD:
2326 case HEAPTUPLE_RECENTLY_DEAD:
2327 case HEAPTUPLE_INSERT_IN_PROGRESS:
2328 case HEAPTUPLE_DELETE_IN_PROGRESS:
2329 {
2330 all_visible = false;
2331 *all_frozen = false;
2332 break;
2333 }
2334 default:
2335 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
2336 break;
2337 }
2338 } /* scan along page */
2339
2340 return all_visible;
2341 }
2342