1 /*-------------------------------------------------------------------------
2 *
3 * vacuumlazy.c
4 * Concurrent ("lazy") vacuuming.
5 *
6 *
7 * The major space usage for LAZY VACUUM is storage for the array of dead tuple
8 * TIDs. We want to ensure we can vacuum even the very largest relations with
9 * finite memory space usage. To do that, we set upper bounds on the number of
10 * tuples we will keep track of at once.
11 *
12 * We are willing to use at most maintenance_work_mem (or perhaps
13 * autovacuum_work_mem) memory space to keep track of dead tuples. We
14 * initially allocate an array of TIDs of that size, with an upper limit that
15 * depends on table size (this limit ensures we don't allocate a huge area
16 * uselessly for vacuuming small tables). If the array threatens to overflow,
17 * we suspend the heap scan phase and perform a pass of index cleanup and page
18 * compaction, then resume the heap scan with an empty TID array.
19 *
20 * If we're processing a table with no indexes, we can just vacuum each page
21 * as we go; there's no need to save up multiple tuples to minimize the number
22 * of index scans performed. So we don't use maintenance_work_mem memory for
23 * the TID array, just enough to hold as many heap tuples as fit on one page.
24 *
25 *
26 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
28 *
29 *
30 * IDENTIFICATION
31 * src/backend/commands/vacuumlazy.c
32 *
33 *-------------------------------------------------------------------------
34 */
35 #include "postgres.h"
36
37 #include <math.h>
38
39 #include "access/genam.h"
40 #include "access/heapam.h"
41 #include "access/heapam_xlog.h"
42 #include "access/htup_details.h"
43 #include "access/multixact.h"
44 #include "access/transam.h"
45 #include "access/visibilitymap.h"
46 #include "access/xlog.h"
47 #include "catalog/storage.h"
48 #include "commands/dbcommands.h"
49 #include "commands/progress.h"
50 #include "commands/vacuum.h"
51 #include "miscadmin.h"
52 #include "pgstat.h"
53 #include "portability/instr_time.h"
54 #include "postmaster/autovacuum.h"
55 #include "storage/bufmgr.h"
56 #include "storage/freespace.h"
57 #include "storage/lmgr.h"
58 #include "utils/lsyscache.h"
59 #include "utils/memutils.h"
60 #include "utils/pg_rusage.h"
61 #include "utils/timestamp.h"
62 #include "utils/tqual.h"
63
64
65 /*
66 * Space/time tradeoff parameters: do these need to be user-tunable?
67 *
68 * To consider truncating the relation, we want there to be at least
69 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
70 * is less) potentially-freeable pages.
71 */
72 #define REL_TRUNCATE_MINIMUM 1000
73 #define REL_TRUNCATE_FRACTION 16
74
75 /*
76 * Timing parameters for truncate locking heuristics.
77 *
78 * These were not exposed as user tunable GUC values because it didn't seem
79 * that the potential for improvement was great enough to merit the cost of
80 * supporting them.
81 */
82 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
83 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
84 #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
85
86 /*
87 * When a table has no indexes, vacuum the FSM after every 8GB, approximately
88 * (it won't be exact because we only vacuum FSM after processing a heap page
89 * that has some removable tuples). When there are indexes, this is ignored,
90 * and we vacuum FSM after each index/heap cleaning pass.
91 */
92 #define VACUUM_FSM_EVERY_PAGES \
93 ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
94
95 /*
96 * Guesstimation of number of dead tuples per page. This is used to
97 * provide an upper limit to memory allocated when vacuuming small
98 * tables.
99 */
100 #define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage
101
102 /*
103 * Before we consider skipping a page that's marked as clean in
104 * visibility map, we must've seen at least this many clean pages.
105 */
106 #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32)
107
108 /*
109 * Size of the prefetch window for lazy vacuum backwards truncation scan.
110 * Needs to be a power of 2.
111 */
112 #define PREFETCH_SIZE ((BlockNumber) 32)
113
114 typedef struct LVRelStats
115 {
116 /* hasindex = true means two-pass strategy; false means one-pass */
117 bool hasindex;
118 /* Overall statistics about rel */
119 BlockNumber old_rel_pages; /* previous value of pg_class.relpages */
120 BlockNumber rel_pages; /* total number of pages */
121 BlockNumber scanned_pages; /* number of pages we examined */
122 BlockNumber pinskipped_pages; /* # of pages we skipped due to a pin */
123 BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */
124 BlockNumber tupcount_pages; /* pages whose tuples we counted */
125 double old_live_tuples; /* previous value of pg_class.reltuples */
126 double new_rel_tuples; /* new estimated total # of tuples */
127 double new_live_tuples; /* new estimated total # of live tuples */
128 double new_dead_tuples; /* new estimated total # of dead tuples */
129 BlockNumber pages_removed;
130 double tuples_deleted;
131 BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
132 /* List of TIDs of tuples we intend to delete */
133 /* NB: this list is ordered by TID address */
134 int num_dead_tuples; /* current # of entries */
135 int max_dead_tuples; /* # slots allocated in array */
136 ItemPointer dead_tuples; /* array of ItemPointerData */
137 int num_index_scans;
138 TransactionId latestRemovedXid;
139 bool lock_waiter_detected;
140 } LVRelStats;
141
142
143 /* A few variables that don't seem worth passing around as parameters */
144 static int elevel = -1;
145
146 static TransactionId OldestXmin;
147 static TransactionId FreezeLimit;
148 static MultiXactId MultiXactCutoff;
149
150 static BufferAccessStrategy vac_strategy;
151
152
153 /* non-export function prototypes */
154 static void lazy_scan_heap(Relation onerel, int options,
155 LVRelStats *vacrelstats, Relation *Irel, int nindexes,
156 bool aggressive);
157 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
158 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup);
159 static void lazy_vacuum_index(Relation indrel,
160 IndexBulkDeleteResult **stats,
161 LVRelStats *vacrelstats);
162 static void lazy_cleanup_index(Relation indrel,
163 IndexBulkDeleteResult *stats,
164 LVRelStats *vacrelstats);
165 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
166 int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer);
167 static bool should_attempt_truncation(LVRelStats *vacrelstats);
168 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
169 static BlockNumber count_nondeletable_pages(Relation onerel,
170 LVRelStats *vacrelstats);
171 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
172 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
173 ItemPointer itemptr);
174 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
175 static int vac_cmp_itemptr(const void *left, const void *right);
176 static bool heap_page_is_all_visible(Relation rel, Buffer buf,
177 TransactionId *visibility_cutoff_xid, bool *all_frozen);
178
179
180 /*
181 * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
182 *
183 * This routine vacuums a single heap, cleans out its indexes, and
184 * updates its relpages and reltuples statistics.
185 *
186 * At entry, we have already established a transaction and opened
187 * and locked the relation.
188 */
189 void
lazy_vacuum_rel(Relation onerel,int options,VacuumParams * params,BufferAccessStrategy bstrategy)190 lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
191 BufferAccessStrategy bstrategy)
192 {
193 LVRelStats *vacrelstats;
194 Relation *Irel;
195 int nindexes;
196 PGRUsage ru0;
197 TimestampTz starttime = 0;
198 long secs;
199 int usecs;
200 double read_rate,
201 write_rate;
202 bool aggressive; /* should we scan all unfrozen pages? */
203 bool scanned_all_unfrozen; /* actually scanned all such pages? */
204 TransactionId xidFullScanLimit;
205 MultiXactId mxactFullScanLimit;
206 BlockNumber new_rel_pages;
207 BlockNumber new_rel_allvisible;
208 double new_live_tuples;
209 TransactionId new_frozen_xid;
210 MultiXactId new_min_multi;
211
212 Assert(params != NULL);
213
214 /* measure elapsed time iff autovacuum logging requires it */
215 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
216 {
217 pg_rusage_init(&ru0);
218 starttime = GetCurrentTimestamp();
219 }
220
221 if (options & VACOPT_VERBOSE)
222 elevel = INFO;
223 else
224 elevel = DEBUG2;
225
226 pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
227 RelationGetRelid(onerel));
228
229 vac_strategy = bstrategy;
230
231 vacuum_set_xid_limits(onerel,
232 params->freeze_min_age,
233 params->freeze_table_age,
234 params->multixact_freeze_min_age,
235 params->multixact_freeze_table_age,
236 &OldestXmin, &FreezeLimit, &xidFullScanLimit,
237 &MultiXactCutoff, &mxactFullScanLimit);
238
239 /*
240 * We request an aggressive scan if the table's frozen Xid is now older
241 * than or equal to the requested Xid full-table scan limit; or if the
242 * table's minimum MultiXactId is older than or equal to the requested
243 * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
244 */
245 aggressive = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
246 xidFullScanLimit);
247 aggressive |= MultiXactIdPrecedesOrEquals(onerel->rd_rel->relminmxid,
248 mxactFullScanLimit);
249 if (options & VACOPT_DISABLE_PAGE_SKIPPING)
250 aggressive = true;
251
252 vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
253
254 vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
255 vacrelstats->old_live_tuples = onerel->rd_rel->reltuples;
256 vacrelstats->num_index_scans = 0;
257 vacrelstats->pages_removed = 0;
258 vacrelstats->lock_waiter_detected = false;
259
260 /* Open all indexes of the relation */
261 vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
262 vacrelstats->hasindex = (nindexes > 0);
263
264 /* Do the vacuuming */
265 lazy_scan_heap(onerel, options, vacrelstats, Irel, nindexes, aggressive);
266
267 /* Done with indexes */
268 vac_close_indexes(nindexes, Irel, NoLock);
269
270 /*
271 * Compute whether we actually scanned the all unfrozen pages. If we did,
272 * we can adjust relfrozenxid and relminmxid.
273 *
274 * NB: We need to check this before truncating the relation, because that
275 * will change ->rel_pages.
276 */
277 if ((vacrelstats->scanned_pages + vacrelstats->frozenskipped_pages)
278 < vacrelstats->rel_pages)
279 {
280 Assert(!aggressive);
281 scanned_all_unfrozen = false;
282 }
283 else
284 scanned_all_unfrozen = true;
285
286 /*
287 * Optionally truncate the relation.
288 */
289 if (should_attempt_truncation(vacrelstats))
290 lazy_truncate_heap(onerel, vacrelstats);
291
292 /* Report that we are now doing final cleanup */
293 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
294 PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
295
296 /*
297 * Update statistics in pg_class.
298 *
299 * A corner case here is that if we scanned no pages at all because every
300 * page is all-visible, we should not update relpages/reltuples, because
301 * we have no new information to contribute. In particular this keeps us
302 * from replacing relpages=reltuples=0 (which means "unknown tuple
303 * density") with nonzero relpages and reltuples=0 (which means "zero
304 * tuple density") unless there's some actual evidence for the latter.
305 *
306 * It's important that we use tupcount_pages and not scanned_pages for the
307 * check described above; scanned_pages counts pages where we could not
308 * get cleanup lock, and which were processed only for frozenxid purposes.
309 *
310 * We do update relallvisible even in the corner case, since if the table
311 * is all-visible we'd definitely like to know that. But clamp the value
312 * to be not more than what we're setting relpages to.
313 *
314 * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
315 * since then we don't know for certain that all tuples have a newer xmin.
316 */
317 new_rel_pages = vacrelstats->rel_pages;
318 new_live_tuples = vacrelstats->new_live_tuples;
319 if (vacrelstats->tupcount_pages == 0 && new_rel_pages > 0)
320 {
321 new_rel_pages = vacrelstats->old_rel_pages;
322 new_live_tuples = vacrelstats->old_live_tuples;
323 }
324
325 visibilitymap_count(onerel, &new_rel_allvisible, NULL);
326 if (new_rel_allvisible > new_rel_pages)
327 new_rel_allvisible = new_rel_pages;
328
329 new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId;
330 new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId;
331
332 vac_update_relstats(onerel,
333 new_rel_pages,
334 new_live_tuples,
335 new_rel_allvisible,
336 vacrelstats->hasindex,
337 new_frozen_xid,
338 new_min_multi,
339 false);
340
341 /* report results to the stats collector, too */
342 pgstat_report_vacuum(RelationGetRelid(onerel),
343 onerel->rd_rel->relisshared,
344 new_live_tuples,
345 vacrelstats->new_dead_tuples);
346 pgstat_progress_end_command();
347
348 /* and log the action if appropriate */
349 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
350 {
351 TimestampTz endtime = GetCurrentTimestamp();
352
353 if (params->log_min_duration == 0 ||
354 TimestampDifferenceExceeds(starttime, endtime,
355 params->log_min_duration))
356 {
357 StringInfoData buf;
358 char *msgfmt;
359
360 TimestampDifference(starttime, endtime, &secs, &usecs);
361
362 read_rate = 0;
363 write_rate = 0;
364 if ((secs > 0) || (usecs > 0))
365 {
366 read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) /
367 (secs + usecs / 1000000.0);
368 write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) /
369 (secs + usecs / 1000000.0);
370 }
371
372 /*
373 * This is pretty messy, but we split it up so that we can skip
374 * emitting individual parts of the message when not applicable.
375 */
376 initStringInfo(&buf);
377 if (aggressive)
378 msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
379 else
380 msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
381 appendStringInfo(&buf, msgfmt,
382 get_database_name(MyDatabaseId),
383 get_namespace_name(RelationGetNamespace(onerel)),
384 RelationGetRelationName(onerel),
385 vacrelstats->num_index_scans);
386 appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
387 vacrelstats->pages_removed,
388 vacrelstats->rel_pages,
389 vacrelstats->pinskipped_pages,
390 vacrelstats->frozenskipped_pages);
391 appendStringInfo(&buf,
392 _("tuples: %.0f removed, %.0f remain, %.0f are dead but not yet removable, oldest xmin: %u\n"),
393 vacrelstats->tuples_deleted,
394 vacrelstats->new_rel_tuples,
395 vacrelstats->new_dead_tuples,
396 OldestXmin);
397 appendStringInfo(&buf,
398 _("buffer usage: %d hits, %d misses, %d dirtied\n"),
399 VacuumPageHit,
400 VacuumPageMiss,
401 VacuumPageDirty);
402 appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
403 read_rate, write_rate);
404 appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
405
406 ereport(LOG,
407 (errmsg_internal("%s", buf.data)));
408 pfree(buf.data);
409 }
410 }
411 }
412
413 /*
414 * For Hot Standby we need to know the highest transaction id that will
415 * be removed by any change. VACUUM proceeds in a number of passes so
416 * we need to consider how each pass operates. The first phase runs
417 * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
418 * progresses - these will have a latestRemovedXid on each record.
419 * In some cases this removes all of the tuples to be removed, though
420 * often we have dead tuples with index pointers so we must remember them
421 * for removal in phase 3. Index records for those rows are removed
422 * in phase 2 and index blocks do not have MVCC information attached.
423 * So before we can allow removal of any index tuples we need to issue
424 * a WAL record containing the latestRemovedXid of rows that will be
425 * removed in phase three. This allows recovery queries to block at the
426 * correct place, i.e. before phase two, rather than during phase three
427 * which would be after the rows have become inaccessible.
428 */
429 static void
vacuum_log_cleanup_info(Relation rel,LVRelStats * vacrelstats)430 vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
431 {
432 /*
433 * Skip this for relations for which no WAL is to be written, or if we're
434 * not trying to support archive recovery.
435 */
436 if (!RelationNeedsWAL(rel) || !XLogIsNeeded())
437 return;
438
439 /*
440 * No need to write the record at all unless it contains a valid value
441 */
442 if (TransactionIdIsValid(vacrelstats->latestRemovedXid))
443 (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
444 }
445
446 /*
447 * lazy_scan_heap() -- scan an open heap relation
448 *
449 * This routine prunes each page in the heap, which will among other
450 * things truncate dead tuples to dead line pointers, defragment the
451 * page, and set commit status bits (see heap_page_prune). It also builds
452 * lists of dead tuples and pages with free space, calculates statistics
453 * on the number of live tuples in the heap, and marks pages as
454 * all-visible if appropriate. When done, or when we run low on space for
455 * dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap
456 * to reclaim dead line pointers.
457 *
458 * If there are no indexes then we can reclaim line pointers on the fly;
459 * dead line pointers need only be retained until all index pointers that
460 * reference them have been killed.
461 */
462 static void
lazy_scan_heap(Relation onerel,int options,LVRelStats * vacrelstats,Relation * Irel,int nindexes,bool aggressive)463 lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
464 Relation *Irel, int nindexes, bool aggressive)
465 {
466 BlockNumber nblocks,
467 blkno;
468 HeapTupleData tuple;
469 char *relname;
470 TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid;
471 TransactionId relminmxid = onerel->rd_rel->relminmxid;
472 BlockNumber empty_pages,
473 vacuumed_pages,
474 next_fsm_block_to_vacuum;
475 double num_tuples, /* total number of nonremovable tuples */
476 live_tuples, /* live tuples (reltuples estimate) */
477 tups_vacuumed, /* tuples cleaned up by vacuum */
478 nkeep, /* dead-but-not-removable tuples */
479 nunused; /* unused item pointers */
480 IndexBulkDeleteResult **indstats;
481 int i;
482 PGRUsage ru0;
483 Buffer vmbuffer = InvalidBuffer;
484 BlockNumber next_unskippable_block;
485 bool skipping_blocks;
486 xl_heap_freeze_tuple *frozen;
487 StringInfoData buf;
488 const int initprog_index[] = {
489 PROGRESS_VACUUM_PHASE,
490 PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
491 PROGRESS_VACUUM_MAX_DEAD_TUPLES
492 };
493 int64 initprog_val[3];
494
495 pg_rusage_init(&ru0);
496
497 relname = RelationGetRelationName(onerel);
498 if (aggressive)
499 ereport(elevel,
500 (errmsg("aggressively vacuuming \"%s.%s\"",
501 get_namespace_name(RelationGetNamespace(onerel)),
502 relname)));
503 else
504 ereport(elevel,
505 (errmsg("vacuuming \"%s.%s\"",
506 get_namespace_name(RelationGetNamespace(onerel)),
507 relname)));
508
509 empty_pages = vacuumed_pages = 0;
510 next_fsm_block_to_vacuum = (BlockNumber) 0;
511 num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0;
512
513 indstats = (IndexBulkDeleteResult **)
514 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
515
516 nblocks = RelationGetNumberOfBlocks(onerel);
517 vacrelstats->rel_pages = nblocks;
518 vacrelstats->scanned_pages = 0;
519 vacrelstats->tupcount_pages = 0;
520 vacrelstats->nonempty_pages = 0;
521 vacrelstats->latestRemovedXid = InvalidTransactionId;
522
523 lazy_space_alloc(vacrelstats, nblocks);
524 frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
525
526 /* Report that we're scanning the heap, advertising total # of blocks */
527 initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
528 initprog_val[1] = nblocks;
529 initprog_val[2] = vacrelstats->max_dead_tuples;
530 pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
531
532 /*
533 * Except when aggressive is set, we want to skip pages that are
534 * all-visible according to the visibility map, but only when we can skip
535 * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading
536 * sequentially, the OS should be doing readahead for us, so there's no
537 * gain in skipping a page now and then; that's likely to disable
538 * readahead and so be counterproductive. Also, skipping even a single
539 * page means that we can't update relfrozenxid, so we only want to do it
540 * if we can skip a goodly number of pages.
541 *
542 * When aggressive is set, we can't skip pages just because they are
543 * all-visible, but we can still skip pages that are all-frozen, since
544 * such pages do not need freezing and do not affect the value that we can
545 * safely set for relfrozenxid or relminmxid.
546 *
547 * Before entering the main loop, establish the invariant that
548 * next_unskippable_block is the next block number >= blkno that we can't
549 * skip based on the visibility map, either all-visible for a regular scan
550 * or all-frozen for an aggressive scan. We set it to nblocks if there's
551 * no such block. We also set up the skipping_blocks flag correctly at
552 * this stage.
553 *
554 * Note: The value returned by visibilitymap_get_status could be slightly
555 * out-of-date, since we make this test before reading the corresponding
556 * heap page or locking the buffer. This is OK. If we mistakenly think
557 * that the page is all-visible or all-frozen when in fact the flag's just
558 * been cleared, we might fail to vacuum the page. It's easy to see that
559 * skipping a page when aggressive is not set is not a very big deal; we
560 * might leave some dead tuples lying around, but the next vacuum will
561 * find them. But even when aggressive *is* set, it's still OK if we miss
562 * a page whose all-frozen marking has just been cleared. Any new XIDs
563 * just added to that page are necessarily newer than the GlobalXmin we
564 * computed, so they'll have no effect on the value to which we can safely
565 * set relfrozenxid. A similar argument applies for MXIDs and relminmxid.
566 *
567 * We will scan the table's last page, at least to the extent of
568 * determining whether it has tuples or not, even if it should be skipped
569 * according to the above rules; except when we've already determined that
570 * it's not worth trying to truncate the table. This avoids having
571 * lazy_truncate_heap() take access-exclusive lock on the table to attempt
572 * a truncation that just fails immediately because there are tuples in
573 * the last page. This is worth avoiding mainly because such a lock must
574 * be replayed on any hot standby, where it can be disruptive.
575 */
576 next_unskippable_block = 0;
577 if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
578 {
579 while (next_unskippable_block < nblocks)
580 {
581 uint8 vmstatus;
582
583 vmstatus = visibilitymap_get_status(onerel, next_unskippable_block,
584 &vmbuffer);
585 if (aggressive)
586 {
587 if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
588 break;
589 }
590 else
591 {
592 if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
593 break;
594 }
595 vacuum_delay_point();
596 next_unskippable_block++;
597 }
598 }
599
600 if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
601 skipping_blocks = true;
602 else
603 skipping_blocks = false;
604
605 for (blkno = 0; blkno < nblocks; blkno++)
606 {
607 Buffer buf;
608 Page page;
609 OffsetNumber offnum,
610 maxoff;
611 bool tupgone,
612 hastup;
613 int prev_dead_count;
614 int nfrozen;
615 Size freespace;
616 bool all_visible_according_to_vm = false;
617 bool all_visible;
618 bool all_frozen = true; /* provided all_visible is also true */
619 bool has_dead_tuples;
620 TransactionId visibility_cutoff_xid = InvalidTransactionId;
621
622 /* see note above about forcing scanning of last page */
623 #define FORCE_CHECK_PAGE() \
624 (blkno == nblocks - 1 && should_attempt_truncation(vacrelstats))
625
626 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
627
628 if (blkno == next_unskippable_block)
629 {
630 /* Time to advance next_unskippable_block */
631 next_unskippable_block++;
632 if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
633 {
634 while (next_unskippable_block < nblocks)
635 {
636 uint8 vmskipflags;
637
638 vmskipflags = visibilitymap_get_status(onerel,
639 next_unskippable_block,
640 &vmbuffer);
641 if (aggressive)
642 {
643 if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
644 break;
645 }
646 else
647 {
648 if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
649 break;
650 }
651 vacuum_delay_point();
652 next_unskippable_block++;
653 }
654 }
655
656 /*
657 * We know we can't skip the current block. But set up
658 * skipping_blocks to do the right thing at the following blocks.
659 */
660 if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
661 skipping_blocks = true;
662 else
663 skipping_blocks = false;
664
665 /*
666 * Normally, the fact that we can't skip this block must mean that
667 * it's not all-visible. But in an aggressive vacuum we know only
668 * that it's not all-frozen, so it might still be all-visible.
669 */
670 if (aggressive && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
671 all_visible_according_to_vm = true;
672 }
673 else
674 {
675 /*
676 * The current block is potentially skippable; if we've seen a
677 * long enough run of skippable blocks to justify skipping it, and
678 * we're not forced to check it, then go ahead and skip.
679 * Otherwise, the page must be at least all-visible if not
680 * all-frozen, so we can set all_visible_according_to_vm = true.
681 */
682 if (skipping_blocks && !FORCE_CHECK_PAGE())
683 {
684 /*
685 * Tricky, tricky. If this is in aggressive vacuum, the page
686 * must have been all-frozen at the time we checked whether it
687 * was skippable, but it might not be any more. We must be
688 * careful to count it as a skipped all-frozen page in that
689 * case, or else we'll think we can't update relfrozenxid and
690 * relminmxid. If it's not an aggressive vacuum, we don't
691 * know whether it was all-frozen, so we have to recheck; but
692 * in this case an approximate answer is OK.
693 */
694 if (aggressive || VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
695 vacrelstats->frozenskipped_pages++;
696 continue;
697 }
698 all_visible_according_to_vm = true;
699 }
700
701 vacuum_delay_point();
702
703 /*
704 * If we are close to overrunning the available space for dead-tuple
705 * TIDs, pause and do a cycle of vacuuming before we tackle this page.
706 */
707 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
708 vacrelstats->num_dead_tuples > 0)
709 {
710 const int hvp_index[] = {
711 PROGRESS_VACUUM_PHASE,
712 PROGRESS_VACUUM_NUM_INDEX_VACUUMS
713 };
714 int64 hvp_val[2];
715
716 /*
717 * Before beginning index vacuuming, we release any pin we may
718 * hold on the visibility map page. This isn't necessary for
719 * correctness, but we do it anyway to avoid holding the pin
720 * across a lengthy, unrelated operation.
721 */
722 if (BufferIsValid(vmbuffer))
723 {
724 ReleaseBuffer(vmbuffer);
725 vmbuffer = InvalidBuffer;
726 }
727
728 /* Log cleanup info before we touch indexes */
729 vacuum_log_cleanup_info(onerel, vacrelstats);
730
731 /* Report that we are now vacuuming indexes */
732 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
733 PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
734
735 /* Remove index entries */
736 for (i = 0; i < nindexes; i++)
737 lazy_vacuum_index(Irel[i],
738 &indstats[i],
739 vacrelstats);
740
741 /*
742 * Report that we are now vacuuming the heap. We also increase
743 * the number of index scans here; note that by using
744 * pgstat_progress_update_multi_param we can update both
745 * parameters atomically.
746 */
747 hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP;
748 hvp_val[1] = vacrelstats->num_index_scans + 1;
749 pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
750
751 /* Remove tuples from heap */
752 lazy_vacuum_heap(onerel, vacrelstats);
753
754 /*
755 * Forget the now-vacuumed tuples, and press on, but be careful
756 * not to reset latestRemovedXid since we want that value to be
757 * valid.
758 */
759 vacrelstats->num_dead_tuples = 0;
760 vacrelstats->num_index_scans++;
761
762 /*
763 * Vacuum the Free Space Map to make newly-freed space visible on
764 * upper-level FSM pages. Note we have not yet processed blkno.
765 */
766 FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno);
767 next_fsm_block_to_vacuum = blkno;
768
769 /* Report that we are once again scanning the heap */
770 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
771 PROGRESS_VACUUM_PHASE_SCAN_HEAP);
772 }
773
774 /*
775 * Pin the visibility map page in case we need to mark the page
776 * all-visible. In most cases this will be very cheap, because we'll
777 * already have the correct page pinned anyway. However, it's
778 * possible that (a) next_unskippable_block is covered by a different
779 * VM page than the current block or (b) we released our pin and did a
780 * cycle of index vacuuming.
781 *
782 */
783 visibilitymap_pin(onerel, blkno, &vmbuffer);
784
785 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
786 RBM_NORMAL, vac_strategy);
787
788 /* We need buffer cleanup lock so that we can prune HOT chains. */
789 if (!ConditionalLockBufferForCleanup(buf))
790 {
791 /*
792 * If we're not performing an aggressive scan to guard against XID
793 * wraparound, and we don't want to forcibly check the page, then
794 * it's OK to skip vacuuming pages we get a lock conflict on. They
795 * will be dealt with in some future vacuum.
796 */
797 if (!aggressive && !FORCE_CHECK_PAGE())
798 {
799 ReleaseBuffer(buf);
800 vacrelstats->pinskipped_pages++;
801 continue;
802 }
803
804 /*
805 * Read the page with share lock to see if any xids on it need to
806 * be frozen. If not we just skip the page, after updating our
807 * scan statistics. If there are some, we wait for cleanup lock.
808 *
809 * We could defer the lock request further by remembering the page
810 * and coming back to it later, or we could even register
811 * ourselves for multiple buffers and then service whichever one
812 * is received first. For now, this seems good enough.
813 *
814 * If we get here with aggressive false, then we're just forcibly
815 * checking the page, and so we don't want to insist on getting
816 * the lock; we only need to know if the page contains tuples, so
817 * that we can update nonempty_pages correctly. It's convenient
818 * to use lazy_check_needs_freeze() for both situations, though.
819 */
820 LockBuffer(buf, BUFFER_LOCK_SHARE);
821 if (!lazy_check_needs_freeze(buf, &hastup))
822 {
823 UnlockReleaseBuffer(buf);
824 vacrelstats->scanned_pages++;
825 vacrelstats->pinskipped_pages++;
826 if (hastup)
827 vacrelstats->nonempty_pages = blkno + 1;
828 continue;
829 }
830 if (!aggressive)
831 {
832 /*
833 * Here, we must not advance scanned_pages; that would amount
834 * to claiming that the page contains no freezable tuples.
835 */
836 UnlockReleaseBuffer(buf);
837 vacrelstats->pinskipped_pages++;
838 if (hastup)
839 vacrelstats->nonempty_pages = blkno + 1;
840 continue;
841 }
842 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
843 LockBufferForCleanup(buf);
844 /* drop through to normal processing */
845 }
846
847 vacrelstats->scanned_pages++;
848 vacrelstats->tupcount_pages++;
849
850 page = BufferGetPage(buf);
851
852 if (PageIsNew(page))
853 {
854 /*
855 * An all-zeroes page could be left over if a backend extends the
856 * relation but crashes before initializing the page. Reclaim such
857 * pages for use.
858 *
859 * We have to be careful here because we could be looking at a
860 * page that someone has just added to the relation and not yet
861 * been able to initialize (see RelationGetBufferForTuple). To
862 * protect against that, release the buffer lock, grab the
863 * relation extension lock momentarily, and re-lock the buffer. If
864 * the page is still uninitialized by then, it must be left over
865 * from a crashed backend, and we can initialize it.
866 *
867 * We don't really need the relation lock when this is a new or
868 * temp relation, but it's probably not worth the code space to
869 * check that, since this surely isn't a critical path.
870 *
871 * Note: the comparable code in vacuum.c need not worry because
872 * it's got exclusive lock on the whole relation.
873 */
874 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
875 LockRelationForExtension(onerel, ExclusiveLock);
876 UnlockRelationForExtension(onerel, ExclusiveLock);
877 LockBufferForCleanup(buf);
878 if (PageIsNew(page))
879 {
880 ereport(WARNING,
881 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
882 relname, blkno)));
883 PageInit(page, BufferGetPageSize(buf), 0);
884 empty_pages++;
885 }
886 freespace = PageGetHeapFreeSpace(page);
887 MarkBufferDirty(buf);
888 UnlockReleaseBuffer(buf);
889
890 RecordPageWithFreeSpace(onerel, blkno, freespace);
891 continue;
892 }
893
894 if (PageIsEmpty(page))
895 {
896 empty_pages++;
897 freespace = PageGetHeapFreeSpace(page);
898
899 /* empty pages are always all-visible and all-frozen */
900 if (!PageIsAllVisible(page))
901 {
902 START_CRIT_SECTION();
903
904 /* mark buffer dirty before writing a WAL record */
905 MarkBufferDirty(buf);
906
907 /*
908 * It's possible that another backend has extended the heap,
909 * initialized the page, and then failed to WAL-log the page
910 * due to an ERROR. Since heap extension is not WAL-logged,
911 * recovery might try to replay our record setting the page
912 * all-visible and find that the page isn't initialized, which
913 * will cause a PANIC. To prevent that, check whether the
914 * page has been previously WAL-logged, and if not, do that
915 * now.
916 */
917 if (RelationNeedsWAL(onerel) &&
918 PageGetLSN(page) == InvalidXLogRecPtr)
919 log_newpage_buffer(buf, true);
920
921 PageSetAllVisible(page);
922 visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
923 vmbuffer, InvalidTransactionId,
924 VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
925 END_CRIT_SECTION();
926 }
927
928 UnlockReleaseBuffer(buf);
929 RecordPageWithFreeSpace(onerel, blkno, freespace);
930 continue;
931 }
932
933 /*
934 * Prune all HOT-update chains in this page.
935 *
936 * We count tuples removed by the pruning step as removed by VACUUM.
937 */
938 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
939 &vacrelstats->latestRemovedXid);
940
941 /*
942 * Now scan the page to collect vacuumable items and check for tuples
943 * requiring freezing.
944 */
945 all_visible = true;
946 has_dead_tuples = false;
947 nfrozen = 0;
948 hastup = false;
949 prev_dead_count = vacrelstats->num_dead_tuples;
950 maxoff = PageGetMaxOffsetNumber(page);
951
952 /*
953 * Note: If you change anything in the loop below, also look at
954 * heap_page_is_all_visible to see if that needs to be changed.
955 */
956 for (offnum = FirstOffsetNumber;
957 offnum <= maxoff;
958 offnum = OffsetNumberNext(offnum))
959 {
960 ItemId itemid;
961
962 itemid = PageGetItemId(page, offnum);
963
964 /* Unused items require no processing, but we count 'em */
965 if (!ItemIdIsUsed(itemid))
966 {
967 nunused += 1;
968 continue;
969 }
970
971 /* Redirect items mustn't be touched */
972 if (ItemIdIsRedirected(itemid))
973 {
974 hastup = true; /* this page won't be truncatable */
975 continue;
976 }
977
978 ItemPointerSet(&(tuple.t_self), blkno, offnum);
979
980 /*
981 * DEAD item pointers are to be vacuumed normally; but we don't
982 * count them in tups_vacuumed, else we'd be double-counting (at
983 * least in the common case where heap_page_prune() just freed up
984 * a non-HOT tuple).
985 */
986 if (ItemIdIsDead(itemid))
987 {
988 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
989 all_visible = false;
990 continue;
991 }
992
993 Assert(ItemIdIsNormal(itemid));
994
995 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
996 tuple.t_len = ItemIdGetLength(itemid);
997 tuple.t_tableOid = RelationGetRelid(onerel);
998
999 tupgone = false;
1000
1001 /*
1002 * The criteria for counting a tuple as live in this block need to
1003 * match what analyze.c's acquire_sample_rows() does, otherwise
1004 * VACUUM and ANALYZE may produce wildly different reltuples
1005 * values, e.g. when there are many recently-dead tuples.
1006 *
1007 * The logic here is a bit simpler than acquire_sample_rows(), as
1008 * VACUUM can't run inside a transaction block, which makes some
1009 * cases impossible (e.g. in-progress insert from the same
1010 * transaction).
1011 */
1012 switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
1013 {
1014 case HEAPTUPLE_DEAD:
1015
1016 /*
1017 * Ordinarily, DEAD tuples would have been removed by
1018 * heap_page_prune(), but it's possible that the tuple
1019 * state changed since heap_page_prune() looked. In
1020 * particular an INSERT_IN_PROGRESS tuple could have
1021 * changed to DEAD if the inserter aborted. So this
1022 * cannot be considered an error condition.
1023 *
1024 * If the tuple is HOT-updated then it must only be
1025 * removed by a prune operation; so we keep it just as if
1026 * it were RECENTLY_DEAD. Also, if it's a heap-only
1027 * tuple, we choose to keep it, because it'll be a lot
1028 * cheaper to get rid of it in the next pruning pass than
1029 * to treat it like an indexed tuple.
1030 *
1031 * If this were to happen for a tuple that actually needed
1032 * to be deleted, we'd be in trouble, because it'd
1033 * possibly leave a tuple below the relation's xmin
1034 * horizon alive. heap_prepare_freeze_tuple() is prepared
1035 * to detect that case and abort the transaction,
1036 * preventing corruption.
1037 */
1038 if (HeapTupleIsHotUpdated(&tuple) ||
1039 HeapTupleIsHeapOnly(&tuple))
1040 nkeep += 1;
1041 else
1042 tupgone = true; /* we can delete the tuple */
1043 all_visible = false;
1044 break;
1045 case HEAPTUPLE_LIVE:
1046 /* Tuple is good --- but let's do some validity checks */
1047 if (onerel->rd_rel->relhasoids &&
1048 !OidIsValid(HeapTupleGetOid(&tuple)))
1049 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
1050 relname, blkno, offnum);
1051
1052 /*
1053 * Count it as live. Not only is this natural, but it's
1054 * also what acquire_sample_rows() does.
1055 */
1056 live_tuples += 1;
1057
1058 /*
1059 * Is the tuple definitely visible to all transactions?
1060 *
1061 * NB: Like with per-tuple hint bits, we can't set the
1062 * PD_ALL_VISIBLE flag if the inserter committed
1063 * asynchronously. See SetHintBits for more info. Check
1064 * that the tuple is hinted xmin-committed because of
1065 * that.
1066 */
1067 if (all_visible)
1068 {
1069 TransactionId xmin;
1070
1071 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1072 {
1073 all_visible = false;
1074 break;
1075 }
1076
1077 /*
1078 * The inserter definitely committed. But is it old
1079 * enough that everyone sees it as committed?
1080 */
1081 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1082 if (!TransactionIdPrecedes(xmin, OldestXmin))
1083 {
1084 all_visible = false;
1085 break;
1086 }
1087
1088 /* Track newest xmin on page. */
1089 if (TransactionIdFollows(xmin, visibility_cutoff_xid))
1090 visibility_cutoff_xid = xmin;
1091 }
1092 break;
1093 case HEAPTUPLE_RECENTLY_DEAD:
1094
1095 /*
1096 * If tuple is recently deleted then we must not remove it
1097 * from relation.
1098 */
1099 nkeep += 1;
1100 all_visible = false;
1101 break;
1102 case HEAPTUPLE_INSERT_IN_PROGRESS:
1103
1104 /*
1105 * This is an expected case during concurrent vacuum.
1106 *
1107 * We do not count these rows as live, because we expect
1108 * the inserting transaction to update the counters at
1109 * commit, and we assume that will happen only after we
1110 * report our results. This assumption is a bit shaky,
1111 * but it is what acquire_sample_rows() does, so be
1112 * consistent.
1113 */
1114 all_visible = false;
1115 break;
1116 case HEAPTUPLE_DELETE_IN_PROGRESS:
1117 /* This is an expected case during concurrent vacuum */
1118 all_visible = false;
1119
1120 /*
1121 * Count such rows as live. As above, we assume the
1122 * deleting transaction will commit and update the
1123 * counters after we report.
1124 */
1125 live_tuples += 1;
1126 break;
1127 default:
1128 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1129 break;
1130 }
1131
1132 if (tupgone)
1133 {
1134 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
1135 HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
1136 &vacrelstats->latestRemovedXid);
1137 tups_vacuumed += 1;
1138 has_dead_tuples = true;
1139 }
1140 else
1141 {
1142 bool tuple_totally_frozen;
1143
1144 num_tuples += 1;
1145 hastup = true;
1146
1147 /*
1148 * Each non-removable tuple must be checked to see if it needs
1149 * freezing. Note we already have exclusive buffer lock.
1150 */
1151 if (heap_prepare_freeze_tuple(tuple.t_data,
1152 relfrozenxid, relminmxid,
1153 FreezeLimit, MultiXactCutoff,
1154 &frozen[nfrozen],
1155 &tuple_totally_frozen))
1156 frozen[nfrozen++].offset = offnum;
1157
1158 if (!tuple_totally_frozen)
1159 all_frozen = false;
1160 }
1161 } /* scan along page */
1162
1163 /*
1164 * If we froze any tuples, mark the buffer dirty, and write a WAL
1165 * record recording the changes. We must log the changes to be
1166 * crash-safe against future truncation of CLOG.
1167 */
1168 if (nfrozen > 0)
1169 {
1170 START_CRIT_SECTION();
1171
1172 MarkBufferDirty(buf);
1173
1174 /* execute collected freezes */
1175 for (i = 0; i < nfrozen; i++)
1176 {
1177 ItemId itemid;
1178 HeapTupleHeader htup;
1179
1180 itemid = PageGetItemId(page, frozen[i].offset);
1181 htup = (HeapTupleHeader) PageGetItem(page, itemid);
1182
1183 heap_execute_freeze_tuple(htup, &frozen[i]);
1184 }
1185
1186 /* Now WAL-log freezing if necessary */
1187 if (RelationNeedsWAL(onerel))
1188 {
1189 XLogRecPtr recptr;
1190
1191 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
1192 frozen, nfrozen);
1193 PageSetLSN(page, recptr);
1194 }
1195
1196 END_CRIT_SECTION();
1197 }
1198
1199 /*
1200 * If there are no indexes then we can vacuum the page right now
1201 * instead of doing a second scan.
1202 */
1203 if (nindexes == 0 &&
1204 vacrelstats->num_dead_tuples > 0)
1205 {
1206 /* Remove tuples from heap */
1207 lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
1208 has_dead_tuples = false;
1209
1210 /*
1211 * Forget the now-vacuumed tuples, and press on, but be careful
1212 * not to reset latestRemovedXid since we want that value to be
1213 * valid.
1214 */
1215 vacrelstats->num_dead_tuples = 0;
1216 vacuumed_pages++;
1217
1218 /*
1219 * Periodically do incremental FSM vacuuming to make newly-freed
1220 * space visible on upper FSM pages. Note: although we've cleaned
1221 * the current block, we haven't yet updated its FSM entry (that
1222 * happens further down), so passing end == blkno is correct.
1223 */
1224 if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
1225 {
1226 FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum,
1227 blkno);
1228 next_fsm_block_to_vacuum = blkno;
1229 }
1230 }
1231
1232 freespace = PageGetHeapFreeSpace(page);
1233
1234 /* mark page all-visible, if appropriate */
1235 if (all_visible && !all_visible_according_to_vm)
1236 {
1237 uint8 flags = VISIBILITYMAP_ALL_VISIBLE;
1238
1239 if (all_frozen)
1240 flags |= VISIBILITYMAP_ALL_FROZEN;
1241
1242 /*
1243 * It should never be the case that the visibility map page is set
1244 * while the page-level bit is clear, but the reverse is allowed
1245 * (if checksums are not enabled). Regardless, set the both bits
1246 * so that we get back in sync.
1247 *
1248 * NB: If the heap page is all-visible but the VM bit is not set,
1249 * we don't need to dirty the heap page. However, if checksums
1250 * are enabled, we do need to make sure that the heap page is
1251 * dirtied before passing it to visibilitymap_set(), because it
1252 * may be logged. Given that this situation should only happen in
1253 * rare cases after a crash, it is not worth optimizing.
1254 */
1255 PageSetAllVisible(page);
1256 MarkBufferDirty(buf);
1257 visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
1258 vmbuffer, visibility_cutoff_xid, flags);
1259 }
1260
1261 /*
1262 * As of PostgreSQL 9.2, the visibility map bit should never be set if
1263 * the page-level bit is clear. However, it's possible that the bit
1264 * got cleared after we checked it and before we took the buffer
1265 * content lock, so we must recheck before jumping to the conclusion
1266 * that something bad has happened.
1267 */
1268 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1269 && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
1270 {
1271 elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1272 relname, blkno);
1273 visibilitymap_clear(onerel, blkno, vmbuffer,
1274 VISIBILITYMAP_VALID_BITS);
1275 }
1276
1277 /*
1278 * It's possible for the value returned by GetOldestXmin() to move
1279 * backwards, so it's not wrong for us to see tuples that appear to
1280 * not be visible to everyone yet, while PD_ALL_VISIBLE is already
1281 * set. The real safe xmin value never moves backwards, but
1282 * GetOldestXmin() is conservative and sometimes returns a value
1283 * that's unnecessarily small, so if we see that contradiction it just
1284 * means that the tuples that we think are not visible to everyone yet
1285 * actually are, and the PD_ALL_VISIBLE flag is correct.
1286 *
1287 * There should never be dead tuples on a page with PD_ALL_VISIBLE
1288 * set, however.
1289 */
1290 else if (PageIsAllVisible(page) && has_dead_tuples)
1291 {
1292 elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1293 relname, blkno);
1294 PageClearAllVisible(page);
1295 MarkBufferDirty(buf);
1296 visibilitymap_clear(onerel, blkno, vmbuffer,
1297 VISIBILITYMAP_VALID_BITS);
1298 }
1299
1300 /*
1301 * If the all-visible page is turned out to be all-frozen but not
1302 * marked, we should so mark it. Note that all_frozen is only valid
1303 * if all_visible is true, so we must check both.
1304 */
1305 else if (all_visible_according_to_vm && all_visible && all_frozen &&
1306 !VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
1307 {
1308 /*
1309 * We can pass InvalidTransactionId as the cutoff XID here,
1310 * because setting the all-frozen bit doesn't cause recovery
1311 * conflicts.
1312 */
1313 visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
1314 vmbuffer, InvalidTransactionId,
1315 VISIBILITYMAP_ALL_FROZEN);
1316 }
1317
1318 UnlockReleaseBuffer(buf);
1319
1320 /* Remember the location of the last page with nonremovable tuples */
1321 if (hastup)
1322 vacrelstats->nonempty_pages = blkno + 1;
1323
1324 /*
1325 * If we remembered any tuples for deletion, then the page will be
1326 * visited again by lazy_vacuum_heap, which will compute and record
1327 * its post-compaction free space. If not, then we're done with this
1328 * page, so remember its free space as-is. (This path will always be
1329 * taken if there are no indexes.)
1330 */
1331 if (vacrelstats->num_dead_tuples == prev_dead_count)
1332 RecordPageWithFreeSpace(onerel, blkno, freespace);
1333 }
1334
1335 /* report that everything is scanned and vacuumed */
1336 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1337
1338 pfree(frozen);
1339
1340 /* save stats for use later */
1341 vacrelstats->tuples_deleted = tups_vacuumed;
1342 vacrelstats->new_dead_tuples = nkeep;
1343
1344 /* now we can compute the new value for pg_class.reltuples */
1345 vacrelstats->new_live_tuples = vac_estimate_reltuples(onerel,
1346 nblocks,
1347 vacrelstats->tupcount_pages,
1348 live_tuples);
1349
1350 /* also compute total number of surviving heap entries */
1351 vacrelstats->new_rel_tuples =
1352 vacrelstats->new_live_tuples + vacrelstats->new_dead_tuples;
1353
1354 /*
1355 * Release any remaining pin on visibility map page.
1356 */
1357 if (BufferIsValid(vmbuffer))
1358 {
1359 ReleaseBuffer(vmbuffer);
1360 vmbuffer = InvalidBuffer;
1361 }
1362
1363 /* If any tuples need to be deleted, perform final vacuum cycle */
1364 /* XXX put a threshold on min number of tuples here? */
1365 if (vacrelstats->num_dead_tuples > 0)
1366 {
1367 const int hvp_index[] = {
1368 PROGRESS_VACUUM_PHASE,
1369 PROGRESS_VACUUM_NUM_INDEX_VACUUMS
1370 };
1371 int64 hvp_val[2];
1372
1373 /* Log cleanup info before we touch indexes */
1374 vacuum_log_cleanup_info(onerel, vacrelstats);
1375
1376 /* Report that we are now vacuuming indexes */
1377 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1378 PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
1379
1380 /* Remove index entries */
1381 for (i = 0; i < nindexes; i++)
1382 lazy_vacuum_index(Irel[i],
1383 &indstats[i],
1384 vacrelstats);
1385
1386 /* Report that we are now vacuuming the heap */
1387 hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP;
1388 hvp_val[1] = vacrelstats->num_index_scans + 1;
1389 pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
1390
1391 /* Remove tuples from heap */
1392 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1393 PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
1394 lazy_vacuum_heap(onerel, vacrelstats);
1395 vacrelstats->num_index_scans++;
1396 }
1397
1398 /*
1399 * Vacuum the remainder of the Free Space Map. We must do this whether or
1400 * not there were indexes.
1401 */
1402 if (blkno > next_fsm_block_to_vacuum)
1403 FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno);
1404
1405 /* report all blocks vacuumed; and that we're cleaning up */
1406 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1407 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1408 PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
1409
1410 /* Do post-vacuum cleanup and statistics update for each index */
1411 for (i = 0; i < nindexes; i++)
1412 lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
1413
1414 /* If no indexes, make log report that lazy_vacuum_heap would've made */
1415 if (vacuumed_pages)
1416 ereport(elevel,
1417 (errmsg("\"%s\": removed %.0f row versions in %u pages",
1418 RelationGetRelationName(onerel),
1419 tups_vacuumed, vacuumed_pages)));
1420
1421 /*
1422 * This is pretty messy, but we split it up so that we can skip emitting
1423 * individual parts of the message when not applicable.
1424 */
1425 initStringInfo(&buf);
1426 appendStringInfo(&buf,
1427 _("%.0f dead row versions cannot be removed yet, oldest xmin: %u\n"),
1428 nkeep, OldestXmin);
1429 appendStringInfo(&buf, _("There were %.0f unused item pointers.\n"),
1430 nunused);
1431 appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ",
1432 "Skipped %u pages due to buffer pins, ",
1433 vacrelstats->pinskipped_pages),
1434 vacrelstats->pinskipped_pages);
1435 appendStringInfo(&buf, ngettext("%u frozen page.\n",
1436 "%u frozen pages.\n",
1437 vacrelstats->frozenskipped_pages),
1438 vacrelstats->frozenskipped_pages);
1439 appendStringInfo(&buf, ngettext("%u page is entirely empty.\n",
1440 "%u pages are entirely empty.\n",
1441 empty_pages),
1442 empty_pages);
1443 appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
1444
1445 ereport(elevel,
1446 (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
1447 RelationGetRelationName(onerel),
1448 tups_vacuumed, num_tuples,
1449 vacrelstats->scanned_pages, nblocks),
1450 errdetail_internal("%s", buf.data)));
1451 pfree(buf.data);
1452 }
1453
1454
1455 /*
1456 * lazy_vacuum_heap() -- second pass over the heap
1457 *
1458 * This routine marks dead tuples as unused and compacts out free
1459 * space on their pages. Pages not having dead tuples recorded from
1460 * lazy_scan_heap are not visited at all.
1461 *
1462 * Note: the reason for doing this as a second pass is we cannot remove
1463 * the tuples until we've removed their index entries, and we want to
1464 * process index entry removal in batches as large as possible.
1465 */
1466 static void
lazy_vacuum_heap(Relation onerel,LVRelStats * vacrelstats)1467 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
1468 {
1469 int tupindex;
1470 int npages;
1471 PGRUsage ru0;
1472 Buffer vmbuffer = InvalidBuffer;
1473
1474 pg_rusage_init(&ru0);
1475 npages = 0;
1476
1477 tupindex = 0;
1478 while (tupindex < vacrelstats->num_dead_tuples)
1479 {
1480 BlockNumber tblk;
1481 Buffer buf;
1482 Page page;
1483 Size freespace;
1484
1485 vacuum_delay_point();
1486
1487 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1488 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
1489 vac_strategy);
1490 if (!ConditionalLockBufferForCleanup(buf))
1491 {
1492 ReleaseBuffer(buf);
1493 ++tupindex;
1494 continue;
1495 }
1496 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
1497 &vmbuffer);
1498
1499 /* Now that we've compacted the page, record its available space */
1500 page = BufferGetPage(buf);
1501 freespace = PageGetHeapFreeSpace(page);
1502
1503 UnlockReleaseBuffer(buf);
1504 RecordPageWithFreeSpace(onerel, tblk, freespace);
1505 npages++;
1506 }
1507
1508 if (BufferIsValid(vmbuffer))
1509 {
1510 ReleaseBuffer(vmbuffer);
1511 vmbuffer = InvalidBuffer;
1512 }
1513
1514 ereport(elevel,
1515 (errmsg("\"%s\": removed %d row versions in %d pages",
1516 RelationGetRelationName(onerel),
1517 tupindex, npages),
1518 errdetail_internal("%s", pg_rusage_show(&ru0))));
1519 }
1520
1521 /*
1522 * lazy_vacuum_page() -- free dead tuples on a page
1523 * and repair its fragmentation.
1524 *
1525 * Caller must hold pin and buffer cleanup lock on the buffer.
1526 *
1527 * tupindex is the index in vacrelstats->dead_tuples of the first dead
1528 * tuple for this page. We assume the rest follow sequentially.
1529 * The return value is the first tupindex after the tuples of this page.
1530 */
1531 static int
lazy_vacuum_page(Relation onerel,BlockNumber blkno,Buffer buffer,int tupindex,LVRelStats * vacrelstats,Buffer * vmbuffer)1532 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
1533 int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer)
1534 {
1535 Page page = BufferGetPage(buffer);
1536 OffsetNumber unused[MaxOffsetNumber];
1537 int uncnt = 0;
1538 TransactionId visibility_cutoff_xid;
1539 bool all_frozen;
1540
1541 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1542
1543 START_CRIT_SECTION();
1544
1545 for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
1546 {
1547 BlockNumber tblk;
1548 OffsetNumber toff;
1549 ItemId itemid;
1550
1551 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1552 if (tblk != blkno)
1553 break; /* past end of tuples for this block */
1554 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
1555 itemid = PageGetItemId(page, toff);
1556 ItemIdSetUnused(itemid);
1557 unused[uncnt++] = toff;
1558 }
1559
1560 PageRepairFragmentation(page);
1561
1562 /*
1563 * Mark buffer dirty before we write WAL.
1564 */
1565 MarkBufferDirty(buffer);
1566
1567 /* XLOG stuff */
1568 if (RelationNeedsWAL(onerel))
1569 {
1570 XLogRecPtr recptr;
1571
1572 recptr = log_heap_clean(onerel, buffer,
1573 NULL, 0, NULL, 0,
1574 unused, uncnt,
1575 vacrelstats->latestRemovedXid);
1576 PageSetLSN(page, recptr);
1577 }
1578
1579 /*
1580 * End critical section, so we safely can do visibility tests (which
1581 * possibly need to perform IO and allocate memory!). If we crash now the
1582 * page (including the corresponding vm bit) might not be marked all
1583 * visible, but that's fine. A later vacuum will fix that.
1584 */
1585 END_CRIT_SECTION();
1586
1587 /*
1588 * Now that we have removed the dead tuples from the page, once again
1589 * check if the page has become all-visible. The page is already marked
1590 * dirty, exclusively locked, and, if needed, a full page image has been
1591 * emitted in the log_heap_clean() above.
1592 */
1593 if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid,
1594 &all_frozen))
1595 PageSetAllVisible(page);
1596
1597 /*
1598 * All the changes to the heap page have been done. If the all-visible
1599 * flag is now set, also set the VM all-visible bit (and, if possible, the
1600 * all-frozen bit) unless this has already been done previously.
1601 */
1602 if (PageIsAllVisible(page))
1603 {
1604 uint8 vm_status = visibilitymap_get_status(onerel, blkno, vmbuffer);
1605 uint8 flags = 0;
1606
1607 /* Set the VM all-frozen bit to flag, if needed */
1608 if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
1609 flags |= VISIBILITYMAP_ALL_VISIBLE;
1610 if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
1611 flags |= VISIBILITYMAP_ALL_FROZEN;
1612
1613 Assert(BufferIsValid(*vmbuffer));
1614 if (flags != 0)
1615 visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr,
1616 *vmbuffer, visibility_cutoff_xid, flags);
1617 }
1618
1619 return tupindex;
1620 }
1621
1622 /*
1623 * lazy_check_needs_freeze() -- scan page to see if any tuples
1624 * need to be cleaned to avoid wraparound
1625 *
1626 * Returns true if the page needs to be vacuumed using cleanup lock.
1627 * Also returns a flag indicating whether page contains any tuples at all.
1628 */
1629 static bool
lazy_check_needs_freeze(Buffer buf,bool * hastup)1630 lazy_check_needs_freeze(Buffer buf, bool *hastup)
1631 {
1632 Page page = BufferGetPage(buf);
1633 OffsetNumber offnum,
1634 maxoff;
1635 HeapTupleHeader tupleheader;
1636
1637 *hastup = false;
1638
1639 /* If we hit an uninitialized page, we want to force vacuuming it. */
1640 if (PageIsNew(page))
1641 return true;
1642
1643 /* Quick out for ordinary empty page. */
1644 if (PageIsEmpty(page))
1645 return false;
1646
1647 maxoff = PageGetMaxOffsetNumber(page);
1648 for (offnum = FirstOffsetNumber;
1649 offnum <= maxoff;
1650 offnum = OffsetNumberNext(offnum))
1651 {
1652 ItemId itemid;
1653
1654 itemid = PageGetItemId(page, offnum);
1655
1656 /* this should match hastup test in count_nondeletable_pages() */
1657 if (ItemIdIsUsed(itemid))
1658 *hastup = true;
1659
1660 /* dead and redirect items never need freezing */
1661 if (!ItemIdIsNormal(itemid))
1662 continue;
1663
1664 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
1665
1666 if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
1667 MultiXactCutoff, buf))
1668 return true;
1669 } /* scan along page */
1670
1671 return false;
1672 }
1673
1674
1675 /*
1676 * lazy_vacuum_index() -- vacuum one index relation.
1677 *
1678 * Delete all the index entries pointing to tuples listed in
1679 * vacrelstats->dead_tuples, and update running statistics.
1680 */
1681 static void
lazy_vacuum_index(Relation indrel,IndexBulkDeleteResult ** stats,LVRelStats * vacrelstats)1682 lazy_vacuum_index(Relation indrel,
1683 IndexBulkDeleteResult **stats,
1684 LVRelStats *vacrelstats)
1685 {
1686 IndexVacuumInfo ivinfo;
1687 PGRUsage ru0;
1688
1689 pg_rusage_init(&ru0);
1690
1691 ivinfo.index = indrel;
1692 ivinfo.analyze_only = false;
1693 ivinfo.estimated_count = true;
1694 ivinfo.message_level = elevel;
1695 /* We can only provide an approximate value of num_heap_tuples here */
1696 ivinfo.num_heap_tuples = vacrelstats->old_live_tuples;
1697 ivinfo.strategy = vac_strategy;
1698
1699 /* Do bulk deletion */
1700 *stats = index_bulk_delete(&ivinfo, *stats,
1701 lazy_tid_reaped, (void *) vacrelstats);
1702
1703 ereport(elevel,
1704 (errmsg("scanned index \"%s\" to remove %d row versions",
1705 RelationGetRelationName(indrel),
1706 vacrelstats->num_dead_tuples),
1707 errdetail_internal("%s", pg_rusage_show(&ru0))));
1708 }
1709
1710 /*
1711 * lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
1712 */
1713 static void
lazy_cleanup_index(Relation indrel,IndexBulkDeleteResult * stats,LVRelStats * vacrelstats)1714 lazy_cleanup_index(Relation indrel,
1715 IndexBulkDeleteResult *stats,
1716 LVRelStats *vacrelstats)
1717 {
1718 IndexVacuumInfo ivinfo;
1719 PGRUsage ru0;
1720
1721 pg_rusage_init(&ru0);
1722
1723 ivinfo.index = indrel;
1724 ivinfo.analyze_only = false;
1725 ivinfo.estimated_count = (vacrelstats->tupcount_pages < vacrelstats->rel_pages);
1726 ivinfo.message_level = elevel;
1727
1728 /*
1729 * Now we can provide a better estimate of total number of surviving
1730 * tuples (we assume indexes are more interested in that than in the
1731 * number of nominally live tuples).
1732 */
1733 ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
1734 ivinfo.strategy = vac_strategy;
1735
1736 stats = index_vacuum_cleanup(&ivinfo, stats);
1737
1738 if (!stats)
1739 return;
1740
1741 /*
1742 * Now update statistics in pg_class, but only if the index says the count
1743 * is accurate.
1744 */
1745 if (!stats->estimated_count)
1746 vac_update_relstats(indrel,
1747 stats->num_pages,
1748 stats->num_index_tuples,
1749 0,
1750 false,
1751 InvalidTransactionId,
1752 InvalidMultiXactId,
1753 false);
1754
1755 ereport(elevel,
1756 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
1757 RelationGetRelationName(indrel),
1758 stats->num_index_tuples,
1759 stats->num_pages),
1760 errdetail("%.0f index row versions were removed.\n"
1761 "%u index pages have been deleted, %u are currently reusable.\n"
1762 "%s.",
1763 stats->tuples_removed,
1764 stats->pages_deleted, stats->pages_free,
1765 pg_rusage_show(&ru0))));
1766
1767 pfree(stats);
1768 }
1769
1770 /*
1771 * should_attempt_truncation - should we attempt to truncate the heap?
1772 *
1773 * Don't even think about it unless we have a shot at releasing a goodly
1774 * number of pages. Otherwise, the time taken isn't worth it.
1775 *
1776 * Also don't attempt it if we are doing early pruning/vacuuming, because a
1777 * scan which cannot find a truncated heap page cannot determine that the
1778 * snapshot is too old to read that page. We might be able to get away with
1779 * truncating all except one of the pages, setting its LSN to (at least) the
1780 * maximum of the truncated range if we also treated an index leaf tuple
1781 * pointing to a missing heap page as something to trigger the "snapshot too
1782 * old" error, but that seems fragile and seems like it deserves its own patch
1783 * if we consider it.
1784 *
1785 * This is split out so that we can test whether truncation is going to be
1786 * called for before we actually do it. If you change the logic here, be
1787 * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
1788 */
1789 static bool
should_attempt_truncation(LVRelStats * vacrelstats)1790 should_attempt_truncation(LVRelStats *vacrelstats)
1791 {
1792 BlockNumber possibly_freeable;
1793
1794 possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
1795 if (possibly_freeable > 0 &&
1796 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
1797 possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION) &&
1798 old_snapshot_threshold < 0)
1799 return true;
1800 else
1801 return false;
1802 }
1803
1804 /*
1805 * lazy_truncate_heap - try to truncate off any empty pages at the end
1806 */
1807 static void
lazy_truncate_heap(Relation onerel,LVRelStats * vacrelstats)1808 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
1809 {
1810 BlockNumber old_rel_pages = vacrelstats->rel_pages;
1811 BlockNumber new_rel_pages;
1812 int lock_retry;
1813
1814 /* Report that we are now truncating */
1815 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1816 PROGRESS_VACUUM_PHASE_TRUNCATE);
1817
1818 /*
1819 * Loop until no more truncating can be done.
1820 */
1821 do
1822 {
1823 PGRUsage ru0;
1824
1825 pg_rusage_init(&ru0);
1826
1827 /*
1828 * We need full exclusive lock on the relation in order to do
1829 * truncation. If we can't get it, give up rather than waiting --- we
1830 * don't want to block other backends, and we don't want to deadlock
1831 * (which is quite possible considering we already hold a lower-grade
1832 * lock).
1833 */
1834 vacrelstats->lock_waiter_detected = false;
1835 lock_retry = 0;
1836 while (true)
1837 {
1838 if (ConditionalLockRelation(onerel, AccessExclusiveLock))
1839 break;
1840
1841 /*
1842 * Check for interrupts while trying to (re-)acquire the exclusive
1843 * lock.
1844 */
1845 CHECK_FOR_INTERRUPTS();
1846
1847 if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
1848 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
1849 {
1850 /*
1851 * We failed to establish the lock in the specified number of
1852 * retries. This means we give up truncating.
1853 */
1854 vacrelstats->lock_waiter_detected = true;
1855 ereport(elevel,
1856 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
1857 RelationGetRelationName(onerel))));
1858 return;
1859 }
1860
1861 pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L);
1862 }
1863
1864 /*
1865 * Now that we have exclusive lock, look to see if the rel has grown
1866 * whilst we were vacuuming with non-exclusive lock. If so, give up;
1867 * the newly added pages presumably contain non-deletable tuples.
1868 */
1869 new_rel_pages = RelationGetNumberOfBlocks(onerel);
1870 if (new_rel_pages != old_rel_pages)
1871 {
1872 /*
1873 * Note: we intentionally don't update vacrelstats->rel_pages with
1874 * the new rel size here. If we did, it would amount to assuming
1875 * that the new pages are empty, which is unlikely. Leaving the
1876 * numbers alone amounts to assuming that the new pages have the
1877 * same tuple density as existing ones, which is less unlikely.
1878 */
1879 UnlockRelation(onerel, AccessExclusiveLock);
1880 return;
1881 }
1882
1883 /*
1884 * Scan backwards from the end to verify that the end pages actually
1885 * contain no tuples. This is *necessary*, not optional, because
1886 * other backends could have added tuples to these pages whilst we
1887 * were vacuuming.
1888 */
1889 new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
1890
1891 if (new_rel_pages >= old_rel_pages)
1892 {
1893 /* can't do anything after all */
1894 UnlockRelation(onerel, AccessExclusiveLock);
1895 return;
1896 }
1897
1898 /*
1899 * Okay to truncate.
1900 */
1901 RelationTruncate(onerel, new_rel_pages);
1902
1903 /*
1904 * We can release the exclusive lock as soon as we have truncated.
1905 * Other backends can't safely access the relation until they have
1906 * processed the smgr invalidation that smgrtruncate sent out ... but
1907 * that should happen as part of standard invalidation processing once
1908 * they acquire lock on the relation.
1909 */
1910 UnlockRelation(onerel, AccessExclusiveLock);
1911
1912 /*
1913 * Update statistics. Here, it *is* correct to adjust rel_pages
1914 * without also touching reltuples, since the tuple count wasn't
1915 * changed by the truncation.
1916 */
1917 vacrelstats->pages_removed += old_rel_pages - new_rel_pages;
1918 vacrelstats->rel_pages = new_rel_pages;
1919
1920 ereport(elevel,
1921 (errmsg("\"%s\": truncated %u to %u pages",
1922 RelationGetRelationName(onerel),
1923 old_rel_pages, new_rel_pages),
1924 errdetail_internal("%s",
1925 pg_rusage_show(&ru0))));
1926 old_rel_pages = new_rel_pages;
1927 } while (new_rel_pages > vacrelstats->nonempty_pages &&
1928 vacrelstats->lock_waiter_detected);
1929 }
1930
1931 /*
1932 * Rescan end pages to verify that they are (still) empty of tuples.
1933 *
1934 * Returns number of nondeletable pages (last nonempty page + 1).
1935 */
1936 static BlockNumber
count_nondeletable_pages(Relation onerel,LVRelStats * vacrelstats)1937 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
1938 {
1939 BlockNumber blkno;
1940 BlockNumber prefetchedUntil;
1941 instr_time starttime;
1942
1943 /* Initialize the starttime if we check for conflicting lock requests */
1944 INSTR_TIME_SET_CURRENT(starttime);
1945
1946 /*
1947 * Start checking blocks at what we believe relation end to be and move
1948 * backwards. (Strange coding of loop control is needed because blkno is
1949 * unsigned.) To make the scan faster, we prefetch a few blocks at a time
1950 * in forward direction, so that OS-level readahead can kick in.
1951 */
1952 blkno = vacrelstats->rel_pages;
1953 StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0,
1954 "prefetch size must be power of 2");
1955 prefetchedUntil = InvalidBlockNumber;
1956 while (blkno > vacrelstats->nonempty_pages)
1957 {
1958 Buffer buf;
1959 Page page;
1960 OffsetNumber offnum,
1961 maxoff;
1962 bool hastup;
1963
1964 /*
1965 * Check if another process requests a lock on our relation. We are
1966 * holding an AccessExclusiveLock here, so they will be waiting. We
1967 * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
1968 * only check if that interval has elapsed once every 32 blocks to
1969 * keep the number of system calls and actual shared lock table
1970 * lookups to a minimum.
1971 */
1972 if ((blkno % 32) == 0)
1973 {
1974 instr_time currenttime;
1975 instr_time elapsed;
1976
1977 INSTR_TIME_SET_CURRENT(currenttime);
1978 elapsed = currenttime;
1979 INSTR_TIME_SUBTRACT(elapsed, starttime);
1980 if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
1981 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
1982 {
1983 if (LockHasWaitersRelation(onerel, AccessExclusiveLock))
1984 {
1985 ereport(elevel,
1986 (errmsg("\"%s\": suspending truncate due to conflicting lock request",
1987 RelationGetRelationName(onerel))));
1988
1989 vacrelstats->lock_waiter_detected = true;
1990 return blkno;
1991 }
1992 starttime = currenttime;
1993 }
1994 }
1995
1996 /*
1997 * We don't insert a vacuum delay point here, because we have an
1998 * exclusive lock on the table which we want to hold for as short a
1999 * time as possible. We still need to check for interrupts however.
2000 */
2001 CHECK_FOR_INTERRUPTS();
2002
2003 blkno--;
2004
2005 /* If we haven't prefetched this lot yet, do so now. */
2006 if (prefetchedUntil > blkno)
2007 {
2008 BlockNumber prefetchStart;
2009 BlockNumber pblkno;
2010
2011 prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
2012 for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
2013 {
2014 PrefetchBuffer(onerel, MAIN_FORKNUM, pblkno);
2015 CHECK_FOR_INTERRUPTS();
2016 }
2017 prefetchedUntil = prefetchStart;
2018 }
2019
2020 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
2021 RBM_NORMAL, vac_strategy);
2022
2023 /* In this phase we only need shared access to the buffer */
2024 LockBuffer(buf, BUFFER_LOCK_SHARE);
2025
2026 page = BufferGetPage(buf);
2027
2028 if (PageIsNew(page) || PageIsEmpty(page))
2029 {
2030 /* PageIsNew probably shouldn't happen... */
2031 UnlockReleaseBuffer(buf);
2032 continue;
2033 }
2034
2035 hastup = false;
2036 maxoff = PageGetMaxOffsetNumber(page);
2037 for (offnum = FirstOffsetNumber;
2038 offnum <= maxoff;
2039 offnum = OffsetNumberNext(offnum))
2040 {
2041 ItemId itemid;
2042
2043 itemid = PageGetItemId(page, offnum);
2044
2045 /*
2046 * Note: any non-unused item should be taken as a reason to keep
2047 * this page. We formerly thought that DEAD tuples could be
2048 * thrown away, but that's not so, because we'd not have cleaned
2049 * out their index entries.
2050 */
2051 if (ItemIdIsUsed(itemid))
2052 {
2053 hastup = true;
2054 break; /* can stop scanning */
2055 }
2056 } /* scan along page */
2057
2058 UnlockReleaseBuffer(buf);
2059
2060 /* Done scanning if we found a tuple here */
2061 if (hastup)
2062 return blkno + 1;
2063 }
2064
2065 /*
2066 * If we fall out of the loop, all the previously-thought-to-be-empty
2067 * pages still are; we need not bother to look at the last known-nonempty
2068 * page.
2069 */
2070 return vacrelstats->nonempty_pages;
2071 }
2072
2073 /*
2074 * lazy_space_alloc - space allocation decisions for lazy vacuum
2075 *
2076 * See the comments at the head of this file for rationale.
2077 */
2078 static void
lazy_space_alloc(LVRelStats * vacrelstats,BlockNumber relblocks)2079 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
2080 {
2081 long maxtuples;
2082 int vac_work_mem = IsAutoVacuumWorkerProcess() &&
2083 autovacuum_work_mem != -1 ?
2084 autovacuum_work_mem : maintenance_work_mem;
2085
2086 if (vacrelstats->hasindex)
2087 {
2088 maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData);
2089 maxtuples = Min(maxtuples, INT_MAX);
2090 maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
2091
2092 /* curious coding here to ensure the multiplication can't overflow */
2093 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
2094 maxtuples = relblocks * LAZY_ALLOC_TUPLES;
2095
2096 /* stay sane if small maintenance_work_mem */
2097 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
2098 }
2099 else
2100 {
2101 maxtuples = MaxHeapTuplesPerPage;
2102 }
2103
2104 vacrelstats->num_dead_tuples = 0;
2105 vacrelstats->max_dead_tuples = (int) maxtuples;
2106 vacrelstats->dead_tuples = (ItemPointer)
2107 palloc(maxtuples * sizeof(ItemPointerData));
2108 }
2109
2110 /*
2111 * lazy_record_dead_tuple - remember one deletable tuple
2112 */
2113 static void
lazy_record_dead_tuple(LVRelStats * vacrelstats,ItemPointer itemptr)2114 lazy_record_dead_tuple(LVRelStats *vacrelstats,
2115 ItemPointer itemptr)
2116 {
2117 /*
2118 * The array shouldn't overflow under normal behavior, but perhaps it
2119 * could if we are given a really small maintenance_work_mem. In that
2120 * case, just forget the last few tuples (we'll get 'em next time).
2121 */
2122 if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
2123 {
2124 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
2125 vacrelstats->num_dead_tuples++;
2126 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
2127 vacrelstats->num_dead_tuples);
2128 }
2129 }
2130
2131 /*
2132 * lazy_tid_reaped() -- is a particular tid deletable?
2133 *
2134 * This has the right signature to be an IndexBulkDeleteCallback.
2135 *
2136 * Assumes dead_tuples array is in sorted order.
2137 */
2138 static bool
lazy_tid_reaped(ItemPointer itemptr,void * state)2139 lazy_tid_reaped(ItemPointer itemptr, void *state)
2140 {
2141 LVRelStats *vacrelstats = (LVRelStats *) state;
2142 ItemPointer res;
2143
2144 res = (ItemPointer) bsearch((void *) itemptr,
2145 (void *) vacrelstats->dead_tuples,
2146 vacrelstats->num_dead_tuples,
2147 sizeof(ItemPointerData),
2148 vac_cmp_itemptr);
2149
2150 return (res != NULL);
2151 }
2152
2153 /*
2154 * Comparator routines for use with qsort() and bsearch().
2155 */
2156 static int
vac_cmp_itemptr(const void * left,const void * right)2157 vac_cmp_itemptr(const void *left, const void *right)
2158 {
2159 BlockNumber lblk,
2160 rblk;
2161 OffsetNumber loff,
2162 roff;
2163
2164 lblk = ItemPointerGetBlockNumber((ItemPointer) left);
2165 rblk = ItemPointerGetBlockNumber((ItemPointer) right);
2166
2167 if (lblk < rblk)
2168 return -1;
2169 if (lblk > rblk)
2170 return 1;
2171
2172 loff = ItemPointerGetOffsetNumber((ItemPointer) left);
2173 roff = ItemPointerGetOffsetNumber((ItemPointer) right);
2174
2175 if (loff < roff)
2176 return -1;
2177 if (loff > roff)
2178 return 1;
2179
2180 return 0;
2181 }
2182
2183 /*
2184 * Check if every tuple in the given page is visible to all current and future
2185 * transactions. Also return the visibility_cutoff_xid which is the highest
2186 * xmin amongst the visible tuples. Set *all_frozen to true if every tuple
2187 * on this page is frozen.
2188 */
2189 static bool
heap_page_is_all_visible(Relation rel,Buffer buf,TransactionId * visibility_cutoff_xid,bool * all_frozen)2190 heap_page_is_all_visible(Relation rel, Buffer buf,
2191 TransactionId *visibility_cutoff_xid,
2192 bool *all_frozen)
2193 {
2194 Page page = BufferGetPage(buf);
2195 BlockNumber blockno = BufferGetBlockNumber(buf);
2196 OffsetNumber offnum,
2197 maxoff;
2198 bool all_visible = true;
2199
2200 *visibility_cutoff_xid = InvalidTransactionId;
2201 *all_frozen = true;
2202
2203 /*
2204 * This is a stripped down version of the line pointer scan in
2205 * lazy_scan_heap(). So if you change anything here, also check that code.
2206 */
2207 maxoff = PageGetMaxOffsetNumber(page);
2208 for (offnum = FirstOffsetNumber;
2209 offnum <= maxoff && all_visible;
2210 offnum = OffsetNumberNext(offnum))
2211 {
2212 ItemId itemid;
2213 HeapTupleData tuple;
2214
2215 itemid = PageGetItemId(page, offnum);
2216
2217 /* Unused or redirect line pointers are of no interest */
2218 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
2219 continue;
2220
2221 ItemPointerSet(&(tuple.t_self), blockno, offnum);
2222
2223 /*
2224 * Dead line pointers can have index pointers pointing to them. So
2225 * they can't be treated as visible
2226 */
2227 if (ItemIdIsDead(itemid))
2228 {
2229 all_visible = false;
2230 *all_frozen = false;
2231 break;
2232 }
2233
2234 Assert(ItemIdIsNormal(itemid));
2235
2236 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2237 tuple.t_len = ItemIdGetLength(itemid);
2238 tuple.t_tableOid = RelationGetRelid(rel);
2239
2240 switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
2241 {
2242 case HEAPTUPLE_LIVE:
2243 {
2244 TransactionId xmin;
2245
2246 /* Check comments in lazy_scan_heap. */
2247 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
2248 {
2249 all_visible = false;
2250 *all_frozen = false;
2251 break;
2252 }
2253
2254 /*
2255 * The inserter definitely committed. But is it old enough
2256 * that everyone sees it as committed?
2257 */
2258 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
2259 if (!TransactionIdPrecedes(xmin, OldestXmin))
2260 {
2261 all_visible = false;
2262 *all_frozen = false;
2263 break;
2264 }
2265
2266 /* Track newest xmin on page. */
2267 if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
2268 *visibility_cutoff_xid = xmin;
2269
2270 /* Check whether this tuple is already frozen or not */
2271 if (all_visible && *all_frozen &&
2272 heap_tuple_needs_eventual_freeze(tuple.t_data))
2273 *all_frozen = false;
2274 }
2275 break;
2276
2277 case HEAPTUPLE_DEAD:
2278 case HEAPTUPLE_RECENTLY_DEAD:
2279 case HEAPTUPLE_INSERT_IN_PROGRESS:
2280 case HEAPTUPLE_DELETE_IN_PROGRESS:
2281 {
2282 all_visible = false;
2283 *all_frozen = false;
2284 break;
2285 }
2286 default:
2287 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
2288 break;
2289 }
2290 } /* scan along page */
2291
2292 return all_visible;
2293 }
2294