1 /*-------------------------------------------------------------------------
2  *
3  * autovacuum.c
4  *
5  * PostgreSQL Integrated Autovacuum Daemon
6  *
7  * The autovacuum system is structured in two different kinds of processes: the
8  * autovacuum launcher and the autovacuum worker.  The launcher is an
9  * always-running process, started by the postmaster when the autovacuum GUC
10  * parameter is set.  The launcher schedules autovacuum workers to be started
11  * when appropriate.  The workers are the processes which execute the actual
12  * vacuuming; they connect to a database as determined in the launcher, and
13  * once connected they examine the catalogs to select the tables to vacuum.
14  *
15  * The autovacuum launcher cannot start the worker processes by itself,
16  * because doing so would cause robustness issues (namely, failure to shut
17  * them down on exceptional conditions, and also, since the launcher is
18  * connected to shared memory and is thus subject to corruption there, it is
19  * not as robust as the postmaster).  So it leaves that task to the postmaster.
20  *
21  * There is an autovacuum shared memory area, where the launcher stores
22  * information about the database it wants vacuumed.  When it wants a new
23  * worker to start, it sets a flag in shared memory and sends a signal to the
24  * postmaster.  Then postmaster knows nothing more than it must start a worker;
25  * so it forks a new child, which turns into a worker.  This new process
26  * connects to shared memory, and there it can inspect the information that the
27  * launcher has set up.
28  *
29  * If the fork() call fails in the postmaster, it sets a flag in the shared
30  * memory area, and sends a signal to the launcher.  The launcher, upon
31  * noticing the flag, can try starting the worker again by resending the
32  * signal.  Note that the failure can only be transient (fork failure due to
33  * high load, memory pressure, too many processes, etc); more permanent
34  * problems, like failure to connect to a database, are detected later in the
35  * worker and dealt with just by having the worker exit normally.  The launcher
36  * will launch a new worker again later, per schedule.
37  *
38  * When the worker is done vacuuming it sends SIGUSR2 to the launcher.  The
39  * launcher then wakes up and is able to launch another worker, if the schedule
40  * is so tight that a new worker is needed immediately.  At this time the
41  * launcher can also balance the settings for the various remaining workers'
42  * cost-based vacuum delay feature.
43  *
44  * Note that there can be more than one worker in a database concurrently.
45  * They will store the table they are currently vacuuming in shared memory, so
46  * that other workers avoid being blocked waiting for the vacuum lock for that
47  * table.  They will also reload the pgstats data just before vacuuming each
48  * table, to avoid vacuuming a table that was just finished being vacuumed by
49  * another worker and thus is no longer noted in shared memory.  However,
50  * there is a window (caused by pgstat delay) on which a worker may choose a
51  * table that was already vacuumed; this is a bug in the current design.
52  *
53  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
54  * Portions Copyright (c) 1994, Regents of the University of California
55  *
56  *
57  * IDENTIFICATION
58  *	  src/backend/postmaster/autovacuum.c
59  *
60  *-------------------------------------------------------------------------
61  */
62 #include "postgres.h"
63 
64 #include <signal.h>
65 #include <sys/time.h>
66 #include <unistd.h>
67 
68 #include "access/heapam.h"
69 #include "access/htup_details.h"
70 #include "access/multixact.h"
71 #include "access/reloptions.h"
72 #include "access/transam.h"
73 #include "access/xact.h"
74 #include "catalog/dependency.h"
75 #include "catalog/namespace.h"
76 #include "catalog/pg_database.h"
77 #include "commands/dbcommands.h"
78 #include "commands/vacuum.h"
79 #include "lib/ilist.h"
80 #include "libpq/pqsignal.h"
81 #include "miscadmin.h"
82 #include "pgstat.h"
83 #include "postmaster/autovacuum.h"
84 #include "postmaster/fork_process.h"
85 #include "postmaster/postmaster.h"
86 #include "storage/bufmgr.h"
87 #include "storage/ipc.h"
88 #include "storage/latch.h"
89 #include "storage/lmgr.h"
90 #include "storage/pmsignal.h"
91 #include "storage/proc.h"
92 #include "storage/procsignal.h"
93 #include "storage/sinvaladt.h"
94 #include "storage/smgr.h"
95 #include "tcop/tcopprot.h"
96 #include "utils/fmgroids.h"
97 #include "utils/fmgrprotos.h"
98 #include "utils/lsyscache.h"
99 #include "utils/memutils.h"
100 #include "utils/ps_status.h"
101 #include "utils/rel.h"
102 #include "utils/snapmgr.h"
103 #include "utils/syscache.h"
104 #include "utils/timeout.h"
105 #include "utils/timestamp.h"
106 #include "utils/tqual.h"
107 
108 
109 /*
110  * GUC parameters
111  */
112 bool		autovacuum_start_daemon = false;
113 int			autovacuum_max_workers;
114 int			autovacuum_work_mem = -1;
115 int			autovacuum_naptime;
116 int			autovacuum_vac_thresh;
117 double		autovacuum_vac_scale;
118 int			autovacuum_anl_thresh;
119 double		autovacuum_anl_scale;
120 int			autovacuum_freeze_max_age;
121 int			autovacuum_multixact_freeze_max_age;
122 
123 int			autovacuum_vac_cost_delay;
124 int			autovacuum_vac_cost_limit;
125 
126 int			Log_autovacuum_min_duration = -1;
127 
128 /* how long to keep pgstat data in the launcher, in milliseconds */
129 #define STATS_READ_DELAY 1000
130 
131 /* the minimum allowed time between two awakenings of the launcher */
132 #define MIN_AUTOVAC_SLEEPTIME 100.0 /* milliseconds */
133 #define MAX_AUTOVAC_SLEEPTIME 300	/* seconds */
134 
135 /* Flags to tell if we are in an autovacuum process */
136 static bool am_autovacuum_launcher = false;
137 static bool am_autovacuum_worker = false;
138 
139 /* Flags set by signal handlers */
140 static volatile sig_atomic_t got_SIGHUP = false;
141 static volatile sig_atomic_t got_SIGUSR2 = false;
142 static volatile sig_atomic_t got_SIGTERM = false;
143 
144 /* Comparison points for determining whether freeze_max_age is exceeded */
145 static TransactionId recentXid;
146 static MultiXactId recentMulti;
147 
148 /* Default freeze ages to use for autovacuum (varies by database) */
149 static int	default_freeze_min_age;
150 static int	default_freeze_table_age;
151 static int	default_multixact_freeze_min_age;
152 static int	default_multixact_freeze_table_age;
153 
154 /* Memory context for long-lived data */
155 static MemoryContext AutovacMemCxt;
156 
157 /* struct to keep track of databases in launcher */
158 typedef struct avl_dbase
159 {
160 	Oid			adl_datid;		/* hash key -- must be first */
161 	TimestampTz adl_next_worker;
162 	int			adl_score;
163 	dlist_node	adl_node;
164 } avl_dbase;
165 
166 /* struct to keep track of databases in worker */
167 typedef struct avw_dbase
168 {
169 	Oid			adw_datid;
170 	char	   *adw_name;
171 	TransactionId adw_frozenxid;
172 	MultiXactId adw_minmulti;
173 	PgStat_StatDBEntry *adw_entry;
174 } avw_dbase;
175 
176 /* struct to keep track of tables to vacuum and/or analyze, in 1st pass */
177 typedef struct av_relation
178 {
179 	Oid			ar_toastrelid;	/* hash key - must be first */
180 	Oid			ar_relid;
181 	bool		ar_hasrelopts;
182 	AutoVacOpts ar_reloptions;	/* copy of AutoVacOpts from the main table's
183 								 * reloptions, or NULL if none */
184 } av_relation;
185 
186 /* struct to keep track of tables to vacuum and/or analyze, after rechecking */
187 typedef struct autovac_table
188 {
189 	Oid			at_relid;
190 	int			at_vacoptions;	/* bitmask of VacuumOption */
191 	VacuumParams at_params;
192 	int			at_vacuum_cost_delay;
193 	int			at_vacuum_cost_limit;
194 	bool		at_dobalance;
195 	bool		at_sharedrel;
196 	char	   *at_relname;
197 	char	   *at_nspname;
198 	char	   *at_datname;
199 } autovac_table;
200 
201 /*-------------
202  * This struct holds information about a single worker's whereabouts.  We keep
203  * an array of these in shared memory, sized according to
204  * autovacuum_max_workers.
205  *
206  * wi_links		entry into free list or running list
207  * wi_dboid		OID of the database this worker is supposed to work on
208  * wi_tableoid	OID of the table currently being vacuumed, if any
209  * wi_sharedrel flag indicating whether table is marked relisshared
210  * wi_proc		pointer to PGPROC of the running worker, NULL if not started
211  * wi_launchtime Time at which this worker was launched
212  * wi_cost_*	Vacuum cost-based delay parameters current in this worker
213  *
214  * All fields are protected by AutovacuumLock, except for wi_tableoid and
215  * wi_sharedrel which are protected by AutovacuumScheduleLock (note these
216  * two fields are read-only for everyone except that worker itself).
217  *-------------
218  */
219 typedef struct WorkerInfoData
220 {
221 	dlist_node	wi_links;
222 	Oid			wi_dboid;
223 	Oid			wi_tableoid;
224 	PGPROC	   *wi_proc;
225 	TimestampTz wi_launchtime;
226 	bool		wi_dobalance;
227 	bool		wi_sharedrel;
228 	int			wi_cost_delay;
229 	int			wi_cost_limit;
230 	int			wi_cost_limit_base;
231 } WorkerInfoData;
232 
233 typedef struct WorkerInfoData *WorkerInfo;
234 
235 /*
236  * Possible signals received by the launcher from remote processes.  These are
237  * stored atomically in shared memory so that other processes can set them
238  * without locking.
239  */
240 typedef enum
241 {
242 	AutoVacForkFailed,			/* failed trying to start a worker */
243 	AutoVacRebalance,			/* rebalance the cost limits */
244 	AutoVacNumSignals			/* must be last */
245 }			AutoVacuumSignal;
246 
247 /*
248  * Autovacuum workitem array, stored in AutoVacuumShmem->av_workItems.  This
249  * list is mostly protected by AutovacuumLock, except that if an item is
250  * marked 'active' other processes must not modify the work-identifying
251  * members.
252  */
253 typedef struct AutoVacuumWorkItem
254 {
255 	AutoVacuumWorkItemType avw_type;
256 	bool		avw_used;		/* below data is valid */
257 	bool		avw_active;		/* being processed */
258 	Oid			avw_database;
259 	Oid			avw_relation;
260 	BlockNumber avw_blockNumber;
261 } AutoVacuumWorkItem;
262 
263 #define NUM_WORKITEMS	256
264 
265 /*-------------
266  * The main autovacuum shmem struct.  On shared memory we store this main
267  * struct and the array of WorkerInfo structs.  This struct keeps:
268  *
269  * av_signal		set by other processes to indicate various conditions
270  * av_launcherpid	the PID of the autovacuum launcher
271  * av_freeWorkers	the WorkerInfo freelist
272  * av_runningWorkers the WorkerInfo non-free queue
273  * av_startingWorker pointer to WorkerInfo currently being started (cleared by
274  *					the worker itself as soon as it's up and running)
275  * av_workItems		work item array
276  *
277  * This struct is protected by AutovacuumLock, except for av_signal and parts
278  * of the worker list (see above).
279  *-------------
280  */
281 typedef struct
282 {
283 	sig_atomic_t av_signal[AutoVacNumSignals];
284 	pid_t		av_launcherpid;
285 	dlist_head	av_freeWorkers;
286 	dlist_head	av_runningWorkers;
287 	WorkerInfo	av_startingWorker;
288 	AutoVacuumWorkItem av_workItems[NUM_WORKITEMS];
289 } AutoVacuumShmemStruct;
290 
291 static AutoVacuumShmemStruct *AutoVacuumShmem;
292 
293 /*
294  * the database list (of avl_dbase elements) in the launcher, and the context
295  * that contains it
296  */
297 static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList);
298 static MemoryContext DatabaseListCxt = NULL;
299 
300 /* Pointer to my own WorkerInfo, valid on each worker */
301 static WorkerInfo MyWorkerInfo = NULL;
302 
303 /* PID of launcher, valid only in worker while shutting down */
304 int			AutovacuumLauncherPid = 0;
305 
306 #ifdef EXEC_BACKEND
307 static pid_t avlauncher_forkexec(void);
308 static pid_t avworker_forkexec(void);
309 #endif
310 NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]) pg_attribute_noreturn();
311 NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]) pg_attribute_noreturn();
312 
313 static Oid	do_start_worker(void);
314 static void launcher_determine_sleep(bool canlaunch, bool recursing,
315 						 struct timeval *nap);
316 static void launch_worker(TimestampTz now);
317 static List *get_database_list(void);
318 static void rebuild_database_list(Oid newdb);
319 static int	db_comparator(const void *a, const void *b);
320 static void autovac_balance_cost(void);
321 
322 static void do_autovacuum(void);
323 static void FreeWorkerInfo(int code, Datum arg);
324 
325 static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map,
326 					  TupleDesc pg_class_desc,
327 					  int effective_multixact_freeze_max_age);
328 static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts,
329 						  Form_pg_class classForm,
330 						  PgStat_StatTabEntry *tabentry,
331 						  int effective_multixact_freeze_max_age,
332 						  bool *dovacuum, bool *doanalyze, bool *wraparound);
333 
334 static void autovacuum_do_vac_analyze(autovac_table *tab,
335 						  BufferAccessStrategy bstrategy);
336 static AutoVacOpts *extract_autovac_opts(HeapTuple tup,
337 					 TupleDesc pg_class_desc);
338 static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared,
339 						  PgStat_StatDBEntry *shared,
340 						  PgStat_StatDBEntry *dbentry);
341 static void perform_work_item(AutoVacuumWorkItem *workitem);
342 static void autovac_report_activity(autovac_table *tab);
343 static void autovac_report_workitem(AutoVacuumWorkItem *workitem,
344 						const char *nspname, const char *relname);
345 static void av_sighup_handler(SIGNAL_ARGS);
346 static void avl_sigusr2_handler(SIGNAL_ARGS);
347 static void avl_sigterm_handler(SIGNAL_ARGS);
348 static void autovac_refresh_stats(void);
349 
350 
351 
352 /********************************************************************
353  *					  AUTOVACUUM LAUNCHER CODE
354  ********************************************************************/
355 
356 #ifdef EXEC_BACKEND
357 /*
358  * forkexec routine for the autovacuum launcher process.
359  *
360  * Format up the arglist, then fork and exec.
361  */
362 static pid_t
avlauncher_forkexec(void)363 avlauncher_forkexec(void)
364 {
365 	char	   *av[10];
366 	int			ac = 0;
367 
368 	av[ac++] = "postgres";
369 	av[ac++] = "--forkavlauncher";
370 	av[ac++] = NULL;			/* filled in by postmaster_forkexec */
371 	av[ac] = NULL;
372 
373 	Assert(ac < lengthof(av));
374 
375 	return postmaster_forkexec(ac, av);
376 }
377 
378 /*
379  * We need this set from the outside, before InitProcess is called
380  */
381 void
AutovacuumLauncherIAm(void)382 AutovacuumLauncherIAm(void)
383 {
384 	am_autovacuum_launcher = true;
385 }
386 #endif
387 
388 /*
389  * Main entry point for autovacuum launcher process, to be called from the
390  * postmaster.
391  */
392 int
StartAutoVacLauncher(void)393 StartAutoVacLauncher(void)
394 {
395 	pid_t		AutoVacPID;
396 
397 #ifdef EXEC_BACKEND
398 	switch ((AutoVacPID = avlauncher_forkexec()))
399 #else
400 	switch ((AutoVacPID = fork_process()))
401 #endif
402 	{
403 		case -1:
404 			ereport(LOG,
405 					(errmsg("could not fork autovacuum launcher process: %m")));
406 			return 0;
407 
408 #ifndef EXEC_BACKEND
409 		case 0:
410 			/* in postmaster child ... */
411 			InitPostmasterChild();
412 
413 			/* Close the postmaster's sockets */
414 			ClosePostmasterPorts(false);
415 
416 			AutoVacLauncherMain(0, NULL);
417 			break;
418 #endif
419 		default:
420 			return (int) AutoVacPID;
421 	}
422 
423 	/* shouldn't get here */
424 	return 0;
425 }
426 
427 /*
428  * Main loop for the autovacuum launcher process.
429  */
430 NON_EXEC_STATIC void
AutoVacLauncherMain(int argc,char * argv[])431 AutoVacLauncherMain(int argc, char *argv[])
432 {
433 	sigjmp_buf	local_sigjmp_buf;
434 
435 	am_autovacuum_launcher = true;
436 
437 	/* Identify myself via ps */
438 	init_ps_display("autovacuum launcher process", "", "", "");
439 
440 	ereport(DEBUG1,
441 			(errmsg("autovacuum launcher started")));
442 
443 	if (PostAuthDelay)
444 		pg_usleep(PostAuthDelay * 1000000L);
445 
446 	SetProcessingMode(InitProcessing);
447 
448 	/*
449 	 * Set up signal handlers.  We operate on databases much like a regular
450 	 * backend, so we use the same signal handling.  See equivalent code in
451 	 * tcop/postgres.c.
452 	 */
453 	pqsignal(SIGHUP, av_sighup_handler);
454 	pqsignal(SIGINT, StatementCancelHandler);
455 	pqsignal(SIGTERM, avl_sigterm_handler);
456 
457 	pqsignal(SIGQUIT, quickdie);
458 	InitializeTimeouts();		/* establishes SIGALRM handler */
459 
460 	pqsignal(SIGPIPE, SIG_IGN);
461 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
462 	pqsignal(SIGUSR2, avl_sigusr2_handler);
463 	pqsignal(SIGFPE, FloatExceptionHandler);
464 	pqsignal(SIGCHLD, SIG_DFL);
465 
466 	/* Early initialization */
467 	BaseInit();
468 
469 	/*
470 	 * Create a per-backend PGPROC struct in shared memory, except in the
471 	 * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
472 	 * this before we can use LWLocks (and in the EXEC_BACKEND case we already
473 	 * had to do some stuff with LWLocks).
474 	 */
475 #ifndef EXEC_BACKEND
476 	InitProcess();
477 #endif
478 
479 	InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL);
480 
481 	SetProcessingMode(NormalProcessing);
482 
483 	/*
484 	 * Create a memory context that we will do all our work in.  We do this so
485 	 * that we can reset the context during error recovery and thereby avoid
486 	 * possible memory leaks.
487 	 */
488 	AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
489 										  "Autovacuum Launcher",
490 										  ALLOCSET_DEFAULT_SIZES);
491 	MemoryContextSwitchTo(AutovacMemCxt);
492 
493 	/*
494 	 * If an exception is encountered, processing resumes here.
495 	 *
496 	 * This code is a stripped down version of PostgresMain error recovery.
497 	 */
498 	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
499 	{
500 		/* since not using PG_TRY, must reset error stack by hand */
501 		error_context_stack = NULL;
502 
503 		/* Prevents interrupts while cleaning up */
504 		HOLD_INTERRUPTS();
505 
506 		/* Forget any pending QueryCancel or timeout request */
507 		disable_all_timeouts(false);
508 		QueryCancelPending = false; /* second to avoid race condition */
509 
510 		/* Report the error to the server log */
511 		EmitErrorReport();
512 
513 		/* Abort the current transaction in order to recover */
514 		AbortCurrentTransaction();
515 
516 		/*
517 		 * Release any other resources, for the case where we were not in a
518 		 * transaction.
519 		 */
520 		LWLockReleaseAll();
521 		pgstat_report_wait_end();
522 		AbortBufferIO();
523 		UnlockBuffers();
524 		if (CurrentResourceOwner)
525 		{
526 			ResourceOwnerRelease(CurrentResourceOwner,
527 								 RESOURCE_RELEASE_BEFORE_LOCKS,
528 								 false, true);
529 			/* we needn't bother with the other ResourceOwnerRelease phases */
530 		}
531 		AtEOXact_Buffers(false);
532 		AtEOXact_SMgr();
533 		AtEOXact_Files();
534 		AtEOXact_HashTables(false);
535 
536 		/*
537 		 * Now return to normal top-level context and clear ErrorContext for
538 		 * next time.
539 		 */
540 		MemoryContextSwitchTo(AutovacMemCxt);
541 		FlushErrorState();
542 
543 		/* Flush any leaked data in the top-level context */
544 		MemoryContextResetAndDeleteChildren(AutovacMemCxt);
545 
546 		/* don't leave dangling pointers to freed memory */
547 		DatabaseListCxt = NULL;
548 		dlist_init(&DatabaseList);
549 
550 		/*
551 		 * Make sure pgstat also considers our stat data as gone.  Note: we
552 		 * mustn't use autovac_refresh_stats here.
553 		 */
554 		pgstat_clear_snapshot();
555 
556 		/* Now we can allow interrupts again */
557 		RESUME_INTERRUPTS();
558 
559 		/* if in shutdown mode, no need for anything further; just go away */
560 		if (got_SIGTERM)
561 			goto shutdown;
562 
563 		/*
564 		 * Sleep at least 1 second after any error.  We don't want to be
565 		 * filling the error logs as fast as we can.
566 		 */
567 		pg_usleep(1000000L);
568 	}
569 
570 	/* We can now handle ereport(ERROR) */
571 	PG_exception_stack = &local_sigjmp_buf;
572 
573 	/* must unblock signals before calling rebuild_database_list */
574 	PG_SETMASK(&UnBlockSig);
575 
576 	/*
577 	 * Set always-secure search path.  Launcher doesn't connect to a database,
578 	 * so this has no effect.
579 	 */
580 	SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
581 
582 	/*
583 	 * Force zero_damaged_pages OFF in the autovac process, even if it is set
584 	 * in postgresql.conf.  We don't really want such a dangerous option being
585 	 * applied non-interactively.
586 	 */
587 	SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
588 
589 	/*
590 	 * Force settable timeouts off to avoid letting these settings prevent
591 	 * regular maintenance from being executed.
592 	 */
593 	SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
594 	SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
595 	SetConfigOption("idle_in_transaction_session_timeout", "0",
596 					PGC_SUSET, PGC_S_OVERRIDE);
597 
598 	/*
599 	 * Force default_transaction_isolation to READ COMMITTED.  We don't want
600 	 * to pay the overhead of serializable mode, nor add any risk of causing
601 	 * deadlocks or delaying other transactions.
602 	 */
603 	SetConfigOption("default_transaction_isolation", "read committed",
604 					PGC_SUSET, PGC_S_OVERRIDE);
605 
606 	/*
607 	 * In emergency mode, just start a worker (unless shutdown was requested)
608 	 * and go away.
609 	 */
610 	if (!AutoVacuumingActive())
611 	{
612 		if (!got_SIGTERM)
613 			do_start_worker();
614 		proc_exit(0);			/* done */
615 	}
616 
617 	AutoVacuumShmem->av_launcherpid = MyProcPid;
618 
619 	/*
620 	 * Create the initial database list.  The invariant we want this list to
621 	 * keep is that it's ordered by decreasing next_time.  As soon as an entry
622 	 * is updated to a higher time, it will be moved to the front (which is
623 	 * correct because the only operation is to add autovacuum_naptime to the
624 	 * entry, and time always increases).
625 	 */
626 	rebuild_database_list(InvalidOid);
627 
628 	/* loop until shutdown request */
629 	while (!got_SIGTERM)
630 	{
631 		struct timeval nap;
632 		TimestampTz current_time = 0;
633 		bool		can_launch;
634 		int			rc;
635 
636 		/*
637 		 * This loop is a bit different from the normal use of WaitLatch,
638 		 * because we'd like to sleep before the first launch of a child
639 		 * process.  So it's WaitLatch, then ResetLatch, then check for
640 		 * wakening conditions.
641 		 */
642 
643 		launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers),
644 								 false, &nap);
645 
646 		/*
647 		 * Wait until naptime expires or we get some type of signal (all the
648 		 * signal handlers will wake us by calling SetLatch).
649 		 */
650 		rc = WaitLatch(MyLatch,
651 					   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
652 					   (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
653 					   WAIT_EVENT_AUTOVACUUM_MAIN);
654 
655 		ResetLatch(MyLatch);
656 
657 		/* Process sinval catchup interrupts that happened while sleeping */
658 		ProcessCatchupInterrupt();
659 
660 		/*
661 		 * Emergency bailout if postmaster has died.  This is to avoid the
662 		 * necessity for manual cleanup of all postmaster children.
663 		 */
664 		if (rc & WL_POSTMASTER_DEATH)
665 			proc_exit(1);
666 
667 		/* the normal shutdown case */
668 		if (got_SIGTERM)
669 			break;
670 
671 		if (got_SIGHUP)
672 		{
673 			got_SIGHUP = false;
674 			ProcessConfigFile(PGC_SIGHUP);
675 
676 			/* shutdown requested in config file? */
677 			if (!AutoVacuumingActive())
678 				break;
679 
680 			/* rebalance in case the default cost parameters changed */
681 			LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
682 			autovac_balance_cost();
683 			LWLockRelease(AutovacuumLock);
684 
685 			/* rebuild the list in case the naptime changed */
686 			rebuild_database_list(InvalidOid);
687 		}
688 
689 		/*
690 		 * a worker finished, or postmaster signalled failure to start a
691 		 * worker
692 		 */
693 		if (got_SIGUSR2)
694 		{
695 			got_SIGUSR2 = false;
696 
697 			/* rebalance cost limits, if needed */
698 			if (AutoVacuumShmem->av_signal[AutoVacRebalance])
699 			{
700 				LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
701 				AutoVacuumShmem->av_signal[AutoVacRebalance] = false;
702 				autovac_balance_cost();
703 				LWLockRelease(AutovacuumLock);
704 			}
705 
706 			if (AutoVacuumShmem->av_signal[AutoVacForkFailed])
707 			{
708 				/*
709 				 * If the postmaster failed to start a new worker, we sleep
710 				 * for a little while and resend the signal.  The new worker's
711 				 * state is still in memory, so this is sufficient.  After
712 				 * that, we restart the main loop.
713 				 *
714 				 * XXX should we put a limit to the number of times we retry?
715 				 * I don't think it makes much sense, because a future start
716 				 * of a worker will continue to fail in the same way.
717 				 */
718 				AutoVacuumShmem->av_signal[AutoVacForkFailed] = false;
719 				pg_usleep(1000000L);	/* 1s */
720 				SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
721 				continue;
722 			}
723 		}
724 
725 		/*
726 		 * There are some conditions that we need to check before trying to
727 		 * start a worker.  First, we need to make sure that there is a worker
728 		 * slot available.  Second, we need to make sure that no other worker
729 		 * failed while starting up.
730 		 */
731 
732 		current_time = GetCurrentTimestamp();
733 		LWLockAcquire(AutovacuumLock, LW_SHARED);
734 
735 		can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers);
736 
737 		if (AutoVacuumShmem->av_startingWorker != NULL)
738 		{
739 			int			waittime;
740 			WorkerInfo	worker = AutoVacuumShmem->av_startingWorker;
741 
742 			/*
743 			 * We can't launch another worker when another one is still
744 			 * starting up (or failed while doing so), so just sleep for a bit
745 			 * more; that worker will wake us up again as soon as it's ready.
746 			 * We will only wait autovacuum_naptime seconds (up to a maximum
747 			 * of 60 seconds) for this to happen however.  Note that failure
748 			 * to connect to a particular database is not a problem here,
749 			 * because the worker removes itself from the startingWorker
750 			 * pointer before trying to connect.  Problems detected by the
751 			 * postmaster (like fork() failure) are also reported and handled
752 			 * differently.  The only problems that may cause this code to
753 			 * fire are errors in the earlier sections of AutoVacWorkerMain,
754 			 * before the worker removes the WorkerInfo from the
755 			 * startingWorker pointer.
756 			 */
757 			waittime = Min(autovacuum_naptime, 60) * 1000;
758 			if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time,
759 										   waittime))
760 			{
761 				LWLockRelease(AutovacuumLock);
762 				LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
763 
764 				/*
765 				 * No other process can put a worker in starting mode, so if
766 				 * startingWorker is still INVALID after exchanging our lock,
767 				 * we assume it's the same one we saw above (so we don't
768 				 * recheck the launch time).
769 				 */
770 				if (AutoVacuumShmem->av_startingWorker != NULL)
771 				{
772 					worker = AutoVacuumShmem->av_startingWorker;
773 					worker->wi_dboid = InvalidOid;
774 					worker->wi_tableoid = InvalidOid;
775 					worker->wi_sharedrel = false;
776 					worker->wi_proc = NULL;
777 					worker->wi_launchtime = 0;
778 					dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
779 									&worker->wi_links);
780 					AutoVacuumShmem->av_startingWorker = NULL;
781 					elog(WARNING, "worker took too long to start; canceled");
782 				}
783 			}
784 			else
785 				can_launch = false;
786 		}
787 		LWLockRelease(AutovacuumLock);	/* either shared or exclusive */
788 
789 		/* if we can't do anything, just go back to sleep */
790 		if (!can_launch)
791 			continue;
792 
793 		/* We're OK to start a new worker */
794 
795 		if (dlist_is_empty(&DatabaseList))
796 		{
797 			/*
798 			 * Special case when the list is empty: start a worker right away.
799 			 * This covers the initial case, when no database is in pgstats
800 			 * (thus the list is empty).  Note that the constraints in
801 			 * launcher_determine_sleep keep us from starting workers too
802 			 * quickly (at most once every autovacuum_naptime when the list is
803 			 * empty).
804 			 */
805 			launch_worker(current_time);
806 		}
807 		else
808 		{
809 			/*
810 			 * because rebuild_database_list constructs a list with most
811 			 * distant adl_next_worker first, we obtain our database from the
812 			 * tail of the list.
813 			 */
814 			avl_dbase  *avdb;
815 
816 			avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
817 
818 			/*
819 			 * launch a worker if next_worker is right now or it is in the
820 			 * past
821 			 */
822 			if (TimestampDifferenceExceeds(avdb->adl_next_worker,
823 										   current_time, 0))
824 				launch_worker(current_time);
825 		}
826 	}
827 
828 	/* Normal exit from the autovac launcher is here */
829 shutdown:
830 	ereport(DEBUG1,
831 			(errmsg("autovacuum launcher shutting down")));
832 	AutoVacuumShmem->av_launcherpid = 0;
833 
834 	proc_exit(0);				/* done */
835 }
836 
837 /*
838  * Determine the time to sleep, based on the database list.
839  *
840  * The "canlaunch" parameter indicates whether we can start a worker right now,
841  * for example due to the workers being all busy.  If this is false, we will
842  * cause a long sleep, which will be interrupted when a worker exits.
843  */
844 static void
launcher_determine_sleep(bool canlaunch,bool recursing,struct timeval * nap)845 launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval *nap)
846 {
847 	/*
848 	 * We sleep until the next scheduled vacuum.  We trust that when the
849 	 * database list was built, care was taken so that no entries have times
850 	 * in the past; if the first entry has too close a next_worker value, or a
851 	 * time in the past, we will sleep a small nominal time.
852 	 */
853 	if (!canlaunch)
854 	{
855 		nap->tv_sec = autovacuum_naptime;
856 		nap->tv_usec = 0;
857 	}
858 	else if (!dlist_is_empty(&DatabaseList))
859 	{
860 		TimestampTz current_time = GetCurrentTimestamp();
861 		TimestampTz next_wakeup;
862 		avl_dbase  *avdb;
863 		long		secs;
864 		int			usecs;
865 
866 		avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
867 
868 		next_wakeup = avdb->adl_next_worker;
869 		TimestampDifference(current_time, next_wakeup, &secs, &usecs);
870 
871 		nap->tv_sec = secs;
872 		nap->tv_usec = usecs;
873 	}
874 	else
875 	{
876 		/* list is empty, sleep for whole autovacuum_naptime seconds  */
877 		nap->tv_sec = autovacuum_naptime;
878 		nap->tv_usec = 0;
879 	}
880 
881 	/*
882 	 * If the result is exactly zero, it means a database had an entry with
883 	 * time in the past.  Rebuild the list so that the databases are evenly
884 	 * distributed again, and recalculate the time to sleep.  This can happen
885 	 * if there are more tables needing vacuum than workers, and they all take
886 	 * longer to vacuum than autovacuum_naptime.
887 	 *
888 	 * We only recurse once.  rebuild_database_list should always return times
889 	 * in the future, but it seems best not to trust too much on that.
890 	 */
891 	if (nap->tv_sec == 0 && nap->tv_usec == 0 && !recursing)
892 	{
893 		rebuild_database_list(InvalidOid);
894 		launcher_determine_sleep(canlaunch, true, nap);
895 		return;
896 	}
897 
898 	/* The smallest time we'll allow the launcher to sleep. */
899 	if (nap->tv_sec <= 0 && nap->tv_usec <= MIN_AUTOVAC_SLEEPTIME * 1000)
900 	{
901 		nap->tv_sec = 0;
902 		nap->tv_usec = MIN_AUTOVAC_SLEEPTIME * 1000;
903 	}
904 
905 	/*
906 	 * If the sleep time is too large, clamp it to an arbitrary maximum (plus
907 	 * any fractional seconds, for simplicity).  This avoids an essentially
908 	 * infinite sleep in strange cases like the system clock going backwards a
909 	 * few years.
910 	 */
911 	if (nap->tv_sec > MAX_AUTOVAC_SLEEPTIME)
912 		nap->tv_sec = MAX_AUTOVAC_SLEEPTIME;
913 }
914 
915 /*
916  * Build an updated DatabaseList.  It must only contain databases that appear
917  * in pgstats, and must be sorted by next_worker from highest to lowest,
918  * distributed regularly across the next autovacuum_naptime interval.
919  *
920  * Receives the Oid of the database that made this list be generated (we call
921  * this the "new" database, because when the database was already present on
922  * the list, we expect that this function is not called at all).  The
923  * preexisting list, if any, will be used to preserve the order of the
924  * databases in the autovacuum_naptime period.  The new database is put at the
925  * end of the interval.  The actual values are not saved, which should not be
926  * much of a problem.
927  */
928 static void
rebuild_database_list(Oid newdb)929 rebuild_database_list(Oid newdb)
930 {
931 	List	   *dblist;
932 	ListCell   *cell;
933 	MemoryContext newcxt;
934 	MemoryContext oldcxt;
935 	MemoryContext tmpcxt;
936 	HASHCTL		hctl;
937 	int			score;
938 	int			nelems;
939 	HTAB	   *dbhash;
940 	dlist_iter	iter;
941 
942 	/* use fresh stats */
943 	autovac_refresh_stats();
944 
945 	newcxt = AllocSetContextCreate(AutovacMemCxt,
946 								   "AV dblist",
947 								   ALLOCSET_DEFAULT_SIZES);
948 	tmpcxt = AllocSetContextCreate(newcxt,
949 								   "tmp AV dblist",
950 								   ALLOCSET_DEFAULT_SIZES);
951 	oldcxt = MemoryContextSwitchTo(tmpcxt);
952 
953 	/*
954 	 * Implementing this is not as simple as it sounds, because we need to put
955 	 * the new database at the end of the list; next the databases that were
956 	 * already on the list, and finally (at the tail of the list) all the
957 	 * other databases that are not on the existing list.
958 	 *
959 	 * To do this, we build an empty hash table of scored databases.  We will
960 	 * start with the lowest score (zero) for the new database, then
961 	 * increasing scores for the databases in the existing list, in order, and
962 	 * lastly increasing scores for all databases gotten via
963 	 * get_database_list() that are not already on the hash.
964 	 *
965 	 * Then we will put all the hash elements into an array, sort the array by
966 	 * score, and finally put the array elements into the new doubly linked
967 	 * list.
968 	 */
969 	hctl.keysize = sizeof(Oid);
970 	hctl.entrysize = sizeof(avl_dbase);
971 	hctl.hcxt = tmpcxt;
972 	dbhash = hash_create("db hash", 20, &hctl,	/* magic number here FIXME */
973 						 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
974 
975 	/* start by inserting the new database */
976 	score = 0;
977 	if (OidIsValid(newdb))
978 	{
979 		avl_dbase  *db;
980 		PgStat_StatDBEntry *entry;
981 
982 		/* only consider this database if it has a pgstat entry */
983 		entry = pgstat_fetch_stat_dbentry(newdb);
984 		if (entry != NULL)
985 		{
986 			/* we assume it isn't found because the hash was just created */
987 			db = hash_search(dbhash, &newdb, HASH_ENTER, NULL);
988 
989 			/* hash_search already filled in the key */
990 			db->adl_score = score++;
991 			/* next_worker is filled in later */
992 		}
993 	}
994 
995 	/* Now insert the databases from the existing list */
996 	dlist_foreach(iter, &DatabaseList)
997 	{
998 		avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
999 		avl_dbase  *db;
1000 		bool		found;
1001 		PgStat_StatDBEntry *entry;
1002 
1003 		/*
1004 		 * skip databases with no stat entries -- in particular, this gets rid
1005 		 * of dropped databases
1006 		 */
1007 		entry = pgstat_fetch_stat_dbentry(avdb->adl_datid);
1008 		if (entry == NULL)
1009 			continue;
1010 
1011 		db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found);
1012 
1013 		if (!found)
1014 		{
1015 			/* hash_search already filled in the key */
1016 			db->adl_score = score++;
1017 			/* next_worker is filled in later */
1018 		}
1019 	}
1020 
1021 	/* finally, insert all qualifying databases not previously inserted */
1022 	dblist = get_database_list();
1023 	foreach(cell, dblist)
1024 	{
1025 		avw_dbase  *avdb = lfirst(cell);
1026 		avl_dbase  *db;
1027 		bool		found;
1028 		PgStat_StatDBEntry *entry;
1029 
1030 		/* only consider databases with a pgstat entry */
1031 		entry = pgstat_fetch_stat_dbentry(avdb->adw_datid);
1032 		if (entry == NULL)
1033 			continue;
1034 
1035 		db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found);
1036 		/* only update the score if the database was not already on the hash */
1037 		if (!found)
1038 		{
1039 			/* hash_search already filled in the key */
1040 			db->adl_score = score++;
1041 			/* next_worker is filled in later */
1042 		}
1043 	}
1044 	nelems = score;
1045 
1046 	/* from here on, the allocated memory belongs to the new list */
1047 	MemoryContextSwitchTo(newcxt);
1048 	dlist_init(&DatabaseList);
1049 
1050 	if (nelems > 0)
1051 	{
1052 		TimestampTz current_time;
1053 		int			millis_increment;
1054 		avl_dbase  *dbary;
1055 		avl_dbase  *db;
1056 		HASH_SEQ_STATUS seq;
1057 		int			i;
1058 
1059 		/* put all the hash elements into an array */
1060 		dbary = palloc(nelems * sizeof(avl_dbase));
1061 
1062 		i = 0;
1063 		hash_seq_init(&seq, dbhash);
1064 		while ((db = hash_seq_search(&seq)) != NULL)
1065 			memcpy(&(dbary[i++]), db, sizeof(avl_dbase));
1066 
1067 		/* sort the array */
1068 		qsort(dbary, nelems, sizeof(avl_dbase), db_comparator);
1069 
1070 		/*
1071 		 * Determine the time interval between databases in the schedule. If
1072 		 * we see that the configured naptime would take us to sleep times
1073 		 * lower than our min sleep time (which launcher_determine_sleep is
1074 		 * coded not to allow), silently use a larger naptime (but don't touch
1075 		 * the GUC variable).
1076 		 */
1077 		millis_increment = 1000.0 * autovacuum_naptime / nelems;
1078 		if (millis_increment <= MIN_AUTOVAC_SLEEPTIME)
1079 			millis_increment = MIN_AUTOVAC_SLEEPTIME * 1.1;
1080 
1081 		current_time = GetCurrentTimestamp();
1082 
1083 		/*
1084 		 * move the elements from the array into the dllist, setting the
1085 		 * next_worker while walking the array
1086 		 */
1087 		for (i = 0; i < nelems; i++)
1088 		{
1089 			avl_dbase  *db = &(dbary[i]);
1090 
1091 			current_time = TimestampTzPlusMilliseconds(current_time,
1092 													   millis_increment);
1093 			db->adl_next_worker = current_time;
1094 
1095 			/* later elements should go closer to the head of the list */
1096 			dlist_push_head(&DatabaseList, &db->adl_node);
1097 		}
1098 	}
1099 
1100 	/* all done, clean up memory */
1101 	if (DatabaseListCxt != NULL)
1102 		MemoryContextDelete(DatabaseListCxt);
1103 	MemoryContextDelete(tmpcxt);
1104 	DatabaseListCxt = newcxt;
1105 	MemoryContextSwitchTo(oldcxt);
1106 }
1107 
1108 /* qsort comparator for avl_dbase, using adl_score */
1109 static int
db_comparator(const void * a,const void * b)1110 db_comparator(const void *a, const void *b)
1111 {
1112 	if (((const avl_dbase *) a)->adl_score == ((const avl_dbase *) b)->adl_score)
1113 		return 0;
1114 	else
1115 		return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1;
1116 }
1117 
1118 /*
1119  * do_start_worker
1120  *
1121  * Bare-bones procedure for starting an autovacuum worker from the launcher.
1122  * It determines what database to work on, sets up shared memory stuff and
1123  * signals postmaster to start the worker.  It fails gracefully if invoked when
1124  * autovacuum_workers are already active.
1125  *
1126  * Return value is the OID of the database that the worker is going to process,
1127  * or InvalidOid if no worker was actually started.
1128  */
1129 static Oid
do_start_worker(void)1130 do_start_worker(void)
1131 {
1132 	List	   *dblist;
1133 	ListCell   *cell;
1134 	TransactionId xidForceLimit;
1135 	MultiXactId multiForceLimit;
1136 	bool		for_xid_wrap;
1137 	bool		for_multi_wrap;
1138 	avw_dbase  *avdb;
1139 	TimestampTz current_time;
1140 	bool		skipit = false;
1141 	Oid			retval = InvalidOid;
1142 	MemoryContext tmpcxt,
1143 				oldcxt;
1144 
1145 	/* return quickly when there are no free workers */
1146 	LWLockAcquire(AutovacuumLock, LW_SHARED);
1147 	if (dlist_is_empty(&AutoVacuumShmem->av_freeWorkers))
1148 	{
1149 		LWLockRelease(AutovacuumLock);
1150 		return InvalidOid;
1151 	}
1152 	LWLockRelease(AutovacuumLock);
1153 
1154 	/*
1155 	 * Create and switch to a temporary context to avoid leaking the memory
1156 	 * allocated for the database list.
1157 	 */
1158 	tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
1159 								   "Start worker tmp cxt",
1160 								   ALLOCSET_DEFAULT_SIZES);
1161 	oldcxt = MemoryContextSwitchTo(tmpcxt);
1162 
1163 	/* use fresh stats */
1164 	autovac_refresh_stats();
1165 
1166 	/* Get a list of databases */
1167 	dblist = get_database_list();
1168 
1169 	/*
1170 	 * Determine the oldest datfrozenxid/relfrozenxid that we will allow to
1171 	 * pass without forcing a vacuum.  (This limit can be tightened for
1172 	 * particular tables, but not loosened.)
1173 	 */
1174 	recentXid = ReadNewTransactionId();
1175 	xidForceLimit = recentXid - autovacuum_freeze_max_age;
1176 	/* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */
1177 	/* this can cause the limit to go backwards by 3, but that's OK */
1178 	if (xidForceLimit < FirstNormalTransactionId)
1179 		xidForceLimit -= FirstNormalTransactionId;
1180 
1181 	/* Also determine the oldest datminmxid we will consider. */
1182 	recentMulti = ReadNextMultiXactId();
1183 	multiForceLimit = recentMulti - MultiXactMemberFreezeThreshold();
1184 	if (multiForceLimit < FirstMultiXactId)
1185 		multiForceLimit -= FirstMultiXactId;
1186 
1187 	/*
1188 	 * Choose a database to connect to.  We pick the database that was least
1189 	 * recently auto-vacuumed, or one that needs vacuuming to prevent Xid
1190 	 * wraparound-related data loss.  If any db at risk of Xid wraparound is
1191 	 * found, we pick the one with oldest datfrozenxid, independently of
1192 	 * autovacuum times; similarly we pick the one with the oldest datminmxid
1193 	 * if any is in MultiXactId wraparound.  Note that those in Xid wraparound
1194 	 * danger are given more priority than those in multi wraparound danger.
1195 	 *
1196 	 * Note that a database with no stats entry is not considered, except for
1197 	 * Xid wraparound purposes.  The theory is that if no one has ever
1198 	 * connected to it since the stats were last initialized, it doesn't need
1199 	 * vacuuming.
1200 	 *
1201 	 * XXX This could be improved if we had more info about whether it needs
1202 	 * vacuuming before connecting to it.  Perhaps look through the pgstats
1203 	 * data for the database's tables?  One idea is to keep track of the
1204 	 * number of new and dead tuples per database in pgstats.  However it
1205 	 * isn't clear how to construct a metric that measures that and not cause
1206 	 * starvation for less busy databases.
1207 	 */
1208 	avdb = NULL;
1209 	for_xid_wrap = false;
1210 	for_multi_wrap = false;
1211 	current_time = GetCurrentTimestamp();
1212 	foreach(cell, dblist)
1213 	{
1214 		avw_dbase  *tmp = lfirst(cell);
1215 		dlist_iter	iter;
1216 
1217 		/* Check to see if this one is at risk of wraparound */
1218 		if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
1219 		{
1220 			if (avdb == NULL ||
1221 				TransactionIdPrecedes(tmp->adw_frozenxid,
1222 									  avdb->adw_frozenxid))
1223 				avdb = tmp;
1224 			for_xid_wrap = true;
1225 			continue;
1226 		}
1227 		else if (for_xid_wrap)
1228 			continue;			/* ignore not-at-risk DBs */
1229 		else if (MultiXactIdPrecedes(tmp->adw_minmulti, multiForceLimit))
1230 		{
1231 			if (avdb == NULL ||
1232 				MultiXactIdPrecedes(tmp->adw_minmulti, avdb->adw_minmulti))
1233 				avdb = tmp;
1234 			for_multi_wrap = true;
1235 			continue;
1236 		}
1237 		else if (for_multi_wrap)
1238 			continue;			/* ignore not-at-risk DBs */
1239 
1240 		/* Find pgstat entry if any */
1241 		tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid);
1242 
1243 		/*
1244 		 * Skip a database with no pgstat entry; it means it hasn't seen any
1245 		 * activity.
1246 		 */
1247 		if (!tmp->adw_entry)
1248 			continue;
1249 
1250 		/*
1251 		 * Also, skip a database that appears on the database list as having
1252 		 * been processed recently (less than autovacuum_naptime seconds ago).
1253 		 * We do this so that we don't select a database which we just
1254 		 * selected, but that pgstat hasn't gotten around to updating the last
1255 		 * autovacuum time yet.
1256 		 */
1257 		skipit = false;
1258 
1259 		dlist_reverse_foreach(iter, &DatabaseList)
1260 		{
1261 			avl_dbase  *dbp = dlist_container(avl_dbase, adl_node, iter.cur);
1262 
1263 			if (dbp->adl_datid == tmp->adw_datid)
1264 			{
1265 				/*
1266 				 * Skip this database if its next_worker value falls between
1267 				 * the current time and the current time plus naptime.
1268 				 */
1269 				if (!TimestampDifferenceExceeds(dbp->adl_next_worker,
1270 												current_time, 0) &&
1271 					!TimestampDifferenceExceeds(current_time,
1272 												dbp->adl_next_worker,
1273 												autovacuum_naptime * 1000))
1274 					skipit = true;
1275 
1276 				break;
1277 			}
1278 		}
1279 		if (skipit)
1280 			continue;
1281 
1282 		/*
1283 		 * Remember the db with oldest autovac time.  (If we are here, both
1284 		 * tmp->entry and db->entry must be non-null.)
1285 		 */
1286 		if (avdb == NULL ||
1287 			tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time)
1288 			avdb = tmp;
1289 	}
1290 
1291 	/* Found a database -- process it */
1292 	if (avdb != NULL)
1293 	{
1294 		WorkerInfo	worker;
1295 		dlist_node *wptr;
1296 
1297 		LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1298 
1299 		/*
1300 		 * Get a worker entry from the freelist.  We checked above, so there
1301 		 * really should be a free slot.
1302 		 */
1303 		wptr = dlist_pop_head_node(&AutoVacuumShmem->av_freeWorkers);
1304 
1305 		worker = dlist_container(WorkerInfoData, wi_links, wptr);
1306 		worker->wi_dboid = avdb->adw_datid;
1307 		worker->wi_proc = NULL;
1308 		worker->wi_launchtime = GetCurrentTimestamp();
1309 
1310 		AutoVacuumShmem->av_startingWorker = worker;
1311 
1312 		LWLockRelease(AutovacuumLock);
1313 
1314 		SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
1315 
1316 		retval = avdb->adw_datid;
1317 	}
1318 	else if (skipit)
1319 	{
1320 		/*
1321 		 * If we skipped all databases on the list, rebuild it, because it
1322 		 * probably contains a dropped database.
1323 		 */
1324 		rebuild_database_list(InvalidOid);
1325 	}
1326 
1327 	MemoryContextSwitchTo(oldcxt);
1328 	MemoryContextDelete(tmpcxt);
1329 
1330 	return retval;
1331 }
1332 
1333 /*
1334  * launch_worker
1335  *
1336  * Wrapper for starting a worker from the launcher.  Besides actually starting
1337  * it, update the database list to reflect the next time that another one will
1338  * need to be started on the selected database.  The actual database choice is
1339  * left to do_start_worker.
1340  *
1341  * This routine is also expected to insert an entry into the database list if
1342  * the selected database was previously absent from the list.
1343  */
1344 static void
launch_worker(TimestampTz now)1345 launch_worker(TimestampTz now)
1346 {
1347 	Oid			dbid;
1348 	dlist_iter	iter;
1349 
1350 	dbid = do_start_worker();
1351 	if (OidIsValid(dbid))
1352 	{
1353 		bool		found = false;
1354 
1355 		/*
1356 		 * Walk the database list and update the corresponding entry.  If the
1357 		 * database is not on the list, we'll recreate the list.
1358 		 */
1359 		dlist_foreach(iter, &DatabaseList)
1360 		{
1361 			avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
1362 
1363 			if (avdb->adl_datid == dbid)
1364 			{
1365 				found = true;
1366 
1367 				/*
1368 				 * add autovacuum_naptime seconds to the current time, and use
1369 				 * that as the new "next_worker" field for this database.
1370 				 */
1371 				avdb->adl_next_worker =
1372 					TimestampTzPlusMilliseconds(now, autovacuum_naptime * 1000);
1373 
1374 				dlist_move_head(&DatabaseList, iter.cur);
1375 				break;
1376 			}
1377 		}
1378 
1379 		/*
1380 		 * If the database was not present in the database list, we rebuild
1381 		 * the list.  It's possible that the database does not get into the
1382 		 * list anyway, for example if it's a database that doesn't have a
1383 		 * pgstat entry, but this is not a problem because we don't want to
1384 		 * schedule workers regularly into those in any case.
1385 		 */
1386 		if (!found)
1387 			rebuild_database_list(dbid);
1388 	}
1389 }
1390 
1391 /*
1392  * Called from postmaster to signal a failure to fork a process to become
1393  * worker.  The postmaster should kill(SIGUSR2) the launcher shortly
1394  * after calling this function.
1395  */
1396 void
AutoVacWorkerFailed(void)1397 AutoVacWorkerFailed(void)
1398 {
1399 	AutoVacuumShmem->av_signal[AutoVacForkFailed] = true;
1400 }
1401 
1402 /* SIGHUP: set flag to re-read config file at next convenient time */
1403 static void
av_sighup_handler(SIGNAL_ARGS)1404 av_sighup_handler(SIGNAL_ARGS)
1405 {
1406 	int			save_errno = errno;
1407 
1408 	got_SIGHUP = true;
1409 	SetLatch(MyLatch);
1410 
1411 	errno = save_errno;
1412 }
1413 
1414 /* SIGUSR2: a worker is up and running, or just finished, or failed to fork */
1415 static void
avl_sigusr2_handler(SIGNAL_ARGS)1416 avl_sigusr2_handler(SIGNAL_ARGS)
1417 {
1418 	int			save_errno = errno;
1419 
1420 	got_SIGUSR2 = true;
1421 	SetLatch(MyLatch);
1422 
1423 	errno = save_errno;
1424 }
1425 
1426 /* SIGTERM: time to die */
1427 static void
avl_sigterm_handler(SIGNAL_ARGS)1428 avl_sigterm_handler(SIGNAL_ARGS)
1429 {
1430 	int			save_errno = errno;
1431 
1432 	got_SIGTERM = true;
1433 	SetLatch(MyLatch);
1434 
1435 	errno = save_errno;
1436 }
1437 
1438 
1439 /********************************************************************
1440  *					  AUTOVACUUM WORKER CODE
1441  ********************************************************************/
1442 
1443 #ifdef EXEC_BACKEND
1444 /*
1445  * forkexec routines for the autovacuum worker.
1446  *
1447  * Format up the arglist, then fork and exec.
1448  */
1449 static pid_t
avworker_forkexec(void)1450 avworker_forkexec(void)
1451 {
1452 	char	   *av[10];
1453 	int			ac = 0;
1454 
1455 	av[ac++] = "postgres";
1456 	av[ac++] = "--forkavworker";
1457 	av[ac++] = NULL;			/* filled in by postmaster_forkexec */
1458 	av[ac] = NULL;
1459 
1460 	Assert(ac < lengthof(av));
1461 
1462 	return postmaster_forkexec(ac, av);
1463 }
1464 
1465 /*
1466  * We need this set from the outside, before InitProcess is called
1467  */
1468 void
AutovacuumWorkerIAm(void)1469 AutovacuumWorkerIAm(void)
1470 {
1471 	am_autovacuum_worker = true;
1472 }
1473 #endif
1474 
1475 /*
1476  * Main entry point for autovacuum worker process.
1477  *
1478  * This code is heavily based on pgarch.c, q.v.
1479  */
1480 int
StartAutoVacWorker(void)1481 StartAutoVacWorker(void)
1482 {
1483 	pid_t		worker_pid;
1484 
1485 #ifdef EXEC_BACKEND
1486 	switch ((worker_pid = avworker_forkexec()))
1487 #else
1488 	switch ((worker_pid = fork_process()))
1489 #endif
1490 	{
1491 		case -1:
1492 			ereport(LOG,
1493 					(errmsg("could not fork autovacuum worker process: %m")));
1494 			return 0;
1495 
1496 #ifndef EXEC_BACKEND
1497 		case 0:
1498 			/* in postmaster child ... */
1499 			InitPostmasterChild();
1500 
1501 			/* Close the postmaster's sockets */
1502 			ClosePostmasterPorts(false);
1503 
1504 			AutoVacWorkerMain(0, NULL);
1505 			break;
1506 #endif
1507 		default:
1508 			return (int) worker_pid;
1509 	}
1510 
1511 	/* shouldn't get here */
1512 	return 0;
1513 }
1514 
1515 /*
1516  * AutoVacWorkerMain
1517  */
1518 NON_EXEC_STATIC void
AutoVacWorkerMain(int argc,char * argv[])1519 AutoVacWorkerMain(int argc, char *argv[])
1520 {
1521 	sigjmp_buf	local_sigjmp_buf;
1522 	Oid			dbid;
1523 
1524 	am_autovacuum_worker = true;
1525 
1526 	/* Identify myself via ps */
1527 	init_ps_display("autovacuum worker process", "", "", "");
1528 
1529 	SetProcessingMode(InitProcessing);
1530 
1531 	/*
1532 	 * Set up signal handlers.  We operate on databases much like a regular
1533 	 * backend, so we use the same signal handling.  See equivalent code in
1534 	 * tcop/postgres.c.
1535 	 */
1536 	pqsignal(SIGHUP, av_sighup_handler);
1537 
1538 	/*
1539 	 * SIGINT is used to signal canceling the current table's vacuum; SIGTERM
1540 	 * means abort and exit cleanly, and SIGQUIT means abandon ship.
1541 	 */
1542 	pqsignal(SIGINT, StatementCancelHandler);
1543 	pqsignal(SIGTERM, die);
1544 	pqsignal(SIGQUIT, quickdie);
1545 	InitializeTimeouts();		/* establishes SIGALRM handler */
1546 
1547 	pqsignal(SIGPIPE, SIG_IGN);
1548 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1549 	pqsignal(SIGUSR2, SIG_IGN);
1550 	pqsignal(SIGFPE, FloatExceptionHandler);
1551 	pqsignal(SIGCHLD, SIG_DFL);
1552 
1553 	/* Early initialization */
1554 	BaseInit();
1555 
1556 	/*
1557 	 * Create a per-backend PGPROC struct in shared memory, except in the
1558 	 * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
1559 	 * this before we can use LWLocks (and in the EXEC_BACKEND case we already
1560 	 * had to do some stuff with LWLocks).
1561 	 */
1562 #ifndef EXEC_BACKEND
1563 	InitProcess();
1564 #endif
1565 
1566 	/*
1567 	 * If an exception is encountered, processing resumes here.
1568 	 *
1569 	 * See notes in postgres.c about the design of this coding.
1570 	 */
1571 	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1572 	{
1573 		/* since not using PG_TRY, must reset error stack by hand */
1574 		error_context_stack = NULL;
1575 
1576 		/* Prevents interrupts while cleaning up */
1577 		HOLD_INTERRUPTS();
1578 
1579 		/* Report the error to the server log */
1580 		EmitErrorReport();
1581 
1582 		/*
1583 		 * We can now go away.  Note that because we called InitProcess, a
1584 		 * callback was registered to do ProcKill, which will clean up
1585 		 * necessary state.
1586 		 */
1587 		proc_exit(0);
1588 	}
1589 
1590 	/* We can now handle ereport(ERROR) */
1591 	PG_exception_stack = &local_sigjmp_buf;
1592 
1593 	PG_SETMASK(&UnBlockSig);
1594 
1595 	/*
1596 	 * Set always-secure search path, so malicious users can't redirect user
1597 	 * code (e.g. pg_index.indexprs).  (That code runs in a
1598 	 * SECURITY_RESTRICTED_OPERATION sandbox, so malicious users could not
1599 	 * take control of the entire autovacuum worker in any case.)
1600 	 */
1601 	SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
1602 
1603 	/*
1604 	 * Force zero_damaged_pages OFF in the autovac process, even if it is set
1605 	 * in postgresql.conf.  We don't really want such a dangerous option being
1606 	 * applied non-interactively.
1607 	 */
1608 	SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
1609 
1610 	/*
1611 	 * Force settable timeouts off to avoid letting these settings prevent
1612 	 * regular maintenance from being executed.
1613 	 */
1614 	SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1615 	SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1616 	SetConfigOption("idle_in_transaction_session_timeout", "0",
1617 					PGC_SUSET, PGC_S_OVERRIDE);
1618 
1619 	/*
1620 	 * Force default_transaction_isolation to READ COMMITTED.  We don't want
1621 	 * to pay the overhead of serializable mode, nor add any risk of causing
1622 	 * deadlocks or delaying other transactions.
1623 	 */
1624 	SetConfigOption("default_transaction_isolation", "read committed",
1625 					PGC_SUSET, PGC_S_OVERRIDE);
1626 
1627 	/*
1628 	 * Force synchronous replication off to allow regular maintenance even if
1629 	 * we are waiting for standbys to connect. This is important to ensure we
1630 	 * aren't blocked from performing anti-wraparound tasks.
1631 	 */
1632 	if (synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH)
1633 		SetConfigOption("synchronous_commit", "local",
1634 						PGC_SUSET, PGC_S_OVERRIDE);
1635 
1636 	/*
1637 	 * Get the info about the database we're going to work on.
1638 	 */
1639 	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1640 
1641 	/*
1642 	 * beware of startingWorker being INVALID; this should normally not
1643 	 * happen, but if a worker fails after forking and before this, the
1644 	 * launcher might have decided to remove it from the queue and start
1645 	 * again.
1646 	 */
1647 	if (AutoVacuumShmem->av_startingWorker != NULL)
1648 	{
1649 		MyWorkerInfo = AutoVacuumShmem->av_startingWorker;
1650 		dbid = MyWorkerInfo->wi_dboid;
1651 		MyWorkerInfo->wi_proc = MyProc;
1652 
1653 		/* insert into the running list */
1654 		dlist_push_head(&AutoVacuumShmem->av_runningWorkers,
1655 						&MyWorkerInfo->wi_links);
1656 
1657 		/*
1658 		 * remove from the "starting" pointer, so that the launcher can start
1659 		 * a new worker if required
1660 		 */
1661 		AutoVacuumShmem->av_startingWorker = NULL;
1662 		LWLockRelease(AutovacuumLock);
1663 
1664 		on_shmem_exit(FreeWorkerInfo, 0);
1665 
1666 		/* wake up the launcher */
1667 		if (AutoVacuumShmem->av_launcherpid != 0)
1668 			kill(AutoVacuumShmem->av_launcherpid, SIGUSR2);
1669 	}
1670 	else
1671 	{
1672 		/* no worker entry for me, go away */
1673 		elog(WARNING, "autovacuum worker started without a worker entry");
1674 		dbid = InvalidOid;
1675 		LWLockRelease(AutovacuumLock);
1676 	}
1677 
1678 	if (OidIsValid(dbid))
1679 	{
1680 		char		dbname[NAMEDATALEN];
1681 
1682 		/*
1683 		 * Report autovac startup to the stats collector.  We deliberately do
1684 		 * this before InitPostgres, so that the last_autovac_time will get
1685 		 * updated even if the connection attempt fails.  This is to prevent
1686 		 * autovac from getting "stuck" repeatedly selecting an unopenable
1687 		 * database, rather than making any progress on stuff it can connect
1688 		 * to.
1689 		 */
1690 		pgstat_report_autovac(dbid);
1691 
1692 		/*
1693 		 * Connect to the selected database
1694 		 *
1695 		 * Note: if we have selected a just-deleted database (due to using
1696 		 * stale stats info), we'll fail and exit here.
1697 		 */
1698 		InitPostgres(NULL, dbid, NULL, InvalidOid, dbname);
1699 		SetProcessingMode(NormalProcessing);
1700 		set_ps_display(dbname, false);
1701 		ereport(DEBUG1,
1702 				(errmsg("autovacuum: processing database \"%s\"", dbname)));
1703 
1704 		if (PostAuthDelay)
1705 			pg_usleep(PostAuthDelay * 1000000L);
1706 
1707 		/* And do an appropriate amount of work */
1708 		recentXid = ReadNewTransactionId();
1709 		recentMulti = ReadNextMultiXactId();
1710 		do_autovacuum();
1711 	}
1712 
1713 	/*
1714 	 * The launcher will be notified of my death in ProcKill, *if* we managed
1715 	 * to get a worker slot at all
1716 	 */
1717 
1718 	/* All done, go away */
1719 	proc_exit(0);
1720 }
1721 
1722 /*
1723  * Return a WorkerInfo to the free list
1724  */
1725 static void
FreeWorkerInfo(int code,Datum arg)1726 FreeWorkerInfo(int code, Datum arg)
1727 {
1728 	if (MyWorkerInfo != NULL)
1729 	{
1730 		LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1731 
1732 		/*
1733 		 * Wake the launcher up so that he can launch a new worker immediately
1734 		 * if required.  We only save the launcher's PID in local memory here;
1735 		 * the actual signal will be sent when the PGPROC is recycled.  Note
1736 		 * that we always do this, so that the launcher can rebalance the cost
1737 		 * limit setting of the remaining workers.
1738 		 *
1739 		 * We somewhat ignore the risk that the launcher changes its PID
1740 		 * between us reading it and the actual kill; we expect ProcKill to be
1741 		 * called shortly after us, and we assume that PIDs are not reused too
1742 		 * quickly after a process exits.
1743 		 */
1744 		AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid;
1745 
1746 		dlist_delete(&MyWorkerInfo->wi_links);
1747 		MyWorkerInfo->wi_dboid = InvalidOid;
1748 		MyWorkerInfo->wi_tableoid = InvalidOid;
1749 		MyWorkerInfo->wi_sharedrel = false;
1750 		MyWorkerInfo->wi_proc = NULL;
1751 		MyWorkerInfo->wi_launchtime = 0;
1752 		MyWorkerInfo->wi_dobalance = false;
1753 		MyWorkerInfo->wi_cost_delay = 0;
1754 		MyWorkerInfo->wi_cost_limit = 0;
1755 		MyWorkerInfo->wi_cost_limit_base = 0;
1756 		dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
1757 						&MyWorkerInfo->wi_links);
1758 		/* not mine anymore */
1759 		MyWorkerInfo = NULL;
1760 
1761 		/*
1762 		 * now that we're inactive, cause a rebalancing of the surviving
1763 		 * workers
1764 		 */
1765 		AutoVacuumShmem->av_signal[AutoVacRebalance] = true;
1766 		LWLockRelease(AutovacuumLock);
1767 	}
1768 }
1769 
1770 /*
1771  * Update the cost-based delay parameters, so that multiple workers consume
1772  * each a fraction of the total available I/O.
1773  */
1774 void
AutoVacuumUpdateDelay(void)1775 AutoVacuumUpdateDelay(void)
1776 {
1777 	if (MyWorkerInfo)
1778 	{
1779 		VacuumCostDelay = MyWorkerInfo->wi_cost_delay;
1780 		VacuumCostLimit = MyWorkerInfo->wi_cost_limit;
1781 	}
1782 }
1783 
1784 /*
1785  * autovac_balance_cost
1786  *		Recalculate the cost limit setting for each active worker.
1787  *
1788  * Caller must hold the AutovacuumLock in exclusive mode.
1789  */
1790 static void
autovac_balance_cost(void)1791 autovac_balance_cost(void)
1792 {
1793 	/*
1794 	 * The idea here is that we ration out I/O equally.  The amount of I/O
1795 	 * that a worker can consume is determined by cost_limit/cost_delay, so we
1796 	 * try to equalize those ratios rather than the raw limit settings.
1797 	 *
1798 	 * note: in cost_limit, zero also means use value from elsewhere, because
1799 	 * zero is not a valid value.
1800 	 */
1801 	int			vac_cost_limit = (autovacuum_vac_cost_limit > 0 ?
1802 								  autovacuum_vac_cost_limit : VacuumCostLimit);
1803 	int			vac_cost_delay = (autovacuum_vac_cost_delay >= 0 ?
1804 								  autovacuum_vac_cost_delay : VacuumCostDelay);
1805 	double		cost_total;
1806 	double		cost_avail;
1807 	dlist_iter	iter;
1808 
1809 	/* not set? nothing to do */
1810 	if (vac_cost_limit <= 0 || vac_cost_delay <= 0)
1811 		return;
1812 
1813 	/* calculate the total base cost limit of participating active workers */
1814 	cost_total = 0.0;
1815 	dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1816 	{
1817 		WorkerInfo	worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1818 
1819 		if (worker->wi_proc != NULL &&
1820 			worker->wi_dobalance &&
1821 			worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1822 			cost_total +=
1823 				(double) worker->wi_cost_limit_base / worker->wi_cost_delay;
1824 	}
1825 
1826 	/* there are no cost limits -- nothing to do */
1827 	if (cost_total <= 0)
1828 		return;
1829 
1830 	/*
1831 	 * Adjust cost limit of each active worker to balance the total of cost
1832 	 * limit to autovacuum_vacuum_cost_limit.
1833 	 */
1834 	cost_avail = (double) vac_cost_limit / vac_cost_delay;
1835 	dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1836 	{
1837 		WorkerInfo	worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1838 
1839 		if (worker->wi_proc != NULL &&
1840 			worker->wi_dobalance &&
1841 			worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1842 		{
1843 			int			limit = (int)
1844 			(cost_avail * worker->wi_cost_limit_base / cost_total);
1845 
1846 			/*
1847 			 * We put a lower bound of 1 on the cost_limit, to avoid division-
1848 			 * by-zero in the vacuum code.  Also, in case of roundoff trouble
1849 			 * in these calculations, let's be sure we don't ever set
1850 			 * cost_limit to more than the base value.
1851 			 */
1852 			worker->wi_cost_limit = Max(Min(limit,
1853 											worker->wi_cost_limit_base),
1854 										1);
1855 		}
1856 
1857 		if (worker->wi_proc != NULL)
1858 			elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, dobalance=%s cost_limit=%d, cost_limit_base=%d, cost_delay=%d)",
1859 				 worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid,
1860 				 worker->wi_dobalance ? "yes" : "no",
1861 				 worker->wi_cost_limit, worker->wi_cost_limit_base,
1862 				 worker->wi_cost_delay);
1863 	}
1864 }
1865 
1866 /*
1867  * get_database_list
1868  *		Return a list of all databases found in pg_database.
1869  *
1870  * The list and associated data is allocated in the caller's memory context,
1871  * which is in charge of ensuring that it's properly cleaned up afterwards.
1872  *
1873  * Note: this is the only function in which the autovacuum launcher uses a
1874  * transaction.  Although we aren't attached to any particular database and
1875  * therefore can't access most catalogs, we do have enough infrastructure
1876  * to do a seqscan on pg_database.
1877  */
1878 static List *
get_database_list(void)1879 get_database_list(void)
1880 {
1881 	List	   *dblist = NIL;
1882 	Relation	rel;
1883 	HeapScanDesc scan;
1884 	HeapTuple	tup;
1885 	MemoryContext resultcxt;
1886 
1887 	/* This is the context that we will allocate our output data in */
1888 	resultcxt = CurrentMemoryContext;
1889 
1890 	/*
1891 	 * Start a transaction so we can access pg_database, and get a snapshot.
1892 	 * We don't have a use for the snapshot itself, but we're interested in
1893 	 * the secondary effect that it sets RecentGlobalXmin.  (This is critical
1894 	 * for anything that reads heap pages, because HOT may decide to prune
1895 	 * them even if the process doesn't attempt to modify any tuples.)
1896 	 */
1897 	StartTransactionCommand();
1898 	(void) GetTransactionSnapshot();
1899 
1900 	rel = heap_open(DatabaseRelationId, AccessShareLock);
1901 	scan = heap_beginscan_catalog(rel, 0, NULL);
1902 
1903 	while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
1904 	{
1905 		Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup);
1906 		avw_dbase  *avdb;
1907 		MemoryContext oldcxt;
1908 
1909 		/*
1910 		 * Allocate our results in the caller's context, not the
1911 		 * transaction's. We do this inside the loop, and restore the original
1912 		 * context at the end, so that leaky things like heap_getnext() are
1913 		 * not called in a potentially long-lived context.
1914 		 */
1915 		oldcxt = MemoryContextSwitchTo(resultcxt);
1916 
1917 		avdb = (avw_dbase *) palloc(sizeof(avw_dbase));
1918 
1919 		avdb->adw_datid = HeapTupleGetOid(tup);
1920 		avdb->adw_name = pstrdup(NameStr(pgdatabase->datname));
1921 		avdb->adw_frozenxid = pgdatabase->datfrozenxid;
1922 		avdb->adw_minmulti = pgdatabase->datminmxid;
1923 		/* this gets set later: */
1924 		avdb->adw_entry = NULL;
1925 
1926 		dblist = lappend(dblist, avdb);
1927 		MemoryContextSwitchTo(oldcxt);
1928 	}
1929 
1930 	heap_endscan(scan);
1931 	heap_close(rel, AccessShareLock);
1932 
1933 	CommitTransactionCommand();
1934 
1935 	return dblist;
1936 }
1937 
1938 /*
1939  * Process a database table-by-table
1940  *
1941  * Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in
1942  * order not to ignore shutdown commands for too long.
1943  */
1944 static void
do_autovacuum(void)1945 do_autovacuum(void)
1946 {
1947 	Relation	classRel;
1948 	HeapTuple	tuple;
1949 	HeapScanDesc relScan;
1950 	Form_pg_database dbForm;
1951 	List	   *table_oids = NIL;
1952 	List	   *orphan_oids = NIL;
1953 	HASHCTL		ctl;
1954 	HTAB	   *table_toast_map;
1955 	ListCell   *volatile cell;
1956 	PgStat_StatDBEntry *shared;
1957 	PgStat_StatDBEntry *dbentry;
1958 	BufferAccessStrategy bstrategy;
1959 	ScanKeyData key;
1960 	TupleDesc	pg_class_desc;
1961 	int			effective_multixact_freeze_max_age;
1962 	bool		did_vacuum = false;
1963 	bool		found_concurrent_worker = false;
1964 	int			i;
1965 
1966 	/*
1967 	 * StartTransactionCommand and CommitTransactionCommand will automatically
1968 	 * switch to other contexts.  We need this one to keep the list of
1969 	 * relations to vacuum/analyze across transactions.
1970 	 */
1971 	AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
1972 										  "AV worker",
1973 										  ALLOCSET_DEFAULT_SIZES);
1974 	MemoryContextSwitchTo(AutovacMemCxt);
1975 
1976 	/*
1977 	 * may be NULL if we couldn't find an entry (only happens if we are
1978 	 * forcing a vacuum for anti-wrap purposes).
1979 	 */
1980 	dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
1981 
1982 	/* Start a transaction so our commands have one to play into. */
1983 	StartTransactionCommand();
1984 
1985 	/*
1986 	 * Clean up any dead statistics collector entries for this DB. We always
1987 	 * want to do this exactly once per DB-processing cycle, even if we find
1988 	 * nothing worth vacuuming in the database.
1989 	 */
1990 	pgstat_vacuum_stat();
1991 
1992 	/*
1993 	 * Compute the multixact age for which freezing is urgent.  This is
1994 	 * normally autovacuum_multixact_freeze_max_age, but may be less if we are
1995 	 * short of multixact member space.
1996 	 */
1997 	effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold();
1998 
1999 	/*
2000 	 * Find the pg_database entry and select the default freeze ages. We use
2001 	 * zero in template and nonconnectable databases, else the system-wide
2002 	 * default.
2003 	 */
2004 	tuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
2005 	if (!HeapTupleIsValid(tuple))
2006 		elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
2007 	dbForm = (Form_pg_database) GETSTRUCT(tuple);
2008 
2009 	if (dbForm->datistemplate || !dbForm->datallowconn)
2010 	{
2011 		default_freeze_min_age = 0;
2012 		default_freeze_table_age = 0;
2013 		default_multixact_freeze_min_age = 0;
2014 		default_multixact_freeze_table_age = 0;
2015 	}
2016 	else
2017 	{
2018 		default_freeze_min_age = vacuum_freeze_min_age;
2019 		default_freeze_table_age = vacuum_freeze_table_age;
2020 		default_multixact_freeze_min_age = vacuum_multixact_freeze_min_age;
2021 		default_multixact_freeze_table_age = vacuum_multixact_freeze_table_age;
2022 	}
2023 
2024 	ReleaseSysCache(tuple);
2025 
2026 	/* StartTransactionCommand changed elsewhere */
2027 	MemoryContextSwitchTo(AutovacMemCxt);
2028 
2029 	/* The database hash where pgstat keeps shared relations */
2030 	shared = pgstat_fetch_stat_dbentry(InvalidOid);
2031 
2032 	classRel = heap_open(RelationRelationId, AccessShareLock);
2033 
2034 	/* create a copy so we can use it after closing pg_class */
2035 	pg_class_desc = CreateTupleDescCopy(RelationGetDescr(classRel));
2036 
2037 	/* create hash table for toast <-> main relid mapping */
2038 	MemSet(&ctl, 0, sizeof(ctl));
2039 	ctl.keysize = sizeof(Oid);
2040 	ctl.entrysize = sizeof(av_relation);
2041 
2042 	table_toast_map = hash_create("TOAST to main relid map",
2043 								  100,
2044 								  &ctl,
2045 								  HASH_ELEM | HASH_BLOBS);
2046 
2047 	/*
2048 	 * Scan pg_class to determine which tables to vacuum.
2049 	 *
2050 	 * We do this in two passes: on the first one we collect the list of plain
2051 	 * relations and materialized views, and on the second one we collect
2052 	 * TOAST tables. The reason for doing the second pass is that during it we
2053 	 * want to use the main relation's pg_class.reloptions entry if the TOAST
2054 	 * table does not have any, and we cannot obtain it unless we know
2055 	 * beforehand what's the main table OID.
2056 	 *
2057 	 * We need to check TOAST tables separately because in cases with short,
2058 	 * wide tables there might be proportionally much more activity in the
2059 	 * TOAST table than in its parent.
2060 	 */
2061 	relScan = heap_beginscan_catalog(classRel, 0, NULL);
2062 
2063 	/*
2064 	 * On the first pass, we collect main tables to vacuum, and also the main
2065 	 * table relid to TOAST relid mapping.
2066 	 */
2067 	while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2068 	{
2069 		Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2070 		PgStat_StatTabEntry *tabentry;
2071 		AutoVacOpts *relopts;
2072 		Oid			relid;
2073 		bool		dovacuum;
2074 		bool		doanalyze;
2075 		bool		wraparound;
2076 
2077 		if (classForm->relkind != RELKIND_RELATION &&
2078 			classForm->relkind != RELKIND_MATVIEW)
2079 			continue;
2080 
2081 		relid = HeapTupleGetOid(tuple);
2082 
2083 		/*
2084 		 * Check if it is a temp table (presumably, of some other backend's).
2085 		 * We cannot safely process other backends' temp tables.
2086 		 */
2087 		if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2088 		{
2089 			int			backendID;
2090 			PGPROC	   *proc;
2091 
2092 			backendID = GetTempNamespaceBackendId(classForm->relnamespace);
2093 
2094 			/*
2095 			 * We just ignore it if the owning backend is still active in the
2096 			 * same database.
2097 			 */
2098 			if (backendID != InvalidBackendId &&
2099 				(backendID == MyBackendId ||
2100 				 (proc = BackendIdGetProc(backendID)) == NULL ||
2101 				 proc->databaseId != MyDatabaseId))
2102 			{
2103 				/*
2104 				 * The table seems to be orphaned -- although it might be that
2105 				 * the owning backend has already deleted it and exited; our
2106 				 * pg_class scan snapshot is not necessarily up-to-date
2107 				 * anymore, so we could be looking at a committed-dead entry.
2108 				 * Remember it so we can try to delete it later.
2109 				 */
2110 				orphan_oids = lappend_oid(orphan_oids, relid);
2111 			}
2112 			continue;
2113 		}
2114 
2115 		/* Fetch reloptions and the pgstat entry for this table */
2116 		relopts = extract_autovac_opts(tuple, pg_class_desc);
2117 		tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2118 											 shared, dbentry);
2119 
2120 		/* Check if it needs vacuum or analyze */
2121 		relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2122 								  effective_multixact_freeze_max_age,
2123 								  &dovacuum, &doanalyze, &wraparound);
2124 
2125 		/* Relations that need work are added to table_oids */
2126 		if (dovacuum || doanalyze)
2127 			table_oids = lappend_oid(table_oids, relid);
2128 
2129 		/*
2130 		 * Remember TOAST associations for the second pass.  Note: we must do
2131 		 * this whether or not the table is going to be vacuumed, because we
2132 		 * don't automatically vacuum toast tables along the parent table.
2133 		 */
2134 		if (OidIsValid(classForm->reltoastrelid))
2135 		{
2136 			av_relation *hentry;
2137 			bool		found;
2138 
2139 			hentry = hash_search(table_toast_map,
2140 								 &classForm->reltoastrelid,
2141 								 HASH_ENTER, &found);
2142 
2143 			if (!found)
2144 			{
2145 				/* hash_search already filled in the key */
2146 				hentry->ar_relid = relid;
2147 				hentry->ar_hasrelopts = false;
2148 				if (relopts != NULL)
2149 				{
2150 					hentry->ar_hasrelopts = true;
2151 					memcpy(&hentry->ar_reloptions, relopts,
2152 						   sizeof(AutoVacOpts));
2153 				}
2154 			}
2155 		}
2156 	}
2157 
2158 	heap_endscan(relScan);
2159 
2160 	/* second pass: check TOAST tables */
2161 	ScanKeyInit(&key,
2162 				Anum_pg_class_relkind,
2163 				BTEqualStrategyNumber, F_CHAREQ,
2164 				CharGetDatum(RELKIND_TOASTVALUE));
2165 
2166 	relScan = heap_beginscan_catalog(classRel, 1, &key);
2167 	while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2168 	{
2169 		Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2170 		PgStat_StatTabEntry *tabentry;
2171 		Oid			relid;
2172 		AutoVacOpts *relopts = NULL;
2173 		bool		dovacuum;
2174 		bool		doanalyze;
2175 		bool		wraparound;
2176 
2177 		/*
2178 		 * We cannot safely process other backends' temp tables, so skip 'em.
2179 		 */
2180 		if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2181 			continue;
2182 
2183 		relid = HeapTupleGetOid(tuple);
2184 
2185 		/*
2186 		 * fetch reloptions -- if this toast table does not have them, try the
2187 		 * main rel
2188 		 */
2189 		relopts = extract_autovac_opts(tuple, pg_class_desc);
2190 		if (relopts == NULL)
2191 		{
2192 			av_relation *hentry;
2193 			bool		found;
2194 
2195 			hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2196 			if (found && hentry->ar_hasrelopts)
2197 				relopts = &hentry->ar_reloptions;
2198 		}
2199 
2200 		/* Fetch the pgstat entry for this table */
2201 		tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2202 											 shared, dbentry);
2203 
2204 		relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2205 								  effective_multixact_freeze_max_age,
2206 								  &dovacuum, &doanalyze, &wraparound);
2207 
2208 		/* ignore analyze for toast tables */
2209 		if (dovacuum)
2210 			table_oids = lappend_oid(table_oids, relid);
2211 	}
2212 
2213 	heap_endscan(relScan);
2214 	heap_close(classRel, AccessShareLock);
2215 
2216 	/*
2217 	 * Recheck orphan temporary tables, and if they still seem orphaned, drop
2218 	 * them.  We'll eat a transaction per dropped table, which might seem
2219 	 * excessive, but we should only need to do anything as a result of a
2220 	 * previous backend crash, so this should not happen often enough to
2221 	 * justify "optimizing".  Using separate transactions ensures that we
2222 	 * don't bloat the lock table if there are many temp tables to be dropped,
2223 	 * and it ensures that we don't lose work if a deletion attempt fails.
2224 	 */
2225 	foreach(cell, orphan_oids)
2226 	{
2227 		Oid			relid = lfirst_oid(cell);
2228 		Form_pg_class classForm;
2229 		int			backendID;
2230 		ObjectAddress object;
2231 
2232 		/*
2233 		 * Check for user-requested abort.
2234 		 */
2235 		CHECK_FOR_INTERRUPTS();
2236 
2237 		/*
2238 		 * Try to lock the table.  If we can't get the lock immediately,
2239 		 * somebody else is using (or dropping) the table, so it's not our
2240 		 * concern anymore.  Having the lock prevents race conditions below.
2241 		 */
2242 		if (!ConditionalLockRelationOid(relid, AccessExclusiveLock))
2243 			continue;
2244 
2245 		/*
2246 		 * Re-fetch the pg_class tuple and re-check whether it still seems to
2247 		 * be an orphaned temp table.  If it's not there or no longer the same
2248 		 * relation, ignore it.
2249 		 */
2250 		tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
2251 		if (!HeapTupleIsValid(tuple))
2252 		{
2253 			/* be sure to drop useless lock so we don't bloat lock table */
2254 			UnlockRelationOid(relid, AccessExclusiveLock);
2255 			continue;
2256 		}
2257 		classForm = (Form_pg_class) GETSTRUCT(tuple);
2258 
2259 		/*
2260 		 * Make all the same tests made in the loop above.  In event of OID
2261 		 * counter wraparound, the pg_class entry we have now might be
2262 		 * completely unrelated to the one we saw before.
2263 		 */
2264 		if (!((classForm->relkind == RELKIND_RELATION ||
2265 			   classForm->relkind == RELKIND_MATVIEW) &&
2266 			  classForm->relpersistence == RELPERSISTENCE_TEMP))
2267 		{
2268 			UnlockRelationOid(relid, AccessExclusiveLock);
2269 			continue;
2270 		}
2271 		backendID = GetTempNamespaceBackendId(classForm->relnamespace);
2272 		if (!(backendID != InvalidBackendId &&
2273 			  (backendID == MyBackendId ||
2274 			   BackendIdGetProc(backendID) == NULL)))
2275 		{
2276 			UnlockRelationOid(relid, AccessExclusiveLock);
2277 			continue;
2278 		}
2279 
2280 		/* OK, let's delete it */
2281 		ereport(LOG,
2282 				(errmsg("autovacuum: dropping orphan temp table \"%s.%s.%s\"",
2283 						get_database_name(MyDatabaseId),
2284 						get_namespace_name(classForm->relnamespace),
2285 						NameStr(classForm->relname))));
2286 
2287 		object.classId = RelationRelationId;
2288 		object.objectId = relid;
2289 		object.objectSubId = 0;
2290 		performDeletion(&object, DROP_CASCADE,
2291 						PERFORM_DELETION_INTERNAL |
2292 						PERFORM_DELETION_QUIETLY |
2293 						PERFORM_DELETION_SKIP_EXTENSIONS);
2294 
2295 		/*
2296 		 * To commit the deletion, end current transaction and start a new
2297 		 * one.  Note this also releases the lock we took.
2298 		 */
2299 		CommitTransactionCommand();
2300 		StartTransactionCommand();
2301 
2302 		/* StartTransactionCommand changed current memory context */
2303 		MemoryContextSwitchTo(AutovacMemCxt);
2304 	}
2305 
2306 	/*
2307 	 * Create a buffer access strategy object for VACUUM to use.  We want to
2308 	 * use the same one across all the vacuum operations we perform, since the
2309 	 * point is for VACUUM not to blow out the shared cache.
2310 	 */
2311 	bstrategy = GetAccessStrategy(BAS_VACUUM);
2312 
2313 	/*
2314 	 * create a memory context to act as fake PortalContext, so that the
2315 	 * contexts created in the vacuum code are cleaned up for each table.
2316 	 */
2317 	PortalContext = AllocSetContextCreate(AutovacMemCxt,
2318 										  "Autovacuum Portal",
2319 										  ALLOCSET_DEFAULT_SIZES);
2320 
2321 	/*
2322 	 * Perform operations on collected tables.
2323 	 */
2324 	foreach(cell, table_oids)
2325 	{
2326 		Oid			relid = lfirst_oid(cell);
2327 		HeapTuple	classTup;
2328 		autovac_table *tab;
2329 		bool		isshared;
2330 		bool		skipit;
2331 		int			stdVacuumCostDelay;
2332 		int			stdVacuumCostLimit;
2333 		dlist_iter	iter;
2334 
2335 		CHECK_FOR_INTERRUPTS();
2336 
2337 		/*
2338 		 * Check for config changes before processing each collected table.
2339 		 */
2340 		if (got_SIGHUP)
2341 		{
2342 			got_SIGHUP = false;
2343 			ProcessConfigFile(PGC_SIGHUP);
2344 
2345 			/*
2346 			 * You might be tempted to bail out if we see autovacuum is now
2347 			 * disabled.  Must resist that temptation -- this might be a
2348 			 * for-wraparound emergency worker, in which case that would be
2349 			 * entirely inappropriate.
2350 			 */
2351 		}
2352 
2353 		/*
2354 		 * Find out whether the table is shared or not.  (It's slightly
2355 		 * annoying to fetch the syscache entry just for this, but in typical
2356 		 * cases it adds little cost because table_recheck_autovac would
2357 		 * refetch the entry anyway.  We could buy that back by copying the
2358 		 * tuple here and passing it to table_recheck_autovac, but that
2359 		 * increases the odds of that function working with stale data.)
2360 		 */
2361 		classTup = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
2362 		if (!HeapTupleIsValid(classTup))
2363 			continue;			/* somebody deleted the rel, forget it */
2364 		isshared = ((Form_pg_class) GETSTRUCT(classTup))->relisshared;
2365 		ReleaseSysCache(classTup);
2366 
2367 		/*
2368 		 * Hold schedule lock from here until we've claimed the table.  We
2369 		 * also need the AutovacuumLock to walk the worker array, but that one
2370 		 * can just be a shared lock.
2371 		 */
2372 		LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2373 		LWLockAcquire(AutovacuumLock, LW_SHARED);
2374 
2375 		/*
2376 		 * Check whether the table is being vacuumed concurrently by another
2377 		 * worker.
2378 		 */
2379 		skipit = false;
2380 		dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
2381 		{
2382 			WorkerInfo	worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
2383 
2384 			/* ignore myself */
2385 			if (worker == MyWorkerInfo)
2386 				continue;
2387 
2388 			/* ignore workers in other databases (unless table is shared) */
2389 			if (!worker->wi_sharedrel && worker->wi_dboid != MyDatabaseId)
2390 				continue;
2391 
2392 			if (worker->wi_tableoid == relid)
2393 			{
2394 				skipit = true;
2395 				found_concurrent_worker = true;
2396 				break;
2397 			}
2398 		}
2399 		LWLockRelease(AutovacuumLock);
2400 		if (skipit)
2401 		{
2402 			LWLockRelease(AutovacuumScheduleLock);
2403 			continue;
2404 		}
2405 
2406 		/*
2407 		 * Store the table's OID in shared memory before releasing the
2408 		 * schedule lock, so that other workers don't try to vacuum it
2409 		 * concurrently.  (We claim it here so as not to hold
2410 		 * AutovacuumScheduleLock while rechecking the stats.)
2411 		 */
2412 		MyWorkerInfo->wi_tableoid = relid;
2413 		MyWorkerInfo->wi_sharedrel = isshared;
2414 		LWLockRelease(AutovacuumScheduleLock);
2415 
2416 		/*
2417 		 * Check whether pgstat data still says we need to vacuum this table.
2418 		 * It could have changed if something else processed the table while
2419 		 * we weren't looking.
2420 		 *
2421 		 * Note: we have a special case in pgstat code to ensure that the
2422 		 * stats we read are as up-to-date as possible, to avoid the problem
2423 		 * that somebody just finished vacuuming this table.  The window to
2424 		 * the race condition is not closed but it is very small.
2425 		 */
2426 		MemoryContextSwitchTo(AutovacMemCxt);
2427 		tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc,
2428 									effective_multixact_freeze_max_age);
2429 		if (tab == NULL)
2430 		{
2431 			/* someone else vacuumed the table, or it went away */
2432 			LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2433 			MyWorkerInfo->wi_tableoid = InvalidOid;
2434 			MyWorkerInfo->wi_sharedrel = false;
2435 			LWLockRelease(AutovacuumScheduleLock);
2436 			continue;
2437 		}
2438 
2439 		/*
2440 		 * Remember the prevailing values of the vacuum cost GUCs.  We have to
2441 		 * restore these at the bottom of the loop, else we'll compute wrong
2442 		 * values in the next iteration of autovac_balance_cost().
2443 		 */
2444 		stdVacuumCostDelay = VacuumCostDelay;
2445 		stdVacuumCostLimit = VacuumCostLimit;
2446 
2447 		/* Must hold AutovacuumLock while mucking with cost balance info */
2448 		LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2449 
2450 		/* advertise my cost delay parameters for the balancing algorithm */
2451 		MyWorkerInfo->wi_dobalance = tab->at_dobalance;
2452 		MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay;
2453 		MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit;
2454 		MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit;
2455 
2456 		/* do a balance */
2457 		autovac_balance_cost();
2458 
2459 		/* set the active cost parameters from the result of that */
2460 		AutoVacuumUpdateDelay();
2461 
2462 		/* done */
2463 		LWLockRelease(AutovacuumLock);
2464 
2465 		/* clean up memory before each iteration */
2466 		MemoryContextResetAndDeleteChildren(PortalContext);
2467 
2468 		/*
2469 		 * Save the relation name for a possible error message, to avoid a
2470 		 * catalog lookup in case of an error.  If any of these return NULL,
2471 		 * then the relation has been dropped since last we checked; skip it.
2472 		 * Note: they must live in a long-lived memory context because we call
2473 		 * vacuum and analyze in different transactions.
2474 		 */
2475 
2476 		tab->at_relname = get_rel_name(tab->at_relid);
2477 		tab->at_nspname = get_namespace_name(get_rel_namespace(tab->at_relid));
2478 		tab->at_datname = get_database_name(MyDatabaseId);
2479 		if (!tab->at_relname || !tab->at_nspname || !tab->at_datname)
2480 			goto deleted;
2481 
2482 		/*
2483 		 * We will abort vacuuming the current table if something errors out,
2484 		 * and continue with the next one in schedule; in particular, this
2485 		 * happens if we are interrupted with SIGINT.
2486 		 */
2487 		PG_TRY();
2488 		{
2489 			/* Use PortalContext for any per-table allocations */
2490 			MemoryContextSwitchTo(PortalContext);
2491 
2492 			/* have at it */
2493 			autovacuum_do_vac_analyze(tab, bstrategy);
2494 
2495 			/*
2496 			 * Clear a possible query-cancel signal, to avoid a late reaction
2497 			 * to an automatically-sent signal because of vacuuming the
2498 			 * current table (we're done with it, so it would make no sense to
2499 			 * cancel at this point.)
2500 			 */
2501 			QueryCancelPending = false;
2502 		}
2503 		PG_CATCH();
2504 		{
2505 			/*
2506 			 * Abort the transaction, start a new one, and proceed with the
2507 			 * next table in our list.
2508 			 */
2509 			HOLD_INTERRUPTS();
2510 			if (tab->at_vacoptions & VACOPT_VACUUM)
2511 				errcontext("automatic vacuum of table \"%s.%s.%s\"",
2512 						   tab->at_datname, tab->at_nspname, tab->at_relname);
2513 			else
2514 				errcontext("automatic analyze of table \"%s.%s.%s\"",
2515 						   tab->at_datname, tab->at_nspname, tab->at_relname);
2516 			EmitErrorReport();
2517 
2518 			/* this resets the PGXACT flags too */
2519 			AbortOutOfAnyTransaction();
2520 			FlushErrorState();
2521 			MemoryContextResetAndDeleteChildren(PortalContext);
2522 
2523 			/* restart our transaction for the following operations */
2524 			StartTransactionCommand();
2525 			RESUME_INTERRUPTS();
2526 		}
2527 		PG_END_TRY();
2528 
2529 		/* Make sure we're back in AutovacMemCxt */
2530 		MemoryContextSwitchTo(AutovacMemCxt);
2531 
2532 		did_vacuum = true;
2533 
2534 		/* the PGXACT flags are reset at the next end of transaction */
2535 
2536 		/* be tidy */
2537 deleted:
2538 		if (tab->at_datname != NULL)
2539 			pfree(tab->at_datname);
2540 		if (tab->at_nspname != NULL)
2541 			pfree(tab->at_nspname);
2542 		if (tab->at_relname != NULL)
2543 			pfree(tab->at_relname);
2544 		pfree(tab);
2545 
2546 		/*
2547 		 * Remove my info from shared memory.  We could, but intentionally
2548 		 * don't, clear wi_cost_limit and friends --- this is on the
2549 		 * assumption that we probably have more to do with similar cost
2550 		 * settings, so we don't want to give up our share of I/O for a very
2551 		 * short interval and thereby thrash the global balance.
2552 		 */
2553 		LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2554 		MyWorkerInfo->wi_tableoid = InvalidOid;
2555 		MyWorkerInfo->wi_sharedrel = false;
2556 		LWLockRelease(AutovacuumScheduleLock);
2557 
2558 		/* restore vacuum cost GUCs for the next iteration */
2559 		VacuumCostDelay = stdVacuumCostDelay;
2560 		VacuumCostLimit = stdVacuumCostLimit;
2561 	}
2562 
2563 	/*
2564 	 * Perform additional work items, as requested by backends.
2565 	 */
2566 	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2567 	for (i = 0; i < NUM_WORKITEMS; i++)
2568 	{
2569 		AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i];
2570 
2571 		if (!workitem->avw_used)
2572 			continue;
2573 		if (workitem->avw_active)
2574 			continue;
2575 		if (workitem->avw_database != MyDatabaseId)
2576 			continue;
2577 
2578 		/* claim this one, and release lock while performing it */
2579 		workitem->avw_active = true;
2580 		LWLockRelease(AutovacuumLock);
2581 
2582 		perform_work_item(workitem);
2583 
2584 		/*
2585 		 * Check for config changes before acquiring lock for further jobs.
2586 		 */
2587 		CHECK_FOR_INTERRUPTS();
2588 		if (got_SIGHUP)
2589 		{
2590 			got_SIGHUP = false;
2591 			ProcessConfigFile(PGC_SIGHUP);
2592 		}
2593 
2594 		LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2595 
2596 		/* and mark it done */
2597 		workitem->avw_active = false;
2598 		workitem->avw_used = false;
2599 	}
2600 	LWLockRelease(AutovacuumLock);
2601 
2602 	/*
2603 	 * We leak table_toast_map here (among other things), but since we're
2604 	 * going away soon, it's not a problem.
2605 	 */
2606 
2607 	/*
2608 	 * Update pg_database.datfrozenxid, and truncate pg_xact if possible. We
2609 	 * only need to do this once, not after each table.
2610 	 *
2611 	 * Even if we didn't vacuum anything, it may still be important to do
2612 	 * this, because one indirect effect of vac_update_datfrozenxid() is to
2613 	 * update ShmemVariableCache->xidVacLimit.  That might need to be done
2614 	 * even if we haven't vacuumed anything, because relations with older
2615 	 * relfrozenxid values or other databases with older datfrozenxid values
2616 	 * might have been dropped, allowing xidVacLimit to advance.
2617 	 *
2618 	 * However, it's also important not to do this blindly in all cases,
2619 	 * because when autovacuum=off this will restart the autovacuum launcher.
2620 	 * If we're not careful, an infinite loop can result, where workers find
2621 	 * no work to do and restart the launcher, which starts another worker in
2622 	 * the same database that finds no work to do.  To prevent that, we skip
2623 	 * this if (1) we found no work to do and (2) we skipped at least one
2624 	 * table due to concurrent autovacuum activity.  In that case, the other
2625 	 * worker has already done it, or will do so when it finishes.
2626 	 */
2627 	if (did_vacuum || !found_concurrent_worker)
2628 		vac_update_datfrozenxid();
2629 
2630 	/* Finally close out the last transaction. */
2631 	CommitTransactionCommand();
2632 }
2633 
2634 /*
2635  * Execute a previously registered work item.
2636  */
2637 static void
perform_work_item(AutoVacuumWorkItem * workitem)2638 perform_work_item(AutoVacuumWorkItem *workitem)
2639 {
2640 	char	   *cur_datname = NULL;
2641 	char	   *cur_nspname = NULL;
2642 	char	   *cur_relname = NULL;
2643 
2644 	/*
2645 	 * Note we do not store table info in MyWorkerInfo, since this is not
2646 	 * vacuuming proper.
2647 	 */
2648 
2649 	/*
2650 	 * Save the relation name for a possible error message, to avoid a catalog
2651 	 * lookup in case of an error.  If any of these return NULL, then the
2652 	 * relation has been dropped since last we checked; skip it.
2653 	 */
2654 	Assert(CurrentMemoryContext == AutovacMemCxt);
2655 
2656 	cur_relname = get_rel_name(workitem->avw_relation);
2657 	cur_nspname = get_namespace_name(get_rel_namespace(workitem->avw_relation));
2658 	cur_datname = get_database_name(MyDatabaseId);
2659 	if (!cur_relname || !cur_nspname || !cur_datname)
2660 		goto deleted2;
2661 
2662 	autovac_report_workitem(workitem, cur_nspname, cur_relname);
2663 
2664 	/* clean up memory before each work item */
2665 	MemoryContextResetAndDeleteChildren(PortalContext);
2666 
2667 	/*
2668 	 * We will abort the current work item if something errors out, and
2669 	 * continue with the next one; in particular, this happens if we are
2670 	 * interrupted with SIGINT.  Note that this means that the work item list
2671 	 * can be lossy.
2672 	 */
2673 	PG_TRY();
2674 	{
2675 		/* Use PortalContext for any per-work-item allocations */
2676 		MemoryContextSwitchTo(PortalContext);
2677 
2678 		/* have at it */
2679 		switch (workitem->avw_type)
2680 		{
2681 			case AVW_BRINSummarizeRange:
2682 				DirectFunctionCall2(brin_summarize_range,
2683 									ObjectIdGetDatum(workitem->avw_relation),
2684 									Int64GetDatum((int64) workitem->avw_blockNumber));
2685 				break;
2686 			default:
2687 				elog(WARNING, "unrecognized work item found: type %d",
2688 					 workitem->avw_type);
2689 				break;
2690 		}
2691 
2692 		/*
2693 		 * Clear a possible query-cancel signal, to avoid a late reaction to
2694 		 * an automatically-sent signal because of vacuuming the current table
2695 		 * (we're done with it, so it would make no sense to cancel at this
2696 		 * point.)
2697 		 */
2698 		QueryCancelPending = false;
2699 	}
2700 	PG_CATCH();
2701 	{
2702 		/*
2703 		 * Abort the transaction, start a new one, and proceed with the next
2704 		 * table in our list.
2705 		 */
2706 		HOLD_INTERRUPTS();
2707 		errcontext("processing work entry for relation \"%s.%s.%s\"",
2708 				   cur_datname, cur_nspname, cur_relname);
2709 		EmitErrorReport();
2710 
2711 		/* this resets the PGXACT flags too */
2712 		AbortOutOfAnyTransaction();
2713 		FlushErrorState();
2714 		MemoryContextResetAndDeleteChildren(PortalContext);
2715 
2716 		/* restart our transaction for the following operations */
2717 		StartTransactionCommand();
2718 		RESUME_INTERRUPTS();
2719 	}
2720 	PG_END_TRY();
2721 
2722 	/* Make sure we're back in AutovacMemCxt */
2723 	MemoryContextSwitchTo(AutovacMemCxt);
2724 
2725 	/* We intentionally do not set did_vacuum here */
2726 
2727 	/* be tidy */
2728 deleted2:
2729 	if (cur_datname)
2730 		pfree(cur_datname);
2731 	if (cur_nspname)
2732 		pfree(cur_nspname);
2733 	if (cur_relname)
2734 		pfree(cur_relname);
2735 }
2736 
2737 /*
2738  * extract_autovac_opts
2739  *
2740  * Given a relation's pg_class tuple, return the AutoVacOpts portion of
2741  * reloptions, if set; otherwise, return NULL.
2742  */
2743 static AutoVacOpts *
extract_autovac_opts(HeapTuple tup,TupleDesc pg_class_desc)2744 extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc)
2745 {
2746 	bytea	   *relopts;
2747 	AutoVacOpts *av;
2748 
2749 	Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION ||
2750 		   ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW ||
2751 		   ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE);
2752 
2753 	relopts = extractRelOptions(tup, pg_class_desc, NULL);
2754 	if (relopts == NULL)
2755 		return NULL;
2756 
2757 	av = palloc(sizeof(AutoVacOpts));
2758 	memcpy(av, &(((StdRdOptions *) relopts)->autovacuum), sizeof(AutoVacOpts));
2759 	pfree(relopts);
2760 
2761 	return av;
2762 }
2763 
2764 /*
2765  * get_pgstat_tabentry_relid
2766  *
2767  * Fetch the pgstat entry of a table, either local to a database or shared.
2768  */
2769 static PgStat_StatTabEntry *
get_pgstat_tabentry_relid(Oid relid,bool isshared,PgStat_StatDBEntry * shared,PgStat_StatDBEntry * dbentry)2770 get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
2771 						  PgStat_StatDBEntry *dbentry)
2772 {
2773 	PgStat_StatTabEntry *tabentry = NULL;
2774 
2775 	if (isshared)
2776 	{
2777 		if (PointerIsValid(shared))
2778 			tabentry = hash_search(shared->tables, &relid,
2779 								   HASH_FIND, NULL);
2780 	}
2781 	else if (PointerIsValid(dbentry))
2782 		tabentry = hash_search(dbentry->tables, &relid,
2783 							   HASH_FIND, NULL);
2784 
2785 	return tabentry;
2786 }
2787 
2788 /*
2789  * table_recheck_autovac
2790  *
2791  * Recheck whether a table still needs vacuum or analyze.  Return value is a
2792  * valid autovac_table pointer if it does, NULL otherwise.
2793  *
2794  * Note that the returned autovac_table does not have the name fields set.
2795  */
2796 static autovac_table *
table_recheck_autovac(Oid relid,HTAB * table_toast_map,TupleDesc pg_class_desc,int effective_multixact_freeze_max_age)2797 table_recheck_autovac(Oid relid, HTAB *table_toast_map,
2798 					  TupleDesc pg_class_desc,
2799 					  int effective_multixact_freeze_max_age)
2800 {
2801 	Form_pg_class classForm;
2802 	HeapTuple	classTup;
2803 	bool		dovacuum;
2804 	bool		doanalyze;
2805 	autovac_table *tab = NULL;
2806 	PgStat_StatTabEntry *tabentry;
2807 	PgStat_StatDBEntry *shared;
2808 	PgStat_StatDBEntry *dbentry;
2809 	bool		wraparound;
2810 	AutoVacOpts *avopts;
2811 
2812 	/* use fresh stats */
2813 	autovac_refresh_stats();
2814 
2815 	shared = pgstat_fetch_stat_dbentry(InvalidOid);
2816 	dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
2817 
2818 	/* fetch the relation's relcache entry */
2819 	classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
2820 	if (!HeapTupleIsValid(classTup))
2821 		return NULL;
2822 	classForm = (Form_pg_class) GETSTRUCT(classTup);
2823 
2824 	/*
2825 	 * Get the applicable reloptions.  If it is a TOAST table, try to get the
2826 	 * main table reloptions if the toast table itself doesn't have.
2827 	 */
2828 	avopts = extract_autovac_opts(classTup, pg_class_desc);
2829 	if (classForm->relkind == RELKIND_TOASTVALUE &&
2830 		avopts == NULL && table_toast_map != NULL)
2831 	{
2832 		av_relation *hentry;
2833 		bool		found;
2834 
2835 		hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2836 		if (found && hentry->ar_hasrelopts)
2837 			avopts = &hentry->ar_reloptions;
2838 	}
2839 
2840 	/* fetch the pgstat table entry */
2841 	tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2842 										 shared, dbentry);
2843 
2844 	relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
2845 							  effective_multixact_freeze_max_age,
2846 							  &dovacuum, &doanalyze, &wraparound);
2847 
2848 	/* ignore ANALYZE for toast tables */
2849 	if (classForm->relkind == RELKIND_TOASTVALUE)
2850 		doanalyze = false;
2851 
2852 	/* OK, it needs something done */
2853 	if (doanalyze || dovacuum)
2854 	{
2855 		int			freeze_min_age;
2856 		int			freeze_table_age;
2857 		int			multixact_freeze_min_age;
2858 		int			multixact_freeze_table_age;
2859 		int			vac_cost_limit;
2860 		int			vac_cost_delay;
2861 		int			log_min_duration;
2862 
2863 		/*
2864 		 * Calculate the vacuum cost parameters and the freeze ages.  If there
2865 		 * are options set in pg_class.reloptions, use them; in the case of a
2866 		 * toast table, try the main table too.  Otherwise use the GUC
2867 		 * defaults, autovacuum's own first and plain vacuum second.
2868 		 */
2869 
2870 		/* -1 in autovac setting means use plain vacuum_cost_delay */
2871 		vac_cost_delay = (avopts && avopts->vacuum_cost_delay >= 0)
2872 			? avopts->vacuum_cost_delay
2873 			: (autovacuum_vac_cost_delay >= 0)
2874 			? autovacuum_vac_cost_delay
2875 			: VacuumCostDelay;
2876 
2877 		/* 0 or -1 in autovac setting means use plain vacuum_cost_limit */
2878 		vac_cost_limit = (avopts && avopts->vacuum_cost_limit > 0)
2879 			? avopts->vacuum_cost_limit
2880 			: (autovacuum_vac_cost_limit > 0)
2881 			? autovacuum_vac_cost_limit
2882 			: VacuumCostLimit;
2883 
2884 		/* -1 in autovac setting means use log_autovacuum_min_duration */
2885 		log_min_duration = (avopts && avopts->log_min_duration >= 0)
2886 			? avopts->log_min_duration
2887 			: Log_autovacuum_min_duration;
2888 
2889 		/* these do not have autovacuum-specific settings */
2890 		freeze_min_age = (avopts && avopts->freeze_min_age >= 0)
2891 			? avopts->freeze_min_age
2892 			: default_freeze_min_age;
2893 
2894 		freeze_table_age = (avopts && avopts->freeze_table_age >= 0)
2895 			? avopts->freeze_table_age
2896 			: default_freeze_table_age;
2897 
2898 		multixact_freeze_min_age = (avopts &&
2899 									avopts->multixact_freeze_min_age >= 0)
2900 			? avopts->multixact_freeze_min_age
2901 			: default_multixact_freeze_min_age;
2902 
2903 		multixact_freeze_table_age = (avopts &&
2904 									  avopts->multixact_freeze_table_age >= 0)
2905 			? avopts->multixact_freeze_table_age
2906 			: default_multixact_freeze_table_age;
2907 
2908 		tab = palloc(sizeof(autovac_table));
2909 		tab->at_relid = relid;
2910 		tab->at_sharedrel = classForm->relisshared;
2911 		tab->at_vacoptions = VACOPT_SKIPTOAST |
2912 			(dovacuum ? VACOPT_VACUUM : 0) |
2913 			(doanalyze ? VACOPT_ANALYZE : 0) |
2914 			(!wraparound ? VACOPT_NOWAIT : 0);
2915 		tab->at_params.freeze_min_age = freeze_min_age;
2916 		tab->at_params.freeze_table_age = freeze_table_age;
2917 		tab->at_params.multixact_freeze_min_age = multixact_freeze_min_age;
2918 		tab->at_params.multixact_freeze_table_age = multixact_freeze_table_age;
2919 		tab->at_params.is_wraparound = wraparound;
2920 		tab->at_params.log_min_duration = log_min_duration;
2921 		tab->at_vacuum_cost_limit = vac_cost_limit;
2922 		tab->at_vacuum_cost_delay = vac_cost_delay;
2923 		tab->at_relname = NULL;
2924 		tab->at_nspname = NULL;
2925 		tab->at_datname = NULL;
2926 
2927 		/*
2928 		 * If any of the cost delay parameters has been set individually for
2929 		 * this table, disable the balancing algorithm.
2930 		 */
2931 		tab->at_dobalance =
2932 			!(avopts && (avopts->vacuum_cost_limit > 0 ||
2933 						 avopts->vacuum_cost_delay > 0));
2934 	}
2935 
2936 	heap_freetuple(classTup);
2937 
2938 	return tab;
2939 }
2940 
2941 /*
2942  * relation_needs_vacanalyze
2943  *
2944  * Check whether a relation needs to be vacuumed or analyzed; return each into
2945  * "dovacuum" and "doanalyze", respectively.  Also return whether the vacuum is
2946  * being forced because of Xid or multixact wraparound.
2947  *
2948  * relopts is a pointer to the AutoVacOpts options (either for itself in the
2949  * case of a plain table, or for either itself or its parent table in the case
2950  * of a TOAST table), NULL if none; tabentry is the pgstats entry, which can be
2951  * NULL.
2952  *
2953  * A table needs to be vacuumed if the number of dead tuples exceeds a
2954  * threshold.  This threshold is calculated as
2955  *
2956  * threshold = vac_base_thresh + vac_scale_factor * reltuples
2957  *
2958  * For analyze, the analysis done is that the number of tuples inserted,
2959  * deleted and updated since the last analyze exceeds a threshold calculated
2960  * in the same fashion as above.  Note that the collector actually stores
2961  * the number of tuples (both live and dead) that there were as of the last
2962  * analyze.  This is asymmetric to the VACUUM case.
2963  *
2964  * We also force vacuum if the table's relfrozenxid is more than freeze_max_age
2965  * transactions back, and if its relminmxid is more than
2966  * multixact_freeze_max_age multixacts back.
2967  *
2968  * A table whose autovacuum_enabled option is false is
2969  * automatically skipped (unless we have to vacuum it due to freeze_max_age).
2970  * Thus autovacuum can be disabled for specific tables. Also, when the stats
2971  * collector does not have data about a table, it will be skipped.
2972  *
2973  * A table whose vac_base_thresh value is < 0 takes the base value from the
2974  * autovacuum_vacuum_threshold GUC variable.  Similarly, a vac_scale_factor
2975  * value < 0 is substituted with the value of
2976  * autovacuum_vacuum_scale_factor GUC variable.  Ditto for analyze.
2977  */
2978 static void
relation_needs_vacanalyze(Oid relid,AutoVacOpts * relopts,Form_pg_class classForm,PgStat_StatTabEntry * tabentry,int effective_multixact_freeze_max_age,bool * dovacuum,bool * doanalyze,bool * wraparound)2979 relation_needs_vacanalyze(Oid relid,
2980 						  AutoVacOpts *relopts,
2981 						  Form_pg_class classForm,
2982 						  PgStat_StatTabEntry *tabentry,
2983 						  int effective_multixact_freeze_max_age,
2984  /* output params below */
2985 						  bool *dovacuum,
2986 						  bool *doanalyze,
2987 						  bool *wraparound)
2988 {
2989 	bool		force_vacuum;
2990 	bool		av_enabled;
2991 	float4		reltuples;		/* pg_class.reltuples */
2992 
2993 	/* constants from reloptions or GUC variables */
2994 	int			vac_base_thresh,
2995 				anl_base_thresh;
2996 	float4		vac_scale_factor,
2997 				anl_scale_factor;
2998 
2999 	/* thresholds calculated from above constants */
3000 	float4		vacthresh,
3001 				anlthresh;
3002 
3003 	/* number of vacuum (resp. analyze) tuples at this time */
3004 	float4		vactuples,
3005 				anltuples;
3006 
3007 	/* freeze parameters */
3008 	int			freeze_max_age;
3009 	int			multixact_freeze_max_age;
3010 	TransactionId xidForceLimit;
3011 	MultiXactId multiForceLimit;
3012 
3013 	AssertArg(classForm != NULL);
3014 	AssertArg(OidIsValid(relid));
3015 
3016 	/*
3017 	 * Determine vacuum/analyze equation parameters.  We have two possible
3018 	 * sources: the passed reloptions (which could be a main table or a toast
3019 	 * table), or the autovacuum GUC variables.
3020 	 */
3021 
3022 	/* -1 in autovac setting means use plain vacuum_cost_delay */
3023 	vac_scale_factor = (relopts && relopts->vacuum_scale_factor >= 0)
3024 		? relopts->vacuum_scale_factor
3025 		: autovacuum_vac_scale;
3026 
3027 	vac_base_thresh = (relopts && relopts->vacuum_threshold >= 0)
3028 		? relopts->vacuum_threshold
3029 		: autovacuum_vac_thresh;
3030 
3031 	anl_scale_factor = (relopts && relopts->analyze_scale_factor >= 0)
3032 		? relopts->analyze_scale_factor
3033 		: autovacuum_anl_scale;
3034 
3035 	anl_base_thresh = (relopts && relopts->analyze_threshold >= 0)
3036 		? relopts->analyze_threshold
3037 		: autovacuum_anl_thresh;
3038 
3039 	freeze_max_age = (relopts && relopts->freeze_max_age >= 0)
3040 		? Min(relopts->freeze_max_age, autovacuum_freeze_max_age)
3041 		: autovacuum_freeze_max_age;
3042 
3043 	multixact_freeze_max_age = (relopts && relopts->multixact_freeze_max_age >= 0)
3044 		? Min(relopts->multixact_freeze_max_age, effective_multixact_freeze_max_age)
3045 		: effective_multixact_freeze_max_age;
3046 
3047 	av_enabled = (relopts ? relopts->enabled : true);
3048 
3049 	/* Force vacuum if table is at risk of wraparound */
3050 	xidForceLimit = recentXid - freeze_max_age;
3051 	if (xidForceLimit < FirstNormalTransactionId)
3052 		xidForceLimit -= FirstNormalTransactionId;
3053 	force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) &&
3054 					TransactionIdPrecedes(classForm->relfrozenxid,
3055 										  xidForceLimit));
3056 	if (!force_vacuum)
3057 	{
3058 		multiForceLimit = recentMulti - multixact_freeze_max_age;
3059 		if (multiForceLimit < FirstMultiXactId)
3060 			multiForceLimit -= FirstMultiXactId;
3061 		force_vacuum = MultiXactIdPrecedes(classForm->relminmxid,
3062 										   multiForceLimit);
3063 	}
3064 	*wraparound = force_vacuum;
3065 
3066 	/* User disabled it in pg_class.reloptions?  (But ignore if at risk) */
3067 	if (!av_enabled && !force_vacuum)
3068 	{
3069 		*doanalyze = false;
3070 		*dovacuum = false;
3071 		return;
3072 	}
3073 
3074 	/*
3075 	 * If we found the table in the stats hash, and autovacuum is currently
3076 	 * enabled, make a threshold-based decision whether to vacuum and/or
3077 	 * analyze.  If autovacuum is currently disabled, we must be here for
3078 	 * anti-wraparound vacuuming only, so don't vacuum (or analyze) anything
3079 	 * that's not being forced.
3080 	 */
3081 	if (PointerIsValid(tabentry) && AutoVacuumingActive())
3082 	{
3083 		reltuples = classForm->reltuples;
3084 		vactuples = tabentry->n_dead_tuples;
3085 		anltuples = tabentry->changes_since_analyze;
3086 
3087 		vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples;
3088 		anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples;
3089 
3090 		/*
3091 		 * Note that we don't need to take special consideration for stat
3092 		 * reset, because if that happens, the last vacuum and analyze counts
3093 		 * will be reset too.
3094 		 */
3095 		elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)",
3096 			 NameStr(classForm->relname),
3097 			 vactuples, vacthresh, anltuples, anlthresh);
3098 
3099 		/* Determine if this table needs vacuum or analyze. */
3100 		*dovacuum = force_vacuum || (vactuples > vacthresh);
3101 		*doanalyze = (anltuples > anlthresh);
3102 	}
3103 	else
3104 	{
3105 		/*
3106 		 * Skip a table not found in stat hash, unless we have to force vacuum
3107 		 * for anti-wrap purposes.  If it's not acted upon, there's no need to
3108 		 * vacuum it.
3109 		 */
3110 		*dovacuum = force_vacuum;
3111 		*doanalyze = false;
3112 	}
3113 
3114 	/* ANALYZE refuses to work with pg_statistic */
3115 	if (relid == StatisticRelationId)
3116 		*doanalyze = false;
3117 }
3118 
3119 /*
3120  * autovacuum_do_vac_analyze
3121  *		Vacuum and/or analyze the specified table
3122  */
3123 static void
autovacuum_do_vac_analyze(autovac_table * tab,BufferAccessStrategy bstrategy)3124 autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy)
3125 {
3126 	RangeVar	rangevar;
3127 
3128 	/* Set up command parameters --- use local variables instead of palloc */
3129 	MemSet(&rangevar, 0, sizeof(rangevar));
3130 
3131 	rangevar.schemaname = tab->at_nspname;
3132 	rangevar.relname = tab->at_relname;
3133 	rangevar.location = -1;
3134 
3135 	/* Let pgstat know what we're doing */
3136 	autovac_report_activity(tab);
3137 
3138 	vacuum(tab->at_vacoptions, &rangevar, tab->at_relid, &tab->at_params, NIL,
3139 		   bstrategy, true);
3140 }
3141 
3142 /*
3143  * autovac_report_activity
3144  *		Report to pgstat what autovacuum is doing
3145  *
3146  * We send a SQL string corresponding to what the user would see if the
3147  * equivalent command was to be issued manually.
3148  *
3149  * Note we assume that we are going to report the next command as soon as we're
3150  * done with the current one, and exit right after the last one, so we don't
3151  * bother to report "<IDLE>" or some such.
3152  */
3153 static void
autovac_report_activity(autovac_table * tab)3154 autovac_report_activity(autovac_table *tab)
3155 {
3156 #define MAX_AUTOVAC_ACTIV_LEN (NAMEDATALEN * 2 + 56)
3157 	char		activity[MAX_AUTOVAC_ACTIV_LEN];
3158 	int			len;
3159 
3160 	/* Report the command and possible options */
3161 	if (tab->at_vacoptions & VACOPT_VACUUM)
3162 		snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3163 				 "autovacuum: VACUUM%s",
3164 				 tab->at_vacoptions & VACOPT_ANALYZE ? " ANALYZE" : "");
3165 	else
3166 		snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3167 				 "autovacuum: ANALYZE");
3168 
3169 	/*
3170 	 * Report the qualified name of the relation.
3171 	 */
3172 	len = strlen(activity);
3173 
3174 	snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
3175 			 " %s.%s%s", tab->at_nspname, tab->at_relname,
3176 			 tab->at_params.is_wraparound ? " (to prevent wraparound)" : "");
3177 
3178 	/* Set statement_timestamp() to current time for pg_stat_activity */
3179 	SetCurrentStatementStartTimestamp();
3180 
3181 	pgstat_report_activity(STATE_RUNNING, activity);
3182 }
3183 
3184 /*
3185  * autovac_report_workitem
3186  *		Report to pgstat that autovacuum is processing a work item
3187  */
3188 static void
autovac_report_workitem(AutoVacuumWorkItem * workitem,const char * nspname,const char * relname)3189 autovac_report_workitem(AutoVacuumWorkItem *workitem,
3190 						const char *nspname, const char *relname)
3191 {
3192 	char		activity[MAX_AUTOVAC_ACTIV_LEN + 12 + 2];
3193 	char		blk[12 + 2];
3194 	int			len;
3195 
3196 	switch (workitem->avw_type)
3197 	{
3198 		case AVW_BRINSummarizeRange:
3199 			snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3200 					 "autovacuum: BRIN summarize");
3201 			break;
3202 	}
3203 
3204 	/*
3205 	 * Report the qualified name of the relation, and the block number if any
3206 	 */
3207 	len = strlen(activity);
3208 
3209 	if (BlockNumberIsValid(workitem->avw_blockNumber))
3210 		snprintf(blk, sizeof(blk), " %u", workitem->avw_blockNumber);
3211 	else
3212 		blk[0] = '\0';
3213 
3214 	snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
3215 			 " %s.%s%s", nspname, relname, blk);
3216 
3217 	/* Set statement_timestamp() to current time for pg_stat_activity */
3218 	SetCurrentStatementStartTimestamp();
3219 
3220 	pgstat_report_activity(STATE_RUNNING, activity);
3221 }
3222 
3223 /*
3224  * AutoVacuumingActive
3225  *		Check GUC vars and report whether the autovacuum process should be
3226  *		running.
3227  */
3228 bool
AutoVacuumingActive(void)3229 AutoVacuumingActive(void)
3230 {
3231 	if (!autovacuum_start_daemon || !pgstat_track_counts)
3232 		return false;
3233 	return true;
3234 }
3235 
3236 /*
3237  * Request one work item to the next autovacuum run processing our database.
3238  * Return false if the request can't be recorded.
3239  */
3240 bool
AutoVacuumRequestWork(AutoVacuumWorkItemType type,Oid relationId,BlockNumber blkno)3241 AutoVacuumRequestWork(AutoVacuumWorkItemType type, Oid relationId,
3242 					  BlockNumber blkno)
3243 {
3244 	int			i;
3245 	bool		result = false;
3246 
3247 	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
3248 
3249 	/*
3250 	 * Locate an unused work item and fill it with the given data.
3251 	 */
3252 	for (i = 0; i < NUM_WORKITEMS; i++)
3253 	{
3254 		AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i];
3255 
3256 		if (workitem->avw_used)
3257 			continue;
3258 
3259 		workitem->avw_used = true;
3260 		workitem->avw_active = false;
3261 		workitem->avw_type = type;
3262 		workitem->avw_database = MyDatabaseId;
3263 		workitem->avw_relation = relationId;
3264 		workitem->avw_blockNumber = blkno;
3265 		result = true;
3266 
3267 		/* done */
3268 		break;
3269 	}
3270 
3271 	LWLockRelease(AutovacuumLock);
3272 
3273 	return result;
3274 }
3275 
3276 /*
3277  * autovac_init
3278  *		This is called at postmaster initialization.
3279  *
3280  * All we do here is annoy the user if he got it wrong.
3281  */
3282 void
autovac_init(void)3283 autovac_init(void)
3284 {
3285 	if (autovacuum_start_daemon && !pgstat_track_counts)
3286 		ereport(WARNING,
3287 				(errmsg("autovacuum not started because of misconfiguration"),
3288 				 errhint("Enable the \"track_counts\" option.")));
3289 }
3290 
3291 /*
3292  * IsAutoVacuum functions
3293  *		Return whether this is either a launcher autovacuum process or a worker
3294  *		process.
3295  */
3296 bool
IsAutoVacuumLauncherProcess(void)3297 IsAutoVacuumLauncherProcess(void)
3298 {
3299 	return am_autovacuum_launcher;
3300 }
3301 
3302 bool
IsAutoVacuumWorkerProcess(void)3303 IsAutoVacuumWorkerProcess(void)
3304 {
3305 	return am_autovacuum_worker;
3306 }
3307 
3308 
3309 /*
3310  * AutoVacuumShmemSize
3311  *		Compute space needed for autovacuum-related shared memory
3312  */
3313 Size
AutoVacuumShmemSize(void)3314 AutoVacuumShmemSize(void)
3315 {
3316 	Size		size;
3317 
3318 	/*
3319 	 * Need the fixed struct and the array of WorkerInfoData.
3320 	 */
3321 	size = sizeof(AutoVacuumShmemStruct);
3322 	size = MAXALIGN(size);
3323 	size = add_size(size, mul_size(autovacuum_max_workers,
3324 								   sizeof(WorkerInfoData)));
3325 	return size;
3326 }
3327 
3328 /*
3329  * AutoVacuumShmemInit
3330  *		Allocate and initialize autovacuum-related shared memory
3331  */
3332 void
AutoVacuumShmemInit(void)3333 AutoVacuumShmemInit(void)
3334 {
3335 	bool		found;
3336 
3337 	AutoVacuumShmem = (AutoVacuumShmemStruct *)
3338 		ShmemInitStruct("AutoVacuum Data",
3339 						AutoVacuumShmemSize(),
3340 						&found);
3341 
3342 	if (!IsUnderPostmaster)
3343 	{
3344 		WorkerInfo	worker;
3345 		int			i;
3346 
3347 		Assert(!found);
3348 
3349 		AutoVacuumShmem->av_launcherpid = 0;
3350 		dlist_init(&AutoVacuumShmem->av_freeWorkers);
3351 		dlist_init(&AutoVacuumShmem->av_runningWorkers);
3352 		AutoVacuumShmem->av_startingWorker = NULL;
3353 		memset(AutoVacuumShmem->av_workItems, 0,
3354 			   sizeof(AutoVacuumWorkItem) * NUM_WORKITEMS);
3355 
3356 		worker = (WorkerInfo) ((char *) AutoVacuumShmem +
3357 							   MAXALIGN(sizeof(AutoVacuumShmemStruct)));
3358 
3359 		/* initialize the WorkerInfo free list */
3360 		for (i = 0; i < autovacuum_max_workers; i++)
3361 			dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
3362 							&worker[i].wi_links);
3363 	}
3364 	else
3365 		Assert(found);
3366 }
3367 
3368 /*
3369  * autovac_refresh_stats
3370  *		Refresh pgstats data for an autovacuum process
3371  *
3372  * Cause the next pgstats read operation to obtain fresh data, but throttle
3373  * such refreshing in the autovacuum launcher.  This is mostly to avoid
3374  * rereading the pgstats files too many times in quick succession when there
3375  * are many databases.
3376  *
3377  * Note: we avoid throttling in the autovac worker, as it would be
3378  * counterproductive in the recheck logic.
3379  */
3380 static void
autovac_refresh_stats(void)3381 autovac_refresh_stats(void)
3382 {
3383 	if (IsAutoVacuumLauncherProcess())
3384 	{
3385 		static TimestampTz last_read = 0;
3386 		TimestampTz current_time;
3387 
3388 		current_time = GetCurrentTimestamp();
3389 
3390 		if (!TimestampDifferenceExceeds(last_read, current_time,
3391 										STATS_READ_DELAY))
3392 			return;
3393 
3394 		last_read = current_time;
3395 	}
3396 
3397 	pgstat_clear_snapshot();
3398 }
3399