1 /*-------------------------------------------------------------------------
2  *
3  * autoprewarm.c
4  *		Periodically dump information about the blocks present in
5  *		shared_buffers, and reload them on server restart.
6  *
7  *		Due to locking considerations, we can't actually begin prewarming
8  *		until the server reaches a consistent state.  We need the catalogs
9  *		to be consistent so that we can figure out which relation to lock,
10  *		and we need to lock the relations so that we don't try to prewarm
11  *		pages from a relation that is in the process of being dropped.
12  *
13  *		While prewarming, autoprewarm will use two workers.  There's a
14  *		leader worker that reads and sorts the list of blocks to be
15  *		prewarmed and then launches a per-database worker for each
16  *		relevant database in turn.  The former keeps running after the
17  *		initial prewarm is complete to update the dump file periodically.
18  *
19  *	Copyright (c) 2016-2021, PostgreSQL Global Development Group
20  *
21  *	IDENTIFICATION
22  *		contrib/pg_prewarm/autoprewarm.c
23  *
24  *-------------------------------------------------------------------------
25  */
26 
27 #include "postgres.h"
28 
29 #include <unistd.h>
30 
31 #include "access/relation.h"
32 #include "access/xact.h"
33 #include "catalog/pg_class.h"
34 #include "catalog/pg_type.h"
35 #include "miscadmin.h"
36 #include "pgstat.h"
37 #include "postmaster/bgworker.h"
38 #include "postmaster/interrupt.h"
39 #include "storage/buf_internals.h"
40 #include "storage/dsm.h"
41 #include "storage/ipc.h"
42 #include "storage/latch.h"
43 #include "storage/lwlock.h"
44 #include "storage/proc.h"
45 #include "storage/procsignal.h"
46 #include "storage/shmem.h"
47 #include "storage/smgr.h"
48 #include "tcop/tcopprot.h"
49 #include "utils/acl.h"
50 #include "utils/datetime.h"
51 #include "utils/guc.h"
52 #include "utils/memutils.h"
53 #include "utils/rel.h"
54 #include "utils/relfilenodemap.h"
55 #include "utils/resowner.h"
56 
57 #define AUTOPREWARM_FILE "autoprewarm.blocks"
58 
59 /* Metadata for each block we dump. */
60 typedef struct BlockInfoRecord
61 {
62 	Oid			database;
63 	Oid			tablespace;
64 	Oid			filenode;
65 	ForkNumber	forknum;
66 	BlockNumber blocknum;
67 } BlockInfoRecord;
68 
69 /* Shared state information for autoprewarm bgworker. */
70 typedef struct AutoPrewarmSharedState
71 {
72 	LWLock		lock;			/* mutual exclusion */
73 	pid_t		bgworker_pid;	/* for main bgworker */
74 	pid_t		pid_using_dumpfile; /* for autoprewarm or block dump */
75 
76 	/* Following items are for communication with per-database worker */
77 	dsm_handle	block_info_handle;
78 	Oid			database;
79 	int			prewarm_start_idx;
80 	int			prewarm_stop_idx;
81 	int			prewarmed_blocks;
82 } AutoPrewarmSharedState;
83 
84 void		_PG_init(void);
85 void		autoprewarm_main(Datum main_arg);
86 void		autoprewarm_database_main(Datum main_arg);
87 
88 PG_FUNCTION_INFO_V1(autoprewarm_start_worker);
89 PG_FUNCTION_INFO_V1(autoprewarm_dump_now);
90 
91 static void apw_load_buffers(void);
92 static int	apw_dump_now(bool is_bgworker, bool dump_unlogged);
93 static void apw_start_leader_worker(void);
94 static void apw_start_database_worker(void);
95 static bool apw_init_shmem(void);
96 static void apw_detach_shmem(int code, Datum arg);
97 static int	apw_compare_blockinfo(const void *p, const void *q);
98 
99 /* Pointer to shared-memory state. */
100 static AutoPrewarmSharedState *apw_state = NULL;
101 
102 /* GUC variables. */
103 static bool autoprewarm = true; /* start worker? */
104 static int	autoprewarm_interval;	/* dump interval */
105 
106 /*
107  * Module load callback.
108  */
109 void
_PG_init(void)110 _PG_init(void)
111 {
112 	DefineCustomIntVariable("pg_prewarm.autoprewarm_interval",
113 							"Sets the interval between dumps of shared buffers",
114 							"If set to zero, time-based dumping is disabled.",
115 							&autoprewarm_interval,
116 							300,
117 							0, INT_MAX / 1000,
118 							PGC_SIGHUP,
119 							GUC_UNIT_S,
120 							NULL,
121 							NULL,
122 							NULL);
123 
124 	if (!process_shared_preload_libraries_in_progress)
125 		return;
126 
127 	/* can't define PGC_POSTMASTER variable after startup */
128 	DefineCustomBoolVariable("pg_prewarm.autoprewarm",
129 							 "Starts the autoprewarm worker.",
130 							 NULL,
131 							 &autoprewarm,
132 							 true,
133 							 PGC_POSTMASTER,
134 							 0,
135 							 NULL,
136 							 NULL,
137 							 NULL);
138 
139 	EmitWarningsOnPlaceholders("pg_prewarm");
140 
141 	RequestAddinShmemSpace(MAXALIGN(sizeof(AutoPrewarmSharedState)));
142 
143 	/* Register autoprewarm worker, if enabled. */
144 	if (autoprewarm)
145 		apw_start_leader_worker();
146 }
147 
148 /*
149  * Main entry point for the leader autoprewarm process.  Per-database workers
150  * have a separate entry point.
151  */
152 void
autoprewarm_main(Datum main_arg)153 autoprewarm_main(Datum main_arg)
154 {
155 	bool		first_time = true;
156 	bool		final_dump_allowed = true;
157 	TimestampTz last_dump_time = 0;
158 
159 	/* Establish signal handlers; once that's done, unblock signals. */
160 	pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
161 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
162 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
163 	BackgroundWorkerUnblockSignals();
164 
165 	/* Create (if necessary) and attach to our shared memory area. */
166 	if (apw_init_shmem())
167 		first_time = false;
168 
169 	/* Set on-detach hook so that our PID will be cleared on exit. */
170 	on_shmem_exit(apw_detach_shmem, 0);
171 
172 	/*
173 	 * Store our PID in the shared memory area --- unless there's already
174 	 * another worker running, in which case just exit.
175 	 */
176 	LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
177 	if (apw_state->bgworker_pid != InvalidPid)
178 	{
179 		LWLockRelease(&apw_state->lock);
180 		ereport(LOG,
181 				(errmsg("autoprewarm worker is already running under PID %lu",
182 						(unsigned long) apw_state->bgworker_pid)));
183 		return;
184 	}
185 	apw_state->bgworker_pid = MyProcPid;
186 	LWLockRelease(&apw_state->lock);
187 
188 	/*
189 	 * Preload buffers from the dump file only if we just created the shared
190 	 * memory region.  Otherwise, it's either already been done or shouldn't
191 	 * be done - e.g. because the old dump file has been overwritten since the
192 	 * server was started.
193 	 *
194 	 * There's not much point in performing a dump immediately after we finish
195 	 * preloading; so, if we do end up preloading, consider the last dump time
196 	 * to be equal to the current time.
197 	 *
198 	 * If apw_load_buffers() is terminated early by a shutdown request,
199 	 * prevent dumping out our state below the loop, because we'd effectively
200 	 * just truncate the saved state to however much we'd managed to preload.
201 	 */
202 	if (first_time)
203 	{
204 		apw_load_buffers();
205 		final_dump_allowed = !ShutdownRequestPending;
206 		last_dump_time = GetCurrentTimestamp();
207 	}
208 
209 	/* Periodically dump buffers until terminated. */
210 	while (!ShutdownRequestPending)
211 	{
212 		/* In case of a SIGHUP, just reload the configuration. */
213 		if (ConfigReloadPending)
214 		{
215 			ConfigReloadPending = false;
216 			ProcessConfigFile(PGC_SIGHUP);
217 		}
218 
219 		if (autoprewarm_interval <= 0)
220 		{
221 			/* We're only dumping at shutdown, so just wait forever. */
222 			(void) WaitLatch(MyLatch,
223 							 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
224 							 -1L,
225 							 PG_WAIT_EXTENSION);
226 		}
227 		else
228 		{
229 			TimestampTz next_dump_time;
230 			long		delay_in_ms;
231 
232 			/* Compute the next dump time. */
233 			next_dump_time =
234 				TimestampTzPlusMilliseconds(last_dump_time,
235 											autoprewarm_interval * 1000);
236 			delay_in_ms =
237 				TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
238 												next_dump_time);
239 
240 			/* Perform a dump if it's time. */
241 			if (delay_in_ms <= 0)
242 			{
243 				last_dump_time = GetCurrentTimestamp();
244 				apw_dump_now(true, false);
245 				continue;
246 			}
247 
248 			/* Sleep until the next dump time. */
249 			(void) WaitLatch(MyLatch,
250 							 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
251 							 delay_in_ms,
252 							 PG_WAIT_EXTENSION);
253 		}
254 
255 		/* Reset the latch, loop. */
256 		ResetLatch(MyLatch);
257 	}
258 
259 	/*
260 	 * Dump one last time.  We assume this is probably the result of a system
261 	 * shutdown, although it's possible that we've merely been terminated.
262 	 */
263 	if (final_dump_allowed)
264 		apw_dump_now(true, true);
265 }
266 
267 /*
268  * Read the dump file and launch per-database workers one at a time to
269  * prewarm the buffers found there.
270  */
271 static void
apw_load_buffers(void)272 apw_load_buffers(void)
273 {
274 	FILE	   *file = NULL;
275 	int			num_elements,
276 				i;
277 	BlockInfoRecord *blkinfo;
278 	dsm_segment *seg;
279 
280 	/*
281 	 * Skip the prewarm if the dump file is in use; otherwise, prevent any
282 	 * other process from writing it while we're using it.
283 	 */
284 	LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
285 	if (apw_state->pid_using_dumpfile == InvalidPid)
286 		apw_state->pid_using_dumpfile = MyProcPid;
287 	else
288 	{
289 		LWLockRelease(&apw_state->lock);
290 		ereport(LOG,
291 				(errmsg("skipping prewarm because block dump file is being written by PID %lu",
292 						(unsigned long) apw_state->pid_using_dumpfile)));
293 		return;
294 	}
295 	LWLockRelease(&apw_state->lock);
296 
297 	/*
298 	 * Open the block dump file.  Exit quietly if it doesn't exist, but report
299 	 * any other error.
300 	 */
301 	file = AllocateFile(AUTOPREWARM_FILE, "r");
302 	if (!file)
303 	{
304 		if (errno == ENOENT)
305 		{
306 			LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
307 			apw_state->pid_using_dumpfile = InvalidPid;
308 			LWLockRelease(&apw_state->lock);
309 			return;				/* No file to load. */
310 		}
311 		ereport(ERROR,
312 				(errcode_for_file_access(),
313 				 errmsg("could not read file \"%s\": %m",
314 						AUTOPREWARM_FILE)));
315 	}
316 
317 	/* First line of the file is a record count. */
318 	if (fscanf(file, "<<%d>>\n", &num_elements) != 1)
319 		ereport(ERROR,
320 				(errcode_for_file_access(),
321 				 errmsg("could not read from file \"%s\": %m",
322 						AUTOPREWARM_FILE)));
323 
324 	/* Allocate a dynamic shared memory segment to store the record data. */
325 	seg = dsm_create(sizeof(BlockInfoRecord) * num_elements, 0);
326 	blkinfo = (BlockInfoRecord *) dsm_segment_address(seg);
327 
328 	/* Read records, one per line. */
329 	for (i = 0; i < num_elements; i++)
330 	{
331 		unsigned	forknum;
332 
333 		if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database,
334 				   &blkinfo[i].tablespace, &blkinfo[i].filenode,
335 				   &forknum, &blkinfo[i].blocknum) != 5)
336 			ereport(ERROR,
337 					(errmsg("autoprewarm block dump file is corrupted at line %d",
338 							i + 1)));
339 		blkinfo[i].forknum = forknum;
340 	}
341 
342 	FreeFile(file);
343 
344 	/* Sort the blocks to be loaded. */
345 	pg_qsort(blkinfo, num_elements, sizeof(BlockInfoRecord),
346 			 apw_compare_blockinfo);
347 
348 	/* Populate shared memory state. */
349 	apw_state->block_info_handle = dsm_segment_handle(seg);
350 	apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
351 	apw_state->prewarmed_blocks = 0;
352 
353 	/* Get the info position of the first block of the next database. */
354 	while (apw_state->prewarm_start_idx < num_elements)
355 	{
356 		int			j = apw_state->prewarm_start_idx;
357 		Oid			current_db = blkinfo[j].database;
358 
359 		/*
360 		 * Advance the prewarm_stop_idx to the first BlockInfoRecord that does
361 		 * not belong to this database.
362 		 */
363 		j++;
364 		while (j < num_elements)
365 		{
366 			if (current_db != blkinfo[j].database)
367 			{
368 				/*
369 				 * Combine BlockInfoRecords for global objects with those of
370 				 * the database.
371 				 */
372 				if (current_db != InvalidOid)
373 					break;
374 				current_db = blkinfo[j].database;
375 			}
376 
377 			j++;
378 		}
379 
380 		/*
381 		 * If we reach this point with current_db == InvalidOid, then only
382 		 * BlockInfoRecords belonging to global objects exist.  We can't
383 		 * prewarm without a database connection, so just bail out.
384 		 */
385 		if (current_db == InvalidOid)
386 			break;
387 
388 		/* Configure stop point and database for next per-database worker. */
389 		apw_state->prewarm_stop_idx = j;
390 		apw_state->database = current_db;
391 		Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
392 
393 		/* If we've run out of free buffers, don't launch another worker. */
394 		if (!have_free_buffer())
395 			break;
396 
397 		/*
398 		 * Likewise, don't launch if we've already been told to shut down.
399 		 * (The launch would fail anyway, but we might as well skip it.)
400 		 */
401 		if (ShutdownRequestPending)
402 			break;
403 
404 		/*
405 		 * Start a per-database worker to load blocks for this database; this
406 		 * function will return once the per-database worker exits.
407 		 */
408 		apw_start_database_worker();
409 
410 		/* Prepare for next database. */
411 		apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx;
412 	}
413 
414 	/* Clean up. */
415 	dsm_detach(seg);
416 	LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
417 	apw_state->block_info_handle = DSM_HANDLE_INVALID;
418 	apw_state->pid_using_dumpfile = InvalidPid;
419 	LWLockRelease(&apw_state->lock);
420 
421 	/* Report our success, if we were able to finish. */
422 	if (!ShutdownRequestPending)
423 		ereport(LOG,
424 				(errmsg("autoprewarm successfully prewarmed %d of %d previously-loaded blocks",
425 						apw_state->prewarmed_blocks, num_elements)));
426 }
427 
428 /*
429  * Prewarm all blocks for one database (and possibly also global objects, if
430  * those got grouped with this database).
431  */
432 void
autoprewarm_database_main(Datum main_arg)433 autoprewarm_database_main(Datum main_arg)
434 {
435 	int			pos;
436 	BlockInfoRecord *block_info;
437 	Relation	rel = NULL;
438 	BlockNumber nblocks = 0;
439 	BlockInfoRecord *old_blk = NULL;
440 	dsm_segment *seg;
441 
442 	/* Establish signal handlers; once that's done, unblock signals. */
443 	pqsignal(SIGTERM, die);
444 	BackgroundWorkerUnblockSignals();
445 
446 	/* Connect to correct database and get block information. */
447 	apw_init_shmem();
448 	seg = dsm_attach(apw_state->block_info_handle);
449 	if (seg == NULL)
450 		ereport(ERROR,
451 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
452 				 errmsg("could not map dynamic shared memory segment")));
453 	BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
454 	block_info = (BlockInfoRecord *) dsm_segment_address(seg);
455 	pos = apw_state->prewarm_start_idx;
456 
457 	/*
458 	 * Loop until we run out of blocks to prewarm or until we run out of free
459 	 * buffers.
460 	 */
461 	while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
462 	{
463 		BlockInfoRecord *blk = &block_info[pos++];
464 		Buffer		buf;
465 
466 		CHECK_FOR_INTERRUPTS();
467 
468 		/*
469 		 * Quit if we've reached records for another database. If previous
470 		 * blocks are of some global objects, then continue pre-warming.
471 		 */
472 		if (old_blk != NULL && old_blk->database != blk->database &&
473 			old_blk->database != 0)
474 			break;
475 
476 		/*
477 		 * As soon as we encounter a block of a new relation, close the old
478 		 * relation. Note that rel will be NULL if try_relation_open failed
479 		 * previously; in that case, there is nothing to close.
480 		 */
481 		if (old_blk != NULL && old_blk->filenode != blk->filenode &&
482 			rel != NULL)
483 		{
484 			relation_close(rel, AccessShareLock);
485 			rel = NULL;
486 			CommitTransactionCommand();
487 		}
488 
489 		/*
490 		 * Try to open each new relation, but only once, when we first
491 		 * encounter it. If it's been dropped, skip the associated blocks.
492 		 */
493 		if (old_blk == NULL || old_blk->filenode != blk->filenode)
494 		{
495 			Oid			reloid;
496 
497 			Assert(rel == NULL);
498 			StartTransactionCommand();
499 			reloid = RelidByRelfilenode(blk->tablespace, blk->filenode);
500 			if (OidIsValid(reloid))
501 				rel = try_relation_open(reloid, AccessShareLock);
502 
503 			if (!rel)
504 				CommitTransactionCommand();
505 		}
506 		if (!rel)
507 		{
508 			old_blk = blk;
509 			continue;
510 		}
511 
512 		/* Once per fork, check for fork existence and size. */
513 		if (old_blk == NULL ||
514 			old_blk->filenode != blk->filenode ||
515 			old_blk->forknum != blk->forknum)
516 		{
517 			RelationOpenSmgr(rel);
518 
519 			/*
520 			 * smgrexists is not safe for illegal forknum, hence check whether
521 			 * the passed forknum is valid before using it in smgrexists.
522 			 */
523 			if (blk->forknum > InvalidForkNumber &&
524 				blk->forknum <= MAX_FORKNUM &&
525 				smgrexists(rel->rd_smgr, blk->forknum))
526 				nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
527 			else
528 				nblocks = 0;
529 		}
530 
531 		/* Check whether blocknum is valid and within fork file size. */
532 		if (blk->blocknum >= nblocks)
533 		{
534 			/* Move to next forknum. */
535 			old_blk = blk;
536 			continue;
537 		}
538 
539 		/* Prewarm buffer. */
540 		buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
541 								 NULL);
542 		if (BufferIsValid(buf))
543 		{
544 			apw_state->prewarmed_blocks++;
545 			ReleaseBuffer(buf);
546 		}
547 
548 		old_blk = blk;
549 	}
550 
551 	dsm_detach(seg);
552 
553 	/* Release lock on previous relation. */
554 	if (rel)
555 	{
556 		relation_close(rel, AccessShareLock);
557 		CommitTransactionCommand();
558 	}
559 }
560 
561 /*
562  * Dump information on blocks in shared buffers.  We use a text format here
563  * so that it's easy to understand and even change the file contents if
564  * necessary.
565  * Returns the number of blocks dumped.
566  */
567 static int
apw_dump_now(bool is_bgworker,bool dump_unlogged)568 apw_dump_now(bool is_bgworker, bool dump_unlogged)
569 {
570 	int			num_blocks;
571 	int			i;
572 	int			ret;
573 	BlockInfoRecord *block_info_array;
574 	BufferDesc *bufHdr;
575 	FILE	   *file;
576 	char		transient_dump_file_path[MAXPGPATH];
577 	pid_t		pid;
578 
579 	LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
580 	pid = apw_state->pid_using_dumpfile;
581 	if (apw_state->pid_using_dumpfile == InvalidPid)
582 		apw_state->pid_using_dumpfile = MyProcPid;
583 	LWLockRelease(&apw_state->lock);
584 
585 	if (pid != InvalidPid)
586 	{
587 		if (!is_bgworker)
588 			ereport(ERROR,
589 					(errmsg("could not perform block dump because dump file is being used by PID %lu",
590 							(unsigned long) apw_state->pid_using_dumpfile)));
591 
592 		ereport(LOG,
593 				(errmsg("skipping block dump because it is already being performed by PID %lu",
594 						(unsigned long) apw_state->pid_using_dumpfile)));
595 		return 0;
596 	}
597 
598 	block_info_array =
599 		(BlockInfoRecord *) palloc(sizeof(BlockInfoRecord) * NBuffers);
600 
601 	for (num_blocks = 0, i = 0; i < NBuffers; i++)
602 	{
603 		uint32		buf_state;
604 
605 		CHECK_FOR_INTERRUPTS();
606 
607 		bufHdr = GetBufferDescriptor(i);
608 
609 		/* Lock each buffer header before inspecting. */
610 		buf_state = LockBufHdr(bufHdr);
611 
612 		/*
613 		 * Unlogged tables will be automatically truncated after a crash or
614 		 * unclean shutdown. In such cases we need not prewarm them. Dump them
615 		 * only if requested by caller.
616 		 */
617 		if (buf_state & BM_TAG_VALID &&
618 			((buf_state & BM_PERMANENT) || dump_unlogged))
619 		{
620 			block_info_array[num_blocks].database = bufHdr->tag.rnode.dbNode;
621 			block_info_array[num_blocks].tablespace = bufHdr->tag.rnode.spcNode;
622 			block_info_array[num_blocks].filenode = bufHdr->tag.rnode.relNode;
623 			block_info_array[num_blocks].forknum = bufHdr->tag.forkNum;
624 			block_info_array[num_blocks].blocknum = bufHdr->tag.blockNum;
625 			++num_blocks;
626 		}
627 
628 		UnlockBufHdr(bufHdr, buf_state);
629 	}
630 
631 	snprintf(transient_dump_file_path, MAXPGPATH, "%s.tmp", AUTOPREWARM_FILE);
632 	file = AllocateFile(transient_dump_file_path, "w");
633 	if (!file)
634 		ereport(ERROR,
635 				(errcode_for_file_access(),
636 				 errmsg("could not open file \"%s\": %m",
637 						transient_dump_file_path)));
638 
639 	ret = fprintf(file, "<<%d>>\n", num_blocks);
640 	if (ret < 0)
641 	{
642 		int			save_errno = errno;
643 
644 		FreeFile(file);
645 		unlink(transient_dump_file_path);
646 		errno = save_errno;
647 		ereport(ERROR,
648 				(errcode_for_file_access(),
649 				 errmsg("could not write to file \"%s\": %m",
650 						transient_dump_file_path)));
651 	}
652 
653 	for (i = 0; i < num_blocks; i++)
654 	{
655 		CHECK_FOR_INTERRUPTS();
656 
657 		ret = fprintf(file, "%u,%u,%u,%u,%u\n",
658 					  block_info_array[i].database,
659 					  block_info_array[i].tablespace,
660 					  block_info_array[i].filenode,
661 					  (uint32) block_info_array[i].forknum,
662 					  block_info_array[i].blocknum);
663 		if (ret < 0)
664 		{
665 			int			save_errno = errno;
666 
667 			FreeFile(file);
668 			unlink(transient_dump_file_path);
669 			errno = save_errno;
670 			ereport(ERROR,
671 					(errcode_for_file_access(),
672 					 errmsg("could not write to file \"%s\": %m",
673 							transient_dump_file_path)));
674 		}
675 	}
676 
677 	pfree(block_info_array);
678 
679 	/*
680 	 * Rename transient_dump_file_path to AUTOPREWARM_FILE to make things
681 	 * permanent.
682 	 */
683 	ret = FreeFile(file);
684 	if (ret != 0)
685 	{
686 		int			save_errno = errno;
687 
688 		unlink(transient_dump_file_path);
689 		errno = save_errno;
690 		ereport(ERROR,
691 				(errcode_for_file_access(),
692 				 errmsg("could not close file \"%s\": %m",
693 						transient_dump_file_path)));
694 	}
695 
696 	(void) durable_rename(transient_dump_file_path, AUTOPREWARM_FILE, ERROR);
697 	apw_state->pid_using_dumpfile = InvalidPid;
698 
699 	ereport(DEBUG1,
700 			(errmsg_internal("wrote block details for %d blocks", num_blocks)));
701 	return num_blocks;
702 }
703 
704 /*
705  * SQL-callable function to launch autoprewarm.
706  */
707 Datum
autoprewarm_start_worker(PG_FUNCTION_ARGS)708 autoprewarm_start_worker(PG_FUNCTION_ARGS)
709 {
710 	pid_t		pid;
711 
712 	if (!autoprewarm)
713 		ereport(ERROR,
714 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
715 				 errmsg("autoprewarm is disabled")));
716 
717 	apw_init_shmem();
718 	LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
719 	pid = apw_state->bgworker_pid;
720 	LWLockRelease(&apw_state->lock);
721 
722 	if (pid != InvalidPid)
723 		ereport(ERROR,
724 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
725 				 errmsg("autoprewarm worker is already running under PID %lu",
726 						(unsigned long) pid)));
727 
728 	apw_start_leader_worker();
729 
730 	PG_RETURN_VOID();
731 }
732 
733 /*
734  * SQL-callable function to perform an immediate block dump.
735  *
736  * Note: this is declared to return int8, as insurance against some
737  * very distant day when we might make NBuffers wider than int.
738  */
739 Datum
autoprewarm_dump_now(PG_FUNCTION_ARGS)740 autoprewarm_dump_now(PG_FUNCTION_ARGS)
741 {
742 	int			num_blocks;
743 
744 	apw_init_shmem();
745 
746 	PG_ENSURE_ERROR_CLEANUP(apw_detach_shmem, 0);
747 	{
748 		num_blocks = apw_dump_now(false, true);
749 	}
750 	PG_END_ENSURE_ERROR_CLEANUP(apw_detach_shmem, 0);
751 
752 	PG_RETURN_INT64((int64) num_blocks);
753 }
754 
755 /*
756  * Allocate and initialize autoprewarm related shared memory, if not already
757  * done, and set up backend-local pointer to that state.  Returns true if an
758  * existing shared memory segment was found.
759  */
760 static bool
apw_init_shmem(void)761 apw_init_shmem(void)
762 {
763 	bool		found;
764 
765 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
766 	apw_state = ShmemInitStruct("autoprewarm",
767 								sizeof(AutoPrewarmSharedState),
768 								&found);
769 	if (!found)
770 	{
771 		/* First time through ... */
772 		LWLockInitialize(&apw_state->lock, LWLockNewTrancheId());
773 		apw_state->bgworker_pid = InvalidPid;
774 		apw_state->pid_using_dumpfile = InvalidPid;
775 	}
776 	LWLockRelease(AddinShmemInitLock);
777 
778 	LWLockRegisterTranche(apw_state->lock.tranche, "autoprewarm");
779 
780 	return found;
781 }
782 
783 /*
784  * Clear our PID from autoprewarm shared state.
785  */
786 static void
apw_detach_shmem(int code,Datum arg)787 apw_detach_shmem(int code, Datum arg)
788 {
789 	LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
790 	if (apw_state->pid_using_dumpfile == MyProcPid)
791 		apw_state->pid_using_dumpfile = InvalidPid;
792 	if (apw_state->bgworker_pid == MyProcPid)
793 		apw_state->bgworker_pid = InvalidPid;
794 	LWLockRelease(&apw_state->lock);
795 }
796 
797 /*
798  * Start autoprewarm leader worker process.
799  */
800 static void
apw_start_leader_worker(void)801 apw_start_leader_worker(void)
802 {
803 	BackgroundWorker worker;
804 	BackgroundWorkerHandle *handle;
805 	BgwHandleStatus status;
806 	pid_t		pid;
807 
808 	MemSet(&worker, 0, sizeof(BackgroundWorker));
809 	worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
810 	worker.bgw_start_time = BgWorkerStart_ConsistentState;
811 	strcpy(worker.bgw_library_name, "pg_prewarm");
812 	strcpy(worker.bgw_function_name, "autoprewarm_main");
813 	strcpy(worker.bgw_name, "autoprewarm leader");
814 	strcpy(worker.bgw_type, "autoprewarm leader");
815 
816 	if (process_shared_preload_libraries_in_progress)
817 	{
818 		RegisterBackgroundWorker(&worker);
819 		return;
820 	}
821 
822 	/* must set notify PID to wait for startup */
823 	worker.bgw_notify_pid = MyProcPid;
824 
825 	if (!RegisterDynamicBackgroundWorker(&worker, &handle))
826 		ereport(ERROR,
827 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
828 				 errmsg("could not register background process"),
829 				 errhint("You may need to increase max_worker_processes.")));
830 
831 	status = WaitForBackgroundWorkerStartup(handle, &pid);
832 	if (status != BGWH_STARTED)
833 		ereport(ERROR,
834 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
835 				 errmsg("could not start background process"),
836 				 errhint("More details may be available in the server log.")));
837 }
838 
839 /*
840  * Start autoprewarm per-database worker process.
841  */
842 static void
apw_start_database_worker(void)843 apw_start_database_worker(void)
844 {
845 	BackgroundWorker worker;
846 	BackgroundWorkerHandle *handle;
847 
848 	MemSet(&worker, 0, sizeof(BackgroundWorker));
849 	worker.bgw_flags =
850 		BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
851 	worker.bgw_start_time = BgWorkerStart_ConsistentState;
852 	worker.bgw_restart_time = BGW_NEVER_RESTART;
853 	strcpy(worker.bgw_library_name, "pg_prewarm");
854 	strcpy(worker.bgw_function_name, "autoprewarm_database_main");
855 	strcpy(worker.bgw_name, "autoprewarm worker");
856 	strcpy(worker.bgw_type, "autoprewarm worker");
857 
858 	/* must set notify PID to wait for shutdown */
859 	worker.bgw_notify_pid = MyProcPid;
860 
861 	if (!RegisterDynamicBackgroundWorker(&worker, &handle))
862 		ereport(ERROR,
863 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
864 				 errmsg("registering dynamic bgworker autoprewarm failed"),
865 				 errhint("Consider increasing configuration parameter \"max_worker_processes\".")));
866 
867 	/*
868 	 * Ignore return value; if it fails, postmaster has died, but we have
869 	 * checks for that elsewhere.
870 	 */
871 	WaitForBackgroundWorkerShutdown(handle);
872 }
873 
874 /* Compare member elements to check whether they are not equal. */
875 #define cmp_member_elem(fld)	\
876 do { \
877 	if (a->fld < b->fld)		\
878 		return -1;				\
879 	else if (a->fld > b->fld)	\
880 		return 1;				\
881 } while(0)
882 
883 /*
884  * apw_compare_blockinfo
885  *
886  * We depend on all records for a particular database being consecutive
887  * in the dump file; each per-database worker will preload blocks until
888  * it sees a block for some other database.  Sorting by tablespace,
889  * filenode, forknum, and blocknum isn't critical for correctness, but
890  * helps us get a sequential I/O pattern.
891  */
892 static int
apw_compare_blockinfo(const void * p,const void * q)893 apw_compare_blockinfo(const void *p, const void *q)
894 {
895 	const BlockInfoRecord *a = (const BlockInfoRecord *) p;
896 	const BlockInfoRecord *b = (const BlockInfoRecord *) q;
897 
898 	cmp_member_elem(database);
899 	cmp_member_elem(tablespace);
900 	cmp_member_elem(filenode);
901 	cmp_member_elem(forknum);
902 	cmp_member_elem(blocknum);
903 
904 	return 0;
905 }
906