1 /*-------------------------------------------------------------------------
2  *
3  * xlog.c
4  *		PostgreSQL write-ahead log manager
5  *
6  *
7  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/backend/access/transam/xlog.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <math.h>
19 #include <time.h>
20 #include <fcntl.h>
21 #include <sys/stat.h>
22 #include <sys/time.h>
23 #include <unistd.h>
24 
25 #include "access/clog.h"
26 #include "access/commit_ts.h"
27 #include "access/heaptoast.h"
28 #include "access/multixact.h"
29 #include "access/rewriteheap.h"
30 #include "access/subtrans.h"
31 #include "access/timeline.h"
32 #include "access/transam.h"
33 #include "access/twophase.h"
34 #include "access/xact.h"
35 #include "access/xlog_internal.h"
36 #include "access/xlogarchive.h"
37 #include "access/xloginsert.h"
38 #include "access/xlogreader.h"
39 #include "access/xlogutils.h"
40 #include "catalog/catversion.h"
41 #include "catalog/pg_control.h"
42 #include "catalog/pg_database.h"
43 #include "commands/progress.h"
44 #include "commands/tablespace.h"
45 #include "common/controldata_utils.h"
46 #include "executor/instrument.h"
47 #include "miscadmin.h"
48 #include "pg_trace.h"
49 #include "pgstat.h"
50 #include "port/atomics.h"
51 #include "port/pg_iovec.h"
52 #include "postmaster/bgwriter.h"
53 #include "postmaster/startup.h"
54 #include "postmaster/walwriter.h"
55 #include "replication/basebackup.h"
56 #include "replication/logical.h"
57 #include "replication/origin.h"
58 #include "replication/slot.h"
59 #include "replication/snapbuild.h"
60 #include "replication/walreceiver.h"
61 #include "replication/walsender.h"
62 #include "storage/bufmgr.h"
63 #include "storage/fd.h"
64 #include "storage/ipc.h"
65 #include "storage/large_object.h"
66 #include "storage/latch.h"
67 #include "storage/pmsignal.h"
68 #include "storage/predicate.h"
69 #include "storage/proc.h"
70 #include "storage/procarray.h"
71 #include "storage/reinit.h"
72 #include "storage/smgr.h"
73 #include "storage/spin.h"
74 #include "storage/sync.h"
75 #include "utils/builtins.h"
76 #include "utils/guc.h"
77 #include "utils/memutils.h"
78 #include "utils/ps_status.h"
79 #include "utils/relmapper.h"
80 #include "utils/pg_rusage.h"
81 #include "utils/snapmgr.h"
82 #include "utils/timestamp.h"
83 
84 extern uint32 bootstrap_data_checksum_version;
85 
86 /* Unsupported old recovery command file names (relative to $PGDATA) */
87 #define RECOVERY_COMMAND_FILE	"recovery.conf"
88 #define RECOVERY_COMMAND_DONE	"recovery.done"
89 
90 /* User-settable parameters */
91 int			max_wal_size_mb = 1024; /* 1 GB */
92 int			min_wal_size_mb = 80;	/* 80 MB */
93 int			wal_keep_size_mb = 0;
94 int			XLOGbuffers = -1;
95 int			XLogArchiveTimeout = 0;
96 int			XLogArchiveMode = ARCHIVE_MODE_OFF;
97 char	   *XLogArchiveCommand = NULL;
98 bool		EnableHotStandby = false;
99 bool		fullPageWrites = true;
100 bool		wal_log_hints = false;
101 bool		wal_compression = false;
102 char	   *wal_consistency_checking_string = NULL;
103 bool	   *wal_consistency_checking = NULL;
104 bool		wal_init_zero = true;
105 bool		wal_recycle = true;
106 bool		log_checkpoints = false;
107 int			sync_method = DEFAULT_SYNC_METHOD;
108 int			wal_level = WAL_LEVEL_MINIMAL;
109 int			CommitDelay = 0;	/* precommit delay in microseconds */
110 int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
111 int			wal_retrieve_retry_interval = 5000;
112 int			max_slot_wal_keep_size_mb = -1;
113 bool		track_wal_io_timing = false;
114 
115 #ifdef WAL_DEBUG
116 bool		XLOG_DEBUG = false;
117 #endif
118 
119 int			wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
120 
121 /*
122  * Number of WAL insertion locks to use. A higher value allows more insertions
123  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
124  * which needs to iterate all the locks.
125  */
126 #define NUM_XLOGINSERT_LOCKS  8
127 
128 /*
129  * Max distance from last checkpoint, before triggering a new xlog-based
130  * checkpoint.
131  */
132 int			CheckPointSegments;
133 
134 /* Estimated distance between checkpoints, in bytes */
135 static double CheckPointDistanceEstimate = 0;
136 static double PrevCheckPointDistance = 0;
137 
138 /*
139  * GUC support
140  */
141 const struct config_enum_entry sync_method_options[] = {
142 	{"fsync", SYNC_METHOD_FSYNC, false},
143 #ifdef HAVE_FSYNC_WRITETHROUGH
144 	{"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
145 #endif
146 #ifdef HAVE_FDATASYNC
147 	{"fdatasync", SYNC_METHOD_FDATASYNC, false},
148 #endif
149 #ifdef OPEN_SYNC_FLAG
150 	{"open_sync", SYNC_METHOD_OPEN, false},
151 #endif
152 #ifdef OPEN_DATASYNC_FLAG
153 	{"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
154 #endif
155 	{NULL, 0, false}
156 };
157 
158 
159 /*
160  * Although only "on", "off", and "always" are documented,
161  * we accept all the likely variants of "on" and "off".
162  */
163 const struct config_enum_entry archive_mode_options[] = {
164 	{"always", ARCHIVE_MODE_ALWAYS, false},
165 	{"on", ARCHIVE_MODE_ON, false},
166 	{"off", ARCHIVE_MODE_OFF, false},
167 	{"true", ARCHIVE_MODE_ON, true},
168 	{"false", ARCHIVE_MODE_OFF, true},
169 	{"yes", ARCHIVE_MODE_ON, true},
170 	{"no", ARCHIVE_MODE_OFF, true},
171 	{"1", ARCHIVE_MODE_ON, true},
172 	{"0", ARCHIVE_MODE_OFF, true},
173 	{NULL, 0, false}
174 };
175 
176 const struct config_enum_entry recovery_target_action_options[] = {
177 	{"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
178 	{"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
179 	{"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
180 	{NULL, 0, false}
181 };
182 
183 /*
184  * Statistics for current checkpoint are collected in this global struct.
185  * Because only the checkpointer or a stand-alone backend can perform
186  * checkpoints, this will be unused in normal backends.
187  */
188 CheckpointStatsData CheckpointStats;
189 
190 /*
191  * ThisTimeLineID will be same in all backends --- it identifies current
192  * WAL timeline for the database system.
193  */
194 TimeLineID	ThisTimeLineID = 0;
195 
196 /*
197  * Are we doing recovery from XLOG?
198  *
199  * This is only ever true in the startup process; it should be read as meaning
200  * "this process is replaying WAL records", rather than "the system is in
201  * recovery mode".  It should be examined primarily by functions that need
202  * to act differently when called from a WAL redo function (e.g., to skip WAL
203  * logging).  To check whether the system is in recovery regardless of which
204  * process you're running in, use RecoveryInProgress() but only after shared
205  * memory startup and lock initialization.
206  */
207 bool		InRecovery = false;
208 
209 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
210 HotStandbyState standbyState = STANDBY_DISABLED;
211 
212 static XLogRecPtr LastRec;
213 
214 /* Local copy of WalRcv->flushedUpto */
215 static XLogRecPtr flushedUpto = 0;
216 static TimeLineID receiveTLI = 0;
217 
218 /*
219  * abortedRecPtr is the start pointer of a broken record at end of WAL when
220  * recovery completes; missingContrecPtr is the location of the first
221  * contrecord that went missing.  See CreateOverwriteContrecordRecord for
222  * details.
223  */
224 static XLogRecPtr abortedRecPtr;
225 static XLogRecPtr missingContrecPtr;
226 
227 /*
228  * During recovery, lastFullPageWrites keeps track of full_page_writes that
229  * the replayed WAL records indicate. It's initialized with full_page_writes
230  * that the recovery starting checkpoint record indicates, and then updated
231  * each time XLOG_FPW_CHANGE record is replayed.
232  */
233 static bool lastFullPageWrites;
234 
235 /*
236  * Local copy of the state tracked by SharedRecoveryState in shared memory,
237  * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
238  * means "not known, need to check the shared state".
239  */
240 static bool LocalRecoveryInProgress = true;
241 
242 /*
243  * Local copy of SharedHotStandbyActive variable. False actually means "not
244  * known, need to check the shared state".
245  */
246 static bool LocalHotStandbyActive = false;
247 
248 /*
249  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
250  * known, need to check the shared state".
251  */
252 static bool LocalPromoteIsTriggered = false;
253 
254 /*
255  * Local state for XLogInsertAllowed():
256  *		1: unconditionally allowed to insert XLOG
257  *		0: unconditionally not allowed to insert XLOG
258  *		-1: must check RecoveryInProgress(); disallow until it is false
259  * Most processes start with -1 and transition to 1 after seeing that recovery
260  * is not in progress.  But we can also force the value for special cases.
261  * The coding in XLogInsertAllowed() depends on the first two of these states
262  * being numerically the same as bool true and false.
263  */
264 static int	LocalXLogInsertAllowed = -1;
265 
266 /*
267  * When ArchiveRecoveryRequested is set, archive recovery was requested,
268  * ie. signal files were present. When InArchiveRecovery is set, we are
269  * currently recovering using offline XLOG archives. These variables are only
270  * valid in the startup process.
271  *
272  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
273  * currently performing crash recovery using only XLOG files in pg_wal, but
274  * will switch to using offline XLOG archives as soon as we reach the end of
275  * WAL in pg_wal.
276 */
277 bool		ArchiveRecoveryRequested = false;
278 bool		InArchiveRecovery = false;
279 
280 static bool standby_signal_file_found = false;
281 static bool recovery_signal_file_found = false;
282 
283 /* Was the last xlog file restored from archive, or local? */
284 static bool restoredFromArchive = false;
285 
286 /* Buffers dedicated to consistency checks of size BLCKSZ */
287 static char *replay_image_masked = NULL;
288 static char *primary_image_masked = NULL;
289 
290 /* options formerly taken from recovery.conf for archive recovery */
291 char	   *recoveryRestoreCommand = NULL;
292 char	   *recoveryEndCommand = NULL;
293 char	   *archiveCleanupCommand = NULL;
294 RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
295 bool		recoveryTargetInclusive = true;
296 int			recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
297 TransactionId recoveryTargetXid;
298 char	   *recovery_target_time_string;
299 static TimestampTz recoveryTargetTime;
300 const char *recoveryTargetName;
301 XLogRecPtr	recoveryTargetLSN;
302 int			recovery_min_apply_delay = 0;
303 
304 /* options formerly taken from recovery.conf for XLOG streaming */
305 bool		StandbyModeRequested = false;
306 char	   *PrimaryConnInfo = NULL;
307 char	   *PrimarySlotName = NULL;
308 char	   *PromoteTriggerFile = NULL;
309 bool		wal_receiver_create_temp_slot = false;
310 
311 /* are we currently in standby mode? */
312 bool		StandbyMode = false;
313 
314 /*
315  * if recoveryStopsBefore/After returns true, it saves information of the stop
316  * point here
317  */
318 static TransactionId recoveryStopXid;
319 static TimestampTz recoveryStopTime;
320 static XLogRecPtr recoveryStopLSN;
321 static char recoveryStopName[MAXFNAMELEN];
322 static bool recoveryStopAfter;
323 
324 /*
325  * During normal operation, the only timeline we care about is ThisTimeLineID.
326  * During recovery, however, things are more complicated.  To simplify life
327  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
328  * scan through the WAL history (that is, it is the line that was active when
329  * the currently-scanned WAL record was generated).  We also need these
330  * timeline values:
331  *
332  * recoveryTargetTimeLineGoal: what the user requested, if any
333  *
334  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
335  *
336  * recoveryTargetTLI: the currently understood target timeline; changes
337  *
338  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
339  * its known parents, newest first (so recoveryTargetTLI is always the
340  * first list member).  Only these TLIs are expected to be seen in the WAL
341  * segments we read, and indeed only these TLIs will be considered as
342  * candidate WAL files to open at all.
343  *
344  * curFileTLI: the TLI appearing in the name of the current input WAL file.
345  * (This is not necessarily the same as ThisTimeLineID, because we could
346  * be scanning data that was copied from an ancestor timeline when the current
347  * file was created.)  During a sequential scan we do not allow this value
348  * to decrease.
349  */
350 RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
351 TimeLineID	recoveryTargetTLIRequested = 0;
352 TimeLineID	recoveryTargetTLI = 0;
353 static List *expectedTLEs;
354 static TimeLineID curFileTLI;
355 
356 /*
357  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
358  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
359  * end+1 of the last record, and is reset when we end a top-level transaction,
360  * or start a new one; so it can be used to tell if the current transaction has
361  * created any XLOG records.
362  *
363  * While in parallel mode, this may not be fully up to date.  When committing,
364  * a transaction can assume this covers all xlog records written either by the
365  * user backend or by any parallel worker which was present at any point during
366  * the transaction.  But when aborting, or when still in parallel mode, other
367  * parallel backends may have written WAL records at later LSNs than the value
368  * stored here.  The parallel leader advances its own copy, when necessary,
369  * in WaitForParallelWorkersToFinish.
370  */
371 XLogRecPtr	ProcLastRecPtr = InvalidXLogRecPtr;
372 XLogRecPtr	XactLastRecEnd = InvalidXLogRecPtr;
373 XLogRecPtr	XactLastCommitEnd = InvalidXLogRecPtr;
374 
375 /*
376  * RedoRecPtr is this backend's local copy of the REDO record pointer
377  * (which is almost but not quite the same as a pointer to the most recent
378  * CHECKPOINT record).  We update this from the shared-memory copy,
379  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
380  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
381  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
382  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
383  * InitXLOGAccess.
384  */
385 static XLogRecPtr RedoRecPtr;
386 
387 /*
388  * doPageWrites is this backend's local copy of (forcePageWrites ||
389  * fullPageWrites).  It is used together with RedoRecPtr to decide whether
390  * a full-page image of a page need to be taken.
391  */
392 static bool doPageWrites;
393 
394 /* Has the recovery code requested a walreceiver wakeup? */
395 static bool doRequestWalReceiverReply;
396 
397 /*
398  * RedoStartLSN points to the checkpoint's REDO location which is specified
399  * in a backup label file, backup history file or control file. In standby
400  * mode, XLOG streaming usually starts from the position where an invalid
401  * record was found. But if we fail to read even the initial checkpoint
402  * record, we use the REDO location instead of the checkpoint location as
403  * the start position of XLOG streaming. Otherwise we would have to jump
404  * backwards to the REDO location after reading the checkpoint record,
405  * because the REDO record can precede the checkpoint record.
406  */
407 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
408 
409 /*----------
410  * Shared-memory data structures for XLOG control
411  *
412  * LogwrtRqst indicates a byte position that we need to write and/or fsync
413  * the log up to (all records before that point must be written or fsynced).
414  * LogwrtResult indicates the byte positions we have already written/fsynced.
415  * These structs are identical but are declared separately to indicate their
416  * slightly different functions.
417  *
418  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
419  * WALWriteLock.  To update it, you need to hold both locks.  The point of
420  * this arrangement is that the value can be examined by code that already
421  * holds WALWriteLock without needing to grab info_lck as well.  In addition
422  * to the shared variable, each backend has a private copy of LogwrtResult,
423  * which is updated when convenient.
424  *
425  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
426  * (protected by info_lck), but we don't need to cache any copies of it.
427  *
428  * info_lck is only held long enough to read/update the protected variables,
429  * so it's a plain spinlock.  The other locks are held longer (potentially
430  * over I/O operations), so we use LWLocks for them.  These locks are:
431  *
432  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
433  * It is only held while initializing and changing the mapping.  If the
434  * contents of the buffer being replaced haven't been written yet, the mapping
435  * lock is released while the write is done, and reacquired afterwards.
436  *
437  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
438  * XLogFlush).
439  *
440  * ControlFileLock: must be held to read/update control file or create
441  * new log file.
442  *
443  *----------
444  */
445 
446 typedef struct XLogwrtRqst
447 {
448 	XLogRecPtr	Write;			/* last byte + 1 to write out */
449 	XLogRecPtr	Flush;			/* last byte + 1 to flush */
450 } XLogwrtRqst;
451 
452 typedef struct XLogwrtResult
453 {
454 	XLogRecPtr	Write;			/* last byte + 1 written out */
455 	XLogRecPtr	Flush;			/* last byte + 1 flushed */
456 } XLogwrtResult;
457 
458 /*
459  * Inserting to WAL is protected by a small fixed number of WAL insertion
460  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
461  * matter which one. To lock out other concurrent insertions, you must hold
462  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
463  * indicator of how far the insertion has progressed (insertingAt).
464  *
465  * The insertingAt values are read when a process wants to flush WAL from
466  * the in-memory buffers to disk, to check that all the insertions to the
467  * region the process is about to write out have finished. You could simply
468  * wait for all currently in-progress insertions to finish, but the
469  * insertingAt indicator allows you to ignore insertions to later in the WAL,
470  * so that you only wait for the insertions that are modifying the buffers
471  * you're about to write out.
472  *
473  * This isn't just an optimization. If all the WAL buffers are dirty, an
474  * inserter that's holding a WAL insert lock might need to evict an old WAL
475  * buffer, which requires flushing the WAL. If it's possible for an inserter
476  * to block on another inserter unnecessarily, deadlock can arise when two
477  * inserters holding a WAL insert lock wait for each other to finish their
478  * insertion.
479  *
480  * Small WAL records that don't cross a page boundary never update the value,
481  * the WAL record is just copied to the page and the lock is released. But
482  * to avoid the deadlock-scenario explained above, the indicator is always
483  * updated before sleeping while holding an insertion lock.
484  *
485  * lastImportantAt contains the LSN of the last important WAL record inserted
486  * using a given lock. This value is used to detect if there has been
487  * important WAL activity since the last time some action, like a checkpoint,
488  * was performed - allowing to not repeat the action if not. The LSN is
489  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
490  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
491  * records.  Tracking the WAL activity directly in WALInsertLock has the
492  * advantage of not needing any additional locks to update the value.
493  */
494 typedef struct
495 {
496 	LWLock		lock;
497 	XLogRecPtr	insertingAt;
498 	XLogRecPtr	lastImportantAt;
499 } WALInsertLock;
500 
501 /*
502  * All the WAL insertion locks are allocated as an array in shared memory. We
503  * force the array stride to be a power of 2, which saves a few cycles in
504  * indexing, but more importantly also ensures that individual slots don't
505  * cross cache line boundaries. (Of course, we have to also ensure that the
506  * array start address is suitably aligned.)
507  */
508 typedef union WALInsertLockPadded
509 {
510 	WALInsertLock l;
511 	char		pad[PG_CACHE_LINE_SIZE];
512 } WALInsertLockPadded;
513 
514 /*
515  * State of an exclusive backup, necessary to control concurrent activities
516  * across sessions when working on exclusive backups.
517  *
518  * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
519  * running, to be more precise pg_start_backup() is not being executed for
520  * an exclusive backup and there is no exclusive backup in progress.
521  * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
522  * exclusive backup.
523  * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
524  * running and an exclusive backup is in progress. pg_stop_backup() is
525  * needed to finish it.
526  * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
527  * exclusive backup.
528  */
529 typedef enum ExclusiveBackupState
530 {
531 	EXCLUSIVE_BACKUP_NONE = 0,
532 	EXCLUSIVE_BACKUP_STARTING,
533 	EXCLUSIVE_BACKUP_IN_PROGRESS,
534 	EXCLUSIVE_BACKUP_STOPPING
535 } ExclusiveBackupState;
536 
537 /*
538  * Session status of running backup, used for sanity checks in SQL-callable
539  * functions to start and stop backups.
540  */
541 static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
542 
543 /*
544  * Shared state data for WAL insertion.
545  */
546 typedef struct XLogCtlInsert
547 {
548 	slock_t		insertpos_lck;	/* protects CurrBytePos and PrevBytePos */
549 
550 	/*
551 	 * CurrBytePos is the end of reserved WAL. The next record will be
552 	 * inserted at that position. PrevBytePos is the start position of the
553 	 * previously inserted (or rather, reserved) record - it is copied to the
554 	 * prev-link of the next record. These are stored as "usable byte
555 	 * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
556 	 */
557 	uint64		CurrBytePos;
558 	uint64		PrevBytePos;
559 
560 	/*
561 	 * Make sure the above heavily-contended spinlock and byte positions are
562 	 * on their own cache line. In particular, the RedoRecPtr and full page
563 	 * write variables below should be on a different cache line. They are
564 	 * read on every WAL insertion, but updated rarely, and we don't want
565 	 * those reads to steal the cache line containing Curr/PrevBytePos.
566 	 */
567 	char		pad[PG_CACHE_LINE_SIZE];
568 
569 	/*
570 	 * fullPageWrites is the authoritative value used by all backends to
571 	 * determine whether to write full-page image to WAL. This shared value,
572 	 * instead of the process-local fullPageWrites, is required because, when
573 	 * full_page_writes is changed by SIGHUP, we must WAL-log it before it
574 	 * actually affects WAL-logging by backends.  Checkpointer sets at startup
575 	 * or after SIGHUP.
576 	 *
577 	 * To read these fields, you must hold an insertion lock. To modify them,
578 	 * you must hold ALL the locks.
579 	 */
580 	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
581 	bool		forcePageWrites;	/* forcing full-page writes for PITR? */
582 	bool		fullPageWrites;
583 
584 	/*
585 	 * exclusiveBackupState indicates the state of an exclusive backup (see
586 	 * comments of ExclusiveBackupState for more details). nonExclusiveBackups
587 	 * is a counter indicating the number of streaming base backups currently
588 	 * in progress. forcePageWrites is set to true when either of these is
589 	 * non-zero. lastBackupStart is the latest checkpoint redo location used
590 	 * as a starting point for an online backup.
591 	 */
592 	ExclusiveBackupState exclusiveBackupState;
593 	int			nonExclusiveBackups;
594 	XLogRecPtr	lastBackupStart;
595 
596 	/*
597 	 * WAL insertion locks.
598 	 */
599 	WALInsertLockPadded *WALInsertLocks;
600 } XLogCtlInsert;
601 
602 /*
603  * Total shared-memory state for XLOG.
604  */
605 typedef struct XLogCtlData
606 {
607 	XLogCtlInsert Insert;
608 
609 	/* Protected by info_lck: */
610 	XLogwrtRqst LogwrtRqst;
611 	XLogRecPtr	RedoRecPtr;		/* a recent copy of Insert->RedoRecPtr */
612 	FullTransactionId ckptFullXid;	/* nextXid of latest checkpoint */
613 	XLogRecPtr	asyncXactLSN;	/* LSN of newest async commit/abort */
614 	XLogRecPtr	replicationSlotMinLSN;	/* oldest LSN needed by any slot */
615 
616 	XLogSegNo	lastRemovedSegNo;	/* latest removed/recycled XLOG segment */
617 
618 	/* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
619 	XLogRecPtr	unloggedLSN;
620 	slock_t		ulsn_lck;
621 
622 	/* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
623 	pg_time_t	lastSegSwitchTime;
624 	XLogRecPtr	lastSegSwitchLSN;
625 
626 	/*
627 	 * Protected by info_lck and WALWriteLock (you must hold either lock to
628 	 * read it, but both to update)
629 	 */
630 	XLogwrtResult LogwrtResult;
631 
632 	/*
633 	 * Latest initialized page in the cache (last byte position + 1).
634 	 *
635 	 * To change the identity of a buffer (and InitializedUpTo), you need to
636 	 * hold WALBufMappingLock.  To change the identity of a buffer that's
637 	 * still dirty, the old page needs to be written out first, and for that
638 	 * you need WALWriteLock, and you need to ensure that there are no
639 	 * in-progress insertions to the page by calling
640 	 * WaitXLogInsertionsToFinish().
641 	 */
642 	XLogRecPtr	InitializedUpTo;
643 
644 	/*
645 	 * These values do not change after startup, although the pointed-to pages
646 	 * and xlblocks values certainly do.  xlblocks values are protected by
647 	 * WALBufMappingLock.
648 	 */
649 	char	   *pages;			/* buffers for unwritten XLOG pages */
650 	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + XLOG_BLCKSZ */
651 	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
652 
653 	/*
654 	 * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
655 	 * If we created a new timeline when the system was started up,
656 	 * PrevTimeLineID is the old timeline's ID that we forked off from.
657 	 * Otherwise it's equal to ThisTimeLineID.
658 	 */
659 	TimeLineID	ThisTimeLineID;
660 	TimeLineID	PrevTimeLineID;
661 
662 	/*
663 	 * SharedRecoveryState indicates if we're still in crash or archive
664 	 * recovery.  Protected by info_lck.
665 	 */
666 	RecoveryState SharedRecoveryState;
667 
668 	/*
669 	 * SharedHotStandbyActive indicates if we allow hot standby queries to be
670 	 * run.  Protected by info_lck.
671 	 */
672 	bool		SharedHotStandbyActive;
673 
674 	/*
675 	 * SharedPromoteIsTriggered indicates if a standby promotion has been
676 	 * triggered.  Protected by info_lck.
677 	 */
678 	bool		SharedPromoteIsTriggered;
679 
680 	/*
681 	 * WalWriterSleeping indicates whether the WAL writer is currently in
682 	 * low-power mode (and hence should be nudged if an async commit occurs).
683 	 * Protected by info_lck.
684 	 */
685 	bool		WalWriterSleeping;
686 
687 	/*
688 	 * recoveryWakeupLatch is used to wake up the startup process to continue
689 	 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
690 	 * to appear.
691 	 *
692 	 * Note that the startup process also uses another latch, its procLatch,
693 	 * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
694 	 * signaling the startup process in favor of using its procLatch, which
695 	 * comports better with possible generic signal handlers using that latch.
696 	 * But we should not do that because the startup process doesn't assume
697 	 * that it's waken up by walreceiver process or SIGHUP signal handler
698 	 * while it's waiting for recovery conflict. The separate latches,
699 	 * recoveryWakeupLatch and procLatch, should be used for inter-process
700 	 * communication for WAL replay and recovery conflict, respectively.
701 	 */
702 	Latch		recoveryWakeupLatch;
703 
704 	/*
705 	 * During recovery, we keep a copy of the latest checkpoint record here.
706 	 * lastCheckPointRecPtr points to start of checkpoint record and
707 	 * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
708 	 * checkpointer when it wants to create a restartpoint.
709 	 *
710 	 * Protected by info_lck.
711 	 */
712 	XLogRecPtr	lastCheckPointRecPtr;
713 	XLogRecPtr	lastCheckPointEndPtr;
714 	CheckPoint	lastCheckPoint;
715 
716 	/*
717 	 * lastReplayedEndRecPtr points to end+1 of the last record successfully
718 	 * replayed. When we're currently replaying a record, ie. in a redo
719 	 * function, replayEndRecPtr points to the end+1 of the record being
720 	 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
721 	 */
722 	XLogRecPtr	lastReplayedEndRecPtr;
723 	TimeLineID	lastReplayedTLI;
724 	XLogRecPtr	replayEndRecPtr;
725 	TimeLineID	replayEndTLI;
726 	/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
727 	TimestampTz recoveryLastXTime;
728 
729 	/*
730 	 * timestamp of when we started replaying the current chunk of WAL data,
731 	 * only relevant for replication or archive recovery
732 	 */
733 	TimestampTz currentChunkStartTime;
734 	/* Recovery pause state */
735 	RecoveryPauseState recoveryPauseState;
736 	ConditionVariable recoveryNotPausedCV;
737 
738 	/*
739 	 * lastFpwDisableRecPtr points to the start of the last replayed
740 	 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
741 	 */
742 	XLogRecPtr	lastFpwDisableRecPtr;
743 
744 	slock_t		info_lck;		/* locks shared variables shown above */
745 } XLogCtlData;
746 
747 static XLogCtlData *XLogCtl = NULL;
748 
749 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
750 static WALInsertLockPadded *WALInsertLocks = NULL;
751 
752 /*
753  * We maintain an image of pg_control in shared memory.
754  */
755 static ControlFileData *ControlFile = NULL;
756 
757 /*
758  * Calculate the amount of space left on the page after 'endptr'. Beware
759  * multiple evaluation!
760  */
761 #define INSERT_FREESPACE(endptr)	\
762 	(((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
763 
764 /* Macro to advance to next buffer index. */
765 #define NextBufIdx(idx)		\
766 		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
767 
768 /*
769  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
770  * would hold if it was in cache, the page containing 'recptr'.
771  */
772 #define XLogRecPtrToBufIdx(recptr)	\
773 	(((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
774 
775 /*
776  * These are the number of bytes in a WAL page usable for WAL data.
777  */
778 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
779 
780 /*
781  * Convert values of GUCs measured in megabytes to equiv. segment count.
782  * Rounds down.
783  */
784 #define ConvertToXSegs(x, segsize)	XLogMBVarToSegs((x), (segsize))
785 
786 /* The number of bytes in a WAL segment usable for WAL data. */
787 static int	UsableBytesInSegment;
788 
789 /*
790  * Private, possibly out-of-date copy of shared LogwrtResult.
791  * See discussion above.
792  */
793 static XLogwrtResult LogwrtResult = {0, 0};
794 
795 /*
796  * Codes indicating where we got a WAL file from during recovery, or where
797  * to attempt to get one.
798  */
799 typedef enum
800 {
801 	XLOG_FROM_ANY = 0,			/* request to read WAL from any source */
802 	XLOG_FROM_ARCHIVE,			/* restored using restore_command */
803 	XLOG_FROM_PG_WAL,			/* existing file in pg_wal */
804 	XLOG_FROM_STREAM			/* streamed from primary */
805 } XLogSource;
806 
807 /* human-readable names for XLogSources, for debugging output */
808 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
809 
810 /*
811  * openLogFile is -1 or a kernel FD for an open log file segment.
812  * openLogSegNo identifies the segment.  These variables are only used to
813  * write the XLOG, and so will normally refer to the active segment.
814  * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
815  */
816 static int	openLogFile = -1;
817 static XLogSegNo openLogSegNo = 0;
818 
819 /*
820  * These variables are used similarly to the ones above, but for reading
821  * the XLOG.  readOff is the offset of the page just read, readLen
822  * indicates how much of it has been read into readBuf, and readSource
823  * indicates where we got the currently open file from.
824  * Note: we could use Reserve/ReleaseExternalFD to track consumption of
825  * this FD too; but it doesn't currently seem worthwhile, since the XLOG is
826  * not read by general-purpose sessions.
827  */
828 static int	readFile = -1;
829 static XLogSegNo readSegNo = 0;
830 static uint32 readOff = 0;
831 static uint32 readLen = 0;
832 static XLogSource readSource = XLOG_FROM_ANY;
833 
834 /*
835  * Keeps track of which source we're currently reading from. This is
836  * different from readSource in that this is always set, even when we don't
837  * currently have a WAL file open. If lastSourceFailed is set, our last
838  * attempt to read from currentSource failed, and we should try another source
839  * next.
840  *
841  * pendingWalRcvRestart is set when a config change occurs that requires a
842  * walreceiver restart.  This is only valid in XLOG_FROM_STREAM state.
843  */
844 static XLogSource currentSource = XLOG_FROM_ANY;
845 static bool lastSourceFailed = false;
846 static bool pendingWalRcvRestart = false;
847 
848 typedef struct XLogPageReadPrivate
849 {
850 	int			emode;
851 	bool		fetching_ckpt;	/* are we fetching a checkpoint record? */
852 	bool		randAccess;
853 } XLogPageReadPrivate;
854 
855 /*
856  * These variables track when we last obtained some WAL data to process,
857  * and where we got it from.  (XLogReceiptSource is initially the same as
858  * readSource, but readSource gets reset to zero when we don't have data
859  * to process right now.  It is also different from currentSource, which
860  * also changes when we try to read from a source and fail, while
861  * XLogReceiptSource tracks where we last successfully read some WAL.)
862  */
863 static TimestampTz XLogReceiptTime = 0;
864 static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
865 
866 /* State information for XLOG reading */
867 static XLogRecPtr ReadRecPtr;	/* start of last record read */
868 static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
869 
870 /*
871  * Local copies of equivalent fields in the control file.  When running
872  * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
873  * expect to replay all the WAL available, and updateMinRecoveryPoint is
874  * switched to false to prevent any updates while replaying records.
875  * Those values are kept consistent as long as crash recovery runs.
876  */
877 static XLogRecPtr minRecoveryPoint;
878 static TimeLineID minRecoveryPointTLI;
879 static bool updateMinRecoveryPoint = true;
880 
881 /*
882  * Have we reached a consistent database state? In crash recovery, we have
883  * to replay all the WAL, so reachedConsistency is never set. During archive
884  * recovery, the database is consistent once minRecoveryPoint is reached.
885  */
886 bool		reachedConsistency = false;
887 
888 static bool InRedo = false;
889 
890 /* Have we launched bgwriter during recovery? */
891 static bool bgwriterLaunched = false;
892 
893 /* For WALInsertLockAcquire/Release functions */
894 static int	MyLockNo = 0;
895 static bool holdingAllLocks = false;
896 
897 #ifdef WAL_DEBUG
898 static MemoryContext walDebugCxt = NULL;
899 #endif
900 
901 static void readRecoverySignalFile(void);
902 static void validateRecoveryParameters(void);
903 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
904 static bool recoveryStopsBefore(XLogReaderState *record);
905 static bool recoveryStopsAfter(XLogReaderState *record);
906 static void ConfirmRecoveryPaused(void);
907 static void recoveryPausesHere(bool endOfRecovery);
908 static bool recoveryApplyDelay(XLogReaderState *record);
909 static void SetLatestXTime(TimestampTz xtime);
910 static void SetCurrentChunkStartTime(TimestampTz xtime);
911 static void CheckRequiredParameterValues(void);
912 static void XLogReportParameters(void);
913 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
914 								TimeLineID prevTLI);
915 static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
916 									  XLogReaderState *state);
917 static void LocalSetXLogInsertAllowed(void);
918 static void CreateEndOfRecoveryRecord(void);
919 static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn);
920 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
921 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
922 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
923 
924 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
925 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
926 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
927 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
928 								   bool find_free, XLogSegNo max_segno,
929 								   bool use_lock);
930 static int	XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
931 						 XLogSource source, bool notfoundOk);
932 static int	XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
933 static int	XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
934 						 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
935 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
936 										bool fetching_ckpt, XLogRecPtr tliRecPtr);
937 static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
938 static void XLogFileClose(void);
939 static void PreallocXlogFiles(XLogRecPtr endptr);
940 static void RemoveTempXlogFiles(void);
941 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr);
942 static void RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
943 						   XLogSegNo *endlogSegNo);
944 static void UpdateLastRemovedPtr(char *filename);
945 static void ValidateXLOGDirectoryStructure(void);
946 static void CleanupBackupHistory(void);
947 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
948 static XLogRecord *ReadRecord(XLogReaderState *xlogreader,
949 							  int emode, bool fetching_ckpt);
950 static void CheckRecoveryConsistency(void);
951 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
952 										XLogRecPtr RecPtr, int whichChkpt, bool report);
953 static bool rescanLatestTimeLine(void);
954 static void InitControlFile(uint64 sysidentifier);
955 static void WriteControlFile(void);
956 static void ReadControlFile(void);
957 static char *str_time(pg_time_t tnow);
958 static void SetPromoteIsTriggered(void);
959 static bool CheckForStandbyTrigger(void);
960 
961 #ifdef WAL_DEBUG
962 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
963 #endif
964 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
965 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
966 static void pg_start_backup_callback(int code, Datum arg);
967 static void pg_stop_backup_callback(int code, Datum arg);
968 static bool read_backup_label(XLogRecPtr *checkPointLoc,
969 							  bool *backupEndRequired, bool *backupFromStandby);
970 static bool read_tablespace_map(List **tablespaces);
971 
972 static void rm_redo_error_callback(void *arg);
973 static int	get_sync_bit(int method);
974 
975 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
976 								XLogRecData *rdata,
977 								XLogRecPtr StartPos, XLogRecPtr EndPos);
978 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
979 									  XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
980 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
981 							  XLogRecPtr *PrevPtr);
982 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
983 static char *GetXLogBuffer(XLogRecPtr ptr);
984 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
985 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
986 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
987 static void checkXLogConsistency(XLogReaderState *record);
988 
989 static void WALInsertLockAcquire(void);
990 static void WALInsertLockAcquireExclusive(void);
991 static void WALInsertLockRelease(void);
992 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
993 
994 /*
995  * Insert an XLOG record represented by an already-constructed chain of data
996  * chunks.  This is a low-level routine; to construct the WAL record header
997  * and data, use the higher-level routines in xloginsert.c.
998  *
999  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
1000  * WAL record applies to, that were not included in the record as full page
1001  * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
1002  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
1003  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
1004  * record is always inserted.
1005  *
1006  * 'flags' gives more in-depth control on the record being inserted. See
1007  * XLogSetRecordFlags() for details.
1008  *
1009  * The first XLogRecData in the chain must be for the record header, and its
1010  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
1011  * xl_crc fields in the header, the rest of the header must already be filled
1012  * by the caller.
1013  *
1014  * Returns XLOG pointer to end of record (beginning of next record).
1015  * This can be used as LSN for data pages affected by the logged action.
1016  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
1017  * before the data page can be written out.  This implements the basic
1018  * WAL rule "write the log before the data".)
1019  */
1020 XLogRecPtr
XLogInsertRecord(XLogRecData * rdata,XLogRecPtr fpw_lsn,uint8 flags,int num_fpi)1021 XLogInsertRecord(XLogRecData *rdata,
1022 				 XLogRecPtr fpw_lsn,
1023 				 uint8 flags,
1024 				 int num_fpi)
1025 {
1026 	XLogCtlInsert *Insert = &XLogCtl->Insert;
1027 	pg_crc32c	rdata_crc;
1028 	bool		inserted;
1029 	XLogRecord *rechdr = (XLogRecord *) rdata->data;
1030 	uint8		info = rechdr->xl_info & ~XLR_INFO_MASK;
1031 	bool		isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
1032 							   info == XLOG_SWITCH);
1033 	XLogRecPtr	StartPos;
1034 	XLogRecPtr	EndPos;
1035 	bool		prevDoPageWrites = doPageWrites;
1036 
1037 	/* we assume that all of the record header is in the first chunk */
1038 	Assert(rdata->len >= SizeOfXLogRecord);
1039 
1040 	/* cross-check on whether we should be here or not */
1041 	if (!XLogInsertAllowed())
1042 		elog(ERROR, "cannot make new WAL entries during recovery");
1043 
1044 	/*----------
1045 	 *
1046 	 * We have now done all the preparatory work we can without holding a
1047 	 * lock or modifying shared state. From here on, inserting the new WAL
1048 	 * record to the shared WAL buffer cache is a two-step process:
1049 	 *
1050 	 * 1. Reserve the right amount of space from the WAL. The current head of
1051 	 *	  reserved space is kept in Insert->CurrBytePos, and is protected by
1052 	 *	  insertpos_lck.
1053 	 *
1054 	 * 2. Copy the record to the reserved WAL space. This involves finding the
1055 	 *	  correct WAL buffer containing the reserved space, and copying the
1056 	 *	  record in place. This can be done concurrently in multiple processes.
1057 	 *
1058 	 * To keep track of which insertions are still in-progress, each concurrent
1059 	 * inserter acquires an insertion lock. In addition to just indicating that
1060 	 * an insertion is in progress, the lock tells others how far the inserter
1061 	 * has progressed. There is a small fixed number of insertion locks,
1062 	 * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
1063 	 * boundary, it updates the value stored in the lock to the how far it has
1064 	 * inserted, to allow the previous buffer to be flushed.
1065 	 *
1066 	 * Holding onto an insertion lock also protects RedoRecPtr and
1067 	 * fullPageWrites from changing until the insertion is finished.
1068 	 *
1069 	 * Step 2 can usually be done completely in parallel. If the required WAL
1070 	 * page is not initialized yet, you have to grab WALBufMappingLock to
1071 	 * initialize it, but the WAL writer tries to do that ahead of insertions
1072 	 * to avoid that from happening in the critical path.
1073 	 *
1074 	 *----------
1075 	 */
1076 	START_CRIT_SECTION();
1077 	if (isLogSwitch)
1078 		WALInsertLockAcquireExclusive();
1079 	else
1080 		WALInsertLockAcquire();
1081 
1082 	/*
1083 	 * Check to see if my copy of RedoRecPtr is out of date. If so, may have
1084 	 * to go back and have the caller recompute everything. This can only
1085 	 * happen just after a checkpoint, so it's better to be slow in this case
1086 	 * and fast otherwise.
1087 	 *
1088 	 * Also check to see if fullPageWrites or forcePageWrites was just turned
1089 	 * on; if we weren't already doing full-page writes then go back and
1090 	 * recompute.
1091 	 *
1092 	 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1093 	 * affect the contents of the XLOG record, so we'll update our local copy
1094 	 * but not force a recomputation.  (If doPageWrites was just turned off,
1095 	 * we could recompute the record without full pages, but we choose not to
1096 	 * bother.)
1097 	 */
1098 	if (RedoRecPtr != Insert->RedoRecPtr)
1099 	{
1100 		Assert(RedoRecPtr < Insert->RedoRecPtr);
1101 		RedoRecPtr = Insert->RedoRecPtr;
1102 	}
1103 	doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
1104 
1105 	if (doPageWrites &&
1106 		(!prevDoPageWrites ||
1107 		 (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
1108 	{
1109 		/*
1110 		 * Oops, some buffer now needs to be backed up that the caller didn't
1111 		 * back up.  Start over.
1112 		 */
1113 		WALInsertLockRelease();
1114 		END_CRIT_SECTION();
1115 		return InvalidXLogRecPtr;
1116 	}
1117 
1118 	/*
1119 	 * Reserve space for the record in the WAL. This also sets the xl_prev
1120 	 * pointer.
1121 	 */
1122 	if (isLogSwitch)
1123 		inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1124 	else
1125 	{
1126 		ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1127 								  &rechdr->xl_prev);
1128 		inserted = true;
1129 	}
1130 
1131 	if (inserted)
1132 	{
1133 		/*
1134 		 * Now that xl_prev has been filled in, calculate CRC of the record
1135 		 * header.
1136 		 */
1137 		rdata_crc = rechdr->xl_crc;
1138 		COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1139 		FIN_CRC32C(rdata_crc);
1140 		rechdr->xl_crc = rdata_crc;
1141 
1142 		/*
1143 		 * All the record data, including the header, is now ready to be
1144 		 * inserted. Copy the record in the space reserved.
1145 		 */
1146 		CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1147 							StartPos, EndPos);
1148 
1149 		/*
1150 		 * Unless record is flagged as not important, update LSN of last
1151 		 * important record in the current slot. When holding all locks, just
1152 		 * update the first one.
1153 		 */
1154 		if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
1155 		{
1156 			int			lockno = holdingAllLocks ? 0 : MyLockNo;
1157 
1158 			WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1159 		}
1160 	}
1161 	else
1162 	{
1163 		/*
1164 		 * This was an xlog-switch record, but the current insert location was
1165 		 * already exactly at the beginning of a segment, so there was no need
1166 		 * to do anything.
1167 		 */
1168 	}
1169 
1170 	/*
1171 	 * Done! Let others know that we're finished.
1172 	 */
1173 	WALInsertLockRelease();
1174 
1175 	MarkCurrentTransactionIdLoggedIfAny();
1176 
1177 	END_CRIT_SECTION();
1178 
1179 	/*
1180 	 * Update shared LogwrtRqst.Write, if we crossed page boundary.
1181 	 */
1182 	if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1183 	{
1184 		SpinLockAcquire(&XLogCtl->info_lck);
1185 		/* advance global request to include new block(s) */
1186 		if (XLogCtl->LogwrtRqst.Write < EndPos)
1187 			XLogCtl->LogwrtRqst.Write = EndPos;
1188 		/* update local result copy while I have the chance */
1189 		LogwrtResult = XLogCtl->LogwrtResult;
1190 		SpinLockRelease(&XLogCtl->info_lck);
1191 	}
1192 
1193 	/*
1194 	 * If this was an XLOG_SWITCH record, flush the record and the empty
1195 	 * padding space that fills the rest of the segment, and perform
1196 	 * end-of-segment actions (eg, notifying archiver).
1197 	 */
1198 	if (isLogSwitch)
1199 	{
1200 		TRACE_POSTGRESQL_WAL_SWITCH();
1201 		XLogFlush(EndPos);
1202 
1203 		/*
1204 		 * Even though we reserved the rest of the segment for us, which is
1205 		 * reflected in EndPos, we return a pointer to just the end of the
1206 		 * xlog-switch record.
1207 		 */
1208 		if (inserted)
1209 		{
1210 			EndPos = StartPos + SizeOfXLogRecord;
1211 			if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1212 			{
1213 				uint64		offset = XLogSegmentOffset(EndPos, wal_segment_size);
1214 
1215 				if (offset == EndPos % XLOG_BLCKSZ)
1216 					EndPos += SizeOfXLogLongPHD;
1217 				else
1218 					EndPos += SizeOfXLogShortPHD;
1219 			}
1220 		}
1221 	}
1222 
1223 #ifdef WAL_DEBUG
1224 	if (XLOG_DEBUG)
1225 	{
1226 		static XLogReaderState *debug_reader = NULL;
1227 		StringInfoData buf;
1228 		StringInfoData recordBuf;
1229 		char	   *errormsg = NULL;
1230 		MemoryContext oldCxt;
1231 
1232 		oldCxt = MemoryContextSwitchTo(walDebugCxt);
1233 
1234 		initStringInfo(&buf);
1235 		appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
1236 
1237 		/*
1238 		 * We have to piece together the WAL record data from the XLogRecData
1239 		 * entries, so that we can pass it to the rm_desc function as one
1240 		 * contiguous chunk.
1241 		 */
1242 		initStringInfo(&recordBuf);
1243 		for (; rdata != NULL; rdata = rdata->next)
1244 			appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1245 
1246 		if (!debug_reader)
1247 			debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
1248 											  XL_ROUTINE(), NULL);
1249 
1250 		if (!debug_reader)
1251 		{
1252 			appendStringInfoString(&buf, "error decoding record: out of memory");
1253 		}
1254 		else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1255 								   &errormsg))
1256 		{
1257 			appendStringInfo(&buf, "error decoding record: %s",
1258 							 errormsg ? errormsg : "no error message");
1259 		}
1260 		else
1261 		{
1262 			appendStringInfoString(&buf, " - ");
1263 			xlog_outdesc(&buf, debug_reader);
1264 		}
1265 		elog(LOG, "%s", buf.data);
1266 
1267 		pfree(buf.data);
1268 		pfree(recordBuf.data);
1269 		MemoryContextSwitchTo(oldCxt);
1270 	}
1271 #endif
1272 
1273 	/*
1274 	 * Update our global variables
1275 	 */
1276 	ProcLastRecPtr = StartPos;
1277 	XactLastRecEnd = EndPos;
1278 
1279 	/* Report WAL traffic to the instrumentation. */
1280 	if (inserted)
1281 	{
1282 		pgWalUsage.wal_bytes += rechdr->xl_tot_len;
1283 		pgWalUsage.wal_records++;
1284 		pgWalUsage.wal_fpi += num_fpi;
1285 	}
1286 
1287 	return EndPos;
1288 }
1289 
1290 /*
1291  * Reserves the right amount of space for a record of given size from the WAL.
1292  * *StartPos is set to the beginning of the reserved section, *EndPos to
1293  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1294  * used to set the xl_prev of this record.
1295  *
1296  * This is the performance critical part of XLogInsert that must be serialized
1297  * across backends. The rest can happen mostly in parallel. Try to keep this
1298  * section as short as possible, insertpos_lck can be heavily contended on a
1299  * busy system.
1300  *
1301  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1302  * where we actually copy the record to the reserved space.
1303  */
1304 static void
ReserveXLogInsertLocation(int size,XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1305 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1306 						  XLogRecPtr *PrevPtr)
1307 {
1308 	XLogCtlInsert *Insert = &XLogCtl->Insert;
1309 	uint64		startbytepos;
1310 	uint64		endbytepos;
1311 	uint64		prevbytepos;
1312 
1313 	size = MAXALIGN(size);
1314 
1315 	/* All (non xlog-switch) records should contain data. */
1316 	Assert(size > SizeOfXLogRecord);
1317 
1318 	/*
1319 	 * The duration the spinlock needs to be held is minimized by minimizing
1320 	 * the calculations that have to be done while holding the lock. The
1321 	 * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1322 	 * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1323 	 * page headers. The mapping between "usable" byte positions and physical
1324 	 * positions (XLogRecPtrs) can be done outside the locked region, and
1325 	 * because the usable byte position doesn't include any headers, reserving
1326 	 * X bytes from WAL is almost as simple as "CurrBytePos += X".
1327 	 */
1328 	SpinLockAcquire(&Insert->insertpos_lck);
1329 
1330 	startbytepos = Insert->CurrBytePos;
1331 	endbytepos = startbytepos + size;
1332 	prevbytepos = Insert->PrevBytePos;
1333 	Insert->CurrBytePos = endbytepos;
1334 	Insert->PrevBytePos = startbytepos;
1335 
1336 	SpinLockRelease(&Insert->insertpos_lck);
1337 
1338 	*StartPos = XLogBytePosToRecPtr(startbytepos);
1339 	*EndPos = XLogBytePosToEndRecPtr(endbytepos);
1340 	*PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1341 
1342 	/*
1343 	 * Check that the conversions between "usable byte positions" and
1344 	 * XLogRecPtrs work consistently in both directions.
1345 	 */
1346 	Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1347 	Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1348 	Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1349 }
1350 
1351 /*
1352  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1353  *
1354  * A log-switch record is handled slightly differently. The rest of the
1355  * segment will be reserved for this insertion, as indicated by the returned
1356  * *EndPos value. However, if we are already at the beginning of the current
1357  * segment, *StartPos and *EndPos are set to the current location without
1358  * reserving any space, and the function returns false.
1359 */
1360 static bool
ReserveXLogSwitch(XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1361 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1362 {
1363 	XLogCtlInsert *Insert = &XLogCtl->Insert;
1364 	uint64		startbytepos;
1365 	uint64		endbytepos;
1366 	uint64		prevbytepos;
1367 	uint32		size = MAXALIGN(SizeOfXLogRecord);
1368 	XLogRecPtr	ptr;
1369 	uint32		segleft;
1370 
1371 	/*
1372 	 * These calculations are a bit heavy-weight to be done while holding a
1373 	 * spinlock, but since we're holding all the WAL insertion locks, there
1374 	 * are no other inserters competing for it. GetXLogInsertRecPtr() does
1375 	 * compete for it, but that's not called very frequently.
1376 	 */
1377 	SpinLockAcquire(&Insert->insertpos_lck);
1378 
1379 	startbytepos = Insert->CurrBytePos;
1380 
1381 	ptr = XLogBytePosToEndRecPtr(startbytepos);
1382 	if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1383 	{
1384 		SpinLockRelease(&Insert->insertpos_lck);
1385 		*EndPos = *StartPos = ptr;
1386 		return false;
1387 	}
1388 
1389 	endbytepos = startbytepos + size;
1390 	prevbytepos = Insert->PrevBytePos;
1391 
1392 	*StartPos = XLogBytePosToRecPtr(startbytepos);
1393 	*EndPos = XLogBytePosToEndRecPtr(endbytepos);
1394 
1395 	segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1396 	if (segleft != wal_segment_size)
1397 	{
1398 		/* consume the rest of the segment */
1399 		*EndPos += segleft;
1400 		endbytepos = XLogRecPtrToBytePos(*EndPos);
1401 	}
1402 	Insert->CurrBytePos = endbytepos;
1403 	Insert->PrevBytePos = startbytepos;
1404 
1405 	SpinLockRelease(&Insert->insertpos_lck);
1406 
1407 	*PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1408 
1409 	Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
1410 	Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1411 	Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1412 	Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1413 
1414 	return true;
1415 }
1416 
1417 /*
1418  * Checks whether the current buffer page and backup page stored in the
1419  * WAL record are consistent or not. Before comparing the two pages, a
1420  * masking can be applied to the pages to ignore certain areas like hint bits,
1421  * unused space between pd_lower and pd_upper among other things. This
1422  * function should be called once WAL replay has been completed for a
1423  * given record.
1424  */
1425 static void
checkXLogConsistency(XLogReaderState * record)1426 checkXLogConsistency(XLogReaderState *record)
1427 {
1428 	RmgrId		rmid = XLogRecGetRmid(record);
1429 	RelFileNode rnode;
1430 	ForkNumber	forknum;
1431 	BlockNumber blkno;
1432 	int			block_id;
1433 
1434 	/* Records with no backup blocks have no need for consistency checks. */
1435 	if (!XLogRecHasAnyBlockRefs(record))
1436 		return;
1437 
1438 	Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
1439 
1440 	for (block_id = 0; block_id <= record->max_block_id; block_id++)
1441 	{
1442 		Buffer		buf;
1443 		Page		page;
1444 
1445 		if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1446 		{
1447 			/*
1448 			 * WAL record doesn't contain a block reference with the given id.
1449 			 * Do nothing.
1450 			 */
1451 			continue;
1452 		}
1453 
1454 		Assert(XLogRecHasBlockImage(record, block_id));
1455 
1456 		if (XLogRecBlockImageApply(record, block_id))
1457 		{
1458 			/*
1459 			 * WAL record has already applied the page, so bypass the
1460 			 * consistency check as that would result in comparing the full
1461 			 * page stored in the record with itself.
1462 			 */
1463 			continue;
1464 		}
1465 
1466 		/*
1467 		 * Read the contents from the current buffer and store it in a
1468 		 * temporary page.
1469 		 */
1470 		buf = XLogReadBufferExtended(rnode, forknum, blkno,
1471 									 RBM_NORMAL_NO_LOG);
1472 		if (!BufferIsValid(buf))
1473 			continue;
1474 
1475 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1476 		page = BufferGetPage(buf);
1477 
1478 		/*
1479 		 * Take a copy of the local page where WAL has been applied to have a
1480 		 * comparison base before masking it...
1481 		 */
1482 		memcpy(replay_image_masked, page, BLCKSZ);
1483 
1484 		/* No need for this page anymore now that a copy is in. */
1485 		UnlockReleaseBuffer(buf);
1486 
1487 		/*
1488 		 * If the block LSN is already ahead of this WAL record, we can't
1489 		 * expect contents to match.  This can happen if recovery is
1490 		 * restarted.
1491 		 */
1492 		if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1493 			continue;
1494 
1495 		/*
1496 		 * Read the contents from the backup copy, stored in WAL record and
1497 		 * store it in a temporary page. There is no need to allocate a new
1498 		 * page here, a local buffer is fine to hold its contents and a mask
1499 		 * can be directly applied on it.
1500 		 */
1501 		if (!RestoreBlockImage(record, block_id, primary_image_masked))
1502 			elog(ERROR, "failed to restore block image");
1503 
1504 		/*
1505 		 * If masking function is defined, mask both the primary and replay
1506 		 * images
1507 		 */
1508 		if (RmgrTable[rmid].rm_mask != NULL)
1509 		{
1510 			RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1511 			RmgrTable[rmid].rm_mask(primary_image_masked, blkno);
1512 		}
1513 
1514 		/* Time to compare the primary and replay images. */
1515 		if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
1516 		{
1517 			elog(FATAL,
1518 				 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1519 				 rnode.spcNode, rnode.dbNode, rnode.relNode,
1520 				 forknum, blkno);
1521 		}
1522 	}
1523 }
1524 
1525 /*
1526  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
1527  * area in the WAL.
1528  */
1529 static void
CopyXLogRecordToWAL(int write_len,bool isLogSwitch,XLogRecData * rdata,XLogRecPtr StartPos,XLogRecPtr EndPos)1530 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1531 					XLogRecPtr StartPos, XLogRecPtr EndPos)
1532 {
1533 	char	   *currpos;
1534 	int			freespace;
1535 	int			written;
1536 	XLogRecPtr	CurrPos;
1537 	XLogPageHeader pagehdr;
1538 
1539 	/*
1540 	 * Get a pointer to the right place in the right WAL buffer to start
1541 	 * inserting to.
1542 	 */
1543 	CurrPos = StartPos;
1544 	currpos = GetXLogBuffer(CurrPos);
1545 	freespace = INSERT_FREESPACE(CurrPos);
1546 
1547 	/*
1548 	 * there should be enough space for at least the first field (xl_tot_len)
1549 	 * on this page.
1550 	 */
1551 	Assert(freespace >= sizeof(uint32));
1552 
1553 	/* Copy record data */
1554 	written = 0;
1555 	while (rdata != NULL)
1556 	{
1557 		char	   *rdata_data = rdata->data;
1558 		int			rdata_len = rdata->len;
1559 
1560 		while (rdata_len > freespace)
1561 		{
1562 			/*
1563 			 * Write what fits on this page, and continue on the next page.
1564 			 */
1565 			Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1566 			memcpy(currpos, rdata_data, freespace);
1567 			rdata_data += freespace;
1568 			rdata_len -= freespace;
1569 			written += freespace;
1570 			CurrPos += freespace;
1571 
1572 			/*
1573 			 * Get pointer to beginning of next page, and set the xlp_rem_len
1574 			 * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1575 			 *
1576 			 * It's safe to set the contrecord flag and xlp_rem_len without a
1577 			 * lock on the page. All the other flags were already set when the
1578 			 * page was initialized, in AdvanceXLInsertBuffer, and we're the
1579 			 * only backend that needs to set the contrecord flag.
1580 			 */
1581 			currpos = GetXLogBuffer(CurrPos);
1582 			pagehdr = (XLogPageHeader) currpos;
1583 			pagehdr->xlp_rem_len = write_len - written;
1584 			pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1585 
1586 			/* skip over the page header */
1587 			if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1588 			{
1589 				CurrPos += SizeOfXLogLongPHD;
1590 				currpos += SizeOfXLogLongPHD;
1591 			}
1592 			else
1593 			{
1594 				CurrPos += SizeOfXLogShortPHD;
1595 				currpos += SizeOfXLogShortPHD;
1596 			}
1597 			freespace = INSERT_FREESPACE(CurrPos);
1598 		}
1599 
1600 		Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1601 		memcpy(currpos, rdata_data, rdata_len);
1602 		currpos += rdata_len;
1603 		CurrPos += rdata_len;
1604 		freespace -= rdata_len;
1605 		written += rdata_len;
1606 
1607 		rdata = rdata->next;
1608 	}
1609 	Assert(written == write_len);
1610 
1611 	/*
1612 	 * If this was an xlog-switch, it's not enough to write the switch record,
1613 	 * we also have to consume all the remaining space in the WAL segment.  We
1614 	 * have already reserved that space, but we need to actually fill it.
1615 	 */
1616 	if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
1617 	{
1618 		/* An xlog-switch record doesn't contain any data besides the header */
1619 		Assert(write_len == SizeOfXLogRecord);
1620 
1621 		/* Assert that we did reserve the right amount of space */
1622 		Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1623 
1624 		/* Use up all the remaining space on the current page */
1625 		CurrPos += freespace;
1626 
1627 		/*
1628 		 * Cause all remaining pages in the segment to be flushed, leaving the
1629 		 * XLog position where it should be, at the start of the next segment.
1630 		 * We do this one page at a time, to make sure we don't deadlock
1631 		 * against ourselves if wal_buffers < wal_segment_size.
1632 		 */
1633 		while (CurrPos < EndPos)
1634 		{
1635 			/*
1636 			 * The minimal action to flush the page would be to call
1637 			 * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1638 			 * AdvanceXLInsertBuffer(...).  The page would be left initialized
1639 			 * mostly to zeros, except for the page header (always the short
1640 			 * variant, as this is never a segment's first page).
1641 			 *
1642 			 * The large vistas of zeros are good for compressibility, but the
1643 			 * headers interrupting them every XLOG_BLCKSZ (with values that
1644 			 * differ from page to page) are not.  The effect varies with
1645 			 * compression tool, but bzip2 for instance compresses about an
1646 			 * order of magnitude worse if those headers are left in place.
1647 			 *
1648 			 * Rather than complicating AdvanceXLInsertBuffer itself (which is
1649 			 * called in heavily-loaded circumstances as well as this lightly-
1650 			 * loaded one) with variant behavior, we just use GetXLogBuffer
1651 			 * (which itself calls the two methods we need) to get the pointer
1652 			 * and zero most of the page.  Then we just zero the page header.
1653 			 */
1654 			currpos = GetXLogBuffer(CurrPos);
1655 			MemSet(currpos, 0, SizeOfXLogShortPHD);
1656 
1657 			CurrPos += XLOG_BLCKSZ;
1658 		}
1659 	}
1660 	else
1661 	{
1662 		/* Align the end position, so that the next record starts aligned */
1663 		CurrPos = MAXALIGN64(CurrPos);
1664 	}
1665 
1666 	if (CurrPos != EndPos)
1667 		elog(PANIC, "space reserved for WAL record does not match what was written");
1668 }
1669 
1670 /*
1671  * Acquire a WAL insertion lock, for inserting to WAL.
1672  */
1673 static void
WALInsertLockAcquire(void)1674 WALInsertLockAcquire(void)
1675 {
1676 	bool		immed;
1677 
1678 	/*
1679 	 * It doesn't matter which of the WAL insertion locks we acquire, so try
1680 	 * the one we used last time.  If the system isn't particularly busy, it's
1681 	 * a good bet that it's still available, and it's good to have some
1682 	 * affinity to a particular lock so that you don't unnecessarily bounce
1683 	 * cache lines between processes when there's no contention.
1684 	 *
1685 	 * If this is the first time through in this backend, pick a lock
1686 	 * (semi-)randomly.  This allows the locks to be used evenly if you have a
1687 	 * lot of very short connections.
1688 	 */
1689 	static int	lockToTry = -1;
1690 
1691 	if (lockToTry == -1)
1692 		lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1693 	MyLockNo = lockToTry;
1694 
1695 	/*
1696 	 * The insertingAt value is initially set to 0, as we don't know our
1697 	 * insert location yet.
1698 	 */
1699 	immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1700 	if (!immed)
1701 	{
1702 		/*
1703 		 * If we couldn't get the lock immediately, try another lock next
1704 		 * time.  On a system with more insertion locks than concurrent
1705 		 * inserters, this causes all the inserters to eventually migrate to a
1706 		 * lock that no-one else is using.  On a system with more inserters
1707 		 * than locks, it still helps to distribute the inserters evenly
1708 		 * across the locks.
1709 		 */
1710 		lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1711 	}
1712 }
1713 
1714 /*
1715  * Acquire all WAL insertion locks, to prevent other backends from inserting
1716  * to WAL.
1717  */
1718 static void
WALInsertLockAcquireExclusive(void)1719 WALInsertLockAcquireExclusive(void)
1720 {
1721 	int			i;
1722 
1723 	/*
1724 	 * When holding all the locks, all but the last lock's insertingAt
1725 	 * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1726 	 * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1727 	 */
1728 	for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1729 	{
1730 		LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1731 		LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1732 						&WALInsertLocks[i].l.insertingAt,
1733 						PG_UINT64_MAX);
1734 	}
1735 	/* Variable value reset to 0 at release */
1736 	LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1737 
1738 	holdingAllLocks = true;
1739 }
1740 
1741 /*
1742  * Release our insertion lock (or locks, if we're holding them all).
1743  *
1744  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1745  * next time the lock is acquired.
1746  */
1747 static void
WALInsertLockRelease(void)1748 WALInsertLockRelease(void)
1749 {
1750 	if (holdingAllLocks)
1751 	{
1752 		int			i;
1753 
1754 		for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1755 			LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1756 								  &WALInsertLocks[i].l.insertingAt,
1757 								  0);
1758 
1759 		holdingAllLocks = false;
1760 	}
1761 	else
1762 	{
1763 		LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1764 							  &WALInsertLocks[MyLockNo].l.insertingAt,
1765 							  0);
1766 	}
1767 }
1768 
1769 /*
1770  * Update our insertingAt value, to let others know that we've finished
1771  * inserting up to that point.
1772  */
1773 static void
WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)1774 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1775 {
1776 	if (holdingAllLocks)
1777 	{
1778 		/*
1779 		 * We use the last lock to mark our actual position, see comments in
1780 		 * WALInsertLockAcquireExclusive.
1781 		 */
1782 		LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1783 						&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1784 						insertingAt);
1785 	}
1786 	else
1787 		LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1788 						&WALInsertLocks[MyLockNo].l.insertingAt,
1789 						insertingAt);
1790 }
1791 
1792 /*
1793  * Wait for any WAL insertions < upto to finish.
1794  *
1795  * Returns the location of the oldest insertion that is still in-progress.
1796  * Any WAL prior to that point has been fully copied into WAL buffers, and
1797  * can be flushed out to disk. Because this waits for any insertions older
1798  * than 'upto' to finish, the return value is always >= 'upto'.
1799  *
1800  * Note: When you are about to write out WAL, you must call this function
1801  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1802  * need to wait for an insertion to finish (or at least advance to next
1803  * uninitialized page), and the inserter might need to evict an old WAL buffer
1804  * to make room for a new one, which in turn requires WALWriteLock.
1805  */
1806 static XLogRecPtr
WaitXLogInsertionsToFinish(XLogRecPtr upto)1807 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1808 {
1809 	uint64		bytepos;
1810 	XLogRecPtr	reservedUpto;
1811 	XLogRecPtr	finishedUpto;
1812 	XLogCtlInsert *Insert = &XLogCtl->Insert;
1813 	int			i;
1814 
1815 	if (MyProc == NULL)
1816 		elog(PANIC, "cannot wait without a PGPROC structure");
1817 
1818 	/* Read the current insert position */
1819 	SpinLockAcquire(&Insert->insertpos_lck);
1820 	bytepos = Insert->CurrBytePos;
1821 	SpinLockRelease(&Insert->insertpos_lck);
1822 	reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1823 
1824 	/*
1825 	 * No-one should request to flush a piece of WAL that hasn't even been
1826 	 * reserved yet. However, it can happen if there is a block with a bogus
1827 	 * LSN on disk, for example. XLogFlush checks for that situation and
1828 	 * complains, but only after the flush. Here we just assume that to mean
1829 	 * that all WAL that has been reserved needs to be finished. In this
1830 	 * corner-case, the return value can be smaller than 'upto' argument.
1831 	 */
1832 	if (upto > reservedUpto)
1833 	{
1834 		ereport(LOG,
1835 				(errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
1836 						LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
1837 		upto = reservedUpto;
1838 	}
1839 
1840 	/*
1841 	 * Loop through all the locks, sleeping on any in-progress insert older
1842 	 * than 'upto'.
1843 	 *
1844 	 * finishedUpto is our return value, indicating the point upto which all
1845 	 * the WAL insertions have been finished. Initialize it to the head of
1846 	 * reserved WAL, and as we iterate through the insertion locks, back it
1847 	 * out for any insertion that's still in progress.
1848 	 */
1849 	finishedUpto = reservedUpto;
1850 	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1851 	{
1852 		XLogRecPtr	insertingat = InvalidXLogRecPtr;
1853 
1854 		do
1855 		{
1856 			/*
1857 			 * See if this insertion is in progress.  LWLockWaitForVar will
1858 			 * wait for the lock to be released, or for the 'value' to be set
1859 			 * by a LWLockUpdateVar call.  When a lock is initially acquired,
1860 			 * its value is 0 (InvalidXLogRecPtr), which means that we don't
1861 			 * know where it's inserting yet.  We will have to wait for it. If
1862 			 * it's a small insertion, the record will most likely fit on the
1863 			 * same page and the inserter will release the lock without ever
1864 			 * calling LWLockUpdateVar.  But if it has to sleep, it will
1865 			 * advertise the insertion point with LWLockUpdateVar before
1866 			 * sleeping.
1867 			 */
1868 			if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1869 								 &WALInsertLocks[i].l.insertingAt,
1870 								 insertingat, &insertingat))
1871 			{
1872 				/* the lock was free, so no insertion in progress */
1873 				insertingat = InvalidXLogRecPtr;
1874 				break;
1875 			}
1876 
1877 			/*
1878 			 * This insertion is still in progress. Have to wait, unless the
1879 			 * inserter has proceeded past 'upto'.
1880 			 */
1881 		} while (insertingat < upto);
1882 
1883 		if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1884 			finishedUpto = insertingat;
1885 	}
1886 	return finishedUpto;
1887 }
1888 
1889 /*
1890  * Get a pointer to the right location in the WAL buffer containing the
1891  * given XLogRecPtr.
1892  *
1893  * If the page is not initialized yet, it is initialized. That might require
1894  * evicting an old dirty buffer from the buffer cache, which means I/O.
1895  *
1896  * The caller must ensure that the page containing the requested location
1897  * isn't evicted yet, and won't be evicted. The way to ensure that is to
1898  * hold onto a WAL insertion lock with the insertingAt position set to
1899  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1900  * to evict an old page from the buffer. (This means that once you call
1901  * GetXLogBuffer() with a given 'ptr', you must not access anything before
1902  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1903  * later, because older buffers might be recycled already)
1904  */
1905 static char *
GetXLogBuffer(XLogRecPtr ptr)1906 GetXLogBuffer(XLogRecPtr ptr)
1907 {
1908 	int			idx;
1909 	XLogRecPtr	endptr;
1910 	static uint64 cachedPage = 0;
1911 	static char *cachedPos = NULL;
1912 	XLogRecPtr	expectedEndPtr;
1913 
1914 	/*
1915 	 * Fast path for the common case that we need to access again the same
1916 	 * page as last time.
1917 	 */
1918 	if (ptr / XLOG_BLCKSZ == cachedPage)
1919 	{
1920 		Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1921 		Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1922 		return cachedPos + ptr % XLOG_BLCKSZ;
1923 	}
1924 
1925 	/*
1926 	 * The XLog buffer cache is organized so that a page is always loaded to a
1927 	 * particular buffer.  That way we can easily calculate the buffer a given
1928 	 * page must be loaded into, from the XLogRecPtr alone.
1929 	 */
1930 	idx = XLogRecPtrToBufIdx(ptr);
1931 
1932 	/*
1933 	 * See what page is loaded in the buffer at the moment. It could be the
1934 	 * page we're looking for, or something older. It can't be anything newer
1935 	 * - that would imply the page we're looking for has already been written
1936 	 * out to disk and evicted, and the caller is responsible for making sure
1937 	 * that doesn't happen.
1938 	 *
1939 	 * However, we don't hold a lock while we read the value. If someone has
1940 	 * just initialized the page, it's possible that we get a "torn read" of
1941 	 * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1942 	 * that case we will see a bogus value. That's ok, we'll grab the mapping
1943 	 * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1944 	 * the page we're looking for. But it means that when we do this unlocked
1945 	 * read, we might see a value that appears to be ahead of the page we're
1946 	 * looking for. Don't PANIC on that, until we've verified the value while
1947 	 * holding the lock.
1948 	 */
1949 	expectedEndPtr = ptr;
1950 	expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1951 
1952 	endptr = XLogCtl->xlblocks[idx];
1953 	if (expectedEndPtr != endptr)
1954 	{
1955 		XLogRecPtr	initializedUpto;
1956 
1957 		/*
1958 		 * Before calling AdvanceXLInsertBuffer(), which can block, let others
1959 		 * know how far we're finished with inserting the record.
1960 		 *
1961 		 * NB: If 'ptr' points to just after the page header, advertise a
1962 		 * position at the beginning of the page rather than 'ptr' itself. If
1963 		 * there are no other insertions running, someone might try to flush
1964 		 * up to our advertised location. If we advertised a position after
1965 		 * the page header, someone might try to flush the page header, even
1966 		 * though page might actually not be initialized yet. As the first
1967 		 * inserter on the page, we are effectively responsible for making
1968 		 * sure that it's initialized, before we let insertingAt to move past
1969 		 * the page header.
1970 		 */
1971 		if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1972 			XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
1973 			initializedUpto = ptr - SizeOfXLogShortPHD;
1974 		else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1975 				 XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
1976 			initializedUpto = ptr - SizeOfXLogLongPHD;
1977 		else
1978 			initializedUpto = ptr;
1979 
1980 		WALInsertLockUpdateInsertingAt(initializedUpto);
1981 
1982 		AdvanceXLInsertBuffer(ptr, false);
1983 		endptr = XLogCtl->xlblocks[idx];
1984 
1985 		if (expectedEndPtr != endptr)
1986 			elog(PANIC, "could not find WAL buffer for %X/%X",
1987 				 LSN_FORMAT_ARGS(ptr));
1988 	}
1989 	else
1990 	{
1991 		/*
1992 		 * Make sure the initialization of the page is visible to us, and
1993 		 * won't arrive later to overwrite the WAL data we write on the page.
1994 		 */
1995 		pg_memory_barrier();
1996 	}
1997 
1998 	/*
1999 	 * Found the buffer holding this page. Return a pointer to the right
2000 	 * offset within the page.
2001 	 */
2002 	cachedPage = ptr / XLOG_BLCKSZ;
2003 	cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
2004 
2005 	Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2006 	Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2007 
2008 	return cachedPos + ptr % XLOG_BLCKSZ;
2009 }
2010 
2011 /*
2012  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
2013  * is the position starting from the beginning of WAL, excluding all WAL
2014  * page headers.
2015  */
2016 static XLogRecPtr
XLogBytePosToRecPtr(uint64 bytepos)2017 XLogBytePosToRecPtr(uint64 bytepos)
2018 {
2019 	uint64		fullsegs;
2020 	uint64		fullpages;
2021 	uint64		bytesleft;
2022 	uint32		seg_offset;
2023 	XLogRecPtr	result;
2024 
2025 	fullsegs = bytepos / UsableBytesInSegment;
2026 	bytesleft = bytepos % UsableBytesInSegment;
2027 
2028 	if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2029 	{
2030 		/* fits on first page of segment */
2031 		seg_offset = bytesleft + SizeOfXLogLongPHD;
2032 	}
2033 	else
2034 	{
2035 		/* account for the first page on segment with long header */
2036 		seg_offset = XLOG_BLCKSZ;
2037 		bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2038 
2039 		fullpages = bytesleft / UsableBytesInPage;
2040 		bytesleft = bytesleft % UsableBytesInPage;
2041 
2042 		seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2043 	}
2044 
2045 	XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2046 
2047 	return result;
2048 }
2049 
2050 /*
2051  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
2052  * returns a pointer to the beginning of the page (ie. before page header),
2053  * not to where the first xlog record on that page would go to. This is used
2054  * when converting a pointer to the end of a record.
2055  */
2056 static XLogRecPtr
XLogBytePosToEndRecPtr(uint64 bytepos)2057 XLogBytePosToEndRecPtr(uint64 bytepos)
2058 {
2059 	uint64		fullsegs;
2060 	uint64		fullpages;
2061 	uint64		bytesleft;
2062 	uint32		seg_offset;
2063 	XLogRecPtr	result;
2064 
2065 	fullsegs = bytepos / UsableBytesInSegment;
2066 	bytesleft = bytepos % UsableBytesInSegment;
2067 
2068 	if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2069 	{
2070 		/* fits on first page of segment */
2071 		if (bytesleft == 0)
2072 			seg_offset = 0;
2073 		else
2074 			seg_offset = bytesleft + SizeOfXLogLongPHD;
2075 	}
2076 	else
2077 	{
2078 		/* account for the first page on segment with long header */
2079 		seg_offset = XLOG_BLCKSZ;
2080 		bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2081 
2082 		fullpages = bytesleft / UsableBytesInPage;
2083 		bytesleft = bytesleft % UsableBytesInPage;
2084 
2085 		if (bytesleft == 0)
2086 			seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2087 		else
2088 			seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2089 	}
2090 
2091 	XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2092 
2093 	return result;
2094 }
2095 
2096 /*
2097  * Convert an XLogRecPtr to a "usable byte position".
2098  */
2099 static uint64
XLogRecPtrToBytePos(XLogRecPtr ptr)2100 XLogRecPtrToBytePos(XLogRecPtr ptr)
2101 {
2102 	uint64		fullsegs;
2103 	uint32		fullpages;
2104 	uint32		offset;
2105 	uint64		result;
2106 
2107 	XLByteToSeg(ptr, fullsegs, wal_segment_size);
2108 
2109 	fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
2110 	offset = ptr % XLOG_BLCKSZ;
2111 
2112 	if (fullpages == 0)
2113 	{
2114 		result = fullsegs * UsableBytesInSegment;
2115 		if (offset > 0)
2116 		{
2117 			Assert(offset >= SizeOfXLogLongPHD);
2118 			result += offset - SizeOfXLogLongPHD;
2119 		}
2120 	}
2121 	else
2122 	{
2123 		result = fullsegs * UsableBytesInSegment +
2124 			(XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2125 			(fullpages - 1) * UsableBytesInPage;	/* full pages */
2126 		if (offset > 0)
2127 		{
2128 			Assert(offset >= SizeOfXLogShortPHD);
2129 			result += offset - SizeOfXLogShortPHD;
2130 		}
2131 	}
2132 
2133 	return result;
2134 }
2135 
2136 /*
2137  * Initialize XLOG buffers, writing out old buffers if they still contain
2138  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2139  * true, initialize as many pages as we can without having to write out
2140  * unwritten data. Any new pages are initialized to zeros, with pages headers
2141  * initialized properly.
2142  */
2143 static void
AdvanceXLInsertBuffer(XLogRecPtr upto,bool opportunistic)2144 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2145 {
2146 	XLogCtlInsert *Insert = &XLogCtl->Insert;
2147 	int			nextidx;
2148 	XLogRecPtr	OldPageRqstPtr;
2149 	XLogwrtRqst WriteRqst;
2150 	XLogRecPtr	NewPageEndPtr = InvalidXLogRecPtr;
2151 	XLogRecPtr	NewPageBeginPtr;
2152 	XLogPageHeader NewPage;
2153 	int			npages = 0;
2154 
2155 	LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2156 
2157 	/*
2158 	 * Now that we have the lock, check if someone initialized the page
2159 	 * already.
2160 	 */
2161 	while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2162 	{
2163 		nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2164 
2165 		/*
2166 		 * Get ending-offset of the buffer page we need to replace (this may
2167 		 * be zero if the buffer hasn't been used yet).  Fall through if it's
2168 		 * already written out.
2169 		 */
2170 		OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2171 		if (LogwrtResult.Write < OldPageRqstPtr)
2172 		{
2173 			/*
2174 			 * Nope, got work to do. If we just want to pre-initialize as much
2175 			 * as we can without flushing, give up now.
2176 			 */
2177 			if (opportunistic)
2178 				break;
2179 
2180 			/* Before waiting, get info_lck and update LogwrtResult */
2181 			SpinLockAcquire(&XLogCtl->info_lck);
2182 			if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2183 				XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2184 			LogwrtResult = XLogCtl->LogwrtResult;
2185 			SpinLockRelease(&XLogCtl->info_lck);
2186 
2187 			/*
2188 			 * Now that we have an up-to-date LogwrtResult value, see if we
2189 			 * still need to write it or if someone else already did.
2190 			 */
2191 			if (LogwrtResult.Write < OldPageRqstPtr)
2192 			{
2193 				/*
2194 				 * Must acquire write lock. Release WALBufMappingLock first,
2195 				 * to make sure that all insertions that we need to wait for
2196 				 * can finish (up to this same position). Otherwise we risk
2197 				 * deadlock.
2198 				 */
2199 				LWLockRelease(WALBufMappingLock);
2200 
2201 				WaitXLogInsertionsToFinish(OldPageRqstPtr);
2202 
2203 				LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2204 
2205 				LogwrtResult = XLogCtl->LogwrtResult;
2206 				if (LogwrtResult.Write >= OldPageRqstPtr)
2207 				{
2208 					/* OK, someone wrote it already */
2209 					LWLockRelease(WALWriteLock);
2210 				}
2211 				else
2212 				{
2213 					/* Have to write it ourselves */
2214 					TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2215 					WriteRqst.Write = OldPageRqstPtr;
2216 					WriteRqst.Flush = 0;
2217 					XLogWrite(WriteRqst, false);
2218 					LWLockRelease(WALWriteLock);
2219 					WalStats.m_wal_buffers_full++;
2220 					TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2221 				}
2222 				/* Re-acquire WALBufMappingLock and retry */
2223 				LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2224 				continue;
2225 			}
2226 		}
2227 
2228 		/*
2229 		 * Now the next buffer slot is free and we can set it up to be the
2230 		 * next output page.
2231 		 */
2232 		NewPageBeginPtr = XLogCtl->InitializedUpTo;
2233 		NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2234 
2235 		Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2236 
2237 		NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2238 
2239 		/*
2240 		 * Be sure to re-zero the buffer so that bytes beyond what we've
2241 		 * written will look like zeroes and not valid XLOG records...
2242 		 */
2243 		MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2244 
2245 		/*
2246 		 * Fill the new page's header
2247 		 */
2248 		NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2249 
2250 		/* NewPage->xlp_info = 0; */	/* done by memset */
2251 		NewPage->xlp_tli = ThisTimeLineID;
2252 		NewPage->xlp_pageaddr = NewPageBeginPtr;
2253 
2254 		/* NewPage->xlp_rem_len = 0; */	/* done by memset */
2255 
2256 		/*
2257 		 * If online backup is not in progress, mark the header to indicate
2258 		 * that WAL records beginning in this page have removable backup
2259 		 * blocks.  This allows the WAL archiver to know whether it is safe to
2260 		 * compress archived WAL data by transforming full-block records into
2261 		 * the non-full-block format.  It is sufficient to record this at the
2262 		 * page level because we force a page switch (in fact a segment
2263 		 * switch) when starting a backup, so the flag will be off before any
2264 		 * records can be written during the backup.  At the end of a backup,
2265 		 * the last page will be marked as all unsafe when perhaps only part
2266 		 * is unsafe, but at worst the archiver would miss the opportunity to
2267 		 * compress a few records.
2268 		 */
2269 		if (!Insert->forcePageWrites)
2270 			NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2271 
2272 		/*
2273 		 * If a record was found to be broken at the end of recovery, and
2274 		 * we're going to write on the page where its first contrecord was
2275 		 * lost, set the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag on the page
2276 		 * header.  See CreateOverwriteContrecordRecord().
2277 		 */
2278 		if (missingContrecPtr == NewPageBeginPtr)
2279 		{
2280 			NewPage->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
2281 			missingContrecPtr = InvalidXLogRecPtr;
2282 		}
2283 
2284 		/*
2285 		 * If first page of an XLOG segment file, make it a long header.
2286 		 */
2287 		if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
2288 		{
2289 			XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2290 
2291 			NewLongPage->xlp_sysid = ControlFile->system_identifier;
2292 			NewLongPage->xlp_seg_size = wal_segment_size;
2293 			NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2294 			NewPage->xlp_info |= XLP_LONG_HEADER;
2295 		}
2296 
2297 		/*
2298 		 * Make sure the initialization of the page becomes visible to others
2299 		 * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2300 		 * holding a lock.
2301 		 */
2302 		pg_write_barrier();
2303 
2304 		*((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2305 
2306 		XLogCtl->InitializedUpTo = NewPageEndPtr;
2307 
2308 		npages++;
2309 	}
2310 	LWLockRelease(WALBufMappingLock);
2311 
2312 #ifdef WAL_DEBUG
2313 	if (XLOG_DEBUG && npages > 0)
2314 	{
2315 		elog(DEBUG1, "initialized %d pages, up to %X/%X",
2316 			 npages, LSN_FORMAT_ARGS(NewPageEndPtr));
2317 	}
2318 #endif
2319 }
2320 
2321 /*
2322  * Calculate CheckPointSegments based on max_wal_size_mb and
2323  * checkpoint_completion_target.
2324  */
2325 static void
CalculateCheckpointSegments(void)2326 CalculateCheckpointSegments(void)
2327 {
2328 	double		target;
2329 
2330 	/*-------
2331 	 * Calculate the distance at which to trigger a checkpoint, to avoid
2332 	 * exceeding max_wal_size_mb. This is based on two assumptions:
2333 	 *
2334 	 * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2335 	 *    WAL for two checkpoint cycles to allow us to recover from the
2336 	 *    secondary checkpoint if the first checkpoint failed, though we
2337 	 *    only did this on the primary anyway, not on standby. Keeping just
2338 	 *    one checkpoint simplifies processing and reduces disk space in
2339 	 *    many smaller databases.)
2340 	 * b) during checkpoint, we consume checkpoint_completion_target *
2341 	 *	  number of segments consumed between checkpoints.
2342 	 *-------
2343 	 */
2344 	target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2345 		(1.0 + CheckPointCompletionTarget);
2346 
2347 	/* round down */
2348 	CheckPointSegments = (int) target;
2349 
2350 	if (CheckPointSegments < 1)
2351 		CheckPointSegments = 1;
2352 }
2353 
2354 void
assign_max_wal_size(int newval,void * extra)2355 assign_max_wal_size(int newval, void *extra)
2356 {
2357 	max_wal_size_mb = newval;
2358 	CalculateCheckpointSegments();
2359 }
2360 
2361 void
assign_checkpoint_completion_target(double newval,void * extra)2362 assign_checkpoint_completion_target(double newval, void *extra)
2363 {
2364 	CheckPointCompletionTarget = newval;
2365 	CalculateCheckpointSegments();
2366 }
2367 
2368 /*
2369  * At a checkpoint, how many WAL segments to recycle as preallocated future
2370  * XLOG segments? Returns the highest segment that should be preallocated.
2371  */
2372 static XLogSegNo
XLOGfileslop(XLogRecPtr lastredoptr)2373 XLOGfileslop(XLogRecPtr lastredoptr)
2374 {
2375 	XLogSegNo	minSegNo;
2376 	XLogSegNo	maxSegNo;
2377 	double		distance;
2378 	XLogSegNo	recycleSegNo;
2379 
2380 	/*
2381 	 * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2382 	 * correspond to. Always recycle enough segments to meet the minimum, and
2383 	 * remove enough segments to stay below the maximum.
2384 	 */
2385 	minSegNo = lastredoptr / wal_segment_size +
2386 		ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
2387 	maxSegNo = lastredoptr / wal_segment_size +
2388 		ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2389 
2390 	/*
2391 	 * Between those limits, recycle enough segments to get us through to the
2392 	 * estimated end of next checkpoint.
2393 	 *
2394 	 * To estimate where the next checkpoint will finish, assume that the
2395 	 * system runs steadily consuming CheckPointDistanceEstimate bytes between
2396 	 * every checkpoint.
2397 	 */
2398 	distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2399 	/* add 10% for good measure. */
2400 	distance *= 1.10;
2401 
2402 	recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
2403 									wal_segment_size);
2404 
2405 	if (recycleSegNo < minSegNo)
2406 		recycleSegNo = minSegNo;
2407 	if (recycleSegNo > maxSegNo)
2408 		recycleSegNo = maxSegNo;
2409 
2410 	return recycleSegNo;
2411 }
2412 
2413 /*
2414  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2415  *
2416  * new_segno indicates a log file that has just been filled up (or read
2417  * during recovery). We measure the distance from RedoRecPtr to new_segno
2418  * and see if that exceeds CheckPointSegments.
2419  *
2420  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2421  */
2422 static bool
XLogCheckpointNeeded(XLogSegNo new_segno)2423 XLogCheckpointNeeded(XLogSegNo new_segno)
2424 {
2425 	XLogSegNo	old_segno;
2426 
2427 	XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2428 
2429 	if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2430 		return true;
2431 	return false;
2432 }
2433 
2434 /*
2435  * Write and/or fsync the log at least as far as WriteRqst indicates.
2436  *
2437  * If flexible == true, we don't have to write as far as WriteRqst, but
2438  * may stop at any convenient boundary (such as a cache or logfile boundary).
2439  * This option allows us to avoid uselessly issuing multiple writes when a
2440  * single one would do.
2441  *
2442  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2443  * must be called before grabbing the lock, to make sure the data is ready to
2444  * write.
2445  */
2446 static void
XLogWrite(XLogwrtRqst WriteRqst,bool flexible)2447 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2448 {
2449 	bool		ispartialpage;
2450 	bool		last_iteration;
2451 	bool		finishing_seg;
2452 	bool		use_existent;
2453 	int			curridx;
2454 	int			npages;
2455 	int			startidx;
2456 	uint32		startoffset;
2457 
2458 	/* We should always be inside a critical section here */
2459 	Assert(CritSectionCount > 0);
2460 
2461 	/*
2462 	 * Update local LogwrtResult (caller probably did this already, but...)
2463 	 */
2464 	LogwrtResult = XLogCtl->LogwrtResult;
2465 
2466 	/*
2467 	 * Since successive pages in the xlog cache are consecutively allocated,
2468 	 * we can usually gather multiple pages together and issue just one
2469 	 * write() call.  npages is the number of pages we have determined can be
2470 	 * written together; startidx is the cache block index of the first one,
2471 	 * and startoffset is the file offset at which it should go. The latter
2472 	 * two variables are only valid when npages > 0, but we must initialize
2473 	 * all of them to keep the compiler quiet.
2474 	 */
2475 	npages = 0;
2476 	startidx = 0;
2477 	startoffset = 0;
2478 
2479 	/*
2480 	 * Within the loop, curridx is the cache block index of the page to
2481 	 * consider writing.  Begin at the buffer containing the next unwritten
2482 	 * page, or last partially written page.
2483 	 */
2484 	curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2485 
2486 	while (LogwrtResult.Write < WriteRqst.Write)
2487 	{
2488 		/*
2489 		 * Make sure we're not ahead of the insert process.  This could happen
2490 		 * if we're passed a bogus WriteRqst.Write that is past the end of the
2491 		 * last page that's been initialized by AdvanceXLInsertBuffer.
2492 		 */
2493 		XLogRecPtr	EndPtr = XLogCtl->xlblocks[curridx];
2494 
2495 		if (LogwrtResult.Write >= EndPtr)
2496 			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2497 				 LSN_FORMAT_ARGS(LogwrtResult.Write),
2498 				 LSN_FORMAT_ARGS(EndPtr));
2499 
2500 		/* Advance LogwrtResult.Write to end of current buffer page */
2501 		LogwrtResult.Write = EndPtr;
2502 		ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2503 
2504 		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2505 							 wal_segment_size))
2506 		{
2507 			/*
2508 			 * Switch to new logfile segment.  We cannot have any pending
2509 			 * pages here (since we dump what we have at segment end).
2510 			 */
2511 			Assert(npages == 0);
2512 			if (openLogFile >= 0)
2513 				XLogFileClose();
2514 			XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2515 							wal_segment_size);
2516 
2517 			/* create/use new log file */
2518 			use_existent = true;
2519 			openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2520 			ReserveExternalFD();
2521 		}
2522 
2523 		/* Make sure we have the current logfile open */
2524 		if (openLogFile < 0)
2525 		{
2526 			XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2527 							wal_segment_size);
2528 			openLogFile = XLogFileOpen(openLogSegNo);
2529 			ReserveExternalFD();
2530 		}
2531 
2532 		/* Add current page to the set of pending pages-to-dump */
2533 		if (npages == 0)
2534 		{
2535 			/* first of group */
2536 			startidx = curridx;
2537 			startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2538 											wal_segment_size);
2539 		}
2540 		npages++;
2541 
2542 		/*
2543 		 * Dump the set if this will be the last loop iteration, or if we are
2544 		 * at the last page of the cache area (since the next page won't be
2545 		 * contiguous in memory), or if we are at the end of the logfile
2546 		 * segment.
2547 		 */
2548 		last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2549 
2550 		finishing_seg = !ispartialpage &&
2551 			(startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2552 
2553 		if (last_iteration ||
2554 			curridx == XLogCtl->XLogCacheBlck ||
2555 			finishing_seg)
2556 		{
2557 			char	   *from;
2558 			Size		nbytes;
2559 			Size		nleft;
2560 			int			written;
2561 			instr_time	start;
2562 
2563 			/* OK to write the page(s) */
2564 			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2565 			nbytes = npages * (Size) XLOG_BLCKSZ;
2566 			nleft = nbytes;
2567 			do
2568 			{
2569 				errno = 0;
2570 
2571 				/* Measure I/O timing to write WAL data */
2572 				if (track_wal_io_timing)
2573 					INSTR_TIME_SET_CURRENT(start);
2574 
2575 				pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2576 				written = pg_pwrite(openLogFile, from, nleft, startoffset);
2577 				pgstat_report_wait_end();
2578 
2579 				/*
2580 				 * Increment the I/O timing and the number of times WAL data
2581 				 * were written out to disk.
2582 				 */
2583 				if (track_wal_io_timing)
2584 				{
2585 					instr_time	duration;
2586 
2587 					INSTR_TIME_SET_CURRENT(duration);
2588 					INSTR_TIME_SUBTRACT(duration, start);
2589 					WalStats.m_wal_write_time += INSTR_TIME_GET_MICROSEC(duration);
2590 				}
2591 
2592 				WalStats.m_wal_write++;
2593 
2594 				if (written <= 0)
2595 				{
2596 					char		xlogfname[MAXFNAMELEN];
2597 					int			save_errno;
2598 
2599 					if (errno == EINTR)
2600 						continue;
2601 
2602 					save_errno = errno;
2603 					XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
2604 								 wal_segment_size);
2605 					errno = save_errno;
2606 					ereport(PANIC,
2607 							(errcode_for_file_access(),
2608 							 errmsg("could not write to log file %s "
2609 									"at offset %u, length %zu: %m",
2610 									xlogfname, startoffset, nleft)));
2611 				}
2612 				nleft -= written;
2613 				from += written;
2614 				startoffset += written;
2615 			} while (nleft > 0);
2616 
2617 			npages = 0;
2618 
2619 			/*
2620 			 * If we just wrote the whole last page of a logfile segment,
2621 			 * fsync the segment immediately.  This avoids having to go back
2622 			 * and re-open prior segments when an fsync request comes along
2623 			 * later. Doing it here ensures that one and only one backend will
2624 			 * perform this fsync.
2625 			 *
2626 			 * This is also the right place to notify the Archiver that the
2627 			 * segment is ready to copy to archival storage, and to update the
2628 			 * timer for archive_timeout, and to signal for a checkpoint if
2629 			 * too many logfile segments have been used since the last
2630 			 * checkpoint.
2631 			 */
2632 			if (finishing_seg)
2633 			{
2634 				issue_xlog_fsync(openLogFile, openLogSegNo);
2635 
2636 				/* signal that we need to wakeup walsenders later */
2637 				WalSndWakeupRequest();
2638 
2639 				LogwrtResult.Flush = LogwrtResult.Write;	/* end of page */
2640 
2641 				if (XLogArchivingActive())
2642 					XLogArchiveNotifySeg(openLogSegNo);
2643 
2644 				XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2645 				XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2646 
2647 				/*
2648 				 * Request a checkpoint if we've consumed too much xlog since
2649 				 * the last one.  For speed, we first check using the local
2650 				 * copy of RedoRecPtr, which might be out of date; if it looks
2651 				 * like a checkpoint is needed, forcibly update RedoRecPtr and
2652 				 * recheck.
2653 				 */
2654 				if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2655 				{
2656 					(void) GetRedoRecPtr();
2657 					if (XLogCheckpointNeeded(openLogSegNo))
2658 						RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2659 				}
2660 			}
2661 		}
2662 
2663 		if (ispartialpage)
2664 		{
2665 			/* Only asked to write a partial page */
2666 			LogwrtResult.Write = WriteRqst.Write;
2667 			break;
2668 		}
2669 		curridx = NextBufIdx(curridx);
2670 
2671 		/* If flexible, break out of loop as soon as we wrote something */
2672 		if (flexible && npages == 0)
2673 			break;
2674 	}
2675 
2676 	Assert(npages == 0);
2677 
2678 	/*
2679 	 * If asked to flush, do so
2680 	 */
2681 	if (LogwrtResult.Flush < WriteRqst.Flush &&
2682 		LogwrtResult.Flush < LogwrtResult.Write)
2683 
2684 	{
2685 		/*
2686 		 * Could get here without iterating above loop, in which case we might
2687 		 * have no open file or the wrong one.  However, we do not need to
2688 		 * fsync more than one file.
2689 		 */
2690 		if (sync_method != SYNC_METHOD_OPEN &&
2691 			sync_method != SYNC_METHOD_OPEN_DSYNC)
2692 		{
2693 			if (openLogFile >= 0 &&
2694 				!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2695 								 wal_segment_size))
2696 				XLogFileClose();
2697 			if (openLogFile < 0)
2698 			{
2699 				XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2700 								wal_segment_size);
2701 				openLogFile = XLogFileOpen(openLogSegNo);
2702 				ReserveExternalFD();
2703 			}
2704 
2705 			issue_xlog_fsync(openLogFile, openLogSegNo);
2706 		}
2707 
2708 		/* signal that we need to wakeup walsenders later */
2709 		WalSndWakeupRequest();
2710 
2711 		LogwrtResult.Flush = LogwrtResult.Write;
2712 	}
2713 
2714 	/*
2715 	 * Update shared-memory status
2716 	 *
2717 	 * We make sure that the shared 'request' values do not fall behind the
2718 	 * 'result' values.  This is not absolutely essential, but it saves some
2719 	 * code in a couple of places.
2720 	 */
2721 	{
2722 		SpinLockAcquire(&XLogCtl->info_lck);
2723 		XLogCtl->LogwrtResult = LogwrtResult;
2724 		if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2725 			XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2726 		if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2727 			XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2728 		SpinLockRelease(&XLogCtl->info_lck);
2729 	}
2730 }
2731 
2732 /*
2733  * Record the LSN for an asynchronous transaction commit/abort
2734  * and nudge the WALWriter if there is work for it to do.
2735  * (This should not be called for synchronous commits.)
2736  */
2737 void
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)2738 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2739 {
2740 	XLogRecPtr	WriteRqstPtr = asyncXactLSN;
2741 	bool		sleeping;
2742 
2743 	SpinLockAcquire(&XLogCtl->info_lck);
2744 	LogwrtResult = XLogCtl->LogwrtResult;
2745 	sleeping = XLogCtl->WalWriterSleeping;
2746 	if (XLogCtl->asyncXactLSN < asyncXactLSN)
2747 		XLogCtl->asyncXactLSN = asyncXactLSN;
2748 	SpinLockRelease(&XLogCtl->info_lck);
2749 
2750 	/*
2751 	 * If the WALWriter is sleeping, we should kick it to make it come out of
2752 	 * low-power mode.  Otherwise, determine whether there's a full page of
2753 	 * WAL available to write.
2754 	 */
2755 	if (!sleeping)
2756 	{
2757 		/* back off to last completed page boundary */
2758 		WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2759 
2760 		/* if we have already flushed that far, we're done */
2761 		if (WriteRqstPtr <= LogwrtResult.Flush)
2762 			return;
2763 	}
2764 
2765 	/*
2766 	 * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2767 	 * to come out of low-power mode so that this async commit will reach disk
2768 	 * within the expected amount of time.
2769 	 */
2770 	if (ProcGlobal->walwriterLatch)
2771 		SetLatch(ProcGlobal->walwriterLatch);
2772 }
2773 
2774 /*
2775  * Record the LSN up to which we can remove WAL because it's not required by
2776  * any replication slot.
2777  */
2778 void
XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)2779 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2780 {
2781 	SpinLockAcquire(&XLogCtl->info_lck);
2782 	XLogCtl->replicationSlotMinLSN = lsn;
2783 	SpinLockRelease(&XLogCtl->info_lck);
2784 }
2785 
2786 
2787 /*
2788  * Return the oldest LSN we must retain to satisfy the needs of some
2789  * replication slot.
2790  */
2791 static XLogRecPtr
XLogGetReplicationSlotMinimumLSN(void)2792 XLogGetReplicationSlotMinimumLSN(void)
2793 {
2794 	XLogRecPtr	retval;
2795 
2796 	SpinLockAcquire(&XLogCtl->info_lck);
2797 	retval = XLogCtl->replicationSlotMinLSN;
2798 	SpinLockRelease(&XLogCtl->info_lck);
2799 
2800 	return retval;
2801 }
2802 
2803 /*
2804  * Advance minRecoveryPoint in control file.
2805  *
2806  * If we crash during recovery, we must reach this point again before the
2807  * database is consistent.
2808  *
2809  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2810  * is only updated if it's not already greater than or equal to 'lsn'.
2811  */
2812 static void
UpdateMinRecoveryPoint(XLogRecPtr lsn,bool force)2813 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2814 {
2815 	/* Quick check using our local copy of the variable */
2816 	if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2817 		return;
2818 
2819 	/*
2820 	 * An invalid minRecoveryPoint means that we need to recover all the WAL,
2821 	 * i.e., we're doing crash recovery.  We never modify the control file's
2822 	 * value in that case, so we can short-circuit future checks here too. The
2823 	 * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2824 	 * updated until crash recovery finishes.  We only do this for the startup
2825 	 * process as it should not update its own reference of minRecoveryPoint
2826 	 * until it has finished crash recovery to make sure that all WAL
2827 	 * available is replayed in this case.  This also saves from extra locks
2828 	 * taken on the control file from the startup process.
2829 	 */
2830 	if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
2831 	{
2832 		updateMinRecoveryPoint = false;
2833 		return;
2834 	}
2835 
2836 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2837 
2838 	/* update local copy */
2839 	minRecoveryPoint = ControlFile->minRecoveryPoint;
2840 	minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2841 
2842 	if (XLogRecPtrIsInvalid(minRecoveryPoint))
2843 		updateMinRecoveryPoint = false;
2844 	else if (force || minRecoveryPoint < lsn)
2845 	{
2846 		XLogRecPtr	newMinRecoveryPoint;
2847 		TimeLineID	newMinRecoveryPointTLI;
2848 
2849 		/*
2850 		 * To avoid having to update the control file too often, we update it
2851 		 * all the way to the last record being replayed, even though 'lsn'
2852 		 * would suffice for correctness.  This also allows the 'force' case
2853 		 * to not need a valid 'lsn' value.
2854 		 *
2855 		 * Another important reason for doing it this way is that the passed
2856 		 * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2857 		 * the caller got it from a corrupted heap page.  Accepting such a
2858 		 * value as the min recovery point would prevent us from coming up at
2859 		 * all.  Instead, we just log a warning and continue with recovery.
2860 		 * (See also the comments about corrupt LSNs in XLogFlush.)
2861 		 */
2862 		SpinLockAcquire(&XLogCtl->info_lck);
2863 		newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2864 		newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2865 		SpinLockRelease(&XLogCtl->info_lck);
2866 
2867 		if (!force && newMinRecoveryPoint < lsn)
2868 			elog(WARNING,
2869 				 "xlog min recovery request %X/%X is past current point %X/%X",
2870 				 LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
2871 
2872 		/* update control file */
2873 		if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2874 		{
2875 			ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2876 			ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2877 			UpdateControlFile();
2878 			minRecoveryPoint = newMinRecoveryPoint;
2879 			minRecoveryPointTLI = newMinRecoveryPointTLI;
2880 
2881 			ereport(DEBUG2,
2882 					(errmsg_internal("updated min recovery point to %X/%X on timeline %u",
2883 									 LSN_FORMAT_ARGS(minRecoveryPoint),
2884 									 newMinRecoveryPointTLI)));
2885 		}
2886 	}
2887 	LWLockRelease(ControlFileLock);
2888 }
2889 
2890 /*
2891  * Ensure that all XLOG data through the given position is flushed to disk.
2892  *
2893  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2894  * already held, and we try to avoid acquiring it if possible.
2895  */
2896 void
XLogFlush(XLogRecPtr record)2897 XLogFlush(XLogRecPtr record)
2898 {
2899 	XLogRecPtr	WriteRqstPtr;
2900 	XLogwrtRqst WriteRqst;
2901 
2902 	/*
2903 	 * During REDO, we are reading not writing WAL.  Therefore, instead of
2904 	 * trying to flush the WAL, we should update minRecoveryPoint instead. We
2905 	 * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2906 	 * to act this way too, and because when it tries to write the
2907 	 * end-of-recovery checkpoint, it should indeed flush.
2908 	 */
2909 	if (!XLogInsertAllowed())
2910 	{
2911 		UpdateMinRecoveryPoint(record, false);
2912 		return;
2913 	}
2914 
2915 	/* Quick exit if already known flushed */
2916 	if (record <= LogwrtResult.Flush)
2917 		return;
2918 
2919 #ifdef WAL_DEBUG
2920 	if (XLOG_DEBUG)
2921 		elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2922 			 LSN_FORMAT_ARGS(record),
2923 			 LSN_FORMAT_ARGS(LogwrtResult.Write),
2924 			 LSN_FORMAT_ARGS(LogwrtResult.Flush));
2925 #endif
2926 
2927 	START_CRIT_SECTION();
2928 
2929 	/*
2930 	 * Since fsync is usually a horribly expensive operation, we try to
2931 	 * piggyback as much data as we can on each fsync: if we see any more data
2932 	 * entered into the xlog buffer, we'll write and fsync that too, so that
2933 	 * the final value of LogwrtResult.Flush is as large as possible. This
2934 	 * gives us some chance of avoiding another fsync immediately after.
2935 	 */
2936 
2937 	/* initialize to given target; may increase below */
2938 	WriteRqstPtr = record;
2939 
2940 	/*
2941 	 * Now wait until we get the write lock, or someone else does the flush
2942 	 * for us.
2943 	 */
2944 	for (;;)
2945 	{
2946 		XLogRecPtr	insertpos;
2947 
2948 		/* read LogwrtResult and update local state */
2949 		SpinLockAcquire(&XLogCtl->info_lck);
2950 		if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2951 			WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2952 		LogwrtResult = XLogCtl->LogwrtResult;
2953 		SpinLockRelease(&XLogCtl->info_lck);
2954 
2955 		/* done already? */
2956 		if (record <= LogwrtResult.Flush)
2957 			break;
2958 
2959 		/*
2960 		 * Before actually performing the write, wait for all in-flight
2961 		 * insertions to the pages we're about to write to finish.
2962 		 */
2963 		insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2964 
2965 		/*
2966 		 * Try to get the write lock. If we can't get it immediately, wait
2967 		 * until it's released, and recheck if we still need to do the flush
2968 		 * or if the backend that held the lock did it for us already. This
2969 		 * helps to maintain a good rate of group committing when the system
2970 		 * is bottlenecked by the speed of fsyncing.
2971 		 */
2972 		if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2973 		{
2974 			/*
2975 			 * The lock is now free, but we didn't acquire it yet. Before we
2976 			 * do, loop back to check if someone else flushed the record for
2977 			 * us already.
2978 			 */
2979 			continue;
2980 		}
2981 
2982 		/* Got the lock; recheck whether request is satisfied */
2983 		LogwrtResult = XLogCtl->LogwrtResult;
2984 		if (record <= LogwrtResult.Flush)
2985 		{
2986 			LWLockRelease(WALWriteLock);
2987 			break;
2988 		}
2989 
2990 		/*
2991 		 * Sleep before flush! By adding a delay here, we may give further
2992 		 * backends the opportunity to join the backlog of group commit
2993 		 * followers; this can significantly improve transaction throughput,
2994 		 * at the risk of increasing transaction latency.
2995 		 *
2996 		 * We do not sleep if enableFsync is not turned on, nor if there are
2997 		 * fewer than CommitSiblings other backends with active transactions.
2998 		 */
2999 		if (CommitDelay > 0 && enableFsync &&
3000 			MinimumActiveBackends(CommitSiblings))
3001 		{
3002 			pg_usleep(CommitDelay);
3003 
3004 			/*
3005 			 * Re-check how far we can now flush the WAL. It's generally not
3006 			 * safe to call WaitXLogInsertionsToFinish while holding
3007 			 * WALWriteLock, because an in-progress insertion might need to
3008 			 * also grab WALWriteLock to make progress. But we know that all
3009 			 * the insertions up to insertpos have already finished, because
3010 			 * that's what the earlier WaitXLogInsertionsToFinish() returned.
3011 			 * We're only calling it again to allow insertpos to be moved
3012 			 * further forward, not to actually wait for anyone.
3013 			 */
3014 			insertpos = WaitXLogInsertionsToFinish(insertpos);
3015 		}
3016 
3017 		/* try to write/flush later additions to XLOG as well */
3018 		WriteRqst.Write = insertpos;
3019 		WriteRqst.Flush = insertpos;
3020 
3021 		XLogWrite(WriteRqst, false);
3022 
3023 		LWLockRelease(WALWriteLock);
3024 		/* done */
3025 		break;
3026 	}
3027 
3028 	END_CRIT_SECTION();
3029 
3030 	/* wake up walsenders now that we've released heavily contended locks */
3031 	WalSndWakeupProcessRequests();
3032 
3033 	/*
3034 	 * If we still haven't flushed to the request point then we have a
3035 	 * problem; most likely, the requested flush point is past end of XLOG.
3036 	 * This has been seen to occur when a disk page has a corrupted LSN.
3037 	 *
3038 	 * Formerly we treated this as a PANIC condition, but that hurts the
3039 	 * system's robustness rather than helping it: we do not want to take down
3040 	 * the whole system due to corruption on one data page.  In particular, if
3041 	 * the bad page is encountered again during recovery then we would be
3042 	 * unable to restart the database at all!  (This scenario actually
3043 	 * happened in the field several times with 7.1 releases.)	As of 8.4, bad
3044 	 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3045 	 * the only time we can reach here during recovery is while flushing the
3046 	 * end-of-recovery checkpoint record, and we don't expect that to have a
3047 	 * bad LSN.
3048 	 *
3049 	 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3050 	 * since xact.c calls this routine inside a critical section.  However,
3051 	 * calls from bufmgr.c are not within critical sections and so we will not
3052 	 * force a restart for a bad LSN on a data page.
3053 	 */
3054 	if (LogwrtResult.Flush < record)
3055 		elog(ERROR,
3056 			 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
3057 			 LSN_FORMAT_ARGS(record),
3058 			 LSN_FORMAT_ARGS(LogwrtResult.Flush));
3059 }
3060 
3061 /*
3062  * Write & flush xlog, but without specifying exactly where to.
3063  *
3064  * We normally write only completed blocks; but if there is nothing to do on
3065  * that basis, we check for unwritten async commits in the current incomplete
3066  * block, and write through the latest one of those.  Thus, if async commits
3067  * are not being used, we will write complete blocks only.
3068  *
3069  * If, based on the above, there's anything to write we do so immediately. But
3070  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
3071  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
3072  * more than wal_writer_flush_after unflushed blocks.
3073  *
3074  * We can guarantee that async commits reach disk after at most three
3075  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
3076  * to write "flexibly", meaning it can stop at the end of the buffer ring;
3077  * this makes a difference only with very high load or long wal_writer_delay,
3078  * but imposes one extra cycle for the worst case for async commits.)
3079  *
3080  * This routine is invoked periodically by the background walwriter process.
3081  *
3082  * Returns true if there was any work to do, even if we skipped flushing due
3083  * to wal_writer_delay/wal_writer_flush_after.
3084  */
3085 bool
XLogBackgroundFlush(void)3086 XLogBackgroundFlush(void)
3087 {
3088 	XLogwrtRqst WriteRqst;
3089 	bool		flexible = true;
3090 	static TimestampTz lastflush;
3091 	TimestampTz now;
3092 	int			flushbytes;
3093 
3094 	/* XLOG doesn't need flushing during recovery */
3095 	if (RecoveryInProgress())
3096 		return false;
3097 
3098 	/* read LogwrtResult and update local state */
3099 	SpinLockAcquire(&XLogCtl->info_lck);
3100 	LogwrtResult = XLogCtl->LogwrtResult;
3101 	WriteRqst = XLogCtl->LogwrtRqst;
3102 	SpinLockRelease(&XLogCtl->info_lck);
3103 
3104 	/* back off to last completed page boundary */
3105 	WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3106 
3107 	/* if we have already flushed that far, consider async commit records */
3108 	if (WriteRqst.Write <= LogwrtResult.Flush)
3109 	{
3110 		SpinLockAcquire(&XLogCtl->info_lck);
3111 		WriteRqst.Write = XLogCtl->asyncXactLSN;
3112 		SpinLockRelease(&XLogCtl->info_lck);
3113 		flexible = false;		/* ensure it all gets written */
3114 	}
3115 
3116 	/*
3117 	 * If already known flushed, we're done. Just need to check if we are
3118 	 * holding an open file handle to a logfile that's no longer in use,
3119 	 * preventing the file from being deleted.
3120 	 */
3121 	if (WriteRqst.Write <= LogwrtResult.Flush)
3122 	{
3123 		if (openLogFile >= 0)
3124 		{
3125 			if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3126 								 wal_segment_size))
3127 			{
3128 				XLogFileClose();
3129 			}
3130 		}
3131 		return false;
3132 	}
3133 
3134 	/*
3135 	 * Determine how far to flush WAL, based on the wal_writer_delay and
3136 	 * wal_writer_flush_after GUCs.
3137 	 */
3138 	now = GetCurrentTimestamp();
3139 	flushbytes =
3140 		WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3141 
3142 	if (WalWriterFlushAfter == 0 || lastflush == 0)
3143 	{
3144 		/* first call, or block based limits disabled */
3145 		WriteRqst.Flush = WriteRqst.Write;
3146 		lastflush = now;
3147 	}
3148 	else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3149 	{
3150 		/*
3151 		 * Flush the writes at least every WalWriterDelay ms. This is
3152 		 * important to bound the amount of time it takes for an asynchronous
3153 		 * commit to hit disk.
3154 		 */
3155 		WriteRqst.Flush = WriteRqst.Write;
3156 		lastflush = now;
3157 	}
3158 	else if (flushbytes >= WalWriterFlushAfter)
3159 	{
3160 		/* exceeded wal_writer_flush_after blocks, flush */
3161 		WriteRqst.Flush = WriteRqst.Write;
3162 		lastflush = now;
3163 	}
3164 	else
3165 	{
3166 		/* no flushing, this time round */
3167 		WriteRqst.Flush = 0;
3168 	}
3169 
3170 #ifdef WAL_DEBUG
3171 	if (XLOG_DEBUG)
3172 		elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3173 			 LSN_FORMAT_ARGS(WriteRqst.Write),
3174 			 LSN_FORMAT_ARGS(WriteRqst.Flush),
3175 			 LSN_FORMAT_ARGS(LogwrtResult.Write),
3176 			 LSN_FORMAT_ARGS(LogwrtResult.Flush));
3177 #endif
3178 
3179 	START_CRIT_SECTION();
3180 
3181 	/* now wait for any in-progress insertions to finish and get write lock */
3182 	WaitXLogInsertionsToFinish(WriteRqst.Write);
3183 	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3184 	LogwrtResult = XLogCtl->LogwrtResult;
3185 	if (WriteRqst.Write > LogwrtResult.Write ||
3186 		WriteRqst.Flush > LogwrtResult.Flush)
3187 	{
3188 		XLogWrite(WriteRqst, flexible);
3189 	}
3190 	LWLockRelease(WALWriteLock);
3191 
3192 	END_CRIT_SECTION();
3193 
3194 	/* wake up walsenders now that we've released heavily contended locks */
3195 	WalSndWakeupProcessRequests();
3196 
3197 	/*
3198 	 * Great, done. To take some work off the critical path, try to initialize
3199 	 * as many of the no-longer-needed WAL buffers for future use as we can.
3200 	 */
3201 	AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3202 
3203 	/*
3204 	 * If we determined that we need to write data, but somebody else
3205 	 * wrote/flushed already, it should be considered as being active, to
3206 	 * avoid hibernating too early.
3207 	 */
3208 	return true;
3209 }
3210 
3211 /*
3212  * Test whether XLOG data has been flushed up to (at least) the given position.
3213  *
3214  * Returns true if a flush is still needed.  (It may be that someone else
3215  * is already in process of flushing that far, however.)
3216  */
3217 bool
XLogNeedsFlush(XLogRecPtr record)3218 XLogNeedsFlush(XLogRecPtr record)
3219 {
3220 	/*
3221 	 * During recovery, we don't flush WAL but update minRecoveryPoint
3222 	 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3223 	 * would need to be updated.
3224 	 */
3225 	if (RecoveryInProgress())
3226 	{
3227 		/*
3228 		 * An invalid minRecoveryPoint means that we need to recover all the
3229 		 * WAL, i.e., we're doing crash recovery.  We never modify the control
3230 		 * file's value in that case, so we can short-circuit future checks
3231 		 * here too.  This triggers a quick exit path for the startup process,
3232 		 * which cannot update its local copy of minRecoveryPoint as long as
3233 		 * it has not replayed all WAL available when doing crash recovery.
3234 		 */
3235 		if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
3236 			updateMinRecoveryPoint = false;
3237 
3238 		/* Quick exit if already known to be updated or cannot be updated */
3239 		if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3240 			return false;
3241 
3242 		/*
3243 		 * Update local copy of minRecoveryPoint. But if the lock is busy,
3244 		 * just return a conservative guess.
3245 		 */
3246 		if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3247 			return true;
3248 		minRecoveryPoint = ControlFile->minRecoveryPoint;
3249 		minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3250 		LWLockRelease(ControlFileLock);
3251 
3252 		/*
3253 		 * Check minRecoveryPoint for any other process than the startup
3254 		 * process doing crash recovery, which should not update the control
3255 		 * file value if crash recovery is still running.
3256 		 */
3257 		if (XLogRecPtrIsInvalid(minRecoveryPoint))
3258 			updateMinRecoveryPoint = false;
3259 
3260 		/* check again */
3261 		if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3262 			return false;
3263 		else
3264 			return true;
3265 	}
3266 
3267 	/* Quick exit if already known flushed */
3268 	if (record <= LogwrtResult.Flush)
3269 		return false;
3270 
3271 	/* read LogwrtResult and update local state */
3272 	SpinLockAcquire(&XLogCtl->info_lck);
3273 	LogwrtResult = XLogCtl->LogwrtResult;
3274 	SpinLockRelease(&XLogCtl->info_lck);
3275 
3276 	/* check again */
3277 	if (record <= LogwrtResult.Flush)
3278 		return false;
3279 
3280 	return true;
3281 }
3282 
3283 /*
3284  * Create a new XLOG file segment, or open a pre-existing one.
3285  *
3286  * logsegno: identify segment to be created/opened.
3287  *
3288  * *use_existent: if true, OK to use a pre-existing file (else, any
3289  * pre-existing file will be deleted).  On return, true if a pre-existing
3290  * file was used.
3291  *
3292  * use_lock: if true, acquire ControlFileLock while moving file into
3293  * place.  This should be true except during bootstrap log creation.  The
3294  * caller must *not* hold the lock at call.
3295  *
3296  * Returns FD of opened file.
3297  *
3298  * Note: errors here are ERROR not PANIC because we might or might not be
3299  * inside a critical section (eg, during checkpoint there is no reason to
3300  * take down the system on failure).  They will promote to PANIC if we are
3301  * in a critical section.
3302  */
3303 int
XLogFileInit(XLogSegNo logsegno,bool * use_existent,bool use_lock)3304 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3305 {
3306 	char		path[MAXPGPATH];
3307 	char		tmppath[MAXPGPATH];
3308 	PGAlignedXLogBlock zbuffer;
3309 	XLogSegNo	installed_segno;
3310 	XLogSegNo	max_segno;
3311 	int			fd;
3312 	int			save_errno;
3313 
3314 	XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
3315 
3316 	/*
3317 	 * Try to use existent file (checkpoint maker may have created it already)
3318 	 */
3319 	if (*use_existent)
3320 	{
3321 		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3322 		if (fd < 0)
3323 		{
3324 			if (errno != ENOENT)
3325 				ereport(ERROR,
3326 						(errcode_for_file_access(),
3327 						 errmsg("could not open file \"%s\": %m", path)));
3328 		}
3329 		else
3330 			return fd;
3331 	}
3332 
3333 	/*
3334 	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3335 	 * another process is doing the same thing.  If so, we will end up
3336 	 * pre-creating an extra log segment.  That seems OK, and better than
3337 	 * holding the lock throughout this lengthy process.
3338 	 */
3339 	elog(DEBUG2, "creating and filling new WAL file");
3340 
3341 	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3342 
3343 	unlink(tmppath);
3344 
3345 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
3346 	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3347 	if (fd < 0)
3348 		ereport(ERROR,
3349 				(errcode_for_file_access(),
3350 				 errmsg("could not create file \"%s\": %m", tmppath)));
3351 
3352 	memset(zbuffer.data, 0, XLOG_BLCKSZ);
3353 
3354 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3355 	save_errno = 0;
3356 	if (wal_init_zero)
3357 	{
3358 		struct iovec iov[PG_IOV_MAX];
3359 		int			blocks;
3360 
3361 		/*
3362 		 * Zero-fill the file.  With this setting, we do this the hard way to
3363 		 * ensure that all the file space has really been allocated.  On
3364 		 * platforms that allow "holes" in files, just seeking to the end
3365 		 * doesn't allocate intermediate space.  This way, we know that we
3366 		 * have all the space and (after the fsync below) that all the
3367 		 * indirect blocks are down on disk.  Therefore, fdatasync(2) or
3368 		 * O_DSYNC will be sufficient to sync future writes to the log file.
3369 		 */
3370 
3371 		/* Prepare to write out a lot of copies of our zero buffer at once. */
3372 		for (int i = 0; i < lengthof(iov); ++i)
3373 		{
3374 			iov[i].iov_base = zbuffer.data;
3375 			iov[i].iov_len = XLOG_BLCKSZ;
3376 		}
3377 
3378 		/* Loop, writing as many blocks as we can for each system call. */
3379 		blocks = wal_segment_size / XLOG_BLCKSZ;
3380 		for (int i = 0; i < blocks;)
3381 		{
3382 			int			iovcnt = Min(blocks - i, lengthof(iov));
3383 			off_t		offset = i * XLOG_BLCKSZ;
3384 
3385 			if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
3386 			{
3387 				save_errno = errno;
3388 				break;
3389 			}
3390 
3391 			i += iovcnt;
3392 		}
3393 	}
3394 	else
3395 	{
3396 		/*
3397 		 * Otherwise, seeking to the end and writing a solitary byte is
3398 		 * enough.
3399 		 */
3400 		errno = 0;
3401 		if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
3402 		{
3403 			/* if write didn't set errno, assume no disk space */
3404 			save_errno = errno ? errno : ENOSPC;
3405 		}
3406 	}
3407 	pgstat_report_wait_end();
3408 
3409 	if (save_errno)
3410 	{
3411 		/*
3412 		 * If we fail to make the file, delete it to release disk space
3413 		 */
3414 		unlink(tmppath);
3415 
3416 		close(fd);
3417 
3418 		errno = save_errno;
3419 
3420 		ereport(ERROR,
3421 				(errcode_for_file_access(),
3422 				 errmsg("could not write to file \"%s\": %m", tmppath)));
3423 	}
3424 
3425 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3426 	if (pg_fsync(fd) != 0)
3427 	{
3428 		int			save_errno = errno;
3429 
3430 		close(fd);
3431 		errno = save_errno;
3432 		ereport(ERROR,
3433 				(errcode_for_file_access(),
3434 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
3435 	}
3436 	pgstat_report_wait_end();
3437 
3438 	if (close(fd) != 0)
3439 		ereport(ERROR,
3440 				(errcode_for_file_access(),
3441 				 errmsg("could not close file \"%s\": %m", tmppath)));
3442 
3443 	/*
3444 	 * Now move the segment into place with its final name.
3445 	 *
3446 	 * If caller didn't want to use a pre-existing file, get rid of any
3447 	 * pre-existing file.  Otherwise, cope with possibility that someone else
3448 	 * has created the file while we were filling ours: if so, use ours to
3449 	 * pre-create a future log segment.
3450 	 */
3451 	installed_segno = logsegno;
3452 
3453 	/*
3454 	 * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3455 	 * that was a constant, but that was always a bit dubious: normally, at a
3456 	 * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3457 	 * here, it was the offset from the insert location. We can't do the
3458 	 * normal XLOGfileslop calculation here because we don't have access to
3459 	 * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3460 	 * CheckPointSegments.
3461 	 */
3462 	max_segno = logsegno + CheckPointSegments;
3463 	if (!InstallXLogFileSegment(&installed_segno, tmppath,
3464 								*use_existent, max_segno,
3465 								use_lock))
3466 	{
3467 		/*
3468 		 * No need for any more future segments, or InstallXLogFileSegment()
3469 		 * failed to rename the file into place. If the rename failed, opening
3470 		 * the file below will fail.
3471 		 */
3472 		unlink(tmppath);
3473 	}
3474 
3475 	/* Set flag to tell caller there was no existent file */
3476 	*use_existent = false;
3477 
3478 	/* Now open original target segment (might not be file I just made) */
3479 	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3480 	if (fd < 0)
3481 		ereport(ERROR,
3482 				(errcode_for_file_access(),
3483 				 errmsg("could not open file \"%s\": %m", path)));
3484 
3485 	elog(DEBUG2, "done creating and filling new WAL file");
3486 
3487 	return fd;
3488 }
3489 
3490 /*
3491  * Create a new XLOG file segment by copying a pre-existing one.
3492  *
3493  * destsegno: identify segment to be created.
3494  *
3495  * srcTLI, srcsegno: identify segment to be copied (could be from
3496  *		a different timeline)
3497  *
3498  * upto: how much of the source file to copy (the rest is filled with
3499  *		zeros)
3500  *
3501  * Currently this is only used during recovery, and so there are no locking
3502  * considerations.  But we should be just as tense as XLogFileInit to avoid
3503  * emplacing a bogus file.
3504  */
3505 static void
XLogFileCopy(XLogSegNo destsegno,TimeLineID srcTLI,XLogSegNo srcsegno,int upto)3506 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3507 			 int upto)
3508 {
3509 	char		path[MAXPGPATH];
3510 	char		tmppath[MAXPGPATH];
3511 	PGAlignedXLogBlock buffer;
3512 	int			srcfd;
3513 	int			fd;
3514 	int			nbytes;
3515 
3516 	/*
3517 	 * Open the source file
3518 	 */
3519 	XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3520 	srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
3521 	if (srcfd < 0)
3522 		ereport(ERROR,
3523 				(errcode_for_file_access(),
3524 				 errmsg("could not open file \"%s\": %m", path)));
3525 
3526 	/*
3527 	 * Copy into a temp file name.
3528 	 */
3529 	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3530 
3531 	unlink(tmppath);
3532 
3533 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
3534 	fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3535 	if (fd < 0)
3536 		ereport(ERROR,
3537 				(errcode_for_file_access(),
3538 				 errmsg("could not create file \"%s\": %m", tmppath)));
3539 
3540 	/*
3541 	 * Do the data copying.
3542 	 */
3543 	for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3544 	{
3545 		int			nread;
3546 
3547 		nread = upto - nbytes;
3548 
3549 		/*
3550 		 * The part that is not read from the source file is filled with
3551 		 * zeros.
3552 		 */
3553 		if (nread < sizeof(buffer))
3554 			memset(buffer.data, 0, sizeof(buffer));
3555 
3556 		if (nread > 0)
3557 		{
3558 			int			r;
3559 
3560 			if (nread > sizeof(buffer))
3561 				nread = sizeof(buffer);
3562 			pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3563 			r = read(srcfd, buffer.data, nread);
3564 			if (r != nread)
3565 			{
3566 				if (r < 0)
3567 					ereport(ERROR,
3568 							(errcode_for_file_access(),
3569 							 errmsg("could not read file \"%s\": %m",
3570 									path)));
3571 				else
3572 					ereport(ERROR,
3573 							(errcode(ERRCODE_DATA_CORRUPTED),
3574 							 errmsg("could not read file \"%s\": read %d of %zu",
3575 									path, r, (Size) nread)));
3576 			}
3577 			pgstat_report_wait_end();
3578 		}
3579 		errno = 0;
3580 		pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3581 		if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3582 		{
3583 			int			save_errno = errno;
3584 
3585 			/*
3586 			 * If we fail to make the file, delete it to release disk space
3587 			 */
3588 			unlink(tmppath);
3589 			/* if write didn't set errno, assume problem is no disk space */
3590 			errno = save_errno ? save_errno : ENOSPC;
3591 
3592 			ereport(ERROR,
3593 					(errcode_for_file_access(),
3594 					 errmsg("could not write to file \"%s\": %m", tmppath)));
3595 		}
3596 		pgstat_report_wait_end();
3597 	}
3598 
3599 	pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3600 	if (pg_fsync(fd) != 0)
3601 		ereport(data_sync_elevel(ERROR),
3602 				(errcode_for_file_access(),
3603 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
3604 	pgstat_report_wait_end();
3605 
3606 	if (CloseTransientFile(fd) != 0)
3607 		ereport(ERROR,
3608 				(errcode_for_file_access(),
3609 				 errmsg("could not close file \"%s\": %m", tmppath)));
3610 
3611 	if (CloseTransientFile(srcfd) != 0)
3612 		ereport(ERROR,
3613 				(errcode_for_file_access(),
3614 				 errmsg("could not close file \"%s\": %m", path)));
3615 
3616 	/*
3617 	 * Now move the segment into place with its final name.
3618 	 */
3619 	if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3620 		elog(ERROR, "InstallXLogFileSegment should not have failed");
3621 }
3622 
3623 /*
3624  * Install a new XLOG segment file as a current or future log segment.
3625  *
3626  * This is used both to install a newly-created segment (which has a temp
3627  * filename while it's being created) and to recycle an old segment.
3628  *
3629  * *segno: identify segment to install as (or first possible target).
3630  * When find_free is true, this is modified on return to indicate the
3631  * actual installation location or last segment searched.
3632  *
3633  * tmppath: initial name of file to install.  It will be renamed into place.
3634  *
3635  * find_free: if true, install the new segment at the first empty segno
3636  * number at or after the passed numbers.  If false, install the new segment
3637  * exactly where specified, deleting any existing segment file there.
3638  *
3639  * max_segno: maximum segment number to install the new file as.  Fail if no
3640  * free slot is found between *segno and max_segno. (Ignored when find_free
3641  * is false.)
3642  *
3643  * use_lock: if true, acquire ControlFileLock while moving file into
3644  * place.  This should be true except during bootstrap log creation.  The
3645  * caller must *not* hold the lock at call.
3646  *
3647  * Returns true if the file was installed successfully.  false indicates that
3648  * max_segno limit was exceeded, or an error occurred while renaming the
3649  * file into place.
3650  */
3651 static bool
InstallXLogFileSegment(XLogSegNo * segno,char * tmppath,bool find_free,XLogSegNo max_segno,bool use_lock)3652 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3653 					   bool find_free, XLogSegNo max_segno,
3654 					   bool use_lock)
3655 {
3656 	char		path[MAXPGPATH];
3657 	struct stat stat_buf;
3658 
3659 	XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3660 
3661 	/*
3662 	 * We want to be sure that only one process does this at a time.
3663 	 */
3664 	if (use_lock)
3665 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3666 
3667 	if (!find_free)
3668 	{
3669 		/* Force installation: get rid of any pre-existing segment file */
3670 		durable_unlink(path, DEBUG1);
3671 	}
3672 	else
3673 	{
3674 		/* Find a free slot to put it in */
3675 		while (stat(path, &stat_buf) == 0)
3676 		{
3677 			if ((*segno) >= max_segno)
3678 			{
3679 				/* Failed to find a free slot within specified range */
3680 				if (use_lock)
3681 					LWLockRelease(ControlFileLock);
3682 				return false;
3683 			}
3684 			(*segno)++;
3685 			XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3686 		}
3687 	}
3688 
3689 	/*
3690 	 * Perform the rename using link if available, paranoidly trying to avoid
3691 	 * overwriting an existing file (there shouldn't be one).
3692 	 */
3693 	if (durable_rename_excl(tmppath, path, LOG) != 0)
3694 	{
3695 		if (use_lock)
3696 			LWLockRelease(ControlFileLock);
3697 		/* durable_rename_excl already emitted log message */
3698 		return false;
3699 	}
3700 
3701 	if (use_lock)
3702 		LWLockRelease(ControlFileLock);
3703 
3704 	return true;
3705 }
3706 
3707 /*
3708  * Open a pre-existing logfile segment for writing.
3709  */
3710 int
XLogFileOpen(XLogSegNo segno)3711 XLogFileOpen(XLogSegNo segno)
3712 {
3713 	char		path[MAXPGPATH];
3714 	int			fd;
3715 
3716 	XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
3717 
3718 	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3719 	if (fd < 0)
3720 		ereport(PANIC,
3721 				(errcode_for_file_access(),
3722 				 errmsg("could not open file \"%s\": %m", path)));
3723 
3724 	return fd;
3725 }
3726 
3727 /*
3728  * Open a logfile segment for reading (during recovery).
3729  *
3730  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3731  * Otherwise, it's assumed to be already available in pg_wal.
3732  */
3733 static int
XLogFileRead(XLogSegNo segno,int emode,TimeLineID tli,XLogSource source,bool notfoundOk)3734 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3735 			 XLogSource source, bool notfoundOk)
3736 {
3737 	char		xlogfname[MAXFNAMELEN];
3738 	char		activitymsg[MAXFNAMELEN + 16];
3739 	char		path[MAXPGPATH];
3740 	int			fd;
3741 
3742 	XLogFileName(xlogfname, tli, segno, wal_segment_size);
3743 
3744 	switch (source)
3745 	{
3746 		case XLOG_FROM_ARCHIVE:
3747 			/* Report recovery progress in PS display */
3748 			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3749 					 xlogfname);
3750 			set_ps_display(activitymsg);
3751 
3752 			restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3753 													  "RECOVERYXLOG",
3754 													  wal_segment_size,
3755 													  InRedo);
3756 			if (!restoredFromArchive)
3757 				return -1;
3758 			break;
3759 
3760 		case XLOG_FROM_PG_WAL:
3761 		case XLOG_FROM_STREAM:
3762 			XLogFilePath(path, tli, segno, wal_segment_size);
3763 			restoredFromArchive = false;
3764 			break;
3765 
3766 		default:
3767 			elog(ERROR, "invalid XLogFileRead source %d", source);
3768 	}
3769 
3770 	/*
3771 	 * If the segment was fetched from archival storage, replace the existing
3772 	 * xlog segment (if any) with the archival version.
3773 	 */
3774 	if (source == XLOG_FROM_ARCHIVE)
3775 	{
3776 		KeepFileRestoredFromArchive(path, xlogfname);
3777 
3778 		/*
3779 		 * Set path to point at the new file in pg_wal.
3780 		 */
3781 		snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3782 	}
3783 
3784 	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
3785 	if (fd >= 0)
3786 	{
3787 		/* Success! */
3788 		curFileTLI = tli;
3789 
3790 		/* Report recovery progress in PS display */
3791 		snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3792 				 xlogfname);
3793 		set_ps_display(activitymsg);
3794 
3795 		/* Track source of data in assorted state variables */
3796 		readSource = source;
3797 		XLogReceiptSource = source;
3798 		/* In FROM_STREAM case, caller tracks receipt time, not me */
3799 		if (source != XLOG_FROM_STREAM)
3800 			XLogReceiptTime = GetCurrentTimestamp();
3801 
3802 		return fd;
3803 	}
3804 	if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3805 		ereport(PANIC,
3806 				(errcode_for_file_access(),
3807 				 errmsg("could not open file \"%s\": %m", path)));
3808 	return -1;
3809 }
3810 
3811 /*
3812  * Open a logfile segment for reading (during recovery).
3813  *
3814  * This version searches for the segment with any TLI listed in expectedTLEs.
3815  */
3816 static int
XLogFileReadAnyTLI(XLogSegNo segno,int emode,XLogSource source)3817 XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
3818 {
3819 	char		path[MAXPGPATH];
3820 	ListCell   *cell;
3821 	int			fd;
3822 	List	   *tles;
3823 
3824 	/*
3825 	 * Loop looking for a suitable timeline ID: we might need to read any of
3826 	 * the timelines listed in expectedTLEs.
3827 	 *
3828 	 * We expect curFileTLI on entry to be the TLI of the preceding file in
3829 	 * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3830 	 * to go backwards; this prevents us from picking up the wrong file when a
3831 	 * parent timeline extends to higher segment numbers than the child we
3832 	 * want to read.
3833 	 *
3834 	 * If we haven't read the timeline history file yet, read it now, so that
3835 	 * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3836 	 * however, unless we actually find a valid segment.  That way if there is
3837 	 * neither a timeline history file nor a WAL segment in the archive, and
3838 	 * streaming replication is set up, we'll read the timeline history file
3839 	 * streamed from the primary when we start streaming, instead of
3840 	 * recovering with a dummy history generated here.
3841 	 */
3842 	if (expectedTLEs)
3843 		tles = expectedTLEs;
3844 	else
3845 		tles = readTimeLineHistory(recoveryTargetTLI);
3846 
3847 	foreach(cell, tles)
3848 	{
3849 		TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
3850 		TimeLineID	tli = hent->tli;
3851 
3852 		if (tli < curFileTLI)
3853 			break;				/* don't bother looking at too-old TLIs */
3854 
3855 		/*
3856 		 * Skip scanning the timeline ID that the logfile segment to read
3857 		 * doesn't belong to
3858 		 */
3859 		if (hent->begin != InvalidXLogRecPtr)
3860 		{
3861 			XLogSegNo	beginseg = 0;
3862 
3863 			XLByteToSeg(hent->begin, beginseg, wal_segment_size);
3864 
3865 			/*
3866 			 * The logfile segment that doesn't belong to the timeline is
3867 			 * older or newer than the segment that the timeline started or
3868 			 * ended at, respectively. It's sufficient to check only the
3869 			 * starting segment of the timeline here. Since the timelines are
3870 			 * scanned in descending order in this loop, any segments newer
3871 			 * than the ending segment should belong to newer timeline and
3872 			 * have already been read before. So it's not necessary to check
3873 			 * the ending segment of the timeline here.
3874 			 */
3875 			if (segno < beginseg)
3876 				continue;
3877 		}
3878 
3879 		if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3880 		{
3881 			fd = XLogFileRead(segno, emode, tli,
3882 							  XLOG_FROM_ARCHIVE, true);
3883 			if (fd != -1)
3884 			{
3885 				elog(DEBUG1, "got WAL segment from archive");
3886 				if (!expectedTLEs)
3887 					expectedTLEs = tles;
3888 				return fd;
3889 			}
3890 		}
3891 
3892 		if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
3893 		{
3894 			fd = XLogFileRead(segno, emode, tli,
3895 							  XLOG_FROM_PG_WAL, true);
3896 			if (fd != -1)
3897 			{
3898 				if (!expectedTLEs)
3899 					expectedTLEs = tles;
3900 				return fd;
3901 			}
3902 		}
3903 	}
3904 
3905 	/* Couldn't find it.  For simplicity, complain about front timeline */
3906 	XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
3907 	errno = ENOENT;
3908 	ereport(emode,
3909 			(errcode_for_file_access(),
3910 			 errmsg("could not open file \"%s\": %m", path)));
3911 	return -1;
3912 }
3913 
3914 /*
3915  * Close the current logfile segment for writing.
3916  */
3917 static void
XLogFileClose(void)3918 XLogFileClose(void)
3919 {
3920 	Assert(openLogFile >= 0);
3921 
3922 	/*
3923 	 * WAL segment files will not be re-read in normal operation, so we advise
3924 	 * the OS to release any cached pages.  But do not do so if WAL archiving
3925 	 * or streaming is active, because archiver and walsender process could
3926 	 * use the cache to read the WAL segment.
3927 	 */
3928 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3929 	if (!XLogIsNeeded())
3930 		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3931 #endif
3932 
3933 	if (close(openLogFile) != 0)
3934 	{
3935 		char		xlogfname[MAXFNAMELEN];
3936 		int			save_errno = errno;
3937 
3938 		XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, wal_segment_size);
3939 		errno = save_errno;
3940 		ereport(PANIC,
3941 				(errcode_for_file_access(),
3942 				 errmsg("could not close file \"%s\": %m", xlogfname)));
3943 	}
3944 
3945 	openLogFile = -1;
3946 	ReleaseExternalFD();
3947 }
3948 
3949 /*
3950  * Preallocate log files beyond the specified log endpoint.
3951  *
3952  * XXX this is currently extremely conservative, since it forces only one
3953  * future log segment to exist, and even that only if we are 75% done with
3954  * the current one.  This is only appropriate for very low-WAL-volume systems.
3955  * High-volume systems will be OK once they've built up a sufficient set of
3956  * recycled log segments, but the startup transient is likely to include
3957  * a lot of segment creations by foreground processes, which is not so good.
3958  */
3959 static void
PreallocXlogFiles(XLogRecPtr endptr)3960 PreallocXlogFiles(XLogRecPtr endptr)
3961 {
3962 	XLogSegNo	_logSegNo;
3963 	int			lf;
3964 	bool		use_existent;
3965 	uint64		offset;
3966 
3967 	XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3968 	offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3969 	if (offset >= (uint32) (0.75 * wal_segment_size))
3970 	{
3971 		_logSegNo++;
3972 		use_existent = true;
3973 		lf = XLogFileInit(_logSegNo, &use_existent, true);
3974 		close(lf);
3975 		if (!use_existent)
3976 			CheckpointStats.ckpt_segs_added++;
3977 	}
3978 }
3979 
3980 /*
3981  * Throws an error if the given log segment has already been removed or
3982  * recycled. The caller should only pass a segment that it knows to have
3983  * existed while the server has been running, as this function always
3984  * succeeds if no WAL segments have been removed since startup.
3985  * 'tli' is only used in the error message.
3986  *
3987  * Note: this function guarantees to keep errno unchanged on return.
3988  * This supports callers that use this to possibly deliver a better
3989  * error message about a missing file, while still being able to throw
3990  * a normal file-access error afterwards, if this does return.
3991  */
3992 void
CheckXLogRemoved(XLogSegNo segno,TimeLineID tli)3993 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3994 {
3995 	int			save_errno = errno;
3996 	XLogSegNo	lastRemovedSegNo;
3997 
3998 	SpinLockAcquire(&XLogCtl->info_lck);
3999 	lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
4000 	SpinLockRelease(&XLogCtl->info_lck);
4001 
4002 	if (segno <= lastRemovedSegNo)
4003 	{
4004 		char		filename[MAXFNAMELEN];
4005 
4006 		XLogFileName(filename, tli, segno, wal_segment_size);
4007 		errno = save_errno;
4008 		ereport(ERROR,
4009 				(errcode_for_file_access(),
4010 				 errmsg("requested WAL segment %s has already been removed",
4011 						filename)));
4012 	}
4013 	errno = save_errno;
4014 }
4015 
4016 /*
4017  * Return the last WAL segment removed, or 0 if no segment has been removed
4018  * since startup.
4019  *
4020  * NB: the result can be out of date arbitrarily fast, the caller has to deal
4021  * with that.
4022  */
4023 XLogSegNo
XLogGetLastRemovedSegno(void)4024 XLogGetLastRemovedSegno(void)
4025 {
4026 	XLogSegNo	lastRemovedSegNo;
4027 
4028 	SpinLockAcquire(&XLogCtl->info_lck);
4029 	lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
4030 	SpinLockRelease(&XLogCtl->info_lck);
4031 
4032 	return lastRemovedSegNo;
4033 }
4034 
4035 
4036 /*
4037  * Update the last removed segno pointer in shared memory, to reflect that the
4038  * given XLOG file has been removed.
4039  */
4040 static void
UpdateLastRemovedPtr(char * filename)4041 UpdateLastRemovedPtr(char *filename)
4042 {
4043 	uint32		tli;
4044 	XLogSegNo	segno;
4045 
4046 	XLogFromFileName(filename, &tli, &segno, wal_segment_size);
4047 
4048 	SpinLockAcquire(&XLogCtl->info_lck);
4049 	if (segno > XLogCtl->lastRemovedSegNo)
4050 		XLogCtl->lastRemovedSegNo = segno;
4051 	SpinLockRelease(&XLogCtl->info_lck);
4052 }
4053 
4054 /*
4055  * Remove all temporary log files in pg_wal
4056  *
4057  * This is called at the beginning of recovery after a previous crash,
4058  * at a point where no other processes write fresh WAL data.
4059  */
4060 static void
RemoveTempXlogFiles(void)4061 RemoveTempXlogFiles(void)
4062 {
4063 	DIR		   *xldir;
4064 	struct dirent *xlde;
4065 
4066 	elog(DEBUG2, "removing all temporary WAL segments");
4067 
4068 	xldir = AllocateDir(XLOGDIR);
4069 	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4070 	{
4071 		char		path[MAXPGPATH];
4072 
4073 		if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
4074 			continue;
4075 
4076 		snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4077 		unlink(path);
4078 		elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
4079 	}
4080 	FreeDir(xldir);
4081 }
4082 
4083 /*
4084  * Recycle or remove all log files older or equal to passed segno.
4085  *
4086  * endptr is current (or recent) end of xlog, and lastredoptr is the
4087  * redo pointer of the last checkpoint. These are used to determine
4088  * whether we want to recycle rather than delete no-longer-wanted log files.
4089  */
4090 static void
RemoveOldXlogFiles(XLogSegNo segno,XLogRecPtr lastredoptr,XLogRecPtr endptr)4091 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr)
4092 {
4093 	DIR		   *xldir;
4094 	struct dirent *xlde;
4095 	char		lastoff[MAXFNAMELEN];
4096 	XLogSegNo	endlogSegNo;
4097 	XLogSegNo	recycleSegNo;
4098 
4099 	/* Initialize info about where to try to recycle to */
4100 	XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
4101 	recycleSegNo = XLOGfileslop(lastredoptr);
4102 
4103 	/*
4104 	 * Construct a filename of the last segment to be kept. The timeline ID
4105 	 * doesn't matter, we ignore that in the comparison. (During recovery,
4106 	 * ThisTimeLineID isn't set, so we can't use that.)
4107 	 */
4108 	XLogFileName(lastoff, 0, segno, wal_segment_size);
4109 
4110 	elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4111 		 lastoff);
4112 
4113 	xldir = AllocateDir(XLOGDIR);
4114 
4115 	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4116 	{
4117 		/* Ignore files that are not XLOG segments */
4118 		if (!IsXLogFileName(xlde->d_name) &&
4119 			!IsPartialXLogFileName(xlde->d_name))
4120 			continue;
4121 
4122 		/*
4123 		 * We ignore the timeline part of the XLOG segment identifiers in
4124 		 * deciding whether a segment is still needed.  This ensures that we
4125 		 * won't prematurely remove a segment from a parent timeline. We could
4126 		 * probably be a little more proactive about removing segments of
4127 		 * non-parent timelines, but that would be a whole lot more
4128 		 * complicated.
4129 		 *
4130 		 * We use the alphanumeric sorting property of the filenames to decide
4131 		 * which ones are earlier than the lastoff segment.
4132 		 */
4133 		if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4134 		{
4135 			if (XLogArchiveCheckDone(xlde->d_name))
4136 			{
4137 				/* Update the last removed location in shared memory first */
4138 				UpdateLastRemovedPtr(xlde->d_name);
4139 
4140 				RemoveXlogFile(xlde->d_name, recycleSegNo, &endlogSegNo);
4141 			}
4142 		}
4143 	}
4144 
4145 	FreeDir(xldir);
4146 }
4147 
4148 /*
4149  * Remove WAL files that are not part of the given timeline's history.
4150  *
4151  * This is called during recovery, whenever we switch to follow a new
4152  * timeline, and at the end of recovery when we create a new timeline. We
4153  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
4154  * might be leftover pre-allocated or recycled WAL segments on the old timeline
4155  * that we haven't used yet, and contain garbage. If we just leave them in
4156  * pg_wal, they will eventually be archived, and we can't let that happen.
4157  * Files that belong to our timeline history are valid, because we have
4158  * successfully replayed them, but from others we can't be sure.
4159  *
4160  * 'switchpoint' is the current point in WAL where we switch to new timeline,
4161  * and 'newTLI' is the new timeline we switch to.
4162  */
4163 static void
RemoveNonParentXlogFiles(XLogRecPtr switchpoint,TimeLineID newTLI)4164 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
4165 {
4166 	DIR		   *xldir;
4167 	struct dirent *xlde;
4168 	char		switchseg[MAXFNAMELEN];
4169 	XLogSegNo	endLogSegNo;
4170 	XLogSegNo	switchLogSegNo;
4171 	XLogSegNo	recycleSegNo;
4172 
4173 	/*
4174 	 * Initialize info about where to begin the work.  This will recycle,
4175 	 * somewhat arbitrarily, 10 future segments.
4176 	 */
4177 	XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
4178 	XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
4179 	recycleSegNo = endLogSegNo + 10;
4180 
4181 	/*
4182 	 * Construct a filename of the last segment to be kept.
4183 	 */
4184 	XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
4185 
4186 	elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
4187 		 switchseg);
4188 
4189 	xldir = AllocateDir(XLOGDIR);
4190 
4191 	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4192 	{
4193 		/* Ignore files that are not XLOG segments */
4194 		if (!IsXLogFileName(xlde->d_name))
4195 			continue;
4196 
4197 		/*
4198 		 * Remove files that are on a timeline older than the new one we're
4199 		 * switching to, but with a segment number >= the first segment on the
4200 		 * new timeline.
4201 		 */
4202 		if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
4203 			strcmp(xlde->d_name + 8, switchseg + 8) > 0)
4204 		{
4205 			/*
4206 			 * If the file has already been marked as .ready, however, don't
4207 			 * remove it yet. It should be OK to remove it - files that are
4208 			 * not part of our timeline history are not required for recovery
4209 			 * - but seems safer to let them be archived and removed later.
4210 			 */
4211 			if (!XLogArchiveIsReady(xlde->d_name))
4212 				RemoveXlogFile(xlde->d_name, recycleSegNo, &endLogSegNo);
4213 		}
4214 	}
4215 
4216 	FreeDir(xldir);
4217 }
4218 
4219 /*
4220  * Recycle or remove a log file that's no longer needed.
4221  *
4222  * segname is the name of the segment to recycle or remove.  recycleSegNo
4223  * is the segment number to recycle up to.  endlogSegNo is the segment
4224  * number of the current (or recent) end of WAL.
4225  *
4226  * endlogSegNo gets incremented if the segment is recycled so as it is not
4227  * checked again with future callers of this function.
4228  */
4229 static void
RemoveXlogFile(const char * segname,XLogSegNo recycleSegNo,XLogSegNo * endlogSegNo)4230 RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
4231 			   XLogSegNo *endlogSegNo)
4232 {
4233 	char		path[MAXPGPATH];
4234 #ifdef WIN32
4235 	char		newpath[MAXPGPATH];
4236 #endif
4237 	struct stat statbuf;
4238 
4239 	snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4240 
4241 	/*
4242 	 * Before deleting the file, see if it can be recycled as a future log
4243 	 * segment. Only recycle normal files, because we don't want to recycle
4244 	 * symbolic links pointing to a separate archive directory.
4245 	 */
4246 	if (wal_recycle &&
4247 		*endlogSegNo <= recycleSegNo &&
4248 		lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4249 		InstallXLogFileSegment(endlogSegNo, path,
4250 							   true, recycleSegNo, true))
4251 	{
4252 		ereport(DEBUG2,
4253 				(errmsg_internal("recycled write-ahead log file \"%s\"",
4254 								 segname)));
4255 		CheckpointStats.ckpt_segs_recycled++;
4256 		/* Needn't recheck that slot on future iterations */
4257 		(*endlogSegNo)++;
4258 	}
4259 	else
4260 	{
4261 		/* No need for any more future segments... */
4262 		int			rc;
4263 
4264 		ereport(DEBUG2,
4265 				(errmsg_internal("removing write-ahead log file \"%s\"",
4266 								 segname)));
4267 
4268 #ifdef WIN32
4269 
4270 		/*
4271 		 * On Windows, if another process (e.g another backend) holds the file
4272 		 * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4273 		 * will still show up in directory listing until the last handle is
4274 		 * closed. To avoid confusing the lingering deleted file for a live
4275 		 * WAL file that needs to be archived, rename it before deleting it.
4276 		 *
4277 		 * If another process holds the file open without FILE_SHARE_DELETE
4278 		 * flag, rename will fail. We'll try again at the next checkpoint.
4279 		 */
4280 		snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4281 		if (rename(path, newpath) != 0)
4282 		{
4283 			ereport(LOG,
4284 					(errcode_for_file_access(),
4285 					 errmsg("could not rename file \"%s\": %m",
4286 							path)));
4287 			return;
4288 		}
4289 		rc = durable_unlink(newpath, LOG);
4290 #else
4291 		rc = durable_unlink(path, LOG);
4292 #endif
4293 		if (rc != 0)
4294 		{
4295 			/* Message already logged by durable_unlink() */
4296 			return;
4297 		}
4298 		CheckpointStats.ckpt_segs_removed++;
4299 	}
4300 
4301 	XLogArchiveCleanup(segname);
4302 }
4303 
4304 /*
4305  * Verify whether pg_wal and pg_wal/archive_status exist.
4306  * If the latter does not exist, recreate it.
4307  *
4308  * It is not the goal of this function to verify the contents of these
4309  * directories, but to help in cases where someone has performed a cluster
4310  * copy for PITR purposes but omitted pg_wal from the copy.
4311  *
4312  * We could also recreate pg_wal if it doesn't exist, but a deliberate
4313  * policy decision was made not to.  It is fairly common for pg_wal to be
4314  * a symlink, and if that was the DBA's intent then automatically making a
4315  * plain directory would result in degraded performance with no notice.
4316  */
4317 static void
ValidateXLOGDirectoryStructure(void)4318 ValidateXLOGDirectoryStructure(void)
4319 {
4320 	char		path[MAXPGPATH];
4321 	struct stat stat_buf;
4322 
4323 	/* Check for pg_wal; if it doesn't exist, error out */
4324 	if (stat(XLOGDIR, &stat_buf) != 0 ||
4325 		!S_ISDIR(stat_buf.st_mode))
4326 		ereport(FATAL,
4327 				(errmsg("required WAL directory \"%s\" does not exist",
4328 						XLOGDIR)));
4329 
4330 	/* Check for archive_status */
4331 	snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4332 	if (stat(path, &stat_buf) == 0)
4333 	{
4334 		/* Check for weird cases where it exists but isn't a directory */
4335 		if (!S_ISDIR(stat_buf.st_mode))
4336 			ereport(FATAL,
4337 					(errmsg("required WAL directory \"%s\" does not exist",
4338 							path)));
4339 	}
4340 	else
4341 	{
4342 		ereport(LOG,
4343 				(errmsg("creating missing WAL directory \"%s\"", path)));
4344 		if (MakePGDirectory(path) < 0)
4345 			ereport(FATAL,
4346 					(errmsg("could not create missing directory \"%s\": %m",
4347 							path)));
4348 	}
4349 }
4350 
4351 /*
4352  * Remove previous backup history files.  This also retries creation of
4353  * .ready files for any backup history files for which XLogArchiveNotify
4354  * failed earlier.
4355  */
4356 static void
CleanupBackupHistory(void)4357 CleanupBackupHistory(void)
4358 {
4359 	DIR		   *xldir;
4360 	struct dirent *xlde;
4361 	char		path[MAXPGPATH + sizeof(XLOGDIR)];
4362 
4363 	xldir = AllocateDir(XLOGDIR);
4364 
4365 	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4366 	{
4367 		if (IsBackupHistoryFileName(xlde->d_name))
4368 		{
4369 			if (XLogArchiveCheckDone(xlde->d_name))
4370 			{
4371 				elog(DEBUG2, "removing WAL backup history file \"%s\"",
4372 					 xlde->d_name);
4373 				snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4374 				unlink(path);
4375 				XLogArchiveCleanup(xlde->d_name);
4376 			}
4377 		}
4378 	}
4379 
4380 	FreeDir(xldir);
4381 }
4382 
4383 /*
4384  * Attempt to read the next XLOG record.
4385  *
4386  * Before first call, the reader needs to be positioned to the first record
4387  * by calling XLogBeginRead().
4388  *
4389  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4390  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4391  * record is available.
4392  */
4393 static XLogRecord *
ReadRecord(XLogReaderState * xlogreader,int emode,bool fetching_ckpt)4394 ReadRecord(XLogReaderState *xlogreader, int emode,
4395 		   bool fetching_ckpt)
4396 {
4397 	XLogRecord *record;
4398 	XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4399 
4400 	/* Pass through parameters to XLogPageRead */
4401 	private->fetching_ckpt = fetching_ckpt;
4402 	private->emode = emode;
4403 	private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
4404 
4405 	/* This is the first attempt to read this page. */
4406 	lastSourceFailed = false;
4407 
4408 	for (;;)
4409 	{
4410 		char	   *errormsg;
4411 
4412 		record = XLogReadRecord(xlogreader, &errormsg);
4413 		ReadRecPtr = xlogreader->ReadRecPtr;
4414 		EndRecPtr = xlogreader->EndRecPtr;
4415 		if (record == NULL)
4416 		{
4417 			/*
4418 			 * When not in standby mode we find that WAL ends in an incomplete
4419 			 * record, keep track of that record.  After recovery is done,
4420 			 * we'll write a record to indicate downstream WAL readers that
4421 			 * that portion is to be ignored.
4422 			 */
4423 			if (!StandbyMode &&
4424 				!XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
4425 			{
4426 				abortedRecPtr = xlogreader->abortedRecPtr;
4427 				missingContrecPtr = xlogreader->missingContrecPtr;
4428 			}
4429 
4430 			if (readFile >= 0)
4431 			{
4432 				close(readFile);
4433 				readFile = -1;
4434 			}
4435 
4436 			/*
4437 			 * We only end up here without a message when XLogPageRead()
4438 			 * failed - in that case we already logged something. In
4439 			 * StandbyMode that only happens if we have been triggered, so we
4440 			 * shouldn't loop anymore in that case.
4441 			 */
4442 			if (errormsg)
4443 				ereport(emode_for_corrupt_record(emode, EndRecPtr),
4444 						(errmsg_internal("%s", errormsg) /* already translated */ ));
4445 		}
4446 
4447 		/*
4448 		 * Check page TLI is one of the expected values.
4449 		 */
4450 		else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4451 		{
4452 			char		fname[MAXFNAMELEN];
4453 			XLogSegNo	segno;
4454 			int32		offset;
4455 
4456 			XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
4457 			offset = XLogSegmentOffset(xlogreader->latestPagePtr,
4458 									   wal_segment_size);
4459 			XLogFileName(fname, xlogreader->seg.ws_tli, segno,
4460 						 wal_segment_size);
4461 			ereport(emode_for_corrupt_record(emode, EndRecPtr),
4462 					(errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4463 							xlogreader->latestPageTLI,
4464 							fname,
4465 							offset)));
4466 			record = NULL;
4467 		}
4468 
4469 		if (record)
4470 		{
4471 			/* Great, got a record */
4472 			return record;
4473 		}
4474 		else
4475 		{
4476 			/* No valid record available from this source */
4477 			lastSourceFailed = true;
4478 
4479 			/*
4480 			 * If archive recovery was requested, but we were still doing
4481 			 * crash recovery, switch to archive recovery and retry using the
4482 			 * offline archive. We have now replayed all the valid WAL in
4483 			 * pg_wal, so we are presumably now consistent.
4484 			 *
4485 			 * We require that there's at least some valid WAL present in
4486 			 * pg_wal, however (!fetching_ckpt).  We could recover using the
4487 			 * WAL from the archive, even if pg_wal is completely empty, but
4488 			 * we'd have no idea how far we'd have to replay to reach
4489 			 * consistency.  So err on the safe side and give up.
4490 			 */
4491 			if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4492 				!fetching_ckpt)
4493 			{
4494 				ereport(DEBUG1,
4495 						(errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4496 				InArchiveRecovery = true;
4497 				if (StandbyModeRequested)
4498 					StandbyMode = true;
4499 
4500 				/* initialize minRecoveryPoint to this record */
4501 				LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4502 				ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4503 				if (ControlFile->minRecoveryPoint < EndRecPtr)
4504 				{
4505 					ControlFile->minRecoveryPoint = EndRecPtr;
4506 					ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4507 				}
4508 				/* update local copy */
4509 				minRecoveryPoint = ControlFile->minRecoveryPoint;
4510 				minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4511 
4512 				/*
4513 				 * The startup process can update its local copy of
4514 				 * minRecoveryPoint from this point.
4515 				 */
4516 				updateMinRecoveryPoint = true;
4517 
4518 				UpdateControlFile();
4519 
4520 				/*
4521 				 * We update SharedRecoveryState while holding the lock on
4522 				 * ControlFileLock so both states are consistent in shared
4523 				 * memory.
4524 				 */
4525 				SpinLockAcquire(&XLogCtl->info_lck);
4526 				XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
4527 				SpinLockRelease(&XLogCtl->info_lck);
4528 
4529 				LWLockRelease(ControlFileLock);
4530 
4531 				CheckRecoveryConsistency();
4532 
4533 				/*
4534 				 * Before we retry, reset lastSourceFailed and currentSource
4535 				 * so that we will check the archive next.
4536 				 */
4537 				lastSourceFailed = false;
4538 				currentSource = XLOG_FROM_ANY;
4539 
4540 				continue;
4541 			}
4542 
4543 			/* In standby mode, loop back to retry. Otherwise, give up. */
4544 			if (StandbyMode && !CheckForStandbyTrigger())
4545 				continue;
4546 			else
4547 				return NULL;
4548 		}
4549 	}
4550 }
4551 
4552 /*
4553  * Scan for new timelines that might have appeared in the archive since we
4554  * started recovery.
4555  *
4556  * If there are any, the function changes recovery target TLI to the latest
4557  * one and returns 'true'.
4558  */
4559 static bool
rescanLatestTimeLine(void)4560 rescanLatestTimeLine(void)
4561 {
4562 	List	   *newExpectedTLEs;
4563 	bool		found;
4564 	ListCell   *cell;
4565 	TimeLineID	newtarget;
4566 	TimeLineID	oldtarget = recoveryTargetTLI;
4567 	TimeLineHistoryEntry *currentTle = NULL;
4568 
4569 	newtarget = findNewestTimeLine(recoveryTargetTLI);
4570 	if (newtarget == recoveryTargetTLI)
4571 	{
4572 		/* No new timelines found */
4573 		return false;
4574 	}
4575 
4576 	/*
4577 	 * Determine the list of expected TLIs for the new TLI
4578 	 */
4579 
4580 	newExpectedTLEs = readTimeLineHistory(newtarget);
4581 
4582 	/*
4583 	 * If the current timeline is not part of the history of the new timeline,
4584 	 * we cannot proceed to it.
4585 	 */
4586 	found = false;
4587 	foreach(cell, newExpectedTLEs)
4588 	{
4589 		currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4590 
4591 		if (currentTle->tli == recoveryTargetTLI)
4592 		{
4593 			found = true;
4594 			break;
4595 		}
4596 	}
4597 	if (!found)
4598 	{
4599 		ereport(LOG,
4600 				(errmsg("new timeline %u is not a child of database system timeline %u",
4601 						newtarget,
4602 						ThisTimeLineID)));
4603 		return false;
4604 	}
4605 
4606 	/*
4607 	 * The current timeline was found in the history file, but check that the
4608 	 * next timeline was forked off from it *after* the current recovery
4609 	 * location.
4610 	 */
4611 	if (currentTle->end < EndRecPtr)
4612 	{
4613 		ereport(LOG,
4614 				(errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4615 						newtarget,
4616 						ThisTimeLineID,
4617 						LSN_FORMAT_ARGS(EndRecPtr))));
4618 		return false;
4619 	}
4620 
4621 	/* The new timeline history seems valid. Switch target */
4622 	recoveryTargetTLI = newtarget;
4623 	list_free_deep(expectedTLEs);
4624 	expectedTLEs = newExpectedTLEs;
4625 
4626 	/*
4627 	 * As in StartupXLOG(), try to ensure we have all the history files
4628 	 * between the old target and new target in pg_wal.
4629 	 */
4630 	restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4631 
4632 	ereport(LOG,
4633 			(errmsg("new target timeline is %u",
4634 					recoveryTargetTLI)));
4635 
4636 	return true;
4637 }
4638 
4639 /*
4640  * I/O routines for pg_control
4641  *
4642  * *ControlFile is a buffer in shared memory that holds an image of the
4643  * contents of pg_control.  WriteControlFile() initializes pg_control
4644  * given a preloaded buffer, ReadControlFile() loads the buffer from
4645  * the pg_control file (during postmaster or standalone-backend startup),
4646  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4647  * InitControlFile() fills the buffer with initial values.
4648  *
4649  * For simplicity, WriteControlFile() initializes the fields of pg_control
4650  * that are related to checking backend/database compatibility, and
4651  * ReadControlFile() verifies they are correct.  We could split out the
4652  * I/O and compatibility-check functions, but there seems no need currently.
4653  */
4654 
4655 static void
InitControlFile(uint64 sysidentifier)4656 InitControlFile(uint64 sysidentifier)
4657 {
4658 	char		mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
4659 
4660 	/*
4661 	 * Generate a random nonce. This is used for authentication requests that
4662 	 * will fail because the user does not exist. The nonce is used to create
4663 	 * a genuine-looking password challenge for the non-existent user, in lieu
4664 	 * of an actual stored password.
4665 	 */
4666 	if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
4667 		ereport(PANIC,
4668 				(errcode(ERRCODE_INTERNAL_ERROR),
4669 				 errmsg("could not generate secret authorization token")));
4670 
4671 	memset(ControlFile, 0, sizeof(ControlFileData));
4672 	/* Initialize pg_control status fields */
4673 	ControlFile->system_identifier = sysidentifier;
4674 	memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
4675 	ControlFile->state = DB_SHUTDOWNED;
4676 	ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
4677 
4678 	/* Set important parameter values for use when replaying WAL */
4679 	ControlFile->MaxConnections = MaxConnections;
4680 	ControlFile->max_worker_processes = max_worker_processes;
4681 	ControlFile->max_wal_senders = max_wal_senders;
4682 	ControlFile->max_prepared_xacts = max_prepared_xacts;
4683 	ControlFile->max_locks_per_xact = max_locks_per_xact;
4684 	ControlFile->wal_level = wal_level;
4685 	ControlFile->wal_log_hints = wal_log_hints;
4686 	ControlFile->track_commit_timestamp = track_commit_timestamp;
4687 	ControlFile->data_checksum_version = bootstrap_data_checksum_version;
4688 }
4689 
4690 static void
WriteControlFile(void)4691 WriteControlFile(void)
4692 {
4693 	int			fd;
4694 	char		buffer[PG_CONTROL_FILE_SIZE];	/* need not be aligned */
4695 
4696 	/*
4697 	 * Ensure that the size of the pg_control data structure is sane.  See the
4698 	 * comments for these symbols in pg_control.h.
4699 	 */
4700 	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
4701 					 "pg_control is too large for atomic disk writes");
4702 	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
4703 					 "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
4704 
4705 	/*
4706 	 * Initialize version and compatibility-check fields
4707 	 */
4708 	ControlFile->pg_control_version = PG_CONTROL_VERSION;
4709 	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4710 
4711 	ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4712 	ControlFile->floatFormat = FLOATFORMAT_VALUE;
4713 
4714 	ControlFile->blcksz = BLCKSZ;
4715 	ControlFile->relseg_size = RELSEG_SIZE;
4716 	ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4717 	ControlFile->xlog_seg_size = wal_segment_size;
4718 
4719 	ControlFile->nameDataLen = NAMEDATALEN;
4720 	ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4721 
4722 	ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4723 	ControlFile->loblksize = LOBLKSIZE;
4724 
4725 	ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4726 
4727 	/* Contents are protected with a CRC */
4728 	INIT_CRC32C(ControlFile->crc);
4729 	COMP_CRC32C(ControlFile->crc,
4730 				(char *) ControlFile,
4731 				offsetof(ControlFileData, crc));
4732 	FIN_CRC32C(ControlFile->crc);
4733 
4734 	/*
4735 	 * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4736 	 * the excess over sizeof(ControlFileData).  This reduces the odds of
4737 	 * premature-EOF errors when reading pg_control.  We'll still fail when we
4738 	 * check the contents of the file, but hopefully with a more specific
4739 	 * error than "couldn't read pg_control".
4740 	 */
4741 	memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4742 	memcpy(buffer, ControlFile, sizeof(ControlFileData));
4743 
4744 	fd = BasicOpenFile(XLOG_CONTROL_FILE,
4745 					   O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
4746 	if (fd < 0)
4747 		ereport(PANIC,
4748 				(errcode_for_file_access(),
4749 				 errmsg("could not create file \"%s\": %m",
4750 						XLOG_CONTROL_FILE)));
4751 
4752 	errno = 0;
4753 	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4754 	if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4755 	{
4756 		/* if write didn't set errno, assume problem is no disk space */
4757 		if (errno == 0)
4758 			errno = ENOSPC;
4759 		ereport(PANIC,
4760 				(errcode_for_file_access(),
4761 				 errmsg("could not write to file \"%s\": %m",
4762 						XLOG_CONTROL_FILE)));
4763 	}
4764 	pgstat_report_wait_end();
4765 
4766 	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4767 	if (pg_fsync(fd) != 0)
4768 		ereport(PANIC,
4769 				(errcode_for_file_access(),
4770 				 errmsg("could not fsync file \"%s\": %m",
4771 						XLOG_CONTROL_FILE)));
4772 	pgstat_report_wait_end();
4773 
4774 	if (close(fd) != 0)
4775 		ereport(PANIC,
4776 				(errcode_for_file_access(),
4777 				 errmsg("could not close file \"%s\": %m",
4778 						XLOG_CONTROL_FILE)));
4779 }
4780 
4781 static void
ReadControlFile(void)4782 ReadControlFile(void)
4783 {
4784 	pg_crc32c	crc;
4785 	int			fd;
4786 	static char wal_segsz_str[20];
4787 	int			r;
4788 
4789 	/*
4790 	 * Read data...
4791 	 */
4792 	fd = BasicOpenFile(XLOG_CONTROL_FILE,
4793 					   O_RDWR | PG_BINARY);
4794 	if (fd < 0)
4795 		ereport(PANIC,
4796 				(errcode_for_file_access(),
4797 				 errmsg("could not open file \"%s\": %m",
4798 						XLOG_CONTROL_FILE)));
4799 
4800 	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4801 	r = read(fd, ControlFile, sizeof(ControlFileData));
4802 	if (r != sizeof(ControlFileData))
4803 	{
4804 		if (r < 0)
4805 			ereport(PANIC,
4806 					(errcode_for_file_access(),
4807 					 errmsg("could not read file \"%s\": %m",
4808 							XLOG_CONTROL_FILE)));
4809 		else
4810 			ereport(PANIC,
4811 					(errcode(ERRCODE_DATA_CORRUPTED),
4812 					 errmsg("could not read file \"%s\": read %d of %zu",
4813 							XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4814 	}
4815 	pgstat_report_wait_end();
4816 
4817 	close(fd);
4818 
4819 	/*
4820 	 * Check for expected pg_control format version.  If this is wrong, the
4821 	 * CRC check will likely fail because we'll be checking the wrong number
4822 	 * of bytes.  Complaining about wrong version will probably be more
4823 	 * enlightening than complaining about wrong CRC.
4824 	 */
4825 
4826 	if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4827 		ereport(FATAL,
4828 				(errmsg("database files are incompatible with server"),
4829 				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4830 						   " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4831 						   ControlFile->pg_control_version, ControlFile->pg_control_version,
4832 						   PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4833 				 errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4834 
4835 	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4836 		ereport(FATAL,
4837 				(errmsg("database files are incompatible with server"),
4838 				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4839 						   " but the server was compiled with PG_CONTROL_VERSION %d.",
4840 						   ControlFile->pg_control_version, PG_CONTROL_VERSION),
4841 				 errhint("It looks like you need to initdb.")));
4842 
4843 	/* Now check the CRC. */
4844 	INIT_CRC32C(crc);
4845 	COMP_CRC32C(crc,
4846 				(char *) ControlFile,
4847 				offsetof(ControlFileData, crc));
4848 	FIN_CRC32C(crc);
4849 
4850 	if (!EQ_CRC32C(crc, ControlFile->crc))
4851 		ereport(FATAL,
4852 				(errmsg("incorrect checksum in control file")));
4853 
4854 	/*
4855 	 * Do compatibility checking immediately.  If the database isn't
4856 	 * compatible with the backend executable, we want to abort before we can
4857 	 * possibly do any damage.
4858 	 */
4859 	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4860 		ereport(FATAL,
4861 				(errmsg("database files are incompatible with server"),
4862 				 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4863 						   " but the server was compiled with CATALOG_VERSION_NO %d.",
4864 						   ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4865 				 errhint("It looks like you need to initdb.")));
4866 	if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4867 		ereport(FATAL,
4868 				(errmsg("database files are incompatible with server"),
4869 				 errdetail("The database cluster was initialized with MAXALIGN %d,"
4870 						   " but the server was compiled with MAXALIGN %d.",
4871 						   ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4872 				 errhint("It looks like you need to initdb.")));
4873 	if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4874 		ereport(FATAL,
4875 				(errmsg("database files are incompatible with server"),
4876 				 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4877 				 errhint("It looks like you need to initdb.")));
4878 	if (ControlFile->blcksz != BLCKSZ)
4879 		ereport(FATAL,
4880 				(errmsg("database files are incompatible with server"),
4881 				 errdetail("The database cluster was initialized with BLCKSZ %d,"
4882 						   " but the server was compiled with BLCKSZ %d.",
4883 						   ControlFile->blcksz, BLCKSZ),
4884 				 errhint("It looks like you need to recompile or initdb.")));
4885 	if (ControlFile->relseg_size != RELSEG_SIZE)
4886 		ereport(FATAL,
4887 				(errmsg("database files are incompatible with server"),
4888 				 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4889 						   " but the server was compiled with RELSEG_SIZE %d.",
4890 						   ControlFile->relseg_size, RELSEG_SIZE),
4891 				 errhint("It looks like you need to recompile or initdb.")));
4892 	if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4893 		ereport(FATAL,
4894 				(errmsg("database files are incompatible with server"),
4895 				 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4896 						   " but the server was compiled with XLOG_BLCKSZ %d.",
4897 						   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4898 				 errhint("It looks like you need to recompile or initdb.")));
4899 	if (ControlFile->nameDataLen != NAMEDATALEN)
4900 		ereport(FATAL,
4901 				(errmsg("database files are incompatible with server"),
4902 				 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4903 						   " but the server was compiled with NAMEDATALEN %d.",
4904 						   ControlFile->nameDataLen, NAMEDATALEN),
4905 				 errhint("It looks like you need to recompile or initdb.")));
4906 	if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4907 		ereport(FATAL,
4908 				(errmsg("database files are incompatible with server"),
4909 				 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4910 						   " but the server was compiled with INDEX_MAX_KEYS %d.",
4911 						   ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4912 				 errhint("It looks like you need to recompile or initdb.")));
4913 	if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4914 		ereport(FATAL,
4915 				(errmsg("database files are incompatible with server"),
4916 				 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4917 						   " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4918 						   ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4919 				 errhint("It looks like you need to recompile or initdb.")));
4920 	if (ControlFile->loblksize != LOBLKSIZE)
4921 		ereport(FATAL,
4922 				(errmsg("database files are incompatible with server"),
4923 				 errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4924 						   " but the server was compiled with LOBLKSIZE %d.",
4925 						   ControlFile->loblksize, (int) LOBLKSIZE),
4926 				 errhint("It looks like you need to recompile or initdb.")));
4927 
4928 #ifdef USE_FLOAT8_BYVAL
4929 	if (ControlFile->float8ByVal != true)
4930 		ereport(FATAL,
4931 				(errmsg("database files are incompatible with server"),
4932 				 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4933 						   " but the server was compiled with USE_FLOAT8_BYVAL."),
4934 				 errhint("It looks like you need to recompile or initdb.")));
4935 #else
4936 	if (ControlFile->float8ByVal != false)
4937 		ereport(FATAL,
4938 				(errmsg("database files are incompatible with server"),
4939 				 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4940 						   " but the server was compiled without USE_FLOAT8_BYVAL."),
4941 				 errhint("It looks like you need to recompile or initdb.")));
4942 #endif
4943 
4944 	wal_segment_size = ControlFile->xlog_seg_size;
4945 
4946 	if (!IsValidWalSegSize(wal_segment_size))
4947 		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4948 						errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
4949 									  "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
4950 									  wal_segment_size,
4951 									  wal_segment_size)));
4952 
4953 	snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4954 	SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4955 					PGC_S_OVERRIDE);
4956 
4957 	/* check and update variables dependent on wal_segment_size */
4958 	if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
4959 		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4960 						errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
4961 
4962 	if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
4963 		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4964 						errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
4965 
4966 	UsableBytesInSegment =
4967 		(wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4968 		(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4969 
4970 	CalculateCheckpointSegments();
4971 
4972 	/* Make the initdb settings visible as GUC variables, too */
4973 	SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4974 					PGC_INTERNAL, PGC_S_OVERRIDE);
4975 }
4976 
4977 /*
4978  * Utility wrapper to update the control file.  Note that the control
4979  * file gets flushed.
4980  */
4981 void
UpdateControlFile(void)4982 UpdateControlFile(void)
4983 {
4984 	update_controlfile(DataDir, ControlFile, true);
4985 }
4986 
4987 /*
4988  * Returns the unique system identifier from control file.
4989  */
4990 uint64
GetSystemIdentifier(void)4991 GetSystemIdentifier(void)
4992 {
4993 	Assert(ControlFile != NULL);
4994 	return ControlFile->system_identifier;
4995 }
4996 
4997 /*
4998  * Returns the random nonce from control file.
4999  */
5000 char *
GetMockAuthenticationNonce(void)5001 GetMockAuthenticationNonce(void)
5002 {
5003 	Assert(ControlFile != NULL);
5004 	return ControlFile->mock_authentication_nonce;
5005 }
5006 
5007 /*
5008  * Are checksums enabled for data pages?
5009  */
5010 bool
DataChecksumsEnabled(void)5011 DataChecksumsEnabled(void)
5012 {
5013 	Assert(ControlFile != NULL);
5014 	return (ControlFile->data_checksum_version > 0);
5015 }
5016 
5017 /*
5018  * Returns a fake LSN for unlogged relations.
5019  *
5020  * Each call generates an LSN that is greater than any previous value
5021  * returned. The current counter value is saved and restored across clean
5022  * shutdowns, but like unlogged relations, does not survive a crash. This can
5023  * be used in lieu of real LSN values returned by XLogInsert, if you need an
5024  * LSN-like increasing sequence of numbers without writing any WAL.
5025  */
5026 XLogRecPtr
GetFakeLSNForUnloggedRel(void)5027 GetFakeLSNForUnloggedRel(void)
5028 {
5029 	XLogRecPtr	nextUnloggedLSN;
5030 
5031 	/* increment the unloggedLSN counter, need SpinLock */
5032 	SpinLockAcquire(&XLogCtl->ulsn_lck);
5033 	nextUnloggedLSN = XLogCtl->unloggedLSN++;
5034 	SpinLockRelease(&XLogCtl->ulsn_lck);
5035 
5036 	return nextUnloggedLSN;
5037 }
5038 
5039 /*
5040  * Auto-tune the number of XLOG buffers.
5041  *
5042  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
5043  * a maximum of one XLOG segment (there is little reason to think that more
5044  * is helpful, at least so long as we force an fsync when switching log files)
5045  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
5046  * 9.1, when auto-tuning was added).
5047  *
5048  * This should not be called until NBuffers has received its final value.
5049  */
5050 static int
XLOGChooseNumBuffers(void)5051 XLOGChooseNumBuffers(void)
5052 {
5053 	int			xbuffers;
5054 
5055 	xbuffers = NBuffers / 32;
5056 	if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
5057 		xbuffers = (wal_segment_size / XLOG_BLCKSZ);
5058 	if (xbuffers < 8)
5059 		xbuffers = 8;
5060 	return xbuffers;
5061 }
5062 
5063 /*
5064  * GUC check_hook for wal_buffers
5065  */
5066 bool
check_wal_buffers(int * newval,void ** extra,GucSource source)5067 check_wal_buffers(int *newval, void **extra, GucSource source)
5068 {
5069 	/*
5070 	 * -1 indicates a request for auto-tune.
5071 	 */
5072 	if (*newval == -1)
5073 	{
5074 		/*
5075 		 * If we haven't yet changed the boot_val default of -1, just let it
5076 		 * be.  We'll fix it when XLOGShmemSize is called.
5077 		 */
5078 		if (XLOGbuffers == -1)
5079 			return true;
5080 
5081 		/* Otherwise, substitute the auto-tune value */
5082 		*newval = XLOGChooseNumBuffers();
5083 	}
5084 
5085 	/*
5086 	 * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
5087 	 * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
5088 	 * the case, we just silently treat such values as a request for the
5089 	 * minimum.  (We could throw an error instead, but that doesn't seem very
5090 	 * helpful.)
5091 	 */
5092 	if (*newval < 4)
5093 		*newval = 4;
5094 
5095 	return true;
5096 }
5097 
5098 /*
5099  * Read the control file, set respective GUCs.
5100  *
5101  * This is to be called during startup, including a crash recovery cycle,
5102  * unless in bootstrap mode, where no control file yet exists.  As there's no
5103  * usable shared memory yet (its sizing can depend on the contents of the
5104  * control file!), first store the contents in local memory. XLOGShmemInit()
5105  * will then copy it to shared memory later.
5106  *
5107  * reset just controls whether previous contents are to be expected (in the
5108  * reset case, there's a dangling pointer into old shared memory), or not.
5109  */
5110 void
LocalProcessControlFile(bool reset)5111 LocalProcessControlFile(bool reset)
5112 {
5113 	Assert(reset || ControlFile == NULL);
5114 	ControlFile = palloc(sizeof(ControlFileData));
5115 	ReadControlFile();
5116 }
5117 
5118 /*
5119  * Initialization of shared memory for XLOG
5120  */
5121 Size
XLOGShmemSize(void)5122 XLOGShmemSize(void)
5123 {
5124 	Size		size;
5125 
5126 	/*
5127 	 * If the value of wal_buffers is -1, use the preferred auto-tune value.
5128 	 * This isn't an amazingly clean place to do this, but we must wait till
5129 	 * NBuffers has received its final value, and must do it before using the
5130 	 * value of XLOGbuffers to do anything important.
5131 	 */
5132 	if (XLOGbuffers == -1)
5133 	{
5134 		char		buf[32];
5135 
5136 		snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5137 		SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5138 	}
5139 	Assert(XLOGbuffers > 0);
5140 
5141 	/* XLogCtl */
5142 	size = sizeof(XLogCtlData);
5143 
5144 	/* WAL insertion locks, plus alignment */
5145 	size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
5146 	/* xlblocks array */
5147 	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5148 	/* extra alignment padding for XLOG I/O buffers */
5149 	size = add_size(size, XLOG_BLCKSZ);
5150 	/* and the buffers themselves */
5151 	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5152 
5153 	/*
5154 	 * Note: we don't count ControlFileData, it comes out of the "slop factor"
5155 	 * added by CreateSharedMemoryAndSemaphores.  This lets us use this
5156 	 * routine again below to compute the actual allocation size.
5157 	 */
5158 
5159 	return size;
5160 }
5161 
5162 void
XLOGShmemInit(void)5163 XLOGShmemInit(void)
5164 {
5165 	bool		foundCFile,
5166 				foundXLog;
5167 	char	   *allocptr;
5168 	int			i;
5169 	ControlFileData *localControlFile;
5170 
5171 #ifdef WAL_DEBUG
5172 
5173 	/*
5174 	 * Create a memory context for WAL debugging that's exempt from the normal
5175 	 * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
5176 	 * an allocation fails, but wal_debug is not for production use anyway.
5177 	 */
5178 	if (walDebugCxt == NULL)
5179 	{
5180 		walDebugCxt = AllocSetContextCreate(TopMemoryContext,
5181 											"WAL Debug",
5182 											ALLOCSET_DEFAULT_SIZES);
5183 		MemoryContextAllowInCriticalSection(walDebugCxt, true);
5184 	}
5185 #endif
5186 
5187 
5188 	XLogCtl = (XLogCtlData *)
5189 		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5190 
5191 	localControlFile = ControlFile;
5192 	ControlFile = (ControlFileData *)
5193 		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5194 
5195 	if (foundCFile || foundXLog)
5196 	{
5197 		/* both should be present or neither */
5198 		Assert(foundCFile && foundXLog);
5199 
5200 		/* Initialize local copy of WALInsertLocks */
5201 		WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
5202 
5203 		if (localControlFile)
5204 			pfree(localControlFile);
5205 		return;
5206 	}
5207 	memset(XLogCtl, 0, sizeof(XLogCtlData));
5208 
5209 	/*
5210 	 * Already have read control file locally, unless in bootstrap mode. Move
5211 	 * contents into shared memory.
5212 	 */
5213 	if (localControlFile)
5214 	{
5215 		memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
5216 		pfree(localControlFile);
5217 	}
5218 
5219 	/*
5220 	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5221 	 * multiple of the alignment for same, so no extra alignment padding is
5222 	 * needed here.
5223 	 */
5224 	allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5225 	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5226 	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5227 	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5228 
5229 
5230 	/* WAL insertion locks. Ensure they're aligned to the full padded size */
5231 	allocptr += sizeof(WALInsertLockPadded) -
5232 		((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5233 	WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5234 		(WALInsertLockPadded *) allocptr;
5235 	allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5236 
5237 	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
5238 	{
5239 		LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5240 		WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
5241 		WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5242 	}
5243 
5244 	/*
5245 	 * Align the start of the page buffers to a full xlog block size boundary.
5246 	 * This simplifies some calculations in XLOG insertion. It is also
5247 	 * required for O_DIRECT.
5248 	 */
5249 	allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5250 	XLogCtl->pages = allocptr;
5251 	memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5252 
5253 	/*
5254 	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5255 	 * in additional info.)
5256 	 */
5257 	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5258 	XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5259 	XLogCtl->SharedHotStandbyActive = false;
5260 	XLogCtl->SharedPromoteIsTriggered = false;
5261 	XLogCtl->WalWriterSleeping = false;
5262 
5263 	SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5264 	SpinLockInit(&XLogCtl->info_lck);
5265 	SpinLockInit(&XLogCtl->ulsn_lck);
5266 	InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5267 	ConditionVariableInit(&XLogCtl->recoveryNotPausedCV);
5268 }
5269 
5270 /*
5271  * This func must be called ONCE on system install.  It creates pg_control
5272  * and the initial XLOG segment.
5273  */
5274 void
BootStrapXLOG(void)5275 BootStrapXLOG(void)
5276 {
5277 	CheckPoint	checkPoint;
5278 	char	   *buffer;
5279 	XLogPageHeader page;
5280 	XLogLongPageHeader longpage;
5281 	XLogRecord *record;
5282 	char	   *recptr;
5283 	bool		use_existent;
5284 	uint64		sysidentifier;
5285 	struct timeval tv;
5286 	pg_crc32c	crc;
5287 
5288 	/*
5289 	 * Select a hopefully-unique system identifier code for this installation.
5290 	 * We use the result of gettimeofday(), including the fractional seconds
5291 	 * field, as being about as unique as we can easily get.  (Think not to
5292 	 * use random(), since it hasn't been seeded and there's no portable way
5293 	 * to seed it other than the system clock value...)  The upper half of the
5294 	 * uint64 value is just the tv_sec part, while the lower half contains the
5295 	 * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5296 	 * PID for a little extra uniqueness.  A person knowing this encoding can
5297 	 * determine the initialization time of the installation, which could
5298 	 * perhaps be useful sometimes.
5299 	 */
5300 	gettimeofday(&tv, NULL);
5301 	sysidentifier = ((uint64) tv.tv_sec) << 32;
5302 	sysidentifier |= ((uint64) tv.tv_usec) << 12;
5303 	sysidentifier |= getpid() & 0xFFF;
5304 
5305 	/* First timeline ID is always 1 */
5306 	ThisTimeLineID = 1;
5307 
5308 	/* page buffer must be aligned suitably for O_DIRECT */
5309 	buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5310 	page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5311 	memset(page, 0, XLOG_BLCKSZ);
5312 
5313 	/*
5314 	 * Set up information for the initial checkpoint record
5315 	 *
5316 	 * The initial checkpoint record is written to the beginning of the WAL
5317 	 * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5318 	 * used, so that we can use 0/0 to mean "before any valid WAL segment".
5319 	 */
5320 	checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5321 	checkPoint.ThisTimeLineID = ThisTimeLineID;
5322 	checkPoint.PrevTimeLineID = ThisTimeLineID;
5323 	checkPoint.fullPageWrites = fullPageWrites;
5324 	checkPoint.nextXid =
5325 		FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
5326 	checkPoint.nextOid = FirstBootstrapObjectId;
5327 	checkPoint.nextMulti = FirstMultiXactId;
5328 	checkPoint.nextMultiOffset = 0;
5329 	checkPoint.oldestXid = FirstNormalTransactionId;
5330 	checkPoint.oldestXidDB = TemplateDbOid;
5331 	checkPoint.oldestMulti = FirstMultiXactId;
5332 	checkPoint.oldestMultiDB = TemplateDbOid;
5333 	checkPoint.oldestCommitTsXid = InvalidTransactionId;
5334 	checkPoint.newestCommitTsXid = InvalidTransactionId;
5335 	checkPoint.time = (pg_time_t) time(NULL);
5336 	checkPoint.oldestActiveXid = InvalidTransactionId;
5337 
5338 	ShmemVariableCache->nextXid = checkPoint.nextXid;
5339 	ShmemVariableCache->nextOid = checkPoint.nextOid;
5340 	ShmemVariableCache->oidCount = 0;
5341 	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5342 	AdvanceOldestClogXid(checkPoint.oldestXid);
5343 	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5344 	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5345 	SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5346 
5347 	/* Set up the XLOG page header */
5348 	page->xlp_magic = XLOG_PAGE_MAGIC;
5349 	page->xlp_info = XLP_LONG_HEADER;
5350 	page->xlp_tli = ThisTimeLineID;
5351 	page->xlp_pageaddr = wal_segment_size;
5352 	longpage = (XLogLongPageHeader) page;
5353 	longpage->xlp_sysid = sysidentifier;
5354 	longpage->xlp_seg_size = wal_segment_size;
5355 	longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5356 
5357 	/* Insert the initial checkpoint record */
5358 	recptr = ((char *) page + SizeOfXLogLongPHD);
5359 	record = (XLogRecord *) recptr;
5360 	record->xl_prev = 0;
5361 	record->xl_xid = InvalidTransactionId;
5362 	record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5363 	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5364 	record->xl_rmid = RM_XLOG_ID;
5365 	recptr += SizeOfXLogRecord;
5366 	/* fill the XLogRecordDataHeaderShort struct */
5367 	*(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5368 	*(recptr++) = sizeof(checkPoint);
5369 	memcpy(recptr, &checkPoint, sizeof(checkPoint));
5370 	recptr += sizeof(checkPoint);
5371 	Assert(recptr - (char *) record == record->xl_tot_len);
5372 
5373 	INIT_CRC32C(crc);
5374 	COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5375 	COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5376 	FIN_CRC32C(crc);
5377 	record->xl_crc = crc;
5378 
5379 	/* Create first XLOG segment file */
5380 	use_existent = false;
5381 	openLogFile = XLogFileInit(1, &use_existent, false);
5382 
5383 	/*
5384 	 * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
5385 	 * close the file again in a moment.
5386 	 */
5387 
5388 	/* Write the first page with the initial record */
5389 	errno = 0;
5390 	pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5391 	if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5392 	{
5393 		/* if write didn't set errno, assume problem is no disk space */
5394 		if (errno == 0)
5395 			errno = ENOSPC;
5396 		ereport(PANIC,
5397 				(errcode_for_file_access(),
5398 				 errmsg("could not write bootstrap write-ahead log file: %m")));
5399 	}
5400 	pgstat_report_wait_end();
5401 
5402 	pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5403 	if (pg_fsync(openLogFile) != 0)
5404 		ereport(PANIC,
5405 				(errcode_for_file_access(),
5406 				 errmsg("could not fsync bootstrap write-ahead log file: %m")));
5407 	pgstat_report_wait_end();
5408 
5409 	if (close(openLogFile) != 0)
5410 		ereport(PANIC,
5411 				(errcode_for_file_access(),
5412 				 errmsg("could not close bootstrap write-ahead log file: %m")));
5413 
5414 	openLogFile = -1;
5415 
5416 	/* Now create pg_control */
5417 	InitControlFile(sysidentifier);
5418 	ControlFile->time = checkPoint.time;
5419 	ControlFile->checkPoint = checkPoint.redo;
5420 	ControlFile->checkPointCopy = checkPoint;
5421 
5422 	/* some additional ControlFile fields are set in WriteControlFile() */
5423 	WriteControlFile();
5424 
5425 	/* Bootstrap the commit log, too */
5426 	BootStrapCLOG();
5427 	BootStrapCommitTs();
5428 	BootStrapSUBTRANS();
5429 	BootStrapMultiXact();
5430 
5431 	pfree(buffer);
5432 
5433 	/*
5434 	 * Force control file to be read - in contrast to normal processing we'd
5435 	 * otherwise never run the checks and GUC related initializations therein.
5436 	 */
5437 	ReadControlFile();
5438 }
5439 
5440 static char *
str_time(pg_time_t tnow)5441 str_time(pg_time_t tnow)
5442 {
5443 	static char buf[128];
5444 
5445 	pg_strftime(buf, sizeof(buf),
5446 				"%Y-%m-%d %H:%M:%S %Z",
5447 				pg_localtime(&tnow, log_timezone));
5448 
5449 	return buf;
5450 }
5451 
5452 /*
5453  * See if there are any recovery signal files and if so, set state for
5454  * recovery.
5455  *
5456  * See if there is a recovery command file (recovery.conf), and if so
5457  * throw an ERROR since as of PG12 we no longer recognize that.
5458  */
5459 static void
readRecoverySignalFile(void)5460 readRecoverySignalFile(void)
5461 {
5462 	struct stat stat_buf;
5463 
5464 	if (IsBootstrapProcessingMode())
5465 		return;
5466 
5467 	/*
5468 	 * Check for old recovery API file: recovery.conf
5469 	 */
5470 	if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
5471 		ereport(FATAL,
5472 				(errcode_for_file_access(),
5473 				 errmsg("using recovery command file \"%s\" is not supported",
5474 						RECOVERY_COMMAND_FILE)));
5475 
5476 	/*
5477 	 * Remove unused .done file, if present. Ignore if absent.
5478 	 */
5479 	unlink(RECOVERY_COMMAND_DONE);
5480 
5481 	/*
5482 	 * Check for recovery signal files and if found, fsync them since they
5483 	 * represent server state information.  We don't sweat too much about the
5484 	 * possibility of fsync failure, however.
5485 	 *
5486 	 * If present, standby signal file takes precedence. If neither is present
5487 	 * then we won't enter archive recovery.
5488 	 */
5489 	if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
5490 	{
5491 		int			fd;
5492 
5493 		fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
5494 							   S_IRUSR | S_IWUSR);
5495 		if (fd >= 0)
5496 		{
5497 			(void) pg_fsync(fd);
5498 			close(fd);
5499 		}
5500 		standby_signal_file_found = true;
5501 	}
5502 	else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
5503 	{
5504 		int			fd;
5505 
5506 		fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
5507 							   S_IRUSR | S_IWUSR);
5508 		if (fd >= 0)
5509 		{
5510 			(void) pg_fsync(fd);
5511 			close(fd);
5512 		}
5513 		recovery_signal_file_found = true;
5514 	}
5515 
5516 	StandbyModeRequested = false;
5517 	ArchiveRecoveryRequested = false;
5518 	if (standby_signal_file_found)
5519 	{
5520 		StandbyModeRequested = true;
5521 		ArchiveRecoveryRequested = true;
5522 	}
5523 	else if (recovery_signal_file_found)
5524 	{
5525 		StandbyModeRequested = false;
5526 		ArchiveRecoveryRequested = true;
5527 	}
5528 	else
5529 		return;
5530 
5531 	/*
5532 	 * We don't support standby mode in standalone backends; that requires
5533 	 * other processes such as the WAL receiver to be alive.
5534 	 */
5535 	if (StandbyModeRequested && !IsUnderPostmaster)
5536 		ereport(FATAL,
5537 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5538 				 errmsg("standby mode is not supported by single-user servers")));
5539 }
5540 
5541 static void
validateRecoveryParameters(void)5542 validateRecoveryParameters(void)
5543 {
5544 	if (!ArchiveRecoveryRequested)
5545 		return;
5546 
5547 	/*
5548 	 * Check for compulsory parameters
5549 	 */
5550 	if (StandbyModeRequested)
5551 	{
5552 		if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
5553 			(recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
5554 			ereport(WARNING,
5555 					(errmsg("specified neither primary_conninfo nor restore_command"),
5556 					 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5557 	}
5558 	else
5559 	{
5560 		if (recoveryRestoreCommand == NULL ||
5561 			strcmp(recoveryRestoreCommand, "") == 0)
5562 			ereport(FATAL,
5563 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5564 					 errmsg("must specify restore_command when standby mode is not enabled")));
5565 	}
5566 
5567 	/*
5568 	 * Override any inconsistent requests. Note that this is a change of
5569 	 * behaviour in 9.5; prior to this we simply ignored a request to pause if
5570 	 * hot_standby = off, which was surprising behaviour.
5571 	 */
5572 	if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5573 		!EnableHotStandby)
5574 		recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5575 
5576 	/*
5577 	 * Final parsing of recovery_target_time string; see also
5578 	 * check_recovery_target_time().
5579 	 */
5580 	if (recoveryTarget == RECOVERY_TARGET_TIME)
5581 	{
5582 		recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5583 																	 CStringGetDatum(recovery_target_time_string),
5584 																	 ObjectIdGetDatum(InvalidOid),
5585 																	 Int32GetDatum(-1)));
5586 	}
5587 
5588 	/*
5589 	 * If user specified recovery_target_timeline, validate it or compute the
5590 	 * "latest" value.  We can't do this until after we've gotten the restore
5591 	 * command and set InArchiveRecovery, because we need to fetch timeline
5592 	 * history files from the archive.
5593 	 */
5594 	if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5595 	{
5596 		TimeLineID	rtli = recoveryTargetTLIRequested;
5597 
5598 		/* Timeline 1 does not have a history file, all else should */
5599 		if (rtli != 1 && !existsTimeLineHistory(rtli))
5600 			ereport(FATAL,
5601 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5602 					 errmsg("recovery target timeline %u does not exist",
5603 							rtli)));
5604 		recoveryTargetTLI = rtli;
5605 	}
5606 	else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
5607 	{
5608 		/* We start the "latest" search from pg_control's timeline */
5609 		recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5610 	}
5611 	else
5612 	{
5613 		/*
5614 		 * else we just use the recoveryTargetTLI as already read from
5615 		 * ControlFile
5616 		 */
5617 		Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
5618 	}
5619 }
5620 
5621 /*
5622  * Exit archive-recovery state
5623  */
5624 static void
exitArchiveRecovery(TimeLineID endTLI,XLogRecPtr endOfLog)5625 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5626 {
5627 	char		xlogfname[MAXFNAMELEN];
5628 	XLogSegNo	endLogSegNo;
5629 	XLogSegNo	startLogSegNo;
5630 
5631 	/* we always switch to a new timeline after archive recovery */
5632 	Assert(endTLI != ThisTimeLineID);
5633 
5634 	/*
5635 	 * We are no longer in archive recovery state.
5636 	 */
5637 	InArchiveRecovery = false;
5638 
5639 	/*
5640 	 * Update min recovery point one last time.
5641 	 */
5642 	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5643 
5644 	/*
5645 	 * If the ending log segment is still open, close it (to avoid problems on
5646 	 * Windows with trying to rename or delete an open file).
5647 	 */
5648 	if (readFile >= 0)
5649 	{
5650 		close(readFile);
5651 		readFile = -1;
5652 	}
5653 
5654 	/*
5655 	 * Calculate the last segment on the old timeline, and the first segment
5656 	 * on the new timeline. If the switch happens in the middle of a segment,
5657 	 * they are the same, but if the switch happens exactly at a segment
5658 	 * boundary, startLogSegNo will be endLogSegNo + 1.
5659 	 */
5660 	XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5661 	XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5662 
5663 	/*
5664 	 * Initialize the starting WAL segment for the new timeline. If the switch
5665 	 * happens in the middle of a segment, copy data from the last WAL segment
5666 	 * of the old timeline up to the switch point, to the starting WAL segment
5667 	 * on the new timeline.
5668 	 */
5669 	if (endLogSegNo == startLogSegNo)
5670 	{
5671 		/*
5672 		 * Make a copy of the file on the new timeline.
5673 		 *
5674 		 * Writing WAL isn't allowed yet, so there are no locking
5675 		 * considerations. But we should be just as tense as XLogFileInit to
5676 		 * avoid emplacing a bogus file.
5677 		 */
5678 		XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5679 					 XLogSegmentOffset(endOfLog, wal_segment_size));
5680 	}
5681 	else
5682 	{
5683 		/*
5684 		 * The switch happened at a segment boundary, so just create the next
5685 		 * segment on the new timeline.
5686 		 */
5687 		bool		use_existent = true;
5688 		int			fd;
5689 
5690 		fd = XLogFileInit(startLogSegNo, &use_existent, true);
5691 
5692 		if (close(fd) != 0)
5693 		{
5694 			char		xlogfname[MAXFNAMELEN];
5695 			int			save_errno = errno;
5696 
5697 			XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo,
5698 						 wal_segment_size);
5699 			errno = save_errno;
5700 			ereport(ERROR,
5701 					(errcode_for_file_access(),
5702 					 errmsg("could not close file \"%s\": %m", xlogfname)));
5703 		}
5704 	}
5705 
5706 	/*
5707 	 * Let's just make real sure there are not .ready or .done flags posted
5708 	 * for the new segment.
5709 	 */
5710 	XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
5711 	XLogArchiveCleanup(xlogfname);
5712 
5713 	/*
5714 	 * Remove the signal files out of the way, so that we don't accidentally
5715 	 * re-enter archive recovery mode in a subsequent crash.
5716 	 */
5717 	if (standby_signal_file_found)
5718 		durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
5719 
5720 	if (recovery_signal_file_found)
5721 		durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
5722 
5723 	ereport(LOG,
5724 			(errmsg("archive recovery complete")));
5725 }
5726 
5727 /*
5728  * Extract timestamp from WAL record.
5729  *
5730  * If the record contains a timestamp, returns true, and saves the timestamp
5731  * in *recordXtime. If the record type has no timestamp, returns false.
5732  * Currently, only transaction commit/abort records and restore points contain
5733  * timestamps.
5734  */
5735 static bool
getRecordTimestamp(XLogReaderState * record,TimestampTz * recordXtime)5736 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5737 {
5738 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5739 	uint8		xact_info = info & XLOG_XACT_OPMASK;
5740 	uint8		rmid = XLogRecGetRmid(record);
5741 
5742 	if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5743 	{
5744 		*recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5745 		return true;
5746 	}
5747 	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5748 							   xact_info == XLOG_XACT_COMMIT_PREPARED))
5749 	{
5750 		*recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5751 		return true;
5752 	}
5753 	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5754 							   xact_info == XLOG_XACT_ABORT_PREPARED))
5755 	{
5756 		*recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5757 		return true;
5758 	}
5759 	return false;
5760 }
5761 
5762 /*
5763  * For point-in-time recovery, this function decides whether we want to
5764  * stop applying the XLOG before the current record.
5765  *
5766  * Returns true if we are stopping, false otherwise. If stopping, some
5767  * information is saved in recoveryStopXid et al for use in annotating the
5768  * new timeline's history file.
5769  */
5770 static bool
recoveryStopsBefore(XLogReaderState * record)5771 recoveryStopsBefore(XLogReaderState *record)
5772 {
5773 	bool		stopsHere = false;
5774 	uint8		xact_info;
5775 	bool		isCommit;
5776 	TimestampTz recordXtime = 0;
5777 	TransactionId recordXid;
5778 
5779 	/*
5780 	 * Ignore recovery target settings when not in archive recovery (meaning
5781 	 * we are in crash recovery).
5782 	 */
5783 	if (!ArchiveRecoveryRequested)
5784 		return false;
5785 
5786 	/* Check if we should stop as soon as reaching consistency */
5787 	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5788 	{
5789 		ereport(LOG,
5790 				(errmsg("recovery stopping after reaching consistency")));
5791 
5792 		recoveryStopAfter = false;
5793 		recoveryStopXid = InvalidTransactionId;
5794 		recoveryStopLSN = InvalidXLogRecPtr;
5795 		recoveryStopTime = 0;
5796 		recoveryStopName[0] = '\0';
5797 		return true;
5798 	}
5799 
5800 	/* Check if target LSN has been reached */
5801 	if (recoveryTarget == RECOVERY_TARGET_LSN &&
5802 		!recoveryTargetInclusive &&
5803 		record->ReadRecPtr >= recoveryTargetLSN)
5804 	{
5805 		recoveryStopAfter = false;
5806 		recoveryStopXid = InvalidTransactionId;
5807 		recoveryStopLSN = record->ReadRecPtr;
5808 		recoveryStopTime = 0;
5809 		recoveryStopName[0] = '\0';
5810 		ereport(LOG,
5811 				(errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
5812 						LSN_FORMAT_ARGS(recoveryStopLSN))));
5813 		return true;
5814 	}
5815 
5816 	/* Otherwise we only consider stopping before COMMIT or ABORT records. */
5817 	if (XLogRecGetRmid(record) != RM_XACT_ID)
5818 		return false;
5819 
5820 	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5821 
5822 	if (xact_info == XLOG_XACT_COMMIT)
5823 	{
5824 		isCommit = true;
5825 		recordXid = XLogRecGetXid(record);
5826 	}
5827 	else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5828 	{
5829 		xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5830 		xl_xact_parsed_commit parsed;
5831 
5832 		isCommit = true;
5833 		ParseCommitRecord(XLogRecGetInfo(record),
5834 						  xlrec,
5835 						  &parsed);
5836 		recordXid = parsed.twophase_xid;
5837 	}
5838 	else if (xact_info == XLOG_XACT_ABORT)
5839 	{
5840 		isCommit = false;
5841 		recordXid = XLogRecGetXid(record);
5842 	}
5843 	else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5844 	{
5845 		xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5846 		xl_xact_parsed_abort parsed;
5847 
5848 		isCommit = false;
5849 		ParseAbortRecord(XLogRecGetInfo(record),
5850 						 xlrec,
5851 						 &parsed);
5852 		recordXid = parsed.twophase_xid;
5853 	}
5854 	else
5855 		return false;
5856 
5857 	if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5858 	{
5859 		/*
5860 		 * There can be only one transaction end record with this exact
5861 		 * transactionid
5862 		 *
5863 		 * when testing for an xid, we MUST test for equality only, since
5864 		 * transactions are numbered in the order they start, not the order
5865 		 * they complete. A higher numbered xid will complete before you about
5866 		 * 50% of the time...
5867 		 */
5868 		stopsHere = (recordXid == recoveryTargetXid);
5869 	}
5870 
5871 	if (recoveryTarget == RECOVERY_TARGET_TIME &&
5872 		getRecordTimestamp(record, &recordXtime))
5873 	{
5874 		/*
5875 		 * There can be many transactions that share the same commit time, so
5876 		 * we stop after the last one, if we are inclusive, or stop at the
5877 		 * first one if we are exclusive
5878 		 */
5879 		if (recoveryTargetInclusive)
5880 			stopsHere = (recordXtime > recoveryTargetTime);
5881 		else
5882 			stopsHere = (recordXtime >= recoveryTargetTime);
5883 	}
5884 
5885 	if (stopsHere)
5886 	{
5887 		recoveryStopAfter = false;
5888 		recoveryStopXid = recordXid;
5889 		recoveryStopTime = recordXtime;
5890 		recoveryStopLSN = InvalidXLogRecPtr;
5891 		recoveryStopName[0] = '\0';
5892 
5893 		if (isCommit)
5894 		{
5895 			ereport(LOG,
5896 					(errmsg("recovery stopping before commit of transaction %u, time %s",
5897 							recoveryStopXid,
5898 							timestamptz_to_str(recoveryStopTime))));
5899 		}
5900 		else
5901 		{
5902 			ereport(LOG,
5903 					(errmsg("recovery stopping before abort of transaction %u, time %s",
5904 							recoveryStopXid,
5905 							timestamptz_to_str(recoveryStopTime))));
5906 		}
5907 	}
5908 
5909 	return stopsHere;
5910 }
5911 
5912 /*
5913  * Same as recoveryStopsBefore, but called after applying the record.
5914  *
5915  * We also track the timestamp of the latest applied COMMIT/ABORT
5916  * record in XLogCtl->recoveryLastXTime.
5917  */
5918 static bool
recoveryStopsAfter(XLogReaderState * record)5919 recoveryStopsAfter(XLogReaderState *record)
5920 {
5921 	uint8		info;
5922 	uint8		xact_info;
5923 	uint8		rmid;
5924 	TimestampTz recordXtime;
5925 
5926 	/*
5927 	 * Ignore recovery target settings when not in archive recovery (meaning
5928 	 * we are in crash recovery).
5929 	 */
5930 	if (!ArchiveRecoveryRequested)
5931 		return false;
5932 
5933 	info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5934 	rmid = XLogRecGetRmid(record);
5935 
5936 	/*
5937 	 * There can be many restore points that share the same name; we stop at
5938 	 * the first one.
5939 	 */
5940 	if (recoveryTarget == RECOVERY_TARGET_NAME &&
5941 		rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5942 	{
5943 		xl_restore_point *recordRestorePointData;
5944 
5945 		recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5946 
5947 		if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5948 		{
5949 			recoveryStopAfter = true;
5950 			recoveryStopXid = InvalidTransactionId;
5951 			recoveryStopLSN = InvalidXLogRecPtr;
5952 			(void) getRecordTimestamp(record, &recoveryStopTime);
5953 			strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5954 
5955 			ereport(LOG,
5956 					(errmsg("recovery stopping at restore point \"%s\", time %s",
5957 							recoveryStopName,
5958 							timestamptz_to_str(recoveryStopTime))));
5959 			return true;
5960 		}
5961 	}
5962 
5963 	/* Check if the target LSN has been reached */
5964 	if (recoveryTarget == RECOVERY_TARGET_LSN &&
5965 		recoveryTargetInclusive &&
5966 		record->ReadRecPtr >= recoveryTargetLSN)
5967 	{
5968 		recoveryStopAfter = true;
5969 		recoveryStopXid = InvalidTransactionId;
5970 		recoveryStopLSN = record->ReadRecPtr;
5971 		recoveryStopTime = 0;
5972 		recoveryStopName[0] = '\0';
5973 		ereport(LOG,
5974 				(errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
5975 						LSN_FORMAT_ARGS(recoveryStopLSN))));
5976 		return true;
5977 	}
5978 
5979 	if (rmid != RM_XACT_ID)
5980 		return false;
5981 
5982 	xact_info = info & XLOG_XACT_OPMASK;
5983 
5984 	if (xact_info == XLOG_XACT_COMMIT ||
5985 		xact_info == XLOG_XACT_COMMIT_PREPARED ||
5986 		xact_info == XLOG_XACT_ABORT ||
5987 		xact_info == XLOG_XACT_ABORT_PREPARED)
5988 	{
5989 		TransactionId recordXid;
5990 
5991 		/* Update the last applied transaction timestamp */
5992 		if (getRecordTimestamp(record, &recordXtime))
5993 			SetLatestXTime(recordXtime);
5994 
5995 		/* Extract the XID of the committed/aborted transaction */
5996 		if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5997 		{
5998 			xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5999 			xl_xact_parsed_commit parsed;
6000 
6001 			ParseCommitRecord(XLogRecGetInfo(record),
6002 							  xlrec,
6003 							  &parsed);
6004 			recordXid = parsed.twophase_xid;
6005 		}
6006 		else if (xact_info == XLOG_XACT_ABORT_PREPARED)
6007 		{
6008 			xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
6009 			xl_xact_parsed_abort parsed;
6010 
6011 			ParseAbortRecord(XLogRecGetInfo(record),
6012 							 xlrec,
6013 							 &parsed);
6014 			recordXid = parsed.twophase_xid;
6015 		}
6016 		else
6017 			recordXid = XLogRecGetXid(record);
6018 
6019 		/*
6020 		 * There can be only one transaction end record with this exact
6021 		 * transactionid
6022 		 *
6023 		 * when testing for an xid, we MUST test for equality only, since
6024 		 * transactions are numbered in the order they start, not the order
6025 		 * they complete. A higher numbered xid will complete before you about
6026 		 * 50% of the time...
6027 		 */
6028 		if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
6029 			recordXid == recoveryTargetXid)
6030 		{
6031 			recoveryStopAfter = true;
6032 			recoveryStopXid = recordXid;
6033 			recoveryStopTime = recordXtime;
6034 			recoveryStopLSN = InvalidXLogRecPtr;
6035 			recoveryStopName[0] = '\0';
6036 
6037 			if (xact_info == XLOG_XACT_COMMIT ||
6038 				xact_info == XLOG_XACT_COMMIT_PREPARED)
6039 			{
6040 				ereport(LOG,
6041 						(errmsg("recovery stopping after commit of transaction %u, time %s",
6042 								recoveryStopXid,
6043 								timestamptz_to_str(recoveryStopTime))));
6044 			}
6045 			else if (xact_info == XLOG_XACT_ABORT ||
6046 					 xact_info == XLOG_XACT_ABORT_PREPARED)
6047 			{
6048 				ereport(LOG,
6049 						(errmsg("recovery stopping after abort of transaction %u, time %s",
6050 								recoveryStopXid,
6051 								timestamptz_to_str(recoveryStopTime))));
6052 			}
6053 			return true;
6054 		}
6055 	}
6056 
6057 	/* Check if we should stop as soon as reaching consistency */
6058 	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
6059 	{
6060 		ereport(LOG,
6061 				(errmsg("recovery stopping after reaching consistency")));
6062 
6063 		recoveryStopAfter = true;
6064 		recoveryStopXid = InvalidTransactionId;
6065 		recoveryStopTime = 0;
6066 		recoveryStopLSN = InvalidXLogRecPtr;
6067 		recoveryStopName[0] = '\0';
6068 		return true;
6069 	}
6070 
6071 	return false;
6072 }
6073 
6074 /*
6075  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
6076  *
6077  * endOfRecovery is true if the recovery target is reached and
6078  * the paused state starts at the end of recovery because of
6079  * recovery_target_action=pause, and false otherwise.
6080  */
6081 static void
recoveryPausesHere(bool endOfRecovery)6082 recoveryPausesHere(bool endOfRecovery)
6083 {
6084 	/* Don't pause unless users can connect! */
6085 	if (!LocalHotStandbyActive)
6086 		return;
6087 
6088 	/* Don't pause after standby promotion has been triggered */
6089 	if (LocalPromoteIsTriggered)
6090 		return;
6091 
6092 	if (endOfRecovery)
6093 		ereport(LOG,
6094 				(errmsg("pausing at the end of recovery"),
6095 				 errhint("Execute pg_wal_replay_resume() to promote.")));
6096 	else
6097 		ereport(LOG,
6098 				(errmsg("recovery has paused"),
6099 				 errhint("Execute pg_wal_replay_resume() to continue.")));
6100 
6101 	/* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
6102 	while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
6103 	{
6104 		HandleStartupProcInterrupts();
6105 		if (CheckForStandbyTrigger())
6106 			return;
6107 
6108 		/*
6109 		 * If recovery pause is requested then set it paused.  While we are in
6110 		 * the loop, user might resume and pause again so set this every time.
6111 		 */
6112 		ConfirmRecoveryPaused();
6113 
6114 		/*
6115 		 * We wait on a condition variable that will wake us as soon as the
6116 		 * pause ends, but we use a timeout so we can check the above exit
6117 		 * condition periodically too.
6118 		 */
6119 		ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000,
6120 									WAIT_EVENT_RECOVERY_PAUSE);
6121 	}
6122 	ConditionVariableCancelSleep();
6123 }
6124 
6125 /*
6126  * Get the current state of the recovery pause request.
6127  */
6128 RecoveryPauseState
GetRecoveryPauseState(void)6129 GetRecoveryPauseState(void)
6130 {
6131 	RecoveryPauseState state;
6132 
6133 	SpinLockAcquire(&XLogCtl->info_lck);
6134 	state = XLogCtl->recoveryPauseState;
6135 	SpinLockRelease(&XLogCtl->info_lck);
6136 
6137 	return state;
6138 }
6139 
6140 /*
6141  * Set the recovery pause state.
6142  *
6143  * If recovery pause is requested then sets the recovery pause state to
6144  * 'pause requested' if it is not already 'paused'.  Otherwise, sets it
6145  * to 'not paused' to resume the recovery.  The recovery pause will be
6146  * confirmed by the ConfirmRecoveryPaused.
6147  */
6148 void
SetRecoveryPause(bool recoveryPause)6149 SetRecoveryPause(bool recoveryPause)
6150 {
6151 	SpinLockAcquire(&XLogCtl->info_lck);
6152 
6153 	if (!recoveryPause)
6154 		XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
6155 	else if (XLogCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
6156 		XLogCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
6157 
6158 	SpinLockRelease(&XLogCtl->info_lck);
6159 
6160 	if (!recoveryPause)
6161 		ConditionVariableBroadcast(&XLogCtl->recoveryNotPausedCV);
6162 }
6163 
6164 /*
6165  * Confirm the recovery pause by setting the recovery pause state to
6166  * RECOVERY_PAUSED.
6167  */
6168 static void
ConfirmRecoveryPaused(void)6169 ConfirmRecoveryPaused(void)
6170 {
6171 	/* If recovery pause is requested then set it paused */
6172 	SpinLockAcquire(&XLogCtl->info_lck);
6173 	if (XLogCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
6174 		XLogCtl->recoveryPauseState = RECOVERY_PAUSED;
6175 	SpinLockRelease(&XLogCtl->info_lck);
6176 }
6177 
6178 /*
6179  * When recovery_min_apply_delay is set, we wait long enough to make sure
6180  * certain record types are applied at least that interval behind the primary.
6181  *
6182  * Returns true if we waited.
6183  *
6184  * Note that the delay is calculated between the WAL record log time and
6185  * the current time on standby. We would prefer to keep track of when this
6186  * standby received each WAL record, which would allow a more consistent
6187  * approach and one not affected by time synchronisation issues, but that
6188  * is significantly more effort and complexity for little actual gain in
6189  * usability.
6190  */
6191 static bool
recoveryApplyDelay(XLogReaderState * record)6192 recoveryApplyDelay(XLogReaderState *record)
6193 {
6194 	uint8		xact_info;
6195 	TimestampTz xtime;
6196 	TimestampTz delayUntil;
6197 	long		msecs;
6198 
6199 	/* nothing to do if no delay configured */
6200 	if (recovery_min_apply_delay <= 0)
6201 		return false;
6202 
6203 	/* no delay is applied on a database not yet consistent */
6204 	if (!reachedConsistency)
6205 		return false;
6206 
6207 	/* nothing to do if crash recovery is requested */
6208 	if (!ArchiveRecoveryRequested)
6209 		return false;
6210 
6211 	/*
6212 	 * Is it a COMMIT record?
6213 	 *
6214 	 * We deliberately choose not to delay aborts since they have no effect on
6215 	 * MVCC. We already allow replay of records that don't have a timestamp,
6216 	 * so there is already opportunity for issues caused by early conflicts on
6217 	 * standbys.
6218 	 */
6219 	if (XLogRecGetRmid(record) != RM_XACT_ID)
6220 		return false;
6221 
6222 	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
6223 
6224 	if (xact_info != XLOG_XACT_COMMIT &&
6225 		xact_info != XLOG_XACT_COMMIT_PREPARED)
6226 		return false;
6227 
6228 	if (!getRecordTimestamp(record, &xtime))
6229 		return false;
6230 
6231 	delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6232 
6233 	/*
6234 	 * Exit without arming the latch if it's already past time to apply this
6235 	 * record
6236 	 */
6237 	msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
6238 	if (msecs <= 0)
6239 		return false;
6240 
6241 	while (true)
6242 	{
6243 		ResetLatch(&XLogCtl->recoveryWakeupLatch);
6244 
6245 		/*
6246 		 * This might change recovery_min_apply_delay or the trigger file's
6247 		 * location.
6248 		 */
6249 		HandleStartupProcInterrupts();
6250 
6251 		if (CheckForStandbyTrigger())
6252 			break;
6253 
6254 		/*
6255 		 * Recalculate delayUntil as recovery_min_apply_delay could have
6256 		 * changed while waiting in this loop.
6257 		 */
6258 		delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6259 
6260 		/*
6261 		 * Wait for difference between GetCurrentTimestamp() and delayUntil.
6262 		 */
6263 		msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
6264 												delayUntil);
6265 
6266 		if (msecs <= 0)
6267 			break;
6268 
6269 		elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
6270 
6271 		(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
6272 						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
6273 						 msecs,
6274 						 WAIT_EVENT_RECOVERY_APPLY_DELAY);
6275 	}
6276 	return true;
6277 }
6278 
6279 /*
6280  * Save timestamp of latest processed commit/abort record.
6281  *
6282  * We keep this in XLogCtl, not a simple static variable, so that it can be
6283  * seen by processes other than the startup process.  Note in particular
6284  * that CreateRestartPoint is executed in the checkpointer.
6285  */
6286 static void
SetLatestXTime(TimestampTz xtime)6287 SetLatestXTime(TimestampTz xtime)
6288 {
6289 	SpinLockAcquire(&XLogCtl->info_lck);
6290 	XLogCtl->recoveryLastXTime = xtime;
6291 	SpinLockRelease(&XLogCtl->info_lck);
6292 }
6293 
6294 /*
6295  * Fetch timestamp of latest processed commit/abort record.
6296  */
6297 TimestampTz
GetLatestXTime(void)6298 GetLatestXTime(void)
6299 {
6300 	TimestampTz xtime;
6301 
6302 	SpinLockAcquire(&XLogCtl->info_lck);
6303 	xtime = XLogCtl->recoveryLastXTime;
6304 	SpinLockRelease(&XLogCtl->info_lck);
6305 
6306 	return xtime;
6307 }
6308 
6309 /*
6310  * Save timestamp of the next chunk of WAL records to apply.
6311  *
6312  * We keep this in XLogCtl, not a simple static variable, so that it can be
6313  * seen by all backends.
6314  */
6315 static void
SetCurrentChunkStartTime(TimestampTz xtime)6316 SetCurrentChunkStartTime(TimestampTz xtime)
6317 {
6318 	SpinLockAcquire(&XLogCtl->info_lck);
6319 	XLogCtl->currentChunkStartTime = xtime;
6320 	SpinLockRelease(&XLogCtl->info_lck);
6321 }
6322 
6323 /*
6324  * Fetch timestamp of latest processed commit/abort record.
6325  * Startup process maintains an accurate local copy in XLogReceiptTime
6326  */
6327 TimestampTz
GetCurrentChunkReplayStartTime(void)6328 GetCurrentChunkReplayStartTime(void)
6329 {
6330 	TimestampTz xtime;
6331 
6332 	SpinLockAcquire(&XLogCtl->info_lck);
6333 	xtime = XLogCtl->currentChunkStartTime;
6334 	SpinLockRelease(&XLogCtl->info_lck);
6335 
6336 	return xtime;
6337 }
6338 
6339 /*
6340  * Returns time of receipt of current chunk of XLOG data, as well as
6341  * whether it was received from streaming replication or from archives.
6342  */
6343 void
GetXLogReceiptTime(TimestampTz * rtime,bool * fromStream)6344 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6345 {
6346 	/*
6347 	 * This must be executed in the startup process, since we don't export the
6348 	 * relevant state to shared memory.
6349 	 */
6350 	Assert(InRecovery);
6351 
6352 	*rtime = XLogReceiptTime;
6353 	*fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6354 }
6355 
6356 /*
6357  * Note that text field supplied is a parameter name and does not require
6358  * translation
6359  */
6360 static void
RecoveryRequiresIntParameter(const char * param_name,int currValue,int minValue)6361 RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
6362 {
6363 	if (currValue < minValue)
6364 	{
6365 		if (LocalHotStandbyActive)
6366 		{
6367 			bool		warned_for_promote = false;
6368 
6369 			ereport(WARNING,
6370 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6371 					 errmsg("hot standby is not possible because of insufficient parameter settings"),
6372 					 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
6373 							   param_name,
6374 							   currValue,
6375 							   minValue)));
6376 
6377 			SetRecoveryPause(true);
6378 
6379 			ereport(LOG,
6380 					(errmsg("recovery has paused"),
6381 					 errdetail("If recovery is unpaused, the server will shut down."),
6382 					 errhint("You can then restart the server after making the necessary configuration changes.")));
6383 
6384 			while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
6385 			{
6386 				HandleStartupProcInterrupts();
6387 
6388 				if (CheckForStandbyTrigger())
6389 				{
6390 					if (!warned_for_promote)
6391 						ereport(WARNING,
6392 								(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6393 								 errmsg("promotion is not possible because of insufficient parameter settings"),
6394 
6395 						/*
6396 						 * Repeat the detail from above so it's easy to find
6397 						 * in the log.
6398 						 */
6399 								 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
6400 										   param_name,
6401 										   currValue,
6402 										   minValue),
6403 								 errhint("Restart the server after making the necessary configuration changes.")));
6404 					warned_for_promote = true;
6405 				}
6406 
6407 				/*
6408 				 * If recovery pause is requested then set it paused.  While
6409 				 * we are in the loop, user might resume and pause again so
6410 				 * set this every time.
6411 				 */
6412 				ConfirmRecoveryPaused();
6413 
6414 				/*
6415 				 * We wait on a condition variable that will wake us as soon
6416 				 * as the pause ends, but we use a timeout so we can check the
6417 				 * above conditions periodically too.
6418 				 */
6419 				ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000,
6420 											WAIT_EVENT_RECOVERY_PAUSE);
6421 			}
6422 			ConditionVariableCancelSleep();
6423 		}
6424 
6425 		ereport(FATAL,
6426 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6427 				 errmsg("recovery aborted because of insufficient parameter settings"),
6428 		/* Repeat the detail from above so it's easy to find in the log. */
6429 				 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
6430 						   param_name,
6431 						   currValue,
6432 						   minValue),
6433 				 errhint("You can restart the server after making the necessary configuration changes.")));
6434 	}
6435 }
6436 
6437 /*
6438  * Check to see if required parameters are set high enough on this server
6439  * for various aspects of recovery operation.
6440  *
6441  * Note that all the parameters which this function tests need to be
6442  * listed in Administrator's Overview section in high-availability.sgml.
6443  * If you change them, don't forget to update the list.
6444  */
6445 static void
CheckRequiredParameterValues(void)6446 CheckRequiredParameterValues(void)
6447 {
6448 	/*
6449 	 * For archive recovery, the WAL must be generated with at least 'replica'
6450 	 * wal_level.
6451 	 */
6452 	if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6453 	{
6454 		ereport(FATAL,
6455 				(errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
6456 				 errdetail("This happens if you temporarily set wal_level=minimal on the server."),
6457 				 errhint("Use a backup taken after setting wal_level to higher than minimal.")));
6458 	}
6459 
6460 	/*
6461 	 * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6462 	 * must have at least as many backend slots as the primary.
6463 	 */
6464 	if (ArchiveRecoveryRequested && EnableHotStandby)
6465 	{
6466 		/* We ignore autovacuum_max_workers when we make this test. */
6467 		RecoveryRequiresIntParameter("max_connections",
6468 									 MaxConnections,
6469 									 ControlFile->MaxConnections);
6470 		RecoveryRequiresIntParameter("max_worker_processes",
6471 									 max_worker_processes,
6472 									 ControlFile->max_worker_processes);
6473 		RecoveryRequiresIntParameter("max_wal_senders",
6474 									 max_wal_senders,
6475 									 ControlFile->max_wal_senders);
6476 		RecoveryRequiresIntParameter("max_prepared_transactions",
6477 									 max_prepared_xacts,
6478 									 ControlFile->max_prepared_xacts);
6479 		RecoveryRequiresIntParameter("max_locks_per_transaction",
6480 									 max_locks_per_xact,
6481 									 ControlFile->max_locks_per_xact);
6482 	}
6483 }
6484 
6485 /*
6486  * This must be called ONCE during postmaster or standalone-backend startup
6487  */
6488 void
StartupXLOG(void)6489 StartupXLOG(void)
6490 {
6491 	XLogCtlInsert *Insert;
6492 	CheckPoint	checkPoint;
6493 	bool		wasShutdown;
6494 	bool		reachedRecoveryTarget = false;
6495 	bool		haveBackupLabel = false;
6496 	bool		haveTblspcMap = false;
6497 	XLogRecPtr	RecPtr,
6498 				checkPointLoc,
6499 				EndOfLog;
6500 	TimeLineID	EndOfLogTLI;
6501 	TimeLineID	PrevTimeLineID;
6502 	XLogRecord *record;
6503 	TransactionId oldestActiveXID;
6504 	bool		backupEndRequired = false;
6505 	bool		backupFromStandby = false;
6506 	DBState		dbstate_at_startup;
6507 	XLogReaderState *xlogreader;
6508 	XLogPageReadPrivate private;
6509 	bool		promoted = false;
6510 	struct stat st;
6511 
6512 	/*
6513 	 * We should have an aux process resource owner to use, and we should not
6514 	 * be in a transaction that's installed some other resowner.
6515 	 */
6516 	Assert(AuxProcessResourceOwner != NULL);
6517 	Assert(CurrentResourceOwner == NULL ||
6518 		   CurrentResourceOwner == AuxProcessResourceOwner);
6519 	CurrentResourceOwner = AuxProcessResourceOwner;
6520 
6521 	/*
6522 	 * Check that contents look valid.
6523 	 */
6524 	if (!XRecOffIsValid(ControlFile->checkPoint))
6525 		ereport(FATAL,
6526 				(errmsg("control file contains invalid checkpoint location")));
6527 
6528 	switch (ControlFile->state)
6529 	{
6530 		case DB_SHUTDOWNED:
6531 
6532 			/*
6533 			 * This is the expected case, so don't be chatty in standalone
6534 			 * mode
6535 			 */
6536 			ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6537 					(errmsg("database system was shut down at %s",
6538 							str_time(ControlFile->time))));
6539 			break;
6540 
6541 		case DB_SHUTDOWNED_IN_RECOVERY:
6542 			ereport(LOG,
6543 					(errmsg("database system was shut down in recovery at %s",
6544 							str_time(ControlFile->time))));
6545 			break;
6546 
6547 		case DB_SHUTDOWNING:
6548 			ereport(LOG,
6549 					(errmsg("database system shutdown was interrupted; last known up at %s",
6550 							str_time(ControlFile->time))));
6551 			break;
6552 
6553 		case DB_IN_CRASH_RECOVERY:
6554 			ereport(LOG,
6555 					(errmsg("database system was interrupted while in recovery at %s",
6556 							str_time(ControlFile->time)),
6557 					 errhint("This probably means that some data is corrupted and"
6558 							 " you will have to use the last backup for recovery.")));
6559 			break;
6560 
6561 		case DB_IN_ARCHIVE_RECOVERY:
6562 			ereport(LOG,
6563 					(errmsg("database system was interrupted while in recovery at log time %s",
6564 							str_time(ControlFile->checkPointCopy.time)),
6565 					 errhint("If this has occurred more than once some data might be corrupted"
6566 							 " and you might need to choose an earlier recovery target.")));
6567 			break;
6568 
6569 		case DB_IN_PRODUCTION:
6570 			ereport(LOG,
6571 					(errmsg("database system was interrupted; last known up at %s",
6572 							str_time(ControlFile->time))));
6573 			break;
6574 
6575 		default:
6576 			ereport(FATAL,
6577 					(errmsg("control file contains invalid database cluster state")));
6578 	}
6579 
6580 	/* This is just to allow attaching to startup process with a debugger */
6581 #ifdef XLOG_REPLAY_DELAY
6582 	if (ControlFile->state != DB_SHUTDOWNED)
6583 		pg_usleep(60000000L);
6584 #endif
6585 
6586 	/*
6587 	 * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
6588 	 * someone has performed a copy for PITR, these directories may have been
6589 	 * excluded and need to be re-created.
6590 	 */
6591 	ValidateXLOGDirectoryStructure();
6592 
6593 	/*----------
6594 	 * If we previously crashed, perform a couple of actions:
6595 	 *
6596 	 * - The pg_wal directory may still include some temporary WAL segments
6597 	 *   used when creating a new segment, so perform some clean up to not
6598 	 *   bloat this path.  This is done first as there is no point to sync
6599 	 *   this temporary data.
6600 	 *
6601 	 * - There might be data which we had written, intending to fsync it, but
6602 	 *   which we had not actually fsync'd yet.  Therefore, a power failure in
6603 	 *   the near future might cause earlier unflushed writes to be lost, even
6604 	 *   though more recent data written to disk from here on would be
6605 	 *   persisted.  To avoid that, fsync the entire data directory.
6606 	 */
6607 	if (ControlFile->state != DB_SHUTDOWNED &&
6608 		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6609 	{
6610 		RemoveTempXlogFiles();
6611 		SyncDataDirectory();
6612 	}
6613 
6614 	/*
6615 	 * Initialize on the assumption we want to recover to the latest timeline
6616 	 * that's active according to pg_control.
6617 	 */
6618 	if (ControlFile->minRecoveryPointTLI >
6619 		ControlFile->checkPointCopy.ThisTimeLineID)
6620 		recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6621 	else
6622 		recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6623 
6624 	/*
6625 	 * Check for signal files, and if so set up state for offline recovery
6626 	 */
6627 	readRecoverySignalFile();
6628 	validateRecoveryParameters();
6629 
6630 	if (ArchiveRecoveryRequested)
6631 	{
6632 		if (StandbyModeRequested)
6633 			ereport(LOG,
6634 					(errmsg("entering standby mode")));
6635 		else if (recoveryTarget == RECOVERY_TARGET_XID)
6636 			ereport(LOG,
6637 					(errmsg("starting point-in-time recovery to XID %u",
6638 							recoveryTargetXid)));
6639 		else if (recoveryTarget == RECOVERY_TARGET_TIME)
6640 			ereport(LOG,
6641 					(errmsg("starting point-in-time recovery to %s",
6642 							timestamptz_to_str(recoveryTargetTime))));
6643 		else if (recoveryTarget == RECOVERY_TARGET_NAME)
6644 			ereport(LOG,
6645 					(errmsg("starting point-in-time recovery to \"%s\"",
6646 							recoveryTargetName)));
6647 		else if (recoveryTarget == RECOVERY_TARGET_LSN)
6648 			ereport(LOG,
6649 					(errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
6650 							LSN_FORMAT_ARGS(recoveryTargetLSN))));
6651 		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6652 			ereport(LOG,
6653 					(errmsg("starting point-in-time recovery to earliest consistent point")));
6654 		else
6655 			ereport(LOG,
6656 					(errmsg("starting archive recovery")));
6657 	}
6658 
6659 	/*
6660 	 * Take ownership of the wakeup latch if we're going to sleep during
6661 	 * recovery.
6662 	 */
6663 	if (ArchiveRecoveryRequested)
6664 		OwnLatch(&XLogCtl->recoveryWakeupLatch);
6665 
6666 	/* Set up XLOG reader facility */
6667 	MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6668 	xlogreader =
6669 		XLogReaderAllocate(wal_segment_size, NULL,
6670 						   XL_ROUTINE(.page_read = &XLogPageRead,
6671 									  .segment_open = NULL,
6672 									  .segment_close = wal_segment_close),
6673 						   &private);
6674 	if (!xlogreader)
6675 		ereport(ERROR,
6676 				(errcode(ERRCODE_OUT_OF_MEMORY),
6677 				 errmsg("out of memory"),
6678 				 errdetail("Failed while allocating a WAL reading processor.")));
6679 	xlogreader->system_identifier = ControlFile->system_identifier;
6680 
6681 	/*
6682 	 * Allocate two page buffers dedicated to WAL consistency checks.  We do
6683 	 * it this way, rather than just making static arrays, for two reasons:
6684 	 * (1) no need to waste the storage in most instantiations of the backend;
6685 	 * (2) a static char array isn't guaranteed to have any particular
6686 	 * alignment, whereas palloc() will provide MAXALIGN'd storage.
6687 	 */
6688 	replay_image_masked = (char *) palloc(BLCKSZ);
6689 	primary_image_masked = (char *) palloc(BLCKSZ);
6690 
6691 	if (read_backup_label(&checkPointLoc, &backupEndRequired,
6692 						  &backupFromStandby))
6693 	{
6694 		List	   *tablespaces = NIL;
6695 
6696 		/*
6697 		 * Archive recovery was requested, and thanks to the backup label
6698 		 * file, we know how far we need to replay to reach consistency. Enter
6699 		 * archive recovery directly.
6700 		 */
6701 		InArchiveRecovery = true;
6702 		if (StandbyModeRequested)
6703 			StandbyMode = true;
6704 
6705 		/*
6706 		 * When a backup_label file is present, we want to roll forward from
6707 		 * the checkpoint it identifies, rather than using pg_control.
6708 		 */
6709 		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6710 		if (record != NULL)
6711 		{
6712 			memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6713 			wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6714 			ereport(DEBUG1,
6715 					(errmsg_internal("checkpoint record is at %X/%X",
6716 									 LSN_FORMAT_ARGS(checkPointLoc))));
6717 			InRecovery = true;	/* force recovery even if SHUTDOWNED */
6718 
6719 			/*
6720 			 * Make sure that REDO location exists. This may not be the case
6721 			 * if there was a crash during an online backup, which left a
6722 			 * backup_label around that references a WAL segment that's
6723 			 * already been archived.
6724 			 */
6725 			if (checkPoint.redo < checkPointLoc)
6726 			{
6727 				XLogBeginRead(xlogreader, checkPoint.redo);
6728 				if (!ReadRecord(xlogreader, LOG, false))
6729 					ereport(FATAL,
6730 							(errmsg("could not find redo location referenced by checkpoint record"),
6731 							 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6732 									 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6733 									 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6734 									 DataDir, DataDir, DataDir)));
6735 			}
6736 		}
6737 		else
6738 		{
6739 			ereport(FATAL,
6740 					(errmsg("could not locate required checkpoint record"),
6741 					 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6742 							 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6743 							 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6744 							 DataDir, DataDir, DataDir)));
6745 			wasShutdown = false;	/* keep compiler quiet */
6746 		}
6747 
6748 		/* read the tablespace_map file if present and create symlinks. */
6749 		if (read_tablespace_map(&tablespaces))
6750 		{
6751 			ListCell   *lc;
6752 
6753 			foreach(lc, tablespaces)
6754 			{
6755 				tablespaceinfo *ti = lfirst(lc);
6756 				char	   *linkloc;
6757 
6758 				linkloc = psprintf("pg_tblspc/%s", ti->oid);
6759 
6760 				/*
6761 				 * Remove the existing symlink if any and Create the symlink
6762 				 * under PGDATA.
6763 				 */
6764 				remove_tablespace_symlink(linkloc);
6765 
6766 				if (symlink(ti->path, linkloc) < 0)
6767 					ereport(ERROR,
6768 							(errcode_for_file_access(),
6769 							 errmsg("could not create symbolic link \"%s\": %m",
6770 									linkloc)));
6771 
6772 				pfree(ti->oid);
6773 				pfree(ti->path);
6774 				pfree(ti);
6775 			}
6776 
6777 			/* set flag to delete it later */
6778 			haveTblspcMap = true;
6779 		}
6780 
6781 		/* set flag to delete it later */
6782 		haveBackupLabel = true;
6783 	}
6784 	else
6785 	{
6786 		/*
6787 		 * If tablespace_map file is present without backup_label file, there
6788 		 * is no use of such file.  There is no harm in retaining it, but it
6789 		 * is better to get rid of the map file so that we don't have any
6790 		 * redundant file in data directory and it will avoid any sort of
6791 		 * confusion.  It seems prudent though to just rename the file out of
6792 		 * the way rather than delete it completely, also we ignore any error
6793 		 * that occurs in rename operation as even if map file is present
6794 		 * without backup_label file, it is harmless.
6795 		 */
6796 		if (stat(TABLESPACE_MAP, &st) == 0)
6797 		{
6798 			unlink(TABLESPACE_MAP_OLD);
6799 			if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6800 				ereport(LOG,
6801 						(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6802 								TABLESPACE_MAP, BACKUP_LABEL_FILE),
6803 						 errdetail("File \"%s\" was renamed to \"%s\".",
6804 								   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6805 			else
6806 				ereport(LOG,
6807 						(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6808 								TABLESPACE_MAP, BACKUP_LABEL_FILE),
6809 						 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6810 								   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6811 		}
6812 
6813 		/*
6814 		 * It's possible that archive recovery was requested, but we don't
6815 		 * know how far we need to replay the WAL before we reach consistency.
6816 		 * This can happen for example if a base backup is taken from a
6817 		 * running server using an atomic filesystem snapshot, without calling
6818 		 * pg_start/stop_backup. Or if you just kill a running primary server
6819 		 * and put it into archive recovery by creating a recovery signal
6820 		 * file.
6821 		 *
6822 		 * Our strategy in that case is to perform crash recovery first,
6823 		 * replaying all the WAL present in pg_wal, and only enter archive
6824 		 * recovery after that.
6825 		 *
6826 		 * But usually we already know how far we need to replay the WAL (up
6827 		 * to minRecoveryPoint, up to backupEndPoint, or until we see an
6828 		 * end-of-backup record), and we can enter archive recovery directly.
6829 		 */
6830 		if (ArchiveRecoveryRequested &&
6831 			(ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6832 			 ControlFile->backupEndRequired ||
6833 			 ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6834 			 ControlFile->state == DB_SHUTDOWNED))
6835 		{
6836 			InArchiveRecovery = true;
6837 			if (StandbyModeRequested)
6838 				StandbyMode = true;
6839 		}
6840 
6841 		/* Get the last valid checkpoint record. */
6842 		checkPointLoc = ControlFile->checkPoint;
6843 		RedoStartLSN = ControlFile->checkPointCopy.redo;
6844 		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6845 		if (record != NULL)
6846 		{
6847 			ereport(DEBUG1,
6848 					(errmsg_internal("checkpoint record is at %X/%X",
6849 									 LSN_FORMAT_ARGS(checkPointLoc))));
6850 		}
6851 		else
6852 		{
6853 			/*
6854 			 * We used to attempt to go back to a secondary checkpoint record
6855 			 * here, but only when not in standby mode. We now just fail if we
6856 			 * can't read the last checkpoint because this allows us to
6857 			 * simplify processing around checkpoints.
6858 			 */
6859 			ereport(PANIC,
6860 					(errmsg("could not locate a valid checkpoint record")));
6861 		}
6862 		memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6863 		wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6864 	}
6865 
6866 	/*
6867 	 * Clear out any old relcache cache files.  This is *necessary* if we do
6868 	 * any WAL replay, since that would probably result in the cache files
6869 	 * being out of sync with database reality.  In theory we could leave them
6870 	 * in place if the database had been cleanly shut down, but it seems
6871 	 * safest to just remove them always and let them be rebuilt during the
6872 	 * first backend startup.  These files needs to be removed from all
6873 	 * directories including pg_tblspc, however the symlinks are created only
6874 	 * after reading tablespace_map file in case of archive recovery from
6875 	 * backup, so needs to clear old relcache files here after creating
6876 	 * symlinks.
6877 	 */
6878 	RelationCacheInitFileRemove();
6879 
6880 	/*
6881 	 * If the location of the checkpoint record is not on the expected
6882 	 * timeline in the history of the requested timeline, we cannot proceed:
6883 	 * the backup is not part of the history of the requested timeline.
6884 	 */
6885 	Assert(expectedTLEs);		/* was initialized by reading checkpoint
6886 								 * record */
6887 	if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6888 		checkPoint.ThisTimeLineID)
6889 	{
6890 		XLogRecPtr	switchpoint;
6891 
6892 		/*
6893 		 * tliSwitchPoint will throw an error if the checkpoint's timeline is
6894 		 * not in expectedTLEs at all.
6895 		 */
6896 		switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6897 		ereport(FATAL,
6898 				(errmsg("requested timeline %u is not a child of this server's history",
6899 						recoveryTargetTLI),
6900 				 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6901 						   LSN_FORMAT_ARGS(ControlFile->checkPoint),
6902 						   ControlFile->checkPointCopy.ThisTimeLineID,
6903 						   LSN_FORMAT_ARGS(switchpoint))));
6904 	}
6905 
6906 	/*
6907 	 * The min recovery point should be part of the requested timeline's
6908 	 * history, too.
6909 	 */
6910 	if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6911 		tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6912 		ControlFile->minRecoveryPointTLI)
6913 		ereport(FATAL,
6914 				(errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6915 						recoveryTargetTLI,
6916 						LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
6917 						ControlFile->minRecoveryPointTLI)));
6918 
6919 	LastRec = RecPtr = checkPointLoc;
6920 
6921 	ereport(DEBUG1,
6922 			(errmsg_internal("redo record is at %X/%X; shutdown %s",
6923 							 LSN_FORMAT_ARGS(checkPoint.redo),
6924 							 wasShutdown ? "true" : "false")));
6925 	ereport(DEBUG1,
6926 			(errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
6927 							 U64FromFullTransactionId(checkPoint.nextXid),
6928 							 checkPoint.nextOid)));
6929 	ereport(DEBUG1,
6930 			(errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6931 							 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6932 	ereport(DEBUG1,
6933 			(errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6934 							 checkPoint.oldestXid, checkPoint.oldestXidDB)));
6935 	ereport(DEBUG1,
6936 			(errmsg_internal("oldest MultiXactId: %u, in database %u",
6937 							 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6938 	ereport(DEBUG1,
6939 			(errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6940 							 checkPoint.oldestCommitTsXid,
6941 							 checkPoint.newestCommitTsXid)));
6942 	if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
6943 		ereport(PANIC,
6944 				(errmsg("invalid next transaction ID")));
6945 
6946 	/* initialize shared memory variables from the checkpoint record */
6947 	ShmemVariableCache->nextXid = checkPoint.nextXid;
6948 	ShmemVariableCache->nextOid = checkPoint.nextOid;
6949 	ShmemVariableCache->oidCount = 0;
6950 	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6951 	AdvanceOldestClogXid(checkPoint.oldestXid);
6952 	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6953 	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6954 	SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6955 					 checkPoint.newestCommitTsXid);
6956 	XLogCtl->ckptFullXid = checkPoint.nextXid;
6957 
6958 	/*
6959 	 * Initialize replication slots, before there's a chance to remove
6960 	 * required resources.
6961 	 */
6962 	StartupReplicationSlots();
6963 
6964 	/*
6965 	 * Startup logical state, needs to be setup now so we have proper data
6966 	 * during crash recovery.
6967 	 */
6968 	StartupReorderBuffer();
6969 
6970 	/*
6971 	 * Startup CLOG. This must be done after ShmemVariableCache->nextXid has
6972 	 * been initialized and before we accept connections or begin WAL replay.
6973 	 */
6974 	StartupCLOG();
6975 
6976 	/*
6977 	 * Startup MultiXact. We need to do this early to be able to replay
6978 	 * truncations.
6979 	 */
6980 	StartupMultiXact();
6981 
6982 	/*
6983 	 * Ditto for commit timestamps.  Activate the facility if the setting is
6984 	 * enabled in the control file, as there should be no tracking of commit
6985 	 * timestamps done when the setting was disabled.  This facility can be
6986 	 * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
6987 	 */
6988 	if (ControlFile->track_commit_timestamp)
6989 		StartupCommitTs();
6990 
6991 	/*
6992 	 * Recover knowledge about replay progress of known replication partners.
6993 	 */
6994 	StartupReplicationOrigin();
6995 
6996 	/*
6997 	 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6998 	 * control file. On recovery, all unlogged relations are blown away, so
6999 	 * the unlogged LSN counter can be reset too.
7000 	 */
7001 	if (ControlFile->state == DB_SHUTDOWNED)
7002 		XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
7003 	else
7004 		XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
7005 
7006 	/*
7007 	 * We must replay WAL entries using the same TimeLineID they were created
7008 	 * under, so temporarily adopt the TLI indicated by the checkpoint (see
7009 	 * also xlog_redo()).
7010 	 */
7011 	ThisTimeLineID = checkPoint.ThisTimeLineID;
7012 
7013 	/*
7014 	 * Copy any missing timeline history files between 'now' and the recovery
7015 	 * target timeline from archive to pg_wal. While we don't need those files
7016 	 * ourselves - the history file of the recovery target timeline covers all
7017 	 * the previous timelines in the history too - a cascading standby server
7018 	 * might be interested in them. Or, if you archive the WAL from this
7019 	 * server to a different archive than the primary, it'd be good for all
7020 	 * the history files to get archived there after failover, so that you can
7021 	 * use one of the old timelines as a PITR target. Timeline history files
7022 	 * are small, so it's better to copy them unnecessarily than not copy them
7023 	 * and regret later.
7024 	 */
7025 	restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
7026 
7027 	/*
7028 	 * Before running in recovery, scan pg_twophase and fill in its status to
7029 	 * be able to work on entries generated by redo.  Doing a scan before
7030 	 * taking any recovery action has the merit to discard any 2PC files that
7031 	 * are newer than the first record to replay, saving from any conflicts at
7032 	 * replay.  This avoids as well any subsequent scans when doing recovery
7033 	 * of the on-disk two-phase data.
7034 	 */
7035 	restoreTwoPhaseData();
7036 
7037 	lastFullPageWrites = checkPoint.fullPageWrites;
7038 
7039 	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
7040 	doPageWrites = lastFullPageWrites;
7041 
7042 	if (RecPtr < checkPoint.redo)
7043 		ereport(PANIC,
7044 				(errmsg("invalid redo in checkpoint record")));
7045 
7046 	/*
7047 	 * Check whether we need to force recovery from WAL.  If it appears to
7048 	 * have been a clean shutdown and we did not have a recovery signal file,
7049 	 * then assume no recovery needed.
7050 	 */
7051 	if (checkPoint.redo < RecPtr)
7052 	{
7053 		if (wasShutdown)
7054 			ereport(PANIC,
7055 					(errmsg("invalid redo record in shutdown checkpoint")));
7056 		InRecovery = true;
7057 	}
7058 	else if (ControlFile->state != DB_SHUTDOWNED)
7059 		InRecovery = true;
7060 	else if (ArchiveRecoveryRequested)
7061 	{
7062 		/* force recovery due to presence of recovery signal file */
7063 		InRecovery = true;
7064 	}
7065 
7066 	/*
7067 	 * Start recovery assuming that the final record isn't lost.
7068 	 */
7069 	abortedRecPtr = InvalidXLogRecPtr;
7070 	missingContrecPtr = InvalidXLogRecPtr;
7071 
7072 	/* REDO */
7073 	if (InRecovery)
7074 	{
7075 		int			rmid;
7076 
7077 		/*
7078 		 * Update pg_control to show that we are recovering and to show the
7079 		 * selected checkpoint as the place we are starting from. We also mark
7080 		 * pg_control with any minimum recovery stop point obtained from a
7081 		 * backup history file.
7082 		 */
7083 		dbstate_at_startup = ControlFile->state;
7084 		if (InArchiveRecovery)
7085 		{
7086 			ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
7087 
7088 			SpinLockAcquire(&XLogCtl->info_lck);
7089 			XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
7090 			SpinLockRelease(&XLogCtl->info_lck);
7091 		}
7092 		else
7093 		{
7094 			ereport(LOG,
7095 					(errmsg("database system was not properly shut down; "
7096 							"automatic recovery in progress")));
7097 			if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
7098 				ereport(LOG,
7099 						(errmsg("crash recovery starts in timeline %u "
7100 								"and has target timeline %u",
7101 								ControlFile->checkPointCopy.ThisTimeLineID,
7102 								recoveryTargetTLI)));
7103 			ControlFile->state = DB_IN_CRASH_RECOVERY;
7104 
7105 			SpinLockAcquire(&XLogCtl->info_lck);
7106 			XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
7107 			SpinLockRelease(&XLogCtl->info_lck);
7108 		}
7109 		ControlFile->checkPoint = checkPointLoc;
7110 		ControlFile->checkPointCopy = checkPoint;
7111 		if (InArchiveRecovery)
7112 		{
7113 			/* initialize minRecoveryPoint if not set yet */
7114 			if (ControlFile->minRecoveryPoint < checkPoint.redo)
7115 			{
7116 				ControlFile->minRecoveryPoint = checkPoint.redo;
7117 				ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
7118 			}
7119 		}
7120 
7121 		/*
7122 		 * Set backupStartPoint if we're starting recovery from a base backup.
7123 		 *
7124 		 * Also set backupEndPoint and use minRecoveryPoint as the backup end
7125 		 * location if we're starting recovery from a base backup which was
7126 		 * taken from a standby. In this case, the database system status in
7127 		 * pg_control must indicate that the database was already in recovery.
7128 		 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
7129 		 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
7130 		 * before reaching this point; e.g. because restore_command or
7131 		 * primary_conninfo were faulty.
7132 		 *
7133 		 * Any other state indicates that the backup somehow became corrupted
7134 		 * and we can't sensibly continue with recovery.
7135 		 */
7136 		if (haveBackupLabel)
7137 		{
7138 			ControlFile->backupStartPoint = checkPoint.redo;
7139 			ControlFile->backupEndRequired = backupEndRequired;
7140 
7141 			if (backupFromStandby)
7142 			{
7143 				if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
7144 					dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
7145 					ereport(FATAL,
7146 							(errmsg("backup_label contains data inconsistent with control file"),
7147 							 errhint("This means that the backup is corrupted and you will "
7148 									 "have to use another backup for recovery.")));
7149 				ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
7150 			}
7151 		}
7152 		ControlFile->time = (pg_time_t) time(NULL);
7153 		/* No need to hold ControlFileLock yet, we aren't up far enough */
7154 		UpdateControlFile();
7155 
7156 		/*
7157 		 * Initialize our local copy of minRecoveryPoint.  When doing crash
7158 		 * recovery we want to replay up to the end of WAL.  Particularly, in
7159 		 * the case of a promoted standby minRecoveryPoint value in the
7160 		 * control file is only updated after the first checkpoint.  However,
7161 		 * if the instance crashes before the first post-recovery checkpoint
7162 		 * is completed then recovery will use a stale location causing the
7163 		 * startup process to think that there are still invalid page
7164 		 * references when checking for data consistency.
7165 		 */
7166 		if (InArchiveRecovery)
7167 		{
7168 			minRecoveryPoint = ControlFile->minRecoveryPoint;
7169 			minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
7170 		}
7171 		else
7172 		{
7173 			minRecoveryPoint = InvalidXLogRecPtr;
7174 			minRecoveryPointTLI = 0;
7175 		}
7176 
7177 		/*
7178 		 * Reset pgstat data, because it may be invalid after recovery.
7179 		 */
7180 		pgstat_reset_all();
7181 
7182 		/*
7183 		 * If there was a backup label file, it's done its job and the info
7184 		 * has now been propagated into pg_control.  We must get rid of the
7185 		 * label file so that if we crash during recovery, we'll pick up at
7186 		 * the latest recovery restartpoint instead of going all the way back
7187 		 * to the backup start point.  It seems prudent though to just rename
7188 		 * the file out of the way rather than delete it completely.
7189 		 */
7190 		if (haveBackupLabel)
7191 		{
7192 			unlink(BACKUP_LABEL_OLD);
7193 			durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
7194 		}
7195 
7196 		/*
7197 		 * If there was a tablespace_map file, it's done its job and the
7198 		 * symlinks have been created.  We must get rid of the map file so
7199 		 * that if we crash during recovery, we don't create symlinks again.
7200 		 * It seems prudent though to just rename the file out of the way
7201 		 * rather than delete it completely.
7202 		 */
7203 		if (haveTblspcMap)
7204 		{
7205 			unlink(TABLESPACE_MAP_OLD);
7206 			durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
7207 		}
7208 
7209 		/* Check that the GUCs used to generate the WAL allow recovery */
7210 		CheckRequiredParameterValues();
7211 
7212 		/*
7213 		 * We're in recovery, so unlogged relations may be trashed and must be
7214 		 * reset.  This should be done BEFORE allowing Hot Standby
7215 		 * connections, so that read-only backends don't try to read whatever
7216 		 * garbage is left over from before.
7217 		 */
7218 		ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
7219 
7220 		/*
7221 		 * Likewise, delete any saved transaction snapshot files that got left
7222 		 * behind by crashed backends.
7223 		 */
7224 		DeleteAllExportedSnapshotFiles();
7225 
7226 		/*
7227 		 * Initialize for Hot Standby, if enabled. We won't let backends in
7228 		 * yet, not until we've reached the min recovery point specified in
7229 		 * control file and we've established a recovery snapshot from a
7230 		 * running-xacts WAL record.
7231 		 */
7232 		if (ArchiveRecoveryRequested && EnableHotStandby)
7233 		{
7234 			TransactionId *xids;
7235 			int			nxids;
7236 
7237 			ereport(DEBUG1,
7238 					(errmsg_internal("initializing for hot standby")));
7239 
7240 			InitRecoveryTransactionEnvironment();
7241 
7242 			if (wasShutdown)
7243 				oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7244 			else
7245 				oldestActiveXID = checkPoint.oldestActiveXid;
7246 			Assert(TransactionIdIsValid(oldestActiveXID));
7247 
7248 			/* Tell procarray about the range of xids it has to deal with */
7249 			ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid));
7250 
7251 			/*
7252 			 * Startup subtrans only.  CLOG, MultiXact and commit timestamp
7253 			 * have already been started up and other SLRUs are not maintained
7254 			 * during recovery and need not be started yet.
7255 			 */
7256 			StartupSUBTRANS(oldestActiveXID);
7257 
7258 			/*
7259 			 * If we're beginning at a shutdown checkpoint, we know that
7260 			 * nothing was running on the primary at this point. So fake-up an
7261 			 * empty running-xacts record and use that here and now. Recover
7262 			 * additional standby state for prepared transactions.
7263 			 */
7264 			if (wasShutdown)
7265 			{
7266 				RunningTransactionsData running;
7267 				TransactionId latestCompletedXid;
7268 
7269 				/*
7270 				 * Construct a RunningTransactions snapshot representing a
7271 				 * shut down server, with only prepared transactions still
7272 				 * alive. We're never overflowed at this point because all
7273 				 * subxids are listed with their parent prepared transactions.
7274 				 */
7275 				running.xcnt = nxids;
7276 				running.subxcnt = 0;
7277 				running.subxid_overflow = false;
7278 				running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
7279 				running.oldestRunningXid = oldestActiveXID;
7280 				latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
7281 				TransactionIdRetreat(latestCompletedXid);
7282 				Assert(TransactionIdIsNormal(latestCompletedXid));
7283 				running.latestCompletedXid = latestCompletedXid;
7284 				running.xids = xids;
7285 
7286 				ProcArrayApplyRecoveryInfo(&running);
7287 
7288 				StandbyRecoverPreparedTransactions();
7289 			}
7290 		}
7291 
7292 		/* Initialize resource managers */
7293 		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7294 		{
7295 			if (RmgrTable[rmid].rm_startup != NULL)
7296 				RmgrTable[rmid].rm_startup();
7297 		}
7298 
7299 		/*
7300 		 * Initialize shared variables for tracking progress of WAL replay, as
7301 		 * if we had just replayed the record before the REDO location (or the
7302 		 * checkpoint record itself, if it's a shutdown checkpoint).
7303 		 */
7304 		SpinLockAcquire(&XLogCtl->info_lck);
7305 		if (checkPoint.redo < RecPtr)
7306 			XLogCtl->replayEndRecPtr = checkPoint.redo;
7307 		else
7308 			XLogCtl->replayEndRecPtr = EndRecPtr;
7309 		XLogCtl->replayEndTLI = ThisTimeLineID;
7310 		XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
7311 		XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
7312 		XLogCtl->recoveryLastXTime = 0;
7313 		XLogCtl->currentChunkStartTime = 0;
7314 		XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
7315 		SpinLockRelease(&XLogCtl->info_lck);
7316 
7317 		/* Also ensure XLogReceiptTime has a sane value */
7318 		XLogReceiptTime = GetCurrentTimestamp();
7319 
7320 		/*
7321 		 * Let postmaster know we've started redo now, so that it can launch
7322 		 * checkpointer to perform restartpoints.  We don't bother during
7323 		 * crash recovery as restartpoints can only be performed during
7324 		 * archive recovery.  And we'd like to keep crash recovery simple, to
7325 		 * avoid introducing bugs that could affect you when recovering after
7326 		 * crash.
7327 		 *
7328 		 * After this point, we can no longer assume that we're the only
7329 		 * process in addition to postmaster!  Also, fsync requests are
7330 		 * subsequently to be handled by the checkpointer, not locally.
7331 		 */
7332 		if (ArchiveRecoveryRequested && IsUnderPostmaster)
7333 		{
7334 			PublishStartupProcessInformation();
7335 			EnableSyncRequestForwarding();
7336 			SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
7337 			bgwriterLaunched = true;
7338 		}
7339 
7340 		/*
7341 		 * Allow read-only connections immediately if we're consistent
7342 		 * already.
7343 		 */
7344 		CheckRecoveryConsistency();
7345 
7346 		/*
7347 		 * Find the first record that logically follows the checkpoint --- it
7348 		 * might physically precede it, though.
7349 		 */
7350 		if (checkPoint.redo < RecPtr)
7351 		{
7352 			/* back up to find the record */
7353 			XLogBeginRead(xlogreader, checkPoint.redo);
7354 			record = ReadRecord(xlogreader, PANIC, false);
7355 		}
7356 		else
7357 		{
7358 			/* just have to read next record after CheckPoint */
7359 			record = ReadRecord(xlogreader, LOG, false);
7360 		}
7361 
7362 		if (record != NULL)
7363 		{
7364 			ErrorContextCallback errcallback;
7365 			TimestampTz xtime;
7366 			PGRUsage	ru0;
7367 
7368 			pg_rusage_init(&ru0);
7369 
7370 			InRedo = true;
7371 
7372 			ereport(LOG,
7373 					(errmsg("redo starts at %X/%X",
7374 							LSN_FORMAT_ARGS(ReadRecPtr))));
7375 
7376 			/*
7377 			 * main redo apply loop
7378 			 */
7379 			do
7380 			{
7381 				bool		switchedTLI = false;
7382 
7383 #ifdef WAL_DEBUG
7384 				if (XLOG_DEBUG ||
7385 					(rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
7386 					(rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
7387 				{
7388 					StringInfoData buf;
7389 
7390 					initStringInfo(&buf);
7391 					appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
7392 									 LSN_FORMAT_ARGS(ReadRecPtr),
7393 									 LSN_FORMAT_ARGS(EndRecPtr));
7394 					xlog_outrec(&buf, xlogreader);
7395 					appendStringInfoString(&buf, " - ");
7396 					xlog_outdesc(&buf, xlogreader);
7397 					elog(LOG, "%s", buf.data);
7398 					pfree(buf.data);
7399 				}
7400 #endif
7401 
7402 				/* Handle interrupt signals of startup process */
7403 				HandleStartupProcInterrupts();
7404 
7405 				/*
7406 				 * Pause WAL replay, if requested by a hot-standby session via
7407 				 * SetRecoveryPause().
7408 				 *
7409 				 * Note that we intentionally don't take the info_lck spinlock
7410 				 * here.  We might therefore read a slightly stale value of
7411 				 * the recoveryPause flag, but it can't be very stale (no
7412 				 * worse than the last spinlock we did acquire).  Since a
7413 				 * pause request is a pretty asynchronous thing anyway,
7414 				 * possibly responding to it one WAL record later than we
7415 				 * otherwise would is a minor issue, so it doesn't seem worth
7416 				 * adding another spinlock cycle to prevent that.
7417 				 */
7418 				if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
7419 					RECOVERY_NOT_PAUSED)
7420 					recoveryPausesHere(false);
7421 
7422 				/*
7423 				 * Have we reached our recovery target?
7424 				 */
7425 				if (recoveryStopsBefore(xlogreader))
7426 				{
7427 					reachedRecoveryTarget = true;
7428 					break;
7429 				}
7430 
7431 				/*
7432 				 * If we've been asked to lag the primary, wait on latch until
7433 				 * enough time has passed.
7434 				 */
7435 				if (recoveryApplyDelay(xlogreader))
7436 				{
7437 					/*
7438 					 * We test for paused recovery again here. If user sets
7439 					 * delayed apply, it may be because they expect to pause
7440 					 * recovery in case of problems, so we must test again
7441 					 * here otherwise pausing during the delay-wait wouldn't
7442 					 * work.
7443 					 */
7444 					if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
7445 						RECOVERY_NOT_PAUSED)
7446 						recoveryPausesHere(false);
7447 				}
7448 
7449 				/* Setup error traceback support for ereport() */
7450 				errcallback.callback = rm_redo_error_callback;
7451 				errcallback.arg = (void *) xlogreader;
7452 				errcallback.previous = error_context_stack;
7453 				error_context_stack = &errcallback;
7454 
7455 				/*
7456 				 * ShmemVariableCache->nextXid must be beyond record's xid.
7457 				 */
7458 				AdvanceNextFullTransactionIdPastXid(record->xl_xid);
7459 
7460 				/*
7461 				 * Before replaying this record, check if this record causes
7462 				 * the current timeline to change. The record is already
7463 				 * considered to be part of the new timeline, so we update
7464 				 * ThisTimeLineID before replaying it. That's important so
7465 				 * that replayEndTLI, which is recorded as the minimum
7466 				 * recovery point's TLI if recovery stops after this record,
7467 				 * is set correctly.
7468 				 */
7469 				if (record->xl_rmid == RM_XLOG_ID)
7470 				{
7471 					TimeLineID	newTLI = ThisTimeLineID;
7472 					TimeLineID	prevTLI = ThisTimeLineID;
7473 					uint8		info = record->xl_info & ~XLR_INFO_MASK;
7474 
7475 					if (info == XLOG_CHECKPOINT_SHUTDOWN)
7476 					{
7477 						CheckPoint	checkPoint;
7478 
7479 						memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7480 						newTLI = checkPoint.ThisTimeLineID;
7481 						prevTLI = checkPoint.PrevTimeLineID;
7482 					}
7483 					else if (info == XLOG_END_OF_RECOVERY)
7484 					{
7485 						xl_end_of_recovery xlrec;
7486 
7487 						memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7488 						newTLI = xlrec.ThisTimeLineID;
7489 						prevTLI = xlrec.PrevTimeLineID;
7490 					}
7491 
7492 					if (newTLI != ThisTimeLineID)
7493 					{
7494 						/* Check that it's OK to switch to this TLI */
7495 						checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7496 
7497 						/* Following WAL records should be run with new TLI */
7498 						ThisTimeLineID = newTLI;
7499 						switchedTLI = true;
7500 					}
7501 				}
7502 
7503 				/*
7504 				 * Update shared replayEndRecPtr before replaying this record,
7505 				 * so that XLogFlush will update minRecoveryPoint correctly.
7506 				 */
7507 				SpinLockAcquire(&XLogCtl->info_lck);
7508 				XLogCtl->replayEndRecPtr = EndRecPtr;
7509 				XLogCtl->replayEndTLI = ThisTimeLineID;
7510 				SpinLockRelease(&XLogCtl->info_lck);
7511 
7512 				/*
7513 				 * If we are attempting to enter Hot Standby mode, process
7514 				 * XIDs we see
7515 				 */
7516 				if (standbyState >= STANDBY_INITIALIZED &&
7517 					TransactionIdIsValid(record->xl_xid))
7518 					RecordKnownAssignedTransactionIds(record->xl_xid);
7519 
7520 				/* Now apply the WAL record itself */
7521 				RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7522 
7523 				/*
7524 				 * After redo, check whether the backup pages associated with
7525 				 * the WAL record are consistent with the existing pages. This
7526 				 * check is done only if consistency check is enabled for this
7527 				 * record.
7528 				 */
7529 				if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
7530 					checkXLogConsistency(xlogreader);
7531 
7532 				/* Pop the error context stack */
7533 				error_context_stack = errcallback.previous;
7534 
7535 				/*
7536 				 * Update lastReplayedEndRecPtr after this record has been
7537 				 * successfully replayed.
7538 				 */
7539 				SpinLockAcquire(&XLogCtl->info_lck);
7540 				XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7541 				XLogCtl->lastReplayedTLI = ThisTimeLineID;
7542 				SpinLockRelease(&XLogCtl->info_lck);
7543 
7544 				/*
7545 				 * If rm_redo called XLogRequestWalReceiverReply, then we wake
7546 				 * up the receiver so that it notices the updated
7547 				 * lastReplayedEndRecPtr and sends a reply to the primary.
7548 				 */
7549 				if (doRequestWalReceiverReply)
7550 				{
7551 					doRequestWalReceiverReply = false;
7552 					WalRcvForceReply();
7553 				}
7554 
7555 				/* Remember this record as the last-applied one */
7556 				LastRec = ReadRecPtr;
7557 
7558 				/* Allow read-only connections if we're consistent now */
7559 				CheckRecoveryConsistency();
7560 
7561 				/* Is this a timeline switch? */
7562 				if (switchedTLI)
7563 				{
7564 					/*
7565 					 * Before we continue on the new timeline, clean up any
7566 					 * (possibly bogus) future WAL segments on the old
7567 					 * timeline.
7568 					 */
7569 					RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7570 
7571 					/*
7572 					 * Wake up any walsenders to notice that we are on a new
7573 					 * timeline.
7574 					 */
7575 					if (AllowCascadeReplication())
7576 						WalSndWakeup();
7577 				}
7578 
7579 				/* Exit loop if we reached inclusive recovery target */
7580 				if (recoveryStopsAfter(xlogreader))
7581 				{
7582 					reachedRecoveryTarget = true;
7583 					break;
7584 				}
7585 
7586 				/* Else, try to fetch the next WAL record */
7587 				record = ReadRecord(xlogreader, LOG, false);
7588 			} while (record != NULL);
7589 
7590 			/*
7591 			 * end of main redo apply loop
7592 			 */
7593 
7594 			if (reachedRecoveryTarget)
7595 			{
7596 				if (!reachedConsistency)
7597 					ereport(FATAL,
7598 							(errmsg("requested recovery stop point is before consistent recovery point")));
7599 
7600 				/*
7601 				 * This is the last point where we can restart recovery with a
7602 				 * new recovery target, if we shutdown and begin again. After
7603 				 * this, Resource Managers may choose to do permanent
7604 				 * corrective actions at end of recovery.
7605 				 */
7606 				switch (recoveryTargetAction)
7607 				{
7608 					case RECOVERY_TARGET_ACTION_SHUTDOWN:
7609 
7610 						/*
7611 						 * exit with special return code to request shutdown
7612 						 * of postmaster.  Log messages issued from
7613 						 * postmaster.
7614 						 */
7615 						proc_exit(3);
7616 
7617 					case RECOVERY_TARGET_ACTION_PAUSE:
7618 						SetRecoveryPause(true);
7619 						recoveryPausesHere(true);
7620 
7621 						/* drop into promote */
7622 
7623 					case RECOVERY_TARGET_ACTION_PROMOTE:
7624 						break;
7625 				}
7626 			}
7627 
7628 			/* Allow resource managers to do any required cleanup. */
7629 			for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7630 			{
7631 				if (RmgrTable[rmid].rm_cleanup != NULL)
7632 					RmgrTable[rmid].rm_cleanup();
7633 			}
7634 
7635 			ereport(LOG,
7636 					(errmsg("redo done at %X/%X system usage: %s",
7637 							LSN_FORMAT_ARGS(ReadRecPtr),
7638 							pg_rusage_show(&ru0))));
7639 			xtime = GetLatestXTime();
7640 			if (xtime)
7641 				ereport(LOG,
7642 						(errmsg("last completed transaction was at log time %s",
7643 								timestamptz_to_str(xtime))));
7644 
7645 			InRedo = false;
7646 		}
7647 		else
7648 		{
7649 			/* there are no WAL records following the checkpoint */
7650 			ereport(LOG,
7651 					(errmsg("redo is not required")));
7652 
7653 		}
7654 
7655 		/*
7656 		 * This check is intentionally after the above log messages that
7657 		 * indicate how far recovery went.
7658 		 */
7659 		if (ArchiveRecoveryRequested &&
7660 			recoveryTarget != RECOVERY_TARGET_UNSET &&
7661 			!reachedRecoveryTarget)
7662 			ereport(FATAL,
7663 					(errmsg("recovery ended before configured recovery target was reached")));
7664 	}
7665 
7666 	/*
7667 	 * Kill WAL receiver, if it's still running, before we continue to write
7668 	 * the startup checkpoint and aborted-contrecord records. It will trump
7669 	 * over these records and subsequent ones if it's still alive when we
7670 	 * start writing WAL.
7671 	 */
7672 	ShutdownWalRcv();
7673 
7674 	/*
7675 	 * Reset unlogged relations to the contents of their INIT fork. This is
7676 	 * done AFTER recovery is complete so as to include any unlogged relations
7677 	 * created during recovery, but BEFORE recovery is marked as having
7678 	 * completed successfully. Otherwise we'd not retry if any of the post
7679 	 * end-of-recovery steps fail.
7680 	 */
7681 	if (InRecovery)
7682 		ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7683 
7684 	/*
7685 	 * We don't need the latch anymore. It's not strictly necessary to disown
7686 	 * it, but let's do it for the sake of tidiness.
7687 	 */
7688 	if (ArchiveRecoveryRequested)
7689 		DisownLatch(&XLogCtl->recoveryWakeupLatch);
7690 
7691 	/*
7692 	 * We are now done reading the xlog from stream. Turn off streaming
7693 	 * recovery to force fetching the files (which would be required at end of
7694 	 * recovery, e.g., timeline history file) from archive or pg_wal.
7695 	 *
7696 	 * Note that standby mode must be turned off after killing WAL receiver,
7697 	 * i.e., calling ShutdownWalRcv().
7698 	 */
7699 	Assert(!WalRcvStreaming());
7700 	StandbyMode = false;
7701 
7702 	/*
7703 	 * Determine where to start writing WAL next.
7704 	 *
7705 	 * When recovery ended in an incomplete record, write a WAL record about
7706 	 * that and continue after it.  In all other cases, re-fetch the last
7707 	 * valid or last applied record, so we can identify the exact endpoint of
7708 	 * what we consider the valid portion of WAL.
7709 	 */
7710 	XLogBeginRead(xlogreader, LastRec);
7711 	record = ReadRecord(xlogreader, PANIC, false);
7712 	EndOfLog = EndRecPtr;
7713 
7714 	/*
7715 	 * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7716 	 * the end-of-log. It could be different from the timeline that EndOfLog
7717 	 * nominally belongs to, if there was a timeline switch in that segment,
7718 	 * and we were reading the old WAL from a segment belonging to a higher
7719 	 * timeline.
7720 	 */
7721 	EndOfLogTLI = xlogreader->seg.ws_tli;
7722 
7723 	/*
7724 	 * Complain if we did not roll forward far enough to render the backup
7725 	 * dump consistent.  Note: it is indeed okay to look at the local variable
7726 	 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7727 	 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7728 	 * advanced beyond the WAL we processed.
7729 	 */
7730 	if (InRecovery &&
7731 		(EndOfLog < minRecoveryPoint ||
7732 		 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7733 	{
7734 		/*
7735 		 * Ran off end of WAL before reaching end-of-backup WAL record, or
7736 		 * minRecoveryPoint. That's usually a bad sign, indicating that you
7737 		 * tried to recover from an online backup but never called
7738 		 * pg_stop_backup(), or you didn't archive all the WAL up to that
7739 		 * point. However, this also happens in crash recovery, if the system
7740 		 * crashes while an online backup is in progress. We must not treat
7741 		 * that as an error, or the database will refuse to start up.
7742 		 */
7743 		if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7744 		{
7745 			if (ControlFile->backupEndRequired)
7746 				ereport(FATAL,
7747 						(errmsg("WAL ends before end of online backup"),
7748 						 errhint("All WAL generated while online backup was taken must be available at recovery.")));
7749 			else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7750 				ereport(FATAL,
7751 						(errmsg("WAL ends before end of online backup"),
7752 						 errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7753 			else
7754 				ereport(FATAL,
7755 						(errmsg("WAL ends before consistent recovery point")));
7756 		}
7757 	}
7758 
7759 	/*
7760 	 * Pre-scan prepared transactions to find out the range of XIDs present.
7761 	 * This information is not quite needed yet, but it is positioned here so
7762 	 * as potential problems are detected before any on-disk change is done.
7763 	 */
7764 	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7765 
7766 	/*
7767 	 * Consider whether we need to assign a new timeline ID.
7768 	 *
7769 	 * If we are doing an archive recovery, we always assign a new ID.  This
7770 	 * handles a couple of issues.  If we stopped short of the end of WAL
7771 	 * during recovery, then we are clearly generating a new timeline and must
7772 	 * assign it a unique new ID.  Even if we ran to the end, modifying the
7773 	 * current last segment is problematic because it may result in trying to
7774 	 * overwrite an already-archived copy of that segment, and we encourage
7775 	 * DBAs to make their archive_commands reject that.  We can dodge the
7776 	 * problem by making the new active segment have a new timeline ID.
7777 	 *
7778 	 * In a normal crash recovery, we can just extend the timeline we were in.
7779 	 */
7780 	PrevTimeLineID = ThisTimeLineID;
7781 	if (ArchiveRecoveryRequested)
7782 	{
7783 		char		reason[200];
7784 		char		recoveryPath[MAXPGPATH];
7785 
7786 		Assert(InArchiveRecovery);
7787 
7788 		ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7789 		ereport(LOG,
7790 				(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7791 
7792 		/*
7793 		 * Create a comment for the history file to explain why and where
7794 		 * timeline changed.
7795 		 */
7796 		if (recoveryTarget == RECOVERY_TARGET_XID)
7797 			snprintf(reason, sizeof(reason),
7798 					 "%s transaction %u",
7799 					 recoveryStopAfter ? "after" : "before",
7800 					 recoveryStopXid);
7801 		else if (recoveryTarget == RECOVERY_TARGET_TIME)
7802 			snprintf(reason, sizeof(reason),
7803 					 "%s %s\n",
7804 					 recoveryStopAfter ? "after" : "before",
7805 					 timestamptz_to_str(recoveryStopTime));
7806 		else if (recoveryTarget == RECOVERY_TARGET_LSN)
7807 			snprintf(reason, sizeof(reason),
7808 					 "%s LSN %X/%X\n",
7809 					 recoveryStopAfter ? "after" : "before",
7810 					 LSN_FORMAT_ARGS(recoveryStopLSN));
7811 		else if (recoveryTarget == RECOVERY_TARGET_NAME)
7812 			snprintf(reason, sizeof(reason),
7813 					 "at restore point \"%s\"",
7814 					 recoveryStopName);
7815 		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7816 			snprintf(reason, sizeof(reason), "reached consistency");
7817 		else
7818 			snprintf(reason, sizeof(reason), "no recovery target specified");
7819 
7820 		/*
7821 		 * We are now done reading the old WAL.  Turn off archive fetching if
7822 		 * it was active, and make a writable copy of the last WAL segment.
7823 		 * (Note that we also have a copy of the last block of the old WAL in
7824 		 * readBuf; we will use that below.)
7825 		 */
7826 		exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7827 
7828 		/*
7829 		 * Write the timeline history file, and have it archived. After this
7830 		 * point (or rather, as soon as the file is archived), the timeline
7831 		 * will appear as "taken" in the WAL archive and to any standby
7832 		 * servers.  If we crash before actually switching to the new
7833 		 * timeline, standby servers will nevertheless think that we switched
7834 		 * to the new timeline, and will try to connect to the new timeline.
7835 		 * To minimize the window for that, try to do as little as possible
7836 		 * between here and writing the end-of-recovery record.
7837 		 */
7838 		writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7839 							 EndRecPtr, reason);
7840 
7841 		/*
7842 		 * Since there might be a partial WAL segment named RECOVERYXLOG, get
7843 		 * rid of it.
7844 		 */
7845 		snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
7846 		unlink(recoveryPath);	/* ignore any error */
7847 
7848 		/* Get rid of any remaining recovered timeline-history file, too */
7849 		snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
7850 		unlink(recoveryPath);	/* ignore any error */
7851 	}
7852 
7853 	/* Save the selected TimeLineID in shared memory, too */
7854 	XLogCtl->ThisTimeLineID = ThisTimeLineID;
7855 	XLogCtl->PrevTimeLineID = PrevTimeLineID;
7856 
7857 	/*
7858 	 * Actually, if WAL ended in an incomplete record, skip the parts that
7859 	 * made it through and start writing after the portion that persisted.
7860 	 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
7861 	 * we'll do as soon as we're open for writing new WAL.)
7862 	 */
7863 	if (!XLogRecPtrIsInvalid(missingContrecPtr))
7864 	{
7865 		Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
7866 		EndOfLog = missingContrecPtr;
7867 	}
7868 
7869 	/*
7870 	 * Prepare to write WAL starting at EndOfLog location, and init xlog
7871 	 * buffer cache using the block containing the last record from the
7872 	 * previous incarnation.
7873 	 */
7874 	Insert = &XLogCtl->Insert;
7875 	Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7876 	Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7877 
7878 	/*
7879 	 * Tricky point here: readBuf contains the *last* block that the LastRec
7880 	 * record spans, not the one it starts in.  The last block is indeed the
7881 	 * one we want to use.
7882 	 */
7883 	if (EndOfLog % XLOG_BLCKSZ != 0)
7884 	{
7885 		char	   *page;
7886 		int			len;
7887 		int			firstIdx;
7888 		XLogRecPtr	pageBeginPtr;
7889 
7890 		pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7891 		Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
7892 
7893 		firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7894 
7895 		/* Copy the valid part of the last block, and zero the rest */
7896 		page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7897 		len = EndOfLog % XLOG_BLCKSZ;
7898 		memcpy(page, xlogreader->readBuf, len);
7899 		memset(page + len, 0, XLOG_BLCKSZ - len);
7900 
7901 		XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7902 		XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7903 	}
7904 	else
7905 	{
7906 		/*
7907 		 * There is no partial block to copy. Just set InitializedUpTo, and
7908 		 * let the first attempt to insert a log record to initialize the next
7909 		 * buffer.
7910 		 */
7911 		XLogCtl->InitializedUpTo = EndOfLog;
7912 	}
7913 
7914 	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7915 
7916 	XLogCtl->LogwrtResult = LogwrtResult;
7917 
7918 	XLogCtl->LogwrtRqst.Write = EndOfLog;
7919 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
7920 
7921 	LocalSetXLogInsertAllowed();
7922 
7923 	/* If necessary, write overwrite-contrecord before doing anything else */
7924 	if (!XLogRecPtrIsInvalid(abortedRecPtr))
7925 	{
7926 		Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
7927 		CreateOverwriteContrecordRecord(abortedRecPtr);
7928 		abortedRecPtr = InvalidXLogRecPtr;
7929 		missingContrecPtr = InvalidXLogRecPtr;
7930 	}
7931 
7932 	/*
7933 	 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7934 	 * record before resource manager writes cleanup WAL records or checkpoint
7935 	 * record is written.
7936 	 */
7937 	Insert->fullPageWrites = lastFullPageWrites;
7938 	UpdateFullPageWrites();
7939 	LocalXLogInsertAllowed = -1;
7940 
7941 	if (InRecovery)
7942 	{
7943 		/*
7944 		 * Perform a checkpoint to update all our recovery activity to disk.
7945 		 *
7946 		 * Note that we write a shutdown checkpoint rather than an on-line
7947 		 * one. This is not particularly critical, but since we may be
7948 		 * assigning a new TLI, using a shutdown checkpoint allows us to have
7949 		 * the rule that TLI only changes in shutdown checkpoints, which
7950 		 * allows some extra error checking in xlog_redo.
7951 		 *
7952 		 * In promotion, only create a lightweight end-of-recovery record
7953 		 * instead of a full checkpoint. A checkpoint is requested later,
7954 		 * after we're fully out of recovery mode and already accepting
7955 		 * queries.
7956 		 */
7957 		if (bgwriterLaunched)
7958 		{
7959 			if (LocalPromoteIsTriggered)
7960 			{
7961 				checkPointLoc = ControlFile->checkPoint;
7962 
7963 				/*
7964 				 * Confirm the last checkpoint is available for us to recover
7965 				 * from if we fail.
7966 				 */
7967 				record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7968 				if (record != NULL)
7969 				{
7970 					promoted = true;
7971 
7972 					/*
7973 					 * Insert a special WAL record to mark the end of
7974 					 * recovery, since we aren't doing a checkpoint. That
7975 					 * means that the checkpointer process may likely be in
7976 					 * the middle of a time-smoothed restartpoint and could
7977 					 * continue to be for minutes after this. That sounds
7978 					 * strange, but the effect is roughly the same and it
7979 					 * would be stranger to try to come out of the
7980 					 * restartpoint and then checkpoint. We request a
7981 					 * checkpoint later anyway, just for safety.
7982 					 */
7983 					CreateEndOfRecoveryRecord();
7984 				}
7985 			}
7986 
7987 			if (!promoted)
7988 				RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7989 								  CHECKPOINT_IMMEDIATE |
7990 								  CHECKPOINT_WAIT);
7991 		}
7992 		else
7993 			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7994 	}
7995 
7996 	if (ArchiveRecoveryRequested)
7997 	{
7998 		/*
7999 		 * And finally, execute the recovery_end_command, if any.
8000 		 */
8001 		if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
8002 			ExecuteRecoveryCommand(recoveryEndCommand,
8003 								   "recovery_end_command",
8004 								   true);
8005 
8006 		/*
8007 		 * We switched to a new timeline. Clean up segments on the old
8008 		 * timeline.
8009 		 *
8010 		 * If there are any higher-numbered segments on the old timeline,
8011 		 * remove them. They might contain valid WAL, but they might also be
8012 		 * pre-allocated files containing garbage. In any case, they are not
8013 		 * part of the new timeline's history so we don't need them.
8014 		 */
8015 		RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
8016 
8017 		/*
8018 		 * If the switch happened in the middle of a segment, what to do with
8019 		 * the last, partial segment on the old timeline? If we don't archive
8020 		 * it, and the server that created the WAL never archives it either
8021 		 * (e.g. because it was hit by a meteor), it will never make it to the
8022 		 * archive. That's OK from our point of view, because the new segment
8023 		 * that we created with the new TLI contains all the WAL from the old
8024 		 * timeline up to the switch point. But if you later try to do PITR to
8025 		 * the "missing" WAL on the old timeline, recovery won't find it in
8026 		 * the archive. It's physically present in the new file with new TLI,
8027 		 * but recovery won't look there when it's recovering to the older
8028 		 * timeline. On the other hand, if we archive the partial segment, and
8029 		 * the original server on that timeline is still running and archives
8030 		 * the completed version of the same segment later, it will fail. (We
8031 		 * used to do that in 9.4 and below, and it caused such problems).
8032 		 *
8033 		 * As a compromise, we rename the last segment with the .partial
8034 		 * suffix, and archive it. Archive recovery will never try to read
8035 		 * .partial segments, so they will normally go unused. But in the odd
8036 		 * PITR case, the administrator can copy them manually to the pg_wal
8037 		 * directory (removing the suffix). They can be useful in debugging,
8038 		 * too.
8039 		 *
8040 		 * If a .done or .ready file already exists for the old timeline,
8041 		 * however, we had already determined that the segment is complete, so
8042 		 * we can let it be archived normally. (In particular, if it was
8043 		 * restored from the archive to begin with, it's expected to have a
8044 		 * .done file).
8045 		 */
8046 		if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
8047 			XLogArchivingActive())
8048 		{
8049 			char		origfname[MAXFNAMELEN];
8050 			XLogSegNo	endLogSegNo;
8051 
8052 			XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
8053 			XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
8054 
8055 			if (!XLogArchiveIsReadyOrDone(origfname))
8056 			{
8057 				char		origpath[MAXPGPATH];
8058 				char		partialfname[MAXFNAMELEN];
8059 				char		partialpath[MAXPGPATH];
8060 
8061 				XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
8062 				snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
8063 				snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
8064 
8065 				/*
8066 				 * Make sure there's no .done or .ready file for the .partial
8067 				 * file.
8068 				 */
8069 				XLogArchiveCleanup(partialfname);
8070 
8071 				durable_rename(origpath, partialpath, ERROR);
8072 				XLogArchiveNotify(partialfname);
8073 			}
8074 		}
8075 	}
8076 
8077 	/*
8078 	 * Preallocate additional log files, if wanted.
8079 	 */
8080 	PreallocXlogFiles(EndOfLog);
8081 
8082 	/*
8083 	 * Okay, we're officially UP.
8084 	 */
8085 	InRecovery = false;
8086 
8087 	/* start the archive_timeout timer and LSN running */
8088 	XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
8089 	XLogCtl->lastSegSwitchLSN = EndOfLog;
8090 
8091 	/* also initialize latestCompletedXid, to nextXid - 1 */
8092 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
8093 	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
8094 	FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
8095 	LWLockRelease(ProcArrayLock);
8096 
8097 	/*
8098 	 * Start up subtrans, if not already done for hot standby.  (commit
8099 	 * timestamps are started below, if necessary.)
8100 	 */
8101 	if (standbyState == STANDBY_DISABLED)
8102 		StartupSUBTRANS(oldestActiveXID);
8103 
8104 	/*
8105 	 * Perform end of recovery actions for any SLRUs that need it.
8106 	 */
8107 	TrimCLOG();
8108 	TrimMultiXact();
8109 
8110 	/* Reload shared-memory state for prepared transactions */
8111 	RecoverPreparedTransactions();
8112 
8113 	/* Shut down xlogreader */
8114 	if (readFile >= 0)
8115 	{
8116 		close(readFile);
8117 		readFile = -1;
8118 	}
8119 	XLogReaderFree(xlogreader);
8120 
8121 	/*
8122 	 * If any of the critical GUCs have changed, log them before we allow
8123 	 * backends to write WAL.
8124 	 */
8125 	LocalSetXLogInsertAllowed();
8126 	XLogReportParameters();
8127 
8128 	/*
8129 	 * Local WAL inserts enabled, so it's time to finish initialization of
8130 	 * commit timestamp.
8131 	 */
8132 	CompleteCommitTsInitialization();
8133 
8134 	/*
8135 	 * All done with end-of-recovery actions.
8136 	 *
8137 	 * Now allow backends to write WAL and update the control file status in
8138 	 * consequence.  SharedRecoveryState, that controls if backends can write
8139 	 * WAL, is updated while holding ControlFileLock to prevent other backends
8140 	 * to look at an inconsistent state of the control file in shared memory.
8141 	 * There is still a small window during which backends can write WAL and
8142 	 * the control file is still referring to a system not in DB_IN_PRODUCTION
8143 	 * state while looking at the on-disk control file.
8144 	 *
8145 	 * Also, we use info_lck to update SharedRecoveryState to ensure that
8146 	 * there are no race conditions concerning visibility of other recent
8147 	 * updates to shared memory.
8148 	 */
8149 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8150 	ControlFile->state = DB_IN_PRODUCTION;
8151 	ControlFile->time = (pg_time_t) time(NULL);
8152 
8153 	SpinLockAcquire(&XLogCtl->info_lck);
8154 	XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
8155 	SpinLockRelease(&XLogCtl->info_lck);
8156 
8157 	UpdateControlFile();
8158 	LWLockRelease(ControlFileLock);
8159 
8160 	/*
8161 	 * Shutdown the recovery environment.  This must occur after
8162 	 * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
8163 	 * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
8164 	 * any session building a snapshot will not rely on KnownAssignedXids as
8165 	 * RecoveryInProgress() would return false at this stage.  This is
8166 	 * particularly critical for prepared 2PC transactions, that would still
8167 	 * need to be included in snapshots once recovery has ended.
8168 	 */
8169 	if (standbyState != STANDBY_DISABLED)
8170 		ShutdownRecoveryTransactionEnvironment();
8171 
8172 	/*
8173 	 * If there were cascading standby servers connected to us, nudge any wal
8174 	 * sender processes to notice that we've been promoted.
8175 	 */
8176 	WalSndWakeup();
8177 
8178 	/*
8179 	 * If this was a promotion, request an (online) checkpoint now. This isn't
8180 	 * required for consistency, but the last restartpoint might be far back,
8181 	 * and in case of a crash, recovering from it might take a longer than is
8182 	 * appropriate now that we're not in standby mode anymore.
8183 	 */
8184 	if (promoted)
8185 		RequestCheckpoint(CHECKPOINT_FORCE);
8186 }
8187 
8188 /*
8189  * Checks if recovery has reached a consistent state. When consistency is
8190  * reached and we have a valid starting standby snapshot, tell postmaster
8191  * that it can start accepting read-only connections.
8192  */
8193 static void
CheckRecoveryConsistency(void)8194 CheckRecoveryConsistency(void)
8195 {
8196 	XLogRecPtr	lastReplayedEndRecPtr;
8197 
8198 	/*
8199 	 * During crash recovery, we don't reach a consistent state until we've
8200 	 * replayed all the WAL.
8201 	 */
8202 	if (XLogRecPtrIsInvalid(minRecoveryPoint))
8203 		return;
8204 
8205 	Assert(InArchiveRecovery);
8206 
8207 	/*
8208 	 * assume that we are called in the startup process, and hence don't need
8209 	 * a lock to read lastReplayedEndRecPtr
8210 	 */
8211 	lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
8212 
8213 	/*
8214 	 * Have we reached the point where our base backup was completed?
8215 	 */
8216 	if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
8217 		ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
8218 	{
8219 		/*
8220 		 * We have reached the end of base backup, as indicated by pg_control.
8221 		 * The data on disk is now consistent. Reset backupStartPoint and
8222 		 * backupEndPoint, and update minRecoveryPoint to make sure we don't
8223 		 * allow starting up at an earlier point even if recovery is stopped
8224 		 * and restarted soon after this.
8225 		 */
8226 		elog(DEBUG1, "end of backup reached");
8227 
8228 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8229 
8230 		if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
8231 			ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
8232 
8233 		ControlFile->backupStartPoint = InvalidXLogRecPtr;
8234 		ControlFile->backupEndPoint = InvalidXLogRecPtr;
8235 		ControlFile->backupEndRequired = false;
8236 		UpdateControlFile();
8237 
8238 		LWLockRelease(ControlFileLock);
8239 	}
8240 
8241 	/*
8242 	 * Have we passed our safe starting point? Note that minRecoveryPoint is
8243 	 * known to be incorrectly set if ControlFile->backupEndRequired, until
8244 	 * the XLOG_BACKUP_END arrives to advise us of the correct
8245 	 * minRecoveryPoint. All we know prior to that is that we're not
8246 	 * consistent yet.
8247 	 */
8248 	if (!reachedConsistency && !ControlFile->backupEndRequired &&
8249 		minRecoveryPoint <= lastReplayedEndRecPtr &&
8250 		XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
8251 	{
8252 		/*
8253 		 * Check to see if the XLOG sequence contained any unresolved
8254 		 * references to uninitialized pages.
8255 		 */
8256 		XLogCheckInvalidPages();
8257 
8258 		reachedConsistency = true;
8259 		ereport(LOG,
8260 				(errmsg("consistent recovery state reached at %X/%X",
8261 						LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
8262 	}
8263 
8264 	/*
8265 	 * Have we got a valid starting snapshot that will allow queries to be
8266 	 * run? If so, we can tell postmaster that the database is consistent now,
8267 	 * enabling connections.
8268 	 */
8269 	if (standbyState == STANDBY_SNAPSHOT_READY &&
8270 		!LocalHotStandbyActive &&
8271 		reachedConsistency &&
8272 		IsUnderPostmaster)
8273 	{
8274 		SpinLockAcquire(&XLogCtl->info_lck);
8275 		XLogCtl->SharedHotStandbyActive = true;
8276 		SpinLockRelease(&XLogCtl->info_lck);
8277 
8278 		LocalHotStandbyActive = true;
8279 
8280 		SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
8281 	}
8282 }
8283 
8284 /*
8285  * Is the system still in recovery?
8286  *
8287  * Unlike testing InRecovery, this works in any process that's connected to
8288  * shared memory.
8289  *
8290  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
8291  * variables the first time we see that recovery is finished.
8292  */
8293 bool
RecoveryInProgress(void)8294 RecoveryInProgress(void)
8295 {
8296 	/*
8297 	 * We check shared state each time only until we leave recovery mode. We
8298 	 * can't re-enter recovery, so there's no need to keep checking after the
8299 	 * shared variable has once been seen false.
8300 	 */
8301 	if (!LocalRecoveryInProgress)
8302 		return false;
8303 	else
8304 	{
8305 		/*
8306 		 * use volatile pointer to make sure we make a fresh read of the
8307 		 * shared variable.
8308 		 */
8309 		volatile XLogCtlData *xlogctl = XLogCtl;
8310 
8311 		LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
8312 
8313 		/*
8314 		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
8315 		 * is finished. InitPostgres() relies upon this behaviour to ensure
8316 		 * that InitXLOGAccess() is called at backend startup.  (If you change
8317 		 * this, see also LocalSetXLogInsertAllowed.)
8318 		 */
8319 		if (!LocalRecoveryInProgress)
8320 		{
8321 			/*
8322 			 * If we just exited recovery, make sure we read TimeLineID and
8323 			 * RedoRecPtr after SharedRecoveryState (for machines with weak
8324 			 * memory ordering).
8325 			 */
8326 			pg_memory_barrier();
8327 			InitXLOGAccess();
8328 		}
8329 
8330 		/*
8331 		 * Note: We don't need a memory barrier when we're still in recovery.
8332 		 * We might exit recovery immediately after return, so the caller
8333 		 * can't rely on 'true' meaning that we're still in recovery anyway.
8334 		 */
8335 
8336 		return LocalRecoveryInProgress;
8337 	}
8338 }
8339 
8340 /*
8341  * Returns current recovery state from shared memory.
8342  *
8343  * This returned state is kept consistent with the contents of the control
8344  * file.  See details about the possible values of RecoveryState in xlog.h.
8345  */
8346 RecoveryState
GetRecoveryState(void)8347 GetRecoveryState(void)
8348 {
8349 	RecoveryState retval;
8350 
8351 	SpinLockAcquire(&XLogCtl->info_lck);
8352 	retval = XLogCtl->SharedRecoveryState;
8353 	SpinLockRelease(&XLogCtl->info_lck);
8354 
8355 	return retval;
8356 }
8357 
8358 /*
8359  * Is HotStandby active yet? This is only important in special backends
8360  * since normal backends won't ever be able to connect until this returns
8361  * true. Postmaster knows this by way of signal, not via shared memory.
8362  *
8363  * Unlike testing standbyState, this works in any process that's connected to
8364  * shared memory.  (And note that standbyState alone doesn't tell the truth
8365  * anyway.)
8366  */
8367 bool
HotStandbyActive(void)8368 HotStandbyActive(void)
8369 {
8370 	/*
8371 	 * We check shared state each time only until Hot Standby is active. We
8372 	 * can't de-activate Hot Standby, so there's no need to keep checking
8373 	 * after the shared variable has once been seen true.
8374 	 */
8375 	if (LocalHotStandbyActive)
8376 		return true;
8377 	else
8378 	{
8379 		/* spinlock is essential on machines with weak memory ordering! */
8380 		SpinLockAcquire(&XLogCtl->info_lck);
8381 		LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
8382 		SpinLockRelease(&XLogCtl->info_lck);
8383 
8384 		return LocalHotStandbyActive;
8385 	}
8386 }
8387 
8388 /*
8389  * Like HotStandbyActive(), but to be used only in WAL replay code,
8390  * where we don't need to ask any other process what the state is.
8391  */
8392 bool
HotStandbyActiveInReplay(void)8393 HotStandbyActiveInReplay(void)
8394 {
8395 	Assert(AmStartupProcess() || !IsPostmasterEnvironment);
8396 	return LocalHotStandbyActive;
8397 }
8398 
8399 /*
8400  * Is this process allowed to insert new WAL records?
8401  *
8402  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
8403  * But we also have provisions for forcing the result "true" or "false"
8404  * within specific processes regardless of the global state.
8405  */
8406 bool
XLogInsertAllowed(void)8407 XLogInsertAllowed(void)
8408 {
8409 	/*
8410 	 * If value is "unconditionally true" or "unconditionally false", just
8411 	 * return it.  This provides the normal fast path once recovery is known
8412 	 * done.
8413 	 */
8414 	if (LocalXLogInsertAllowed >= 0)
8415 		return (bool) LocalXLogInsertAllowed;
8416 
8417 	/*
8418 	 * Else, must check to see if we're still in recovery.
8419 	 */
8420 	if (RecoveryInProgress())
8421 		return false;
8422 
8423 	/*
8424 	 * On exit from recovery, reset to "unconditionally true", since there is
8425 	 * no need to keep checking.
8426 	 */
8427 	LocalXLogInsertAllowed = 1;
8428 	return true;
8429 }
8430 
8431 /*
8432  * Make XLogInsertAllowed() return true in the current process only.
8433  *
8434  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
8435  * and even call LocalSetXLogInsertAllowed() again after that.
8436  */
8437 static void
LocalSetXLogInsertAllowed(void)8438 LocalSetXLogInsertAllowed(void)
8439 {
8440 	Assert(LocalXLogInsertAllowed == -1);
8441 	LocalXLogInsertAllowed = 1;
8442 
8443 	/* Initialize as RecoveryInProgress() would do when switching state */
8444 	InitXLOGAccess();
8445 }
8446 
8447 /*
8448  * Subroutine to try to fetch and validate a prior checkpoint record.
8449  *
8450  * whichChkpt identifies the checkpoint (merely for reporting purposes).
8451  * 1 for "primary", 0 for "other" (backup_label)
8452  */
8453 static XLogRecord *
ReadCheckpointRecord(XLogReaderState * xlogreader,XLogRecPtr RecPtr,int whichChkpt,bool report)8454 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
8455 					 int whichChkpt, bool report)
8456 {
8457 	XLogRecord *record;
8458 	uint8		info;
8459 
8460 	if (!XRecOffIsValid(RecPtr))
8461 	{
8462 		if (!report)
8463 			return NULL;
8464 
8465 		switch (whichChkpt)
8466 		{
8467 			case 1:
8468 				ereport(LOG,
8469 						(errmsg("invalid primary checkpoint link in control file")));
8470 				break;
8471 			default:
8472 				ereport(LOG,
8473 						(errmsg("invalid checkpoint link in backup_label file")));
8474 				break;
8475 		}
8476 		return NULL;
8477 	}
8478 
8479 	XLogBeginRead(xlogreader, RecPtr);
8480 	record = ReadRecord(xlogreader, LOG, true);
8481 
8482 	if (record == NULL)
8483 	{
8484 		if (!report)
8485 			return NULL;
8486 
8487 		switch (whichChkpt)
8488 		{
8489 			case 1:
8490 				ereport(LOG,
8491 						(errmsg("invalid primary checkpoint record")));
8492 				break;
8493 			default:
8494 				ereport(LOG,
8495 						(errmsg("invalid checkpoint record")));
8496 				break;
8497 		}
8498 		return NULL;
8499 	}
8500 	if (record->xl_rmid != RM_XLOG_ID)
8501 	{
8502 		switch (whichChkpt)
8503 		{
8504 			case 1:
8505 				ereport(LOG,
8506 						(errmsg("invalid resource manager ID in primary checkpoint record")));
8507 				break;
8508 			default:
8509 				ereport(LOG,
8510 						(errmsg("invalid resource manager ID in checkpoint record")));
8511 				break;
8512 		}
8513 		return NULL;
8514 	}
8515 	info = record->xl_info & ~XLR_INFO_MASK;
8516 	if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8517 		info != XLOG_CHECKPOINT_ONLINE)
8518 	{
8519 		switch (whichChkpt)
8520 		{
8521 			case 1:
8522 				ereport(LOG,
8523 						(errmsg("invalid xl_info in primary checkpoint record")));
8524 				break;
8525 			default:
8526 				ereport(LOG,
8527 						(errmsg("invalid xl_info in checkpoint record")));
8528 				break;
8529 		}
8530 		return NULL;
8531 	}
8532 	if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8533 	{
8534 		switch (whichChkpt)
8535 		{
8536 			case 1:
8537 				ereport(LOG,
8538 						(errmsg("invalid length of primary checkpoint record")));
8539 				break;
8540 			default:
8541 				ereport(LOG,
8542 						(errmsg("invalid length of checkpoint record")));
8543 				break;
8544 		}
8545 		return NULL;
8546 	}
8547 	return record;
8548 }
8549 
8550 /*
8551  * This must be called in a backend process before creating WAL records
8552  * (except in a standalone backend, which does StartupXLOG instead).  We need
8553  * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8554  *
8555  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8556  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
8557  * unnecessary however, since the postmaster itself never touches XLOG anyway.
8558  */
8559 void
InitXLOGAccess(void)8560 InitXLOGAccess(void)
8561 {
8562 	XLogCtlInsert *Insert = &XLogCtl->Insert;
8563 
8564 	/* ThisTimeLineID doesn't change so we need no lock to copy it */
8565 	ThisTimeLineID = XLogCtl->ThisTimeLineID;
8566 	Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8567 
8568 	/* set wal_segment_size */
8569 	wal_segment_size = ControlFile->xlog_seg_size;
8570 
8571 	/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8572 	(void) GetRedoRecPtr();
8573 	/* Also update our copy of doPageWrites. */
8574 	doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
8575 
8576 	/* Also initialize the working areas for constructing WAL records */
8577 	InitXLogInsert();
8578 }
8579 
8580 /*
8581  * Return the current Redo pointer from shared memory.
8582  *
8583  * As a side-effect, the local RedoRecPtr copy is updated.
8584  */
8585 XLogRecPtr
GetRedoRecPtr(void)8586 GetRedoRecPtr(void)
8587 {
8588 	XLogRecPtr	ptr;
8589 
8590 	/*
8591 	 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8592 	 * grabbed a WAL insertion lock to read the authoritative value in
8593 	 * Insert->RedoRecPtr, someone might update it just after we've released
8594 	 * the lock.
8595 	 */
8596 	SpinLockAcquire(&XLogCtl->info_lck);
8597 	ptr = XLogCtl->RedoRecPtr;
8598 	SpinLockRelease(&XLogCtl->info_lck);
8599 
8600 	if (RedoRecPtr < ptr)
8601 		RedoRecPtr = ptr;
8602 
8603 	return RedoRecPtr;
8604 }
8605 
8606 /*
8607  * Return information needed to decide whether a modified block needs a
8608  * full-page image to be included in the WAL record.
8609  *
8610  * The returned values are cached copies from backend-private memory, and
8611  * possibly out-of-date.  XLogInsertRecord will re-check them against
8612  * up-to-date values, while holding the WAL insert lock.
8613  */
8614 void
GetFullPageWriteInfo(XLogRecPtr * RedoRecPtr_p,bool * doPageWrites_p)8615 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8616 {
8617 	*RedoRecPtr_p = RedoRecPtr;
8618 	*doPageWrites_p = doPageWrites;
8619 }
8620 
8621 /*
8622  * GetInsertRecPtr -- Returns the current insert position.
8623  *
8624  * NOTE: The value *actually* returned is the position of the last full
8625  * xlog page. It lags behind the real insert position by at most 1 page.
8626  * For that, we don't need to scan through WAL insertion locks, and an
8627  * approximation is enough for the current usage of this function.
8628  */
8629 XLogRecPtr
GetInsertRecPtr(void)8630 GetInsertRecPtr(void)
8631 {
8632 	XLogRecPtr	recptr;
8633 
8634 	SpinLockAcquire(&XLogCtl->info_lck);
8635 	recptr = XLogCtl->LogwrtRqst.Write;
8636 	SpinLockRelease(&XLogCtl->info_lck);
8637 
8638 	return recptr;
8639 }
8640 
8641 /*
8642  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8643  * position known to be fsync'd to disk.
8644  */
8645 XLogRecPtr
GetFlushRecPtr(void)8646 GetFlushRecPtr(void)
8647 {
8648 	SpinLockAcquire(&XLogCtl->info_lck);
8649 	LogwrtResult = XLogCtl->LogwrtResult;
8650 	SpinLockRelease(&XLogCtl->info_lck);
8651 
8652 	return LogwrtResult.Flush;
8653 }
8654 
8655 /*
8656  * GetLastImportantRecPtr -- Returns the LSN of the last important record
8657  * inserted. All records not explicitly marked as unimportant are considered
8658  * important.
8659  *
8660  * The LSN is determined by computing the maximum of
8661  * WALInsertLocks[i].lastImportantAt.
8662  */
8663 XLogRecPtr
GetLastImportantRecPtr(void)8664 GetLastImportantRecPtr(void)
8665 {
8666 	XLogRecPtr	res = InvalidXLogRecPtr;
8667 	int			i;
8668 
8669 	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
8670 	{
8671 		XLogRecPtr	last_important;
8672 
8673 		/*
8674 		 * Need to take a lock to prevent torn reads of the LSN, which are
8675 		 * possible on some of the supported platforms. WAL insert locks only
8676 		 * support exclusive mode, so we have to use that.
8677 		 */
8678 		LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8679 		last_important = WALInsertLocks[i].l.lastImportantAt;
8680 		LWLockRelease(&WALInsertLocks[i].l.lock);
8681 
8682 		if (res < last_important)
8683 			res = last_important;
8684 	}
8685 
8686 	return res;
8687 }
8688 
8689 /*
8690  * Get the time and LSN of the last xlog segment switch
8691  */
8692 pg_time_t
GetLastSegSwitchData(XLogRecPtr * lastSwitchLSN)8693 GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8694 {
8695 	pg_time_t	result;
8696 
8697 	/* Need WALWriteLock, but shared lock is sufficient */
8698 	LWLockAcquire(WALWriteLock, LW_SHARED);
8699 	result = XLogCtl->lastSegSwitchTime;
8700 	*lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8701 	LWLockRelease(WALWriteLock);
8702 
8703 	return result;
8704 }
8705 
8706 /*
8707  * This must be called ONCE during postmaster or standalone-backend shutdown
8708  */
8709 void
ShutdownXLOG(int code,Datum arg)8710 ShutdownXLOG(int code, Datum arg)
8711 {
8712 	/*
8713 	 * We should have an aux process resource owner to use, and we should not
8714 	 * be in a transaction that's installed some other resowner.
8715 	 */
8716 	Assert(AuxProcessResourceOwner != NULL);
8717 	Assert(CurrentResourceOwner == NULL ||
8718 		   CurrentResourceOwner == AuxProcessResourceOwner);
8719 	CurrentResourceOwner = AuxProcessResourceOwner;
8720 
8721 	/* Don't be chatty in standalone mode */
8722 	ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8723 			(errmsg("shutting down")));
8724 
8725 	/*
8726 	 * Signal walsenders to move to stopping state.
8727 	 */
8728 	WalSndInitStopping();
8729 
8730 	/*
8731 	 * Wait for WAL senders to be in stopping state.  This prevents commands
8732 	 * from writing new WAL.
8733 	 */
8734 	WalSndWaitStopping();
8735 
8736 	if (RecoveryInProgress())
8737 		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8738 	else
8739 	{
8740 		/*
8741 		 * If archiving is enabled, rotate the last XLOG file so that all the
8742 		 * remaining records are archived (postmaster wakes up the archiver
8743 		 * process one more time at the end of shutdown). The checkpoint
8744 		 * record will go to the next XLOG file and won't be archived (yet).
8745 		 */
8746 		if (XLogArchivingActive() && XLogArchiveCommandSet())
8747 			RequestXLogSwitch(false);
8748 
8749 		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8750 	}
8751 }
8752 
8753 /*
8754  * Log start of a checkpoint.
8755  */
8756 static void
LogCheckpointStart(int flags,bool restartpoint)8757 LogCheckpointStart(int flags, bool restartpoint)
8758 {
8759 	if (restartpoint)
8760 		ereport(LOG,
8761 		/* translator: the placeholders show checkpoint options */
8762 				(errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
8763 						(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8764 						(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8765 						(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8766 						(flags & CHECKPOINT_FORCE) ? " force" : "",
8767 						(flags & CHECKPOINT_WAIT) ? " wait" : "",
8768 						(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
8769 						(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8770 						(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
8771 	else
8772 		ereport(LOG,
8773 		/* translator: the placeholders show checkpoint options */
8774 				(errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
8775 						(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8776 						(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8777 						(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8778 						(flags & CHECKPOINT_FORCE) ? " force" : "",
8779 						(flags & CHECKPOINT_WAIT) ? " wait" : "",
8780 						(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
8781 						(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8782 						(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
8783 }
8784 
8785 /*
8786  * Log end of a checkpoint.
8787  */
8788 static void
LogCheckpointEnd(bool restartpoint)8789 LogCheckpointEnd(bool restartpoint)
8790 {
8791 	long		write_msecs,
8792 				sync_msecs,
8793 				total_msecs,
8794 				longest_msecs,
8795 				average_msecs;
8796 	uint64		average_sync_time;
8797 
8798 	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8799 
8800 	write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
8801 												  CheckpointStats.ckpt_sync_t);
8802 
8803 	sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
8804 												 CheckpointStats.ckpt_sync_end_t);
8805 
8806 	/* Accumulate checkpoint timing summary data, in milliseconds. */
8807 	BgWriterStats.m_checkpoint_write_time += write_msecs;
8808 	BgWriterStats.m_checkpoint_sync_time += sync_msecs;
8809 
8810 	/*
8811 	 * All of the published timing statistics are accounted for.  Only
8812 	 * continue if a log message is to be written.
8813 	 */
8814 	if (!log_checkpoints)
8815 		return;
8816 
8817 	total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
8818 												  CheckpointStats.ckpt_end_t);
8819 
8820 	/*
8821 	 * Timing values returned from CheckpointStats are in microseconds.
8822 	 * Convert to milliseconds for consistent printing.
8823 	 */
8824 	longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
8825 
8826 	average_sync_time = 0;
8827 	if (CheckpointStats.ckpt_sync_rels > 0)
8828 		average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8829 			CheckpointStats.ckpt_sync_rels;
8830 	average_msecs = (long) ((average_sync_time + 999) / 1000);
8831 
8832 	if (restartpoint)
8833 		ereport(LOG,
8834 				(errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
8835 						"%d WAL file(s) added, %d removed, %d recycled; "
8836 						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8837 						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8838 						"distance=%d kB, estimate=%d kB",
8839 						CheckpointStats.ckpt_bufs_written,
8840 						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8841 						CheckpointStats.ckpt_segs_added,
8842 						CheckpointStats.ckpt_segs_removed,
8843 						CheckpointStats.ckpt_segs_recycled,
8844 						write_msecs / 1000, (int) (write_msecs % 1000),
8845 						sync_msecs / 1000, (int) (sync_msecs % 1000),
8846 						total_msecs / 1000, (int) (total_msecs % 1000),
8847 						CheckpointStats.ckpt_sync_rels,
8848 						longest_msecs / 1000, (int) (longest_msecs % 1000),
8849 						average_msecs / 1000, (int) (average_msecs % 1000),
8850 						(int) (PrevCheckPointDistance / 1024.0),
8851 						(int) (CheckPointDistanceEstimate / 1024.0))));
8852 	else
8853 		ereport(LOG,
8854 				(errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
8855 						"%d WAL file(s) added, %d removed, %d recycled; "
8856 						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8857 						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8858 						"distance=%d kB, estimate=%d kB",
8859 						CheckpointStats.ckpt_bufs_written,
8860 						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8861 						CheckpointStats.ckpt_segs_added,
8862 						CheckpointStats.ckpt_segs_removed,
8863 						CheckpointStats.ckpt_segs_recycled,
8864 						write_msecs / 1000, (int) (write_msecs % 1000),
8865 						sync_msecs / 1000, (int) (sync_msecs % 1000),
8866 						total_msecs / 1000, (int) (total_msecs % 1000),
8867 						CheckpointStats.ckpt_sync_rels,
8868 						longest_msecs / 1000, (int) (longest_msecs % 1000),
8869 						average_msecs / 1000, (int) (average_msecs % 1000),
8870 						(int) (PrevCheckPointDistance / 1024.0),
8871 						(int) (CheckPointDistanceEstimate / 1024.0))));
8872 }
8873 
8874 /*
8875  * Update the estimate of distance between checkpoints.
8876  *
8877  * The estimate is used to calculate the number of WAL segments to keep
8878  * preallocated, see XLOGfileslop().
8879  */
8880 static void
UpdateCheckPointDistanceEstimate(uint64 nbytes)8881 UpdateCheckPointDistanceEstimate(uint64 nbytes)
8882 {
8883 	/*
8884 	 * To estimate the number of segments consumed between checkpoints, keep a
8885 	 * moving average of the amount of WAL generated in previous checkpoint
8886 	 * cycles. However, if the load is bursty, with quiet periods and busy
8887 	 * periods, we want to cater for the peak load. So instead of a plain
8888 	 * moving average, let the average decline slowly if the previous cycle
8889 	 * used less WAL than estimated, but bump it up immediately if it used
8890 	 * more.
8891 	 *
8892 	 * When checkpoints are triggered by max_wal_size, this should converge to
8893 	 * CheckpointSegments * wal_segment_size,
8894 	 *
8895 	 * Note: This doesn't pay any attention to what caused the checkpoint.
8896 	 * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8897 	 * starting a base backup, are counted the same as those created
8898 	 * automatically. The slow-decline will largely mask them out, if they are
8899 	 * not frequent. If they are frequent, it seems reasonable to count them
8900 	 * in as any others; if you issue a manual checkpoint every 5 minutes and
8901 	 * never let a timed checkpoint happen, it makes sense to base the
8902 	 * preallocation on that 5 minute interval rather than whatever
8903 	 * checkpoint_timeout is set to.
8904 	 */
8905 	PrevCheckPointDistance = nbytes;
8906 	if (CheckPointDistanceEstimate < nbytes)
8907 		CheckPointDistanceEstimate = nbytes;
8908 	else
8909 		CheckPointDistanceEstimate =
8910 			(0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8911 }
8912 
8913 /*
8914  * Update the ps display for a process running a checkpoint.  Note that
8915  * this routine should not do any allocations so as it can be called
8916  * from a critical section.
8917  */
8918 static void
update_checkpoint_display(int flags,bool restartpoint,bool reset)8919 update_checkpoint_display(int flags, bool restartpoint, bool reset)
8920 {
8921 	/*
8922 	 * The status is reported only for end-of-recovery and shutdown
8923 	 * checkpoints or shutdown restartpoints.  Updating the ps display is
8924 	 * useful in those situations as it may not be possible to rely on
8925 	 * pg_stat_activity to see the status of the checkpointer or the startup
8926 	 * process.
8927 	 */
8928 	if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
8929 		return;
8930 
8931 	if (reset)
8932 		set_ps_display("");
8933 	else
8934 	{
8935 		char		activitymsg[128];
8936 
8937 		snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
8938 				 (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
8939 				 (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
8940 				 restartpoint ? "restartpoint" : "checkpoint");
8941 		set_ps_display(activitymsg);
8942 	}
8943 }
8944 
8945 
8946 /*
8947  * Perform a checkpoint --- either during shutdown, or on-the-fly
8948  *
8949  * flags is a bitwise OR of the following:
8950  *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8951  *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8952  *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8953  *		ignoring checkpoint_completion_target parameter.
8954  *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8955  *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8956  *		CHECKPOINT_END_OF_RECOVERY).
8957  *	CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8958  *
8959  * Note: flags contains other bits, of interest here only for logging purposes.
8960  * In particular note that this routine is synchronous and does not pay
8961  * attention to CHECKPOINT_WAIT.
8962  *
8963  * If !shutdown then we are writing an online checkpoint. This is a very special
8964  * kind of operation and WAL record because the checkpoint action occurs over
8965  * a period of time yet logically occurs at just a single LSN. The logical
8966  * position of the WAL record (redo ptr) is the same or earlier than the
8967  * physical position. When we replay WAL we locate the checkpoint via its
8968  * physical position then read the redo ptr and actually start replay at the
8969  * earlier logical position. Note that we don't write *anything* to WAL at
8970  * the logical position, so that location could be any other kind of WAL record.
8971  * All of this mechanism allows us to continue working while we checkpoint.
8972  * As a result, timing of actions is critical here and be careful to note that
8973  * this function will likely take minutes to execute on a busy system.
8974  */
8975 void
CreateCheckPoint(int flags)8976 CreateCheckPoint(int flags)
8977 {
8978 	bool		shutdown;
8979 	CheckPoint	checkPoint;
8980 	XLogRecPtr	recptr;
8981 	XLogSegNo	_logSegNo;
8982 	XLogCtlInsert *Insert = &XLogCtl->Insert;
8983 	uint32		freespace;
8984 	XLogRecPtr	PriorRedoPtr;
8985 	XLogRecPtr	curInsert;
8986 	XLogRecPtr	last_important_lsn;
8987 	VirtualTransactionId *vxids;
8988 	int			nvxids;
8989 
8990 	/*
8991 	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8992 	 * issued at a different time.
8993 	 */
8994 	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8995 		shutdown = true;
8996 	else
8997 		shutdown = false;
8998 
8999 	/* sanity check */
9000 	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
9001 		elog(ERROR, "can't create a checkpoint during recovery");
9002 
9003 	/*
9004 	 * Initialize InitXLogInsert working areas before entering the critical
9005 	 * section.  Normally, this is done by the first call to
9006 	 * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
9007 	 * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
9008 	 * done below in a critical section, and InitXLogInsert cannot be called
9009 	 * in a critical section.
9010 	 */
9011 	InitXLogInsert();
9012 
9013 	/*
9014 	 * Prepare to accumulate statistics.
9015 	 *
9016 	 * Note: because it is possible for log_checkpoints to change while a
9017 	 * checkpoint proceeds, we always accumulate stats, even if
9018 	 * log_checkpoints is currently off.
9019 	 */
9020 	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9021 	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9022 
9023 	/*
9024 	 * Use a critical section to force system panic if we have trouble.
9025 	 */
9026 	START_CRIT_SECTION();
9027 
9028 	if (shutdown)
9029 	{
9030 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9031 		ControlFile->state = DB_SHUTDOWNING;
9032 		ControlFile->time = (pg_time_t) time(NULL);
9033 		UpdateControlFile();
9034 		LWLockRelease(ControlFileLock);
9035 	}
9036 
9037 	/*
9038 	 * Let smgr prepare for checkpoint; this has to happen before we determine
9039 	 * the REDO pointer.  Note that smgr must not do anything that'd have to
9040 	 * be undone if we decide no checkpoint is needed.
9041 	 */
9042 	SyncPreCheckpoint();
9043 
9044 	/* Begin filling in the checkpoint WAL record */
9045 	MemSet(&checkPoint, 0, sizeof(checkPoint));
9046 	checkPoint.time = (pg_time_t) time(NULL);
9047 
9048 	/*
9049 	 * For Hot Standby, derive the oldestActiveXid before we fix the redo
9050 	 * pointer. This allows us to begin accumulating changes to assemble our
9051 	 * starting snapshot of locks and transactions.
9052 	 */
9053 	if (!shutdown && XLogStandbyInfoActive())
9054 		checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
9055 	else
9056 		checkPoint.oldestActiveXid = InvalidTransactionId;
9057 
9058 	/*
9059 	 * Get location of last important record before acquiring insert locks (as
9060 	 * GetLastImportantRecPtr() also locks WAL locks).
9061 	 */
9062 	last_important_lsn = GetLastImportantRecPtr();
9063 
9064 	/*
9065 	 * We must block concurrent insertions while examining insert state to
9066 	 * determine the checkpoint REDO pointer.
9067 	 */
9068 	WALInsertLockAcquireExclusive();
9069 	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
9070 
9071 	/*
9072 	 * If this isn't a shutdown or forced checkpoint, and if there has been no
9073 	 * WAL activity requiring a checkpoint, skip it.  The idea here is to
9074 	 * avoid inserting duplicate checkpoints when the system is idle.
9075 	 */
9076 	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
9077 				  CHECKPOINT_FORCE)) == 0)
9078 	{
9079 		if (last_important_lsn == ControlFile->checkPoint)
9080 		{
9081 			WALInsertLockRelease();
9082 			END_CRIT_SECTION();
9083 			ereport(DEBUG1,
9084 					(errmsg_internal("checkpoint skipped because system is idle")));
9085 			return;
9086 		}
9087 	}
9088 
9089 	/*
9090 	 * An end-of-recovery checkpoint is created before anyone is allowed to
9091 	 * write WAL. To allow us to write the checkpoint record, temporarily
9092 	 * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
9093 	 * initialized, which we need here and in AdvanceXLInsertBuffer.)
9094 	 */
9095 	if (flags & CHECKPOINT_END_OF_RECOVERY)
9096 		LocalSetXLogInsertAllowed();
9097 
9098 	checkPoint.ThisTimeLineID = ThisTimeLineID;
9099 	if (flags & CHECKPOINT_END_OF_RECOVERY)
9100 		checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
9101 	else
9102 		checkPoint.PrevTimeLineID = ThisTimeLineID;
9103 
9104 	checkPoint.fullPageWrites = Insert->fullPageWrites;
9105 
9106 	/*
9107 	 * Compute new REDO record ptr = location of next XLOG record.
9108 	 *
9109 	 * NB: this is NOT necessarily where the checkpoint record itself will be,
9110 	 * since other backends may insert more XLOG records while we're off doing
9111 	 * the buffer flush work.  Those XLOG records are logically after the
9112 	 * checkpoint, even though physically before it.  Got that?
9113 	 */
9114 	freespace = INSERT_FREESPACE(curInsert);
9115 	if (freespace == 0)
9116 	{
9117 		if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
9118 			curInsert += SizeOfXLogLongPHD;
9119 		else
9120 			curInsert += SizeOfXLogShortPHD;
9121 	}
9122 	checkPoint.redo = curInsert;
9123 
9124 	/*
9125 	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
9126 	 * must be done while holding all the insertion locks.
9127 	 *
9128 	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
9129 	 * pointing past where it really needs to point.  This is okay; the only
9130 	 * consequence is that XLogInsert might back up whole buffers that it
9131 	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
9132 	 * XLogInserts that happen while we are dumping buffers must assume that
9133 	 * their buffer changes are not included in the checkpoint.
9134 	 */
9135 	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
9136 
9137 	/*
9138 	 * Now we can release the WAL insertion locks, allowing other xacts to
9139 	 * proceed while we are flushing disk buffers.
9140 	 */
9141 	WALInsertLockRelease();
9142 
9143 	/* Update the info_lck-protected copy of RedoRecPtr as well */
9144 	SpinLockAcquire(&XLogCtl->info_lck);
9145 	XLogCtl->RedoRecPtr = checkPoint.redo;
9146 	SpinLockRelease(&XLogCtl->info_lck);
9147 
9148 	/*
9149 	 * If enabled, log checkpoint start.  We postpone this until now so as not
9150 	 * to log anything if we decided to skip the checkpoint.
9151 	 */
9152 	if (log_checkpoints)
9153 		LogCheckpointStart(flags, false);
9154 
9155 	/* Update the process title */
9156 	update_checkpoint_display(flags, false, false);
9157 
9158 	TRACE_POSTGRESQL_CHECKPOINT_START(flags);
9159 
9160 	/*
9161 	 * Get the other info we need for the checkpoint record.
9162 	 *
9163 	 * We don't need to save oldestClogXid in the checkpoint, it only matters
9164 	 * for the short period in which clog is being truncated, and if we crash
9165 	 * during that we'll redo the clog truncation and fix up oldestClogXid
9166 	 * there.
9167 	 */
9168 	LWLockAcquire(XidGenLock, LW_SHARED);
9169 	checkPoint.nextXid = ShmemVariableCache->nextXid;
9170 	checkPoint.oldestXid = ShmemVariableCache->oldestXid;
9171 	checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
9172 	LWLockRelease(XidGenLock);
9173 
9174 	LWLockAcquire(CommitTsLock, LW_SHARED);
9175 	checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
9176 	checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
9177 	LWLockRelease(CommitTsLock);
9178 
9179 	LWLockAcquire(OidGenLock, LW_SHARED);
9180 	checkPoint.nextOid = ShmemVariableCache->nextOid;
9181 	if (!shutdown)
9182 		checkPoint.nextOid += ShmemVariableCache->oidCount;
9183 	LWLockRelease(OidGenLock);
9184 
9185 	MultiXactGetCheckptMulti(shutdown,
9186 							 &checkPoint.nextMulti,
9187 							 &checkPoint.nextMultiOffset,
9188 							 &checkPoint.oldestMulti,
9189 							 &checkPoint.oldestMultiDB);
9190 
9191 	/*
9192 	 * Having constructed the checkpoint record, ensure all shmem disk buffers
9193 	 * and commit-log buffers are flushed to disk.
9194 	 *
9195 	 * This I/O could fail for various reasons.  If so, we will fail to
9196 	 * complete the checkpoint, but there is no reason to force a system
9197 	 * panic. Accordingly, exit critical section while doing it.
9198 	 */
9199 	END_CRIT_SECTION();
9200 
9201 	/*
9202 	 * In some cases there are groups of actions that must all occur on one
9203 	 * side or the other of a checkpoint record. Before flushing the
9204 	 * checkpoint record we must explicitly wait for any backend currently
9205 	 * performing those groups of actions.
9206 	 *
9207 	 * One example is end of transaction, so we must wait for any transactions
9208 	 * that are currently in commit critical sections.  If an xact inserted
9209 	 * its commit record into XLOG just before the REDO point, then a crash
9210 	 * restart from the REDO point would not replay that record, which means
9211 	 * that our flushing had better include the xact's update of pg_xact.  So
9212 	 * we wait till he's out of his commit critical section before proceeding.
9213 	 * See notes in RecordTransactionCommit().
9214 	 *
9215 	 * Because we've already released the insertion locks, this test is a bit
9216 	 * fuzzy: it is possible that we will wait for xacts we didn't really need
9217 	 * to wait for.  But the delay should be short and it seems better to make
9218 	 * checkpoint take a bit longer than to hold off insertions longer than
9219 	 * necessary. (In fact, the whole reason we have this issue is that xact.c
9220 	 * does commit record XLOG insertion and clog update as two separate steps
9221 	 * protected by different locks, but again that seems best on grounds of
9222 	 * minimizing lock contention.)
9223 	 *
9224 	 * A transaction that has not yet set delayChkpt when we look cannot be at
9225 	 * risk, since he's not inserted his commit record yet; and one that's
9226 	 * already cleared it is not at risk either, since he's done fixing clog
9227 	 * and we will correctly flush the update below.  So we cannot miss any
9228 	 * xacts we need to wait for.
9229 	 */
9230 	vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
9231 	if (nvxids > 0)
9232 	{
9233 		do
9234 		{
9235 			pg_usleep(10000L);	/* wait for 10 msec */
9236 		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
9237 	}
9238 	pfree(vxids);
9239 
9240 	CheckPointGuts(checkPoint.redo, flags);
9241 
9242 	/*
9243 	 * Take a snapshot of running transactions and write this to WAL. This
9244 	 * allows us to reconstruct the state of running transactions during
9245 	 * archive recovery, if required. Skip, if this info disabled.
9246 	 *
9247 	 * If we are shutting down, or Startup process is completing crash
9248 	 * recovery we don't need to write running xact data.
9249 	 */
9250 	if (!shutdown && XLogStandbyInfoActive())
9251 		LogStandbySnapshot();
9252 
9253 	START_CRIT_SECTION();
9254 
9255 	/*
9256 	 * Now insert the checkpoint record into XLOG.
9257 	 */
9258 	XLogBeginInsert();
9259 	XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
9260 	recptr = XLogInsert(RM_XLOG_ID,
9261 						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
9262 						XLOG_CHECKPOINT_ONLINE);
9263 
9264 	XLogFlush(recptr);
9265 
9266 	/*
9267 	 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
9268 	 * overwritten at next startup.  No-one should even try, this just allows
9269 	 * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
9270 	 * to just temporarily disable writing until the system has exited
9271 	 * recovery.
9272 	 */
9273 	if (shutdown)
9274 	{
9275 		if (flags & CHECKPOINT_END_OF_RECOVERY)
9276 			LocalXLogInsertAllowed = -1;	/* return to "check" state */
9277 		else
9278 			LocalXLogInsertAllowed = 0; /* never again write WAL */
9279 	}
9280 
9281 	/*
9282 	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
9283 	 * = end of actual checkpoint record.
9284 	 */
9285 	if (shutdown && checkPoint.redo != ProcLastRecPtr)
9286 		ereport(PANIC,
9287 				(errmsg("concurrent write-ahead log activity while database system is shutting down")));
9288 
9289 	/*
9290 	 * Remember the prior checkpoint's redo ptr for
9291 	 * UpdateCheckPointDistanceEstimate()
9292 	 */
9293 	PriorRedoPtr = ControlFile->checkPointCopy.redo;
9294 
9295 	/*
9296 	 * Update the control file.
9297 	 */
9298 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9299 	if (shutdown)
9300 		ControlFile->state = DB_SHUTDOWNED;
9301 	ControlFile->checkPoint = ProcLastRecPtr;
9302 	ControlFile->checkPointCopy = checkPoint;
9303 	ControlFile->time = (pg_time_t) time(NULL);
9304 	/* crash recovery should always recover to the end of WAL */
9305 	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
9306 	ControlFile->minRecoveryPointTLI = 0;
9307 
9308 	/*
9309 	 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
9310 	 * unused on non-shutdown checkpoints, but seems useful to store it always
9311 	 * for debugging purposes.
9312 	 */
9313 	SpinLockAcquire(&XLogCtl->ulsn_lck);
9314 	ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
9315 	SpinLockRelease(&XLogCtl->ulsn_lck);
9316 
9317 	UpdateControlFile();
9318 	LWLockRelease(ControlFileLock);
9319 
9320 	/* Update shared-memory copy of checkpoint XID/epoch */
9321 	SpinLockAcquire(&XLogCtl->info_lck);
9322 	XLogCtl->ckptFullXid = checkPoint.nextXid;
9323 	SpinLockRelease(&XLogCtl->info_lck);
9324 
9325 	/*
9326 	 * We are now done with critical updates; no need for system panic if we
9327 	 * have trouble while fooling with old log segments.
9328 	 */
9329 	END_CRIT_SECTION();
9330 
9331 	/*
9332 	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
9333 	 */
9334 	SyncPostCheckpoint();
9335 
9336 	/*
9337 	 * Update the average distance between checkpoints if the prior checkpoint
9338 	 * exists.
9339 	 */
9340 	if (PriorRedoPtr != InvalidXLogRecPtr)
9341 		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9342 
9343 	/*
9344 	 * Delete old log files, those no longer needed for last checkpoint to
9345 	 * prevent the disk holding the xlog from growing full.
9346 	 */
9347 	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9348 	KeepLogSeg(recptr, &_logSegNo);
9349 	if (InvalidateObsoleteReplicationSlots(_logSegNo))
9350 	{
9351 		/*
9352 		 * Some slots have been invalidated; recalculate the old-segment
9353 		 * horizon, starting again from RedoRecPtr.
9354 		 */
9355 		XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9356 		KeepLogSeg(recptr, &_logSegNo);
9357 	}
9358 	_logSegNo--;
9359 	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
9360 
9361 	/*
9362 	 * Make more log segments if needed.  (Do this after recycling old log
9363 	 * segments, since that may supply some of the needed files.)
9364 	 */
9365 	if (!shutdown)
9366 		PreallocXlogFiles(recptr);
9367 
9368 	/*
9369 	 * Truncate pg_subtrans if possible.  We can throw away all data before
9370 	 * the oldest XMIN of any running transaction.  No future transaction will
9371 	 * attempt to reference any pg_subtrans entry older than that (see Asserts
9372 	 * in subtrans.c).  During recovery, though, we mustn't do this because
9373 	 * StartupSUBTRANS hasn't been called yet.
9374 	 */
9375 	if (!RecoveryInProgress())
9376 		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
9377 
9378 	/* Real work is done; log and update stats. */
9379 	LogCheckpointEnd(false);
9380 
9381 	/* Reset the process title */
9382 	update_checkpoint_display(flags, false, true);
9383 
9384 	TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
9385 									 NBuffers,
9386 									 CheckpointStats.ckpt_segs_added,
9387 									 CheckpointStats.ckpt_segs_removed,
9388 									 CheckpointStats.ckpt_segs_recycled);
9389 }
9390 
9391 /*
9392  * Mark the end of recovery in WAL though without running a full checkpoint.
9393  * We can expect that a restartpoint is likely to be in progress as we
9394  * do this, though we are unwilling to wait for it to complete.
9395  *
9396  * CreateRestartPoint() allows for the case where recovery may end before
9397  * the restartpoint completes so there is no concern of concurrent behaviour.
9398  */
9399 static void
CreateEndOfRecoveryRecord(void)9400 CreateEndOfRecoveryRecord(void)
9401 {
9402 	xl_end_of_recovery xlrec;
9403 	XLogRecPtr	recptr;
9404 
9405 	/* sanity check */
9406 	if (!RecoveryInProgress())
9407 		elog(ERROR, "can only be used to end recovery");
9408 
9409 	xlrec.end_time = GetCurrentTimestamp();
9410 
9411 	WALInsertLockAcquireExclusive();
9412 	xlrec.ThisTimeLineID = ThisTimeLineID;
9413 	xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
9414 	WALInsertLockRelease();
9415 
9416 	LocalSetXLogInsertAllowed();
9417 
9418 	START_CRIT_SECTION();
9419 
9420 	XLogBeginInsert();
9421 	XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
9422 	recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
9423 
9424 	XLogFlush(recptr);
9425 
9426 	/*
9427 	 * Update the control file so that crash recovery can follow the timeline
9428 	 * changes to this point.
9429 	 */
9430 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9431 	ControlFile->time = (pg_time_t) time(NULL);
9432 	ControlFile->minRecoveryPoint = recptr;
9433 	ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9434 	UpdateControlFile();
9435 	LWLockRelease(ControlFileLock);
9436 
9437 	END_CRIT_SECTION();
9438 
9439 	LocalXLogInsertAllowed = -1;	/* return to "check" state */
9440 }
9441 
9442 /*
9443  * Write an OVERWRITE_CONTRECORD message.
9444  *
9445  * When on WAL replay we expect a continuation record at the start of a page
9446  * that is not there, recovery ends and WAL writing resumes at that point.
9447  * But it's wrong to resume writing new WAL back at the start of the record
9448  * that was broken, because downstream consumers of that WAL (physical
9449  * replicas) are not prepared to "rewind".  So the first action after
9450  * finishing replay of all valid WAL must be to write a record of this type
9451  * at the point where the contrecord was missing; to support xlogreader
9452  * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
9453  * to the page header where the record occurs.  xlogreader has an ad-hoc
9454  * mechanism to report metadata about the broken record, which is what we
9455  * use here.
9456  *
9457  * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
9458  * skip the record it was reading, and pass back the LSN of the skipped
9459  * record, so that its caller can verify (on "replay" of that record) that the
9460  * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
9461  */
9462 static XLogRecPtr
CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)9463 CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)
9464 {
9465 	xl_overwrite_contrecord xlrec;
9466 	XLogRecPtr	recptr;
9467 
9468 	/* sanity check */
9469 	if (!RecoveryInProgress())
9470 		elog(ERROR, "can only be used at end of recovery");
9471 
9472 	xlrec.overwritten_lsn = aborted_lsn;
9473 	xlrec.overwrite_time = GetCurrentTimestamp();
9474 
9475 	START_CRIT_SECTION();
9476 
9477 	XLogBeginInsert();
9478 	XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
9479 
9480 	recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
9481 
9482 	XLogFlush(recptr);
9483 
9484 	END_CRIT_SECTION();
9485 
9486 	return recptr;
9487 }
9488 
9489 /*
9490  * Flush all data in shared memory to disk, and fsync
9491  *
9492  * This is the common code shared between regular checkpoints and
9493  * recovery restartpoints.
9494  */
9495 static void
CheckPointGuts(XLogRecPtr checkPointRedo,int flags)9496 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
9497 {
9498 	CheckPointRelationMap();
9499 	CheckPointReplicationSlots();
9500 	CheckPointSnapBuild();
9501 	CheckPointLogicalRewriteHeap();
9502 	CheckPointReplicationOrigin();
9503 
9504 	/* Write out all dirty data in SLRUs and the main buffer pool */
9505 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
9506 	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
9507 	CheckPointCLOG();
9508 	CheckPointCommitTs();
9509 	CheckPointSUBTRANS();
9510 	CheckPointMultiXact();
9511 	CheckPointPredicate();
9512 	CheckPointBuffers(flags);
9513 
9514 	/* Perform all queued up fsyncs */
9515 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
9516 	CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
9517 	ProcessSyncRequests();
9518 	CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
9519 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
9520 
9521 	/* We deliberately delay 2PC checkpointing as long as possible */
9522 	CheckPointTwoPhase(checkPointRedo);
9523 }
9524 
9525 /*
9526  * Save a checkpoint for recovery restart if appropriate
9527  *
9528  * This function is called each time a checkpoint record is read from XLOG.
9529  * It must determine whether the checkpoint represents a safe restartpoint or
9530  * not.  If so, the checkpoint record is stashed in shared memory so that
9531  * CreateRestartPoint can consult it.  (Note that the latter function is
9532  * executed by the checkpointer, while this one will be executed by the
9533  * startup process.)
9534  */
9535 static void
RecoveryRestartPoint(const CheckPoint * checkPoint)9536 RecoveryRestartPoint(const CheckPoint *checkPoint)
9537 {
9538 	/*
9539 	 * Also refrain from creating a restartpoint if we have seen any
9540 	 * references to non-existent pages. Restarting recovery from the
9541 	 * restartpoint would not see the references, so we would lose the
9542 	 * cross-check that the pages belonged to a relation that was dropped
9543 	 * later.
9544 	 */
9545 	if (XLogHaveInvalidPages())
9546 	{
9547 		elog(trace_recovery(DEBUG2),
9548 			 "could not record restart point at %X/%X because there "
9549 			 "are unresolved references to invalid pages",
9550 			 LSN_FORMAT_ARGS(checkPoint->redo));
9551 		return;
9552 	}
9553 
9554 	/*
9555 	 * Copy the checkpoint record to shared memory, so that checkpointer can
9556 	 * work out the next time it wants to perform a restartpoint.
9557 	 */
9558 	SpinLockAcquire(&XLogCtl->info_lck);
9559 	XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9560 	XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9561 	XLogCtl->lastCheckPoint = *checkPoint;
9562 	SpinLockRelease(&XLogCtl->info_lck);
9563 }
9564 
9565 /*
9566  * Establish a restartpoint if possible.
9567  *
9568  * This is similar to CreateCheckPoint, but is used during WAL recovery
9569  * to establish a point from which recovery can roll forward without
9570  * replaying the entire recovery log.
9571  *
9572  * Returns true if a new restartpoint was established. We can only establish
9573  * a restartpoint if we have replayed a safe checkpoint record since last
9574  * restartpoint.
9575  */
9576 bool
CreateRestartPoint(int flags)9577 CreateRestartPoint(int flags)
9578 {
9579 	XLogRecPtr	lastCheckPointRecPtr;
9580 	XLogRecPtr	lastCheckPointEndPtr;
9581 	CheckPoint	lastCheckPoint;
9582 	XLogRecPtr	PriorRedoPtr;
9583 	XLogRecPtr	receivePtr;
9584 	XLogRecPtr	replayPtr;
9585 	TimeLineID	replayTLI;
9586 	XLogRecPtr	endptr;
9587 	XLogSegNo	_logSegNo;
9588 	TimestampTz xtime;
9589 
9590 	/* Get a local copy of the last safe checkpoint record. */
9591 	SpinLockAcquire(&XLogCtl->info_lck);
9592 	lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9593 	lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9594 	lastCheckPoint = XLogCtl->lastCheckPoint;
9595 	SpinLockRelease(&XLogCtl->info_lck);
9596 
9597 	/*
9598 	 * Check that we're still in recovery mode. It's ok if we exit recovery
9599 	 * mode after this check, the restart point is valid anyway.
9600 	 */
9601 	if (!RecoveryInProgress())
9602 	{
9603 		ereport(DEBUG2,
9604 				(errmsg_internal("skipping restartpoint, recovery has already ended")));
9605 		return false;
9606 	}
9607 
9608 	/*
9609 	 * If the last checkpoint record we've replayed is already our last
9610 	 * restartpoint, we can't perform a new restart point. We still update
9611 	 * minRecoveryPoint in that case, so that if this is a shutdown restart
9612 	 * point, we won't start up earlier than before. That's not strictly
9613 	 * necessary, but when hot standby is enabled, it would be rather weird if
9614 	 * the database opened up for read-only connections at a point-in-time
9615 	 * before the last shutdown. Such time travel is still possible in case of
9616 	 * immediate shutdown, though.
9617 	 *
9618 	 * We don't explicitly advance minRecoveryPoint when we do create a
9619 	 * restartpoint. It's assumed that flushing the buffers will do that as a
9620 	 * side-effect.
9621 	 */
9622 	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9623 		lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9624 	{
9625 		ereport(DEBUG2,
9626 				(errmsg_internal("skipping restartpoint, already performed at %X/%X",
9627 								 LSN_FORMAT_ARGS(lastCheckPoint.redo))));
9628 
9629 		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9630 		if (flags & CHECKPOINT_IS_SHUTDOWN)
9631 		{
9632 			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9633 			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9634 			ControlFile->time = (pg_time_t) time(NULL);
9635 			UpdateControlFile();
9636 			LWLockRelease(ControlFileLock);
9637 		}
9638 		return false;
9639 	}
9640 
9641 	/*
9642 	 * Update the shared RedoRecPtr so that the startup process can calculate
9643 	 * the number of segments replayed since last restartpoint, and request a
9644 	 * restartpoint if it exceeds CheckPointSegments.
9645 	 *
9646 	 * Like in CreateCheckPoint(), hold off insertions to update it, although
9647 	 * during recovery this is just pro forma, because no WAL insertions are
9648 	 * happening.
9649 	 */
9650 	WALInsertLockAcquireExclusive();
9651 	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9652 	WALInsertLockRelease();
9653 
9654 	/* Also update the info_lck-protected copy */
9655 	SpinLockAcquire(&XLogCtl->info_lck);
9656 	XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9657 	SpinLockRelease(&XLogCtl->info_lck);
9658 
9659 	/*
9660 	 * Prepare to accumulate statistics.
9661 	 *
9662 	 * Note: because it is possible for log_checkpoints to change while a
9663 	 * checkpoint proceeds, we always accumulate stats, even if
9664 	 * log_checkpoints is currently off.
9665 	 */
9666 	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9667 	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9668 
9669 	if (log_checkpoints)
9670 		LogCheckpointStart(flags, true);
9671 
9672 	/* Update the process title */
9673 	update_checkpoint_display(flags, true, false);
9674 
9675 	CheckPointGuts(lastCheckPoint.redo, flags);
9676 
9677 	/*
9678 	 * Remember the prior checkpoint's redo ptr for
9679 	 * UpdateCheckPointDistanceEstimate()
9680 	 */
9681 	PriorRedoPtr = ControlFile->checkPointCopy.redo;
9682 
9683 	/*
9684 	 * Update pg_control, using current time.  Check that it still shows
9685 	 * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9686 	 * this is a quick hack to make sure nothing really bad happens if somehow
9687 	 * we get here after the end-of-recovery checkpoint.
9688 	 */
9689 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9690 	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9691 		ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9692 	{
9693 		ControlFile->checkPoint = lastCheckPointRecPtr;
9694 		ControlFile->checkPointCopy = lastCheckPoint;
9695 		ControlFile->time = (pg_time_t) time(NULL);
9696 
9697 		/*
9698 		 * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
9699 		 * this will have happened already while writing out dirty buffers,
9700 		 * but not necessarily - e.g. because no buffers were dirtied.  We do
9701 		 * this because a non-exclusive base backup uses minRecoveryPoint to
9702 		 * determine which WAL files must be included in the backup, and the
9703 		 * file (or files) containing the checkpoint record must be included,
9704 		 * at a minimum. Note that for an ordinary restart of recovery there's
9705 		 * no value in having the minimum recovery point any earlier than this
9706 		 * anyway, because redo will begin just after the checkpoint record.
9707 		 */
9708 		if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9709 		{
9710 			ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9711 			ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9712 
9713 			/* update local copy */
9714 			minRecoveryPoint = ControlFile->minRecoveryPoint;
9715 			minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9716 		}
9717 		if (flags & CHECKPOINT_IS_SHUTDOWN)
9718 			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9719 		UpdateControlFile();
9720 	}
9721 	LWLockRelease(ControlFileLock);
9722 
9723 	/*
9724 	 * Update the average distance between checkpoints/restartpoints if the
9725 	 * prior checkpoint exists.
9726 	 */
9727 	if (PriorRedoPtr != InvalidXLogRecPtr)
9728 		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9729 
9730 	/*
9731 	 * Delete old log files, those no longer needed for last restartpoint to
9732 	 * prevent the disk holding the xlog from growing full.
9733 	 */
9734 	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9735 
9736 	/*
9737 	 * Retreat _logSegNo using the current end of xlog replayed or received,
9738 	 * whichever is later.
9739 	 */
9740 	receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
9741 	replayPtr = GetXLogReplayRecPtr(&replayTLI);
9742 	endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9743 	KeepLogSeg(endptr, &_logSegNo);
9744 	if (InvalidateObsoleteReplicationSlots(_logSegNo))
9745 	{
9746 		/*
9747 		 * Some slots have been invalidated; recalculate the old-segment
9748 		 * horizon, starting again from RedoRecPtr.
9749 		 */
9750 		XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9751 		KeepLogSeg(endptr, &_logSegNo);
9752 	}
9753 	_logSegNo--;
9754 
9755 	/*
9756 	 * Try to recycle segments on a useful timeline. If we've been promoted
9757 	 * since the beginning of this restartpoint, use the new timeline chosen
9758 	 * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
9759 	 * case). If we're still in recovery, use the timeline we're currently
9760 	 * replaying.
9761 	 *
9762 	 * There is no guarantee that the WAL segments will be useful on the
9763 	 * current timeline; if recovery proceeds to a new timeline right after
9764 	 * this, the pre-allocated WAL segments on this timeline will not be used,
9765 	 * and will go wasted until recycled on the next restartpoint. We'll live
9766 	 * with that.
9767 	 */
9768 	if (RecoveryInProgress())
9769 		ThisTimeLineID = replayTLI;
9770 
9771 	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
9772 
9773 	/*
9774 	 * Make more log segments if needed.  (Do this after recycling old log
9775 	 * segments, since that may supply some of the needed files.)
9776 	 */
9777 	PreallocXlogFiles(endptr);
9778 
9779 	/*
9780 	 * ThisTimeLineID is normally not set when we're still in recovery.
9781 	 * However, recycling/preallocating segments above needed ThisTimeLineID
9782 	 * to determine which timeline to install the segments on. Reset it now,
9783 	 * to restore the normal state of affairs for debugging purposes.
9784 	 */
9785 	if (RecoveryInProgress())
9786 		ThisTimeLineID = 0;
9787 
9788 	/*
9789 	 * Truncate pg_subtrans if possible.  We can throw away all data before
9790 	 * the oldest XMIN of any running transaction.  No future transaction will
9791 	 * attempt to reference any pg_subtrans entry older than that (see Asserts
9792 	 * in subtrans.c).  When hot standby is disabled, though, we mustn't do
9793 	 * this because StartupSUBTRANS hasn't been called yet.
9794 	 */
9795 	if (EnableHotStandby)
9796 		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
9797 
9798 	/* Real work is done; log and update stats. */
9799 	LogCheckpointEnd(true);
9800 
9801 	/* Reset the process title */
9802 	update_checkpoint_display(flags, true, true);
9803 
9804 	xtime = GetLatestXTime();
9805 	ereport((log_checkpoints ? LOG : DEBUG2),
9806 			(errmsg("recovery restart point at %X/%X",
9807 					LSN_FORMAT_ARGS(lastCheckPoint.redo)),
9808 			 xtime ? errdetail("Last completed transaction was at log time %s.",
9809 							   timestamptz_to_str(xtime)) : 0));
9810 
9811 	/*
9812 	 * Finally, execute archive_cleanup_command, if any.
9813 	 */
9814 	if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
9815 		ExecuteRecoveryCommand(archiveCleanupCommand,
9816 							   "archive_cleanup_command",
9817 							   false);
9818 
9819 	return true;
9820 }
9821 
9822 /*
9823  * Report availability of WAL for the given target LSN
9824  *		(typically a slot's restart_lsn)
9825  *
9826  * Returns one of the following enum values:
9827  *
9828  * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
9829  *   max_wal_size.
9830  *
9831  * * WALAVAIL_EXTENDED means it is still available by preserving extra
9832  *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
9833  *   than max_wal_size, this state is not returned.
9834  *
9835  * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
9836  *   remove reserved segments. The walsender using this slot may return to the
9837  *   above.
9838  *
9839  * * WALAVAIL_REMOVED means it has been removed. A replication stream on
9840  *   a slot with this LSN cannot continue after a restart.
9841  *
9842  * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
9843  */
9844 WALAvailability
GetWALAvailability(XLogRecPtr targetLSN)9845 GetWALAvailability(XLogRecPtr targetLSN)
9846 {
9847 	XLogRecPtr	currpos;		/* current write LSN */
9848 	XLogSegNo	currSeg;		/* segid of currpos */
9849 	XLogSegNo	targetSeg;		/* segid of targetLSN */
9850 	XLogSegNo	oldestSeg;		/* actual oldest segid */
9851 	XLogSegNo	oldestSegMaxWalSize;	/* oldest segid kept by max_wal_size */
9852 	XLogSegNo	oldestSlotSeg;	/* oldest segid kept by slot */
9853 	uint64		keepSegs;
9854 
9855 	/*
9856 	 * slot does not reserve WAL. Either deactivated, or has never been active
9857 	 */
9858 	if (XLogRecPtrIsInvalid(targetLSN))
9859 		return WALAVAIL_INVALID_LSN;
9860 
9861 	/*
9862 	 * Calculate the oldest segment currently reserved by all slots,
9863 	 * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
9864 	 * oldestSlotSeg to the current segment.
9865 	 */
9866 	currpos = GetXLogWriteRecPtr();
9867 	XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
9868 	KeepLogSeg(currpos, &oldestSlotSeg);
9869 
9870 	/*
9871 	 * Find the oldest extant segment file. We get 1 until checkpoint removes
9872 	 * the first WAL segment file since startup, which causes the status being
9873 	 * wrong under certain abnormal conditions but that doesn't actually harm.
9874 	 */
9875 	oldestSeg = XLogGetLastRemovedSegno() + 1;
9876 
9877 	/* calculate oldest segment by max_wal_size */
9878 	XLByteToSeg(currpos, currSeg, wal_segment_size);
9879 	keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
9880 
9881 	if (currSeg > keepSegs)
9882 		oldestSegMaxWalSize = currSeg - keepSegs;
9883 	else
9884 		oldestSegMaxWalSize = 1;
9885 
9886 	/* the segment we care about */
9887 	XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
9888 
9889 	/*
9890 	 * No point in returning reserved or extended status values if the
9891 	 * targetSeg is known to be lost.
9892 	 */
9893 	if (targetSeg >= oldestSlotSeg)
9894 	{
9895 		/* show "reserved" when targetSeg is within max_wal_size */
9896 		if (targetSeg >= oldestSegMaxWalSize)
9897 			return WALAVAIL_RESERVED;
9898 
9899 		/* being retained by slots exceeding max_wal_size */
9900 		return WALAVAIL_EXTENDED;
9901 	}
9902 
9903 	/* WAL segments are no longer retained but haven't been removed yet */
9904 	if (targetSeg >= oldestSeg)
9905 		return WALAVAIL_UNRESERVED;
9906 
9907 	/* Definitely lost */
9908 	return WALAVAIL_REMOVED;
9909 }
9910 
9911 
9912 /*
9913  * Retreat *logSegNo to the last segment that we need to retain because of
9914  * either wal_keep_size or replication slots.
9915  *
9916  * This is calculated by subtracting wal_keep_size from the given xlog
9917  * location, recptr and by making sure that that result is below the
9918  * requirement of replication slots.  For the latter criterion we do consider
9919  * the effects of max_slot_wal_keep_size: reserve at most that much space back
9920  * from recptr.
9921  *
9922  * Note about replication slots: if this function calculates a value
9923  * that's further ahead than what slots need reserved, then affected
9924  * slots need to be invalidated and this function invoked again.
9925  * XXX it might be a good idea to rewrite this function so that
9926  * invalidation is optionally done here, instead.
9927  */
9928 static void
KeepLogSeg(XLogRecPtr recptr,XLogSegNo * logSegNo)9929 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9930 {
9931 	XLogSegNo	currSegNo;
9932 	XLogSegNo	segno;
9933 	XLogRecPtr	keep;
9934 
9935 	XLByteToSeg(recptr, currSegNo, wal_segment_size);
9936 	segno = currSegNo;
9937 
9938 	/*
9939 	 * Calculate how many segments are kept by slots first, adjusting for
9940 	 * max_slot_wal_keep_size.
9941 	 */
9942 	keep = XLogGetReplicationSlotMinimumLSN();
9943 	if (keep != InvalidXLogRecPtr)
9944 	{
9945 		XLByteToSeg(keep, segno, wal_segment_size);
9946 
9947 		/* Cap by max_slot_wal_keep_size ... */
9948 		if (max_slot_wal_keep_size_mb >= 0)
9949 		{
9950 			uint64		slot_keep_segs;
9951 
9952 			slot_keep_segs =
9953 				ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
9954 
9955 			if (currSegNo - segno > slot_keep_segs)
9956 				segno = currSegNo - slot_keep_segs;
9957 		}
9958 	}
9959 
9960 	/* but, keep at least wal_keep_size if that's set */
9961 	if (wal_keep_size_mb > 0)
9962 	{
9963 		uint64		keep_segs;
9964 
9965 		keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
9966 		if (currSegNo - segno < keep_segs)
9967 		{
9968 			/* avoid underflow, don't go below 1 */
9969 			if (currSegNo <= keep_segs)
9970 				segno = 1;
9971 			else
9972 				segno = currSegNo - keep_segs;
9973 		}
9974 	}
9975 
9976 	/* don't delete WAL segments newer than the calculated segment */
9977 	if (segno < *logSegNo)
9978 		*logSegNo = segno;
9979 }
9980 
9981 /*
9982  * Write a NEXTOID log record
9983  */
9984 void
XLogPutNextOid(Oid nextOid)9985 XLogPutNextOid(Oid nextOid)
9986 {
9987 	XLogBeginInsert();
9988 	XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9989 	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9990 
9991 	/*
9992 	 * We need not flush the NEXTOID record immediately, because any of the
9993 	 * just-allocated OIDs could only reach disk as part of a tuple insert or
9994 	 * update that would have its own XLOG record that must follow the NEXTOID
9995 	 * record.  Therefore, the standard buffer LSN interlock applied to those
9996 	 * records will ensure no such OID reaches disk before the NEXTOID record
9997 	 * does.
9998 	 *
9999 	 * Note, however, that the above statement only covers state "within" the
10000 	 * database.  When we use a generated OID as a file or directory name, we
10001 	 * are in a sense violating the basic WAL rule, because that filesystem
10002 	 * change may reach disk before the NEXTOID WAL record does.  The impact
10003 	 * of this is that if a database crash occurs immediately afterward, we
10004 	 * might after restart re-generate the same OID and find that it conflicts
10005 	 * with the leftover file or directory.  But since for safety's sake we
10006 	 * always loop until finding a nonconflicting filename, this poses no real
10007 	 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
10008 	 */
10009 }
10010 
10011 /*
10012  * Write an XLOG SWITCH record.
10013  *
10014  * Here we just blindly issue an XLogInsert request for the record.
10015  * All the magic happens inside XLogInsert.
10016  *
10017  * The return value is either the end+1 address of the switch record,
10018  * or the end+1 address of the prior segment if we did not need to
10019  * write a switch record because we are already at segment start.
10020  */
10021 XLogRecPtr
RequestXLogSwitch(bool mark_unimportant)10022 RequestXLogSwitch(bool mark_unimportant)
10023 {
10024 	XLogRecPtr	RecPtr;
10025 
10026 	/* XLOG SWITCH has no data */
10027 	XLogBeginInsert();
10028 
10029 	if (mark_unimportant)
10030 		XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
10031 	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
10032 
10033 	return RecPtr;
10034 }
10035 
10036 /*
10037  * Write a RESTORE POINT record
10038  */
10039 XLogRecPtr
XLogRestorePoint(const char * rpName)10040 XLogRestorePoint(const char *rpName)
10041 {
10042 	XLogRecPtr	RecPtr;
10043 	xl_restore_point xlrec;
10044 
10045 	xlrec.rp_time = GetCurrentTimestamp();
10046 	strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
10047 
10048 	XLogBeginInsert();
10049 	XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
10050 
10051 	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
10052 
10053 	ereport(LOG,
10054 			(errmsg("restore point \"%s\" created at %X/%X",
10055 					rpName, LSN_FORMAT_ARGS(RecPtr))));
10056 
10057 	return RecPtr;
10058 }
10059 
10060 /*
10061  * Check if any of the GUC parameters that are critical for hot standby
10062  * have changed, and update the value in pg_control file if necessary.
10063  */
10064 static void
XLogReportParameters(void)10065 XLogReportParameters(void)
10066 {
10067 	if (wal_level != ControlFile->wal_level ||
10068 		wal_log_hints != ControlFile->wal_log_hints ||
10069 		MaxConnections != ControlFile->MaxConnections ||
10070 		max_worker_processes != ControlFile->max_worker_processes ||
10071 		max_wal_senders != ControlFile->max_wal_senders ||
10072 		max_prepared_xacts != ControlFile->max_prepared_xacts ||
10073 		max_locks_per_xact != ControlFile->max_locks_per_xact ||
10074 		track_commit_timestamp != ControlFile->track_commit_timestamp)
10075 	{
10076 		/*
10077 		 * The change in number of backend slots doesn't need to be WAL-logged
10078 		 * if archiving is not enabled, as you can't start archive recovery
10079 		 * with wal_level=minimal anyway. We don't really care about the
10080 		 * values in pg_control either if wal_level=minimal, but seems better
10081 		 * to keep them up-to-date to avoid confusion.
10082 		 */
10083 		if (wal_level != ControlFile->wal_level || XLogIsNeeded())
10084 		{
10085 			xl_parameter_change xlrec;
10086 			XLogRecPtr	recptr;
10087 
10088 			xlrec.MaxConnections = MaxConnections;
10089 			xlrec.max_worker_processes = max_worker_processes;
10090 			xlrec.max_wal_senders = max_wal_senders;
10091 			xlrec.max_prepared_xacts = max_prepared_xacts;
10092 			xlrec.max_locks_per_xact = max_locks_per_xact;
10093 			xlrec.wal_level = wal_level;
10094 			xlrec.wal_log_hints = wal_log_hints;
10095 			xlrec.track_commit_timestamp = track_commit_timestamp;
10096 
10097 			XLogBeginInsert();
10098 			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
10099 
10100 			recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
10101 			XLogFlush(recptr);
10102 		}
10103 
10104 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10105 
10106 		ControlFile->MaxConnections = MaxConnections;
10107 		ControlFile->max_worker_processes = max_worker_processes;
10108 		ControlFile->max_wal_senders = max_wal_senders;
10109 		ControlFile->max_prepared_xacts = max_prepared_xacts;
10110 		ControlFile->max_locks_per_xact = max_locks_per_xact;
10111 		ControlFile->wal_level = wal_level;
10112 		ControlFile->wal_log_hints = wal_log_hints;
10113 		ControlFile->track_commit_timestamp = track_commit_timestamp;
10114 		UpdateControlFile();
10115 
10116 		LWLockRelease(ControlFileLock);
10117 	}
10118 }
10119 
10120 /*
10121  * Update full_page_writes in shared memory, and write an
10122  * XLOG_FPW_CHANGE record if necessary.
10123  *
10124  * Note: this function assumes there is no other process running
10125  * concurrently that could update it.
10126  */
10127 void
UpdateFullPageWrites(void)10128 UpdateFullPageWrites(void)
10129 {
10130 	XLogCtlInsert *Insert = &XLogCtl->Insert;
10131 	bool		recoveryInProgress;
10132 
10133 	/*
10134 	 * Do nothing if full_page_writes has not been changed.
10135 	 *
10136 	 * It's safe to check the shared full_page_writes without the lock,
10137 	 * because we assume that there is no concurrently running process which
10138 	 * can update it.
10139 	 */
10140 	if (fullPageWrites == Insert->fullPageWrites)
10141 		return;
10142 
10143 	/*
10144 	 * Perform this outside critical section so that the WAL insert
10145 	 * initialization done by RecoveryInProgress() doesn't trigger an
10146 	 * assertion failure.
10147 	 */
10148 	recoveryInProgress = RecoveryInProgress();
10149 
10150 	START_CRIT_SECTION();
10151 
10152 	/*
10153 	 * It's always safe to take full page images, even when not strictly
10154 	 * required, but not the other round. So if we're setting full_page_writes
10155 	 * to true, first set it true and then write the WAL record. If we're
10156 	 * setting it to false, first write the WAL record and then set the global
10157 	 * flag.
10158 	 */
10159 	if (fullPageWrites)
10160 	{
10161 		WALInsertLockAcquireExclusive();
10162 		Insert->fullPageWrites = true;
10163 		WALInsertLockRelease();
10164 	}
10165 
10166 	/*
10167 	 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
10168 	 * full_page_writes during archive recovery, if required.
10169 	 */
10170 	if (XLogStandbyInfoActive() && !recoveryInProgress)
10171 	{
10172 		XLogBeginInsert();
10173 		XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
10174 
10175 		XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
10176 	}
10177 
10178 	if (!fullPageWrites)
10179 	{
10180 		WALInsertLockAcquireExclusive();
10181 		Insert->fullPageWrites = false;
10182 		WALInsertLockRelease();
10183 	}
10184 	END_CRIT_SECTION();
10185 }
10186 
10187 /*
10188  * Check that it's OK to switch to new timeline during recovery.
10189  *
10190  * 'lsn' is the address of the shutdown checkpoint record we're about to
10191  * replay. (Currently, timeline can only change at a shutdown checkpoint).
10192  */
10193 static void
checkTimeLineSwitch(XLogRecPtr lsn,TimeLineID newTLI,TimeLineID prevTLI)10194 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
10195 {
10196 	/* Check that the record agrees on what the current (old) timeline is */
10197 	if (prevTLI != ThisTimeLineID)
10198 		ereport(PANIC,
10199 				(errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
10200 						prevTLI, ThisTimeLineID)));
10201 
10202 	/*
10203 	 * The new timeline better be in the list of timelines we expect to see,
10204 	 * according to the timeline history. It should also not decrease.
10205 	 */
10206 	if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
10207 		ereport(PANIC,
10208 				(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
10209 						newTLI, ThisTimeLineID)));
10210 
10211 	/*
10212 	 * If we have not yet reached min recovery point, and we're about to
10213 	 * switch to a timeline greater than the timeline of the min recovery
10214 	 * point: trouble. After switching to the new timeline, we could not
10215 	 * possibly visit the min recovery point on the correct timeline anymore.
10216 	 * This can happen if there is a newer timeline in the archive that
10217 	 * branched before the timeline the min recovery point is on, and you
10218 	 * attempt to do PITR to the new timeline.
10219 	 */
10220 	if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
10221 		lsn < minRecoveryPoint &&
10222 		newTLI > minRecoveryPointTLI)
10223 		ereport(PANIC,
10224 				(errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
10225 						newTLI,
10226 						LSN_FORMAT_ARGS(minRecoveryPoint),
10227 						minRecoveryPointTLI)));
10228 
10229 	/* Looks good */
10230 }
10231 
10232 /*
10233  * XLOG resource manager's routines
10234  *
10235  * Definitions of info values are in include/catalog/pg_control.h, though
10236  * not all record types are related to control file updates.
10237  */
10238 void
xlog_redo(XLogReaderState * record)10239 xlog_redo(XLogReaderState *record)
10240 {
10241 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
10242 	XLogRecPtr	lsn = record->EndRecPtr;
10243 
10244 	/* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
10245 	Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
10246 		   !XLogRecHasAnyBlockRefs(record));
10247 
10248 	if (info == XLOG_NEXTOID)
10249 	{
10250 		Oid			nextOid;
10251 
10252 		/*
10253 		 * We used to try to take the maximum of ShmemVariableCache->nextOid
10254 		 * and the recorded nextOid, but that fails if the OID counter wraps
10255 		 * around.  Since no OID allocation should be happening during replay
10256 		 * anyway, better to just believe the record exactly.  We still take
10257 		 * OidGenLock while setting the variable, just in case.
10258 		 */
10259 		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
10260 		LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
10261 		ShmemVariableCache->nextOid = nextOid;
10262 		ShmemVariableCache->oidCount = 0;
10263 		LWLockRelease(OidGenLock);
10264 	}
10265 	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
10266 	{
10267 		CheckPoint	checkPoint;
10268 
10269 		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
10270 		/* In a SHUTDOWN checkpoint, believe the counters exactly */
10271 		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
10272 		ShmemVariableCache->nextXid = checkPoint.nextXid;
10273 		LWLockRelease(XidGenLock);
10274 		LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
10275 		ShmemVariableCache->nextOid = checkPoint.nextOid;
10276 		ShmemVariableCache->oidCount = 0;
10277 		LWLockRelease(OidGenLock);
10278 		MultiXactSetNextMXact(checkPoint.nextMulti,
10279 							  checkPoint.nextMultiOffset);
10280 
10281 		MultiXactAdvanceOldest(checkPoint.oldestMulti,
10282 							   checkPoint.oldestMultiDB);
10283 
10284 		/*
10285 		 * No need to set oldestClogXid here as well; it'll be set when we
10286 		 * redo an xl_clog_truncate if it changed since initialization.
10287 		 */
10288 		SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
10289 
10290 		/*
10291 		 * If we see a shutdown checkpoint while waiting for an end-of-backup
10292 		 * record, the backup was canceled and the end-of-backup record will
10293 		 * never arrive.
10294 		 */
10295 		if (ArchiveRecoveryRequested &&
10296 			!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
10297 			XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
10298 			ereport(PANIC,
10299 					(errmsg("online backup was canceled, recovery cannot continue")));
10300 
10301 		/*
10302 		 * If we see a shutdown checkpoint, we know that nothing was running
10303 		 * on the primary at this point. So fake-up an empty running-xacts
10304 		 * record and use that here and now. Recover additional standby state
10305 		 * for prepared transactions.
10306 		 */
10307 		if (standbyState >= STANDBY_INITIALIZED)
10308 		{
10309 			TransactionId *xids;
10310 			int			nxids;
10311 			TransactionId oldestActiveXID;
10312 			TransactionId latestCompletedXid;
10313 			RunningTransactionsData running;
10314 
10315 			oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
10316 
10317 			/*
10318 			 * Construct a RunningTransactions snapshot representing a shut
10319 			 * down server, with only prepared transactions still alive. We're
10320 			 * never overflowed at this point because all subxids are listed
10321 			 * with their parent prepared transactions.
10322 			 */
10323 			running.xcnt = nxids;
10324 			running.subxcnt = 0;
10325 			running.subxid_overflow = false;
10326 			running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
10327 			running.oldestRunningXid = oldestActiveXID;
10328 			latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
10329 			TransactionIdRetreat(latestCompletedXid);
10330 			Assert(TransactionIdIsNormal(latestCompletedXid));
10331 			running.latestCompletedXid = latestCompletedXid;
10332 			running.xids = xids;
10333 
10334 			ProcArrayApplyRecoveryInfo(&running);
10335 
10336 			StandbyRecoverPreparedTransactions();
10337 		}
10338 
10339 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
10340 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10341 		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
10342 		LWLockRelease(ControlFileLock);
10343 
10344 		/* Update shared-memory copy of checkpoint XID/epoch */
10345 		SpinLockAcquire(&XLogCtl->info_lck);
10346 		XLogCtl->ckptFullXid = checkPoint.nextXid;
10347 		SpinLockRelease(&XLogCtl->info_lck);
10348 
10349 		/*
10350 		 * We should've already switched to the new TLI before replaying this
10351 		 * record.
10352 		 */
10353 		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
10354 			ereport(PANIC,
10355 					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10356 							checkPoint.ThisTimeLineID, ThisTimeLineID)));
10357 
10358 		RecoveryRestartPoint(&checkPoint);
10359 	}
10360 	else if (info == XLOG_CHECKPOINT_ONLINE)
10361 	{
10362 		CheckPoint	checkPoint;
10363 
10364 		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
10365 		/* In an ONLINE checkpoint, treat the XID counter as a minimum */
10366 		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
10367 		if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid,
10368 									  checkPoint.nextXid))
10369 			ShmemVariableCache->nextXid = checkPoint.nextXid;
10370 		LWLockRelease(XidGenLock);
10371 
10372 		/*
10373 		 * We ignore the nextOid counter in an ONLINE checkpoint, preferring
10374 		 * to track OID assignment through XLOG_NEXTOID records.  The nextOid
10375 		 * counter is from the start of the checkpoint and might well be stale
10376 		 * compared to later XLOG_NEXTOID records.  We could try to take the
10377 		 * maximum of the nextOid counter and our latest value, but since
10378 		 * there's no particular guarantee about the speed with which the OID
10379 		 * counter wraps around, that's a risky thing to do.  In any case,
10380 		 * users of the nextOid counter are required to avoid assignment of
10381 		 * duplicates, so that a somewhat out-of-date value should be safe.
10382 		 */
10383 
10384 		/* Handle multixact */
10385 		MultiXactAdvanceNextMXact(checkPoint.nextMulti,
10386 								  checkPoint.nextMultiOffset);
10387 
10388 		/*
10389 		 * NB: This may perform multixact truncation when replaying WAL
10390 		 * generated by an older primary.
10391 		 */
10392 		MultiXactAdvanceOldest(checkPoint.oldestMulti,
10393 							   checkPoint.oldestMultiDB);
10394 		if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
10395 								  checkPoint.oldestXid))
10396 			SetTransactionIdLimit(checkPoint.oldestXid,
10397 								  checkPoint.oldestXidDB);
10398 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
10399 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10400 		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
10401 		LWLockRelease(ControlFileLock);
10402 
10403 		/* Update shared-memory copy of checkpoint XID/epoch */
10404 		SpinLockAcquire(&XLogCtl->info_lck);
10405 		XLogCtl->ckptFullXid = checkPoint.nextXid;
10406 		SpinLockRelease(&XLogCtl->info_lck);
10407 
10408 		/* TLI should not change in an on-line checkpoint */
10409 		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
10410 			ereport(PANIC,
10411 					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10412 							checkPoint.ThisTimeLineID, ThisTimeLineID)));
10413 
10414 		RecoveryRestartPoint(&checkPoint);
10415 	}
10416 	else if (info == XLOG_OVERWRITE_CONTRECORD)
10417 	{
10418 		xl_overwrite_contrecord xlrec;
10419 
10420 		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
10421 		VerifyOverwriteContrecord(&xlrec, record);
10422 	}
10423 	else if (info == XLOG_END_OF_RECOVERY)
10424 	{
10425 		xl_end_of_recovery xlrec;
10426 
10427 		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
10428 
10429 		/*
10430 		 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
10431 		 * but this case is rarer and harder to test, so the benefit doesn't
10432 		 * outweigh the potential extra cost of maintenance.
10433 		 */
10434 
10435 		/*
10436 		 * We should've already switched to the new TLI before replaying this
10437 		 * record.
10438 		 */
10439 		if (xlrec.ThisTimeLineID != ThisTimeLineID)
10440 			ereport(PANIC,
10441 					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10442 							xlrec.ThisTimeLineID, ThisTimeLineID)));
10443 	}
10444 	else if (info == XLOG_NOOP)
10445 	{
10446 		/* nothing to do here */
10447 	}
10448 	else if (info == XLOG_SWITCH)
10449 	{
10450 		/* nothing to do here */
10451 	}
10452 	else if (info == XLOG_RESTORE_POINT)
10453 	{
10454 		/* nothing to do here */
10455 	}
10456 	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
10457 	{
10458 		/*
10459 		 * Full-page image (FPI) records contain nothing else but a backup
10460 		 * block (or multiple backup blocks). Every block reference must
10461 		 * include a full-page image - otherwise there would be no point in
10462 		 * this record.
10463 		 *
10464 		 * No recovery conflicts are generated by these generic records - if a
10465 		 * resource manager needs to generate conflicts, it has to define a
10466 		 * separate WAL record type and redo routine.
10467 		 *
10468 		 * XLOG_FPI_FOR_HINT records are generated when a page needs to be
10469 		 * WAL- logged because of a hint bit update. They are only generated
10470 		 * when checksums are enabled. There is no difference in handling
10471 		 * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
10472 		 * code just to distinguish them for statistics purposes.
10473 		 */
10474 		for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
10475 		{
10476 			Buffer		buffer;
10477 
10478 			if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
10479 				elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
10480 			UnlockReleaseBuffer(buffer);
10481 		}
10482 	}
10483 	else if (info == XLOG_BACKUP_END)
10484 	{
10485 		XLogRecPtr	startpoint;
10486 
10487 		memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
10488 
10489 		if (ControlFile->backupStartPoint == startpoint)
10490 		{
10491 			/*
10492 			 * We have reached the end of base backup, the point where
10493 			 * pg_stop_backup() was done. The data on disk is now consistent.
10494 			 * Reset backupStartPoint, and update minRecoveryPoint to make
10495 			 * sure we don't allow starting up at an earlier point even if
10496 			 * recovery is stopped and restarted soon after this.
10497 			 */
10498 			elog(DEBUG1, "end of backup reached");
10499 
10500 			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10501 
10502 			if (ControlFile->minRecoveryPoint < lsn)
10503 			{
10504 				ControlFile->minRecoveryPoint = lsn;
10505 				ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10506 			}
10507 			ControlFile->backupStartPoint = InvalidXLogRecPtr;
10508 			ControlFile->backupEndRequired = false;
10509 			UpdateControlFile();
10510 
10511 			LWLockRelease(ControlFileLock);
10512 		}
10513 	}
10514 	else if (info == XLOG_PARAMETER_CHANGE)
10515 	{
10516 		xl_parameter_change xlrec;
10517 
10518 		/* Update our copy of the parameters in pg_control */
10519 		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
10520 
10521 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10522 		ControlFile->MaxConnections = xlrec.MaxConnections;
10523 		ControlFile->max_worker_processes = xlrec.max_worker_processes;
10524 		ControlFile->max_wal_senders = xlrec.max_wal_senders;
10525 		ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
10526 		ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
10527 		ControlFile->wal_level = xlrec.wal_level;
10528 		ControlFile->wal_log_hints = xlrec.wal_log_hints;
10529 
10530 		/*
10531 		 * Update minRecoveryPoint to ensure that if recovery is aborted, we
10532 		 * recover back up to this point before allowing hot standby again.
10533 		 * This is important if the max_* settings are decreased, to ensure
10534 		 * you don't run queries against the WAL preceding the change. The
10535 		 * local copies cannot be updated as long as crash recovery is
10536 		 * happening and we expect all the WAL to be replayed.
10537 		 */
10538 		if (InArchiveRecovery)
10539 		{
10540 			minRecoveryPoint = ControlFile->minRecoveryPoint;
10541 			minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
10542 		}
10543 		if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
10544 		{
10545 			ControlFile->minRecoveryPoint = lsn;
10546 			ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10547 		}
10548 
10549 		CommitTsParameterChange(xlrec.track_commit_timestamp,
10550 								ControlFile->track_commit_timestamp);
10551 		ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
10552 
10553 		UpdateControlFile();
10554 		LWLockRelease(ControlFileLock);
10555 
10556 		/* Check to see if any parameter change gives a problem on recovery */
10557 		CheckRequiredParameterValues();
10558 	}
10559 	else if (info == XLOG_FPW_CHANGE)
10560 	{
10561 		bool		fpw;
10562 
10563 		memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
10564 
10565 		/*
10566 		 * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
10567 		 * do_pg_start_backup() and do_pg_stop_backup() can check whether
10568 		 * full_page_writes has been disabled during online backup.
10569 		 */
10570 		if (!fpw)
10571 		{
10572 			SpinLockAcquire(&XLogCtl->info_lck);
10573 			if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
10574 				XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
10575 			SpinLockRelease(&XLogCtl->info_lck);
10576 		}
10577 
10578 		/* Keep track of full_page_writes */
10579 		lastFullPageWrites = fpw;
10580 	}
10581 }
10582 
10583 /*
10584  * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record.
10585  */
10586 static void
VerifyOverwriteContrecord(xl_overwrite_contrecord * xlrec,XLogReaderState * state)10587 VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state)
10588 {
10589 	if (xlrec->overwritten_lsn != state->overwrittenRecPtr)
10590 		elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
10591 			 LSN_FORMAT_ARGS(xlrec->overwritten_lsn),
10592 			 LSN_FORMAT_ARGS(state->overwrittenRecPtr));
10593 
10594 	ereport(LOG,
10595 			(errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
10596 					LSN_FORMAT_ARGS(xlrec->overwritten_lsn),
10597 					timestamptz_to_str(xlrec->overwrite_time))));
10598 
10599 	/* Verifying the record should only happen once */
10600 	state->overwrittenRecPtr = InvalidXLogRecPtr;
10601 }
10602 
10603 #ifdef WAL_DEBUG
10604 
10605 static void
xlog_outrec(StringInfo buf,XLogReaderState * record)10606 xlog_outrec(StringInfo buf, XLogReaderState *record)
10607 {
10608 	appendStringInfo(buf, "prev %X/%X; xid %u",
10609 					 LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
10610 					 XLogRecGetXid(record));
10611 
10612 	appendStringInfo(buf, "; len %u",
10613 					 XLogRecGetDataLen(record));
10614 
10615 	xlog_block_info(buf, record);
10616 }
10617 #endif							/* WAL_DEBUG */
10618 
10619 /*
10620  * Returns a string giving information about all the blocks in an
10621  * XLogRecord.
10622  */
10623 static void
xlog_block_info(StringInfo buf,XLogReaderState * record)10624 xlog_block_info(StringInfo buf, XLogReaderState *record)
10625 {
10626 	int			block_id;
10627 
10628 	/* decode block references */
10629 	for (block_id = 0; block_id <= record->max_block_id; block_id++)
10630 	{
10631 		RelFileNode rnode;
10632 		ForkNumber	forknum;
10633 		BlockNumber blk;
10634 
10635 		if (!XLogRecHasBlockRef(record, block_id))
10636 			continue;
10637 
10638 		XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
10639 		if (forknum != MAIN_FORKNUM)
10640 			appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
10641 							 block_id,
10642 							 rnode.spcNode, rnode.dbNode, rnode.relNode,
10643 							 forknum,
10644 							 blk);
10645 		else
10646 			appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
10647 							 block_id,
10648 							 rnode.spcNode, rnode.dbNode, rnode.relNode,
10649 							 blk);
10650 		if (XLogRecHasBlockImage(record, block_id))
10651 			appendStringInfoString(buf, " FPW");
10652 	}
10653 }
10654 
10655 /*
10656  * Returns a string describing an XLogRecord, consisting of its identity
10657  * optionally followed by a colon, a space, and a further description.
10658  */
10659 static void
xlog_outdesc(StringInfo buf,XLogReaderState * record)10660 xlog_outdesc(StringInfo buf, XLogReaderState *record)
10661 {
10662 	RmgrId		rmid = XLogRecGetRmid(record);
10663 	uint8		info = XLogRecGetInfo(record);
10664 	const char *id;
10665 
10666 	appendStringInfoString(buf, RmgrTable[rmid].rm_name);
10667 	appendStringInfoChar(buf, '/');
10668 
10669 	id = RmgrTable[rmid].rm_identify(info);
10670 	if (id == NULL)
10671 		appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
10672 	else
10673 		appendStringInfo(buf, "%s: ", id);
10674 
10675 	RmgrTable[rmid].rm_desc(buf, record);
10676 }
10677 
10678 
10679 /*
10680  * Return the (possible) sync flag used for opening a file, depending on the
10681  * value of the GUC wal_sync_method.
10682  */
10683 static int
get_sync_bit(int method)10684 get_sync_bit(int method)
10685 {
10686 	int			o_direct_flag = 0;
10687 
10688 	/* If fsync is disabled, never open in sync mode */
10689 	if (!enableFsync)
10690 		return 0;
10691 
10692 	/*
10693 	 * Optimize writes by bypassing kernel cache with O_DIRECT when using
10694 	 * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
10695 	 * disabled, otherwise the archive command or walsender process will read
10696 	 * the WAL soon after writing it, which is guaranteed to cause a physical
10697 	 * read if we bypassed the kernel cache. We also skip the
10698 	 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
10699 	 * reason.
10700 	 *
10701 	 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
10702 	 * written by walreceiver is normally read by the startup process soon
10703 	 * after it's written. Also, walreceiver performs unaligned writes, which
10704 	 * don't work with O_DIRECT, so it is required for correctness too.
10705 	 */
10706 	if (!XLogIsNeeded() && !AmWalReceiverProcess())
10707 		o_direct_flag = PG_O_DIRECT;
10708 
10709 	switch (method)
10710 	{
10711 			/*
10712 			 * enum values for all sync options are defined even if they are
10713 			 * not supported on the current platform.  But if not, they are
10714 			 * not included in the enum option array, and therefore will never
10715 			 * be seen here.
10716 			 */
10717 		case SYNC_METHOD_FSYNC:
10718 		case SYNC_METHOD_FSYNC_WRITETHROUGH:
10719 		case SYNC_METHOD_FDATASYNC:
10720 			return 0;
10721 #ifdef OPEN_SYNC_FLAG
10722 		case SYNC_METHOD_OPEN:
10723 			return OPEN_SYNC_FLAG | o_direct_flag;
10724 #endif
10725 #ifdef OPEN_DATASYNC_FLAG
10726 		case SYNC_METHOD_OPEN_DSYNC:
10727 			return OPEN_DATASYNC_FLAG | o_direct_flag;
10728 #endif
10729 		default:
10730 			/* can't happen (unless we are out of sync with option array) */
10731 			elog(ERROR, "unrecognized wal_sync_method: %d", method);
10732 			return 0;			/* silence warning */
10733 	}
10734 }
10735 
10736 /*
10737  * GUC support
10738  */
10739 void
assign_xlog_sync_method(int new_sync_method,void * extra)10740 assign_xlog_sync_method(int new_sync_method, void *extra)
10741 {
10742 	if (sync_method != new_sync_method)
10743 	{
10744 		/*
10745 		 * To ensure that no blocks escape unsynced, force an fsync on the
10746 		 * currently open log segment (if any).  Also, if the open flag is
10747 		 * changing, close the log file so it will be reopened (with new flag
10748 		 * bit) at next use.
10749 		 */
10750 		if (openLogFile >= 0)
10751 		{
10752 			pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
10753 			if (pg_fsync(openLogFile) != 0)
10754 			{
10755 				char		xlogfname[MAXFNAMELEN];
10756 				int			save_errno;
10757 
10758 				save_errno = errno;
10759 				XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
10760 							 wal_segment_size);
10761 				errno = save_errno;
10762 				ereport(PANIC,
10763 						(errcode_for_file_access(),
10764 						 errmsg("could not fsync file \"%s\": %m", xlogfname)));
10765 			}
10766 
10767 			pgstat_report_wait_end();
10768 			if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10769 				XLogFileClose();
10770 		}
10771 	}
10772 }
10773 
10774 
10775 /*
10776  * Issue appropriate kind of fsync (if any) for an XLOG output file.
10777  *
10778  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10779  * 'segno' is for error reporting purposes.
10780  */
10781 void
issue_xlog_fsync(int fd,XLogSegNo segno)10782 issue_xlog_fsync(int fd, XLogSegNo segno)
10783 {
10784 	char	   *msg = NULL;
10785 	instr_time	start;
10786 
10787 	/*
10788 	 * Quick exit if fsync is disabled or write() has already synced the WAL
10789 	 * file.
10790 	 */
10791 	if (!enableFsync ||
10792 		sync_method == SYNC_METHOD_OPEN ||
10793 		sync_method == SYNC_METHOD_OPEN_DSYNC)
10794 		return;
10795 
10796 	/* Measure I/O timing to sync the WAL file */
10797 	if (track_wal_io_timing)
10798 		INSTR_TIME_SET_CURRENT(start);
10799 
10800 	pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
10801 	switch (sync_method)
10802 	{
10803 		case SYNC_METHOD_FSYNC:
10804 			if (pg_fsync_no_writethrough(fd) != 0)
10805 				msg = _("could not fsync file \"%s\": %m");
10806 			break;
10807 #ifdef HAVE_FSYNC_WRITETHROUGH
10808 		case SYNC_METHOD_FSYNC_WRITETHROUGH:
10809 			if (pg_fsync_writethrough(fd) != 0)
10810 				msg = _("could not fsync write-through file \"%s\": %m");
10811 			break;
10812 #endif
10813 #ifdef HAVE_FDATASYNC
10814 		case SYNC_METHOD_FDATASYNC:
10815 			if (pg_fdatasync(fd) != 0)
10816 				msg = _("could not fdatasync file \"%s\": %m");
10817 			break;
10818 #endif
10819 		case SYNC_METHOD_OPEN:
10820 		case SYNC_METHOD_OPEN_DSYNC:
10821 			/* not reachable */
10822 			Assert(false);
10823 			break;
10824 		default:
10825 			elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10826 			break;
10827 	}
10828 
10829 	/* PANIC if failed to fsync */
10830 	if (msg)
10831 	{
10832 		char		xlogfname[MAXFNAMELEN];
10833 		int			save_errno = errno;
10834 
10835 		XLogFileName(xlogfname, ThisTimeLineID, segno,
10836 					 wal_segment_size);
10837 		errno = save_errno;
10838 		ereport(PANIC,
10839 				(errcode_for_file_access(),
10840 				 errmsg(msg, xlogfname)));
10841 	}
10842 
10843 	pgstat_report_wait_end();
10844 
10845 	/*
10846 	 * Increment the I/O timing and the number of times WAL files were synced.
10847 	 */
10848 	if (track_wal_io_timing)
10849 	{
10850 		instr_time	duration;
10851 
10852 		INSTR_TIME_SET_CURRENT(duration);
10853 		INSTR_TIME_SUBTRACT(duration, start);
10854 		WalStats.m_wal_sync_time += INSTR_TIME_GET_MICROSEC(duration);
10855 	}
10856 
10857 	WalStats.m_wal_sync++;
10858 }
10859 
10860 /*
10861  * do_pg_start_backup
10862  *
10863  * Utility function called at the start of an online backup. It creates the
10864  * necessary starting checkpoint and constructs the backup label file.
10865  *
10866  * There are two kind of backups: exclusive and non-exclusive. An exclusive
10867  * backup is started with pg_start_backup(), and there can be only one active
10868  * at a time. The backup and tablespace map files of an exclusive backup are
10869  * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10870  * removed by pg_stop_backup().
10871  *
10872  * A non-exclusive backup is used for the streaming base backups (see
10873  * src/backend/replication/basebackup.c). The difference to exclusive backups
10874  * is that the backup label and tablespace map files are not written to disk.
10875  * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10876  * and the caller is responsible for including them in the backup archive as
10877  * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10878  * active at the same time, and they don't conflict with an exclusive backup
10879  * either.
10880  *
10881  * labelfile and tblspcmapfile must be passed as NULL when starting an
10882  * exclusive backup, and as initially-empty StringInfos for a non-exclusive
10883  * backup.
10884  *
10885  * If "tablespaces" isn't NULL, it receives a list of tablespaceinfo structs
10886  * describing the cluster's tablespaces.
10887  *
10888  * tblspcmapfile is required mainly for tar format in windows as native windows
10889  * utilities are not able to create symlinks while extracting files from tar.
10890  * However for consistency, the same is used for all platforms.
10891  *
10892  * Returns the minimum WAL location that must be present to restore from this
10893  * backup, and the corresponding timeline ID in *starttli_p.
10894  *
10895  * Every successfully started non-exclusive backup must be stopped by calling
10896  * do_pg_stop_backup() or do_pg_abort_backup().
10897  *
10898  * It is the responsibility of the caller of this function to verify the
10899  * permissions of the calling user!
10900  */
10901 XLogRecPtr
do_pg_start_backup(const char * backupidstr,bool fast,TimeLineID * starttli_p,StringInfo labelfile,List ** tablespaces,StringInfo tblspcmapfile)10902 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10903 				   StringInfo labelfile, List **tablespaces,
10904 				   StringInfo tblspcmapfile)
10905 {
10906 	bool		exclusive = (labelfile == NULL);
10907 	bool		backup_started_in_recovery = false;
10908 	XLogRecPtr	checkpointloc;
10909 	XLogRecPtr	startpoint;
10910 	TimeLineID	starttli;
10911 	pg_time_t	stamp_time;
10912 	char		strfbuf[128];
10913 	char		xlogfilename[MAXFNAMELEN];
10914 	XLogSegNo	_logSegNo;
10915 	struct stat stat_buf;
10916 	FILE	   *fp;
10917 
10918 	backup_started_in_recovery = RecoveryInProgress();
10919 
10920 	/*
10921 	 * Currently only non-exclusive backup can be taken during recovery.
10922 	 */
10923 	if (backup_started_in_recovery && exclusive)
10924 		ereport(ERROR,
10925 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10926 				 errmsg("recovery is in progress"),
10927 				 errhint("WAL control functions cannot be executed during recovery.")));
10928 
10929 	/*
10930 	 * During recovery, we don't need to check WAL level. Because, if WAL
10931 	 * level is not sufficient, it's impossible to get here during recovery.
10932 	 */
10933 	if (!backup_started_in_recovery && !XLogIsNeeded())
10934 		ereport(ERROR,
10935 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10936 				 errmsg("WAL level not sufficient for making an online backup"),
10937 				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10938 
10939 	if (strlen(backupidstr) > MAXPGPATH)
10940 		ereport(ERROR,
10941 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10942 				 errmsg("backup label too long (max %d bytes)",
10943 						MAXPGPATH)));
10944 
10945 	/*
10946 	 * Mark backup active in shared memory.  We must do full-page WAL writes
10947 	 * during an on-line backup even if not doing so at other times, because
10948 	 * it's quite possible for the backup dump to obtain a "torn" (partially
10949 	 * written) copy of a database page if it reads the page concurrently with
10950 	 * our write to the same page.  This can be fixed as long as the first
10951 	 * write to the page in the WAL sequence is a full-page write. Hence, we
10952 	 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10953 	 * are no dirty pages in shared memory that might get dumped while the
10954 	 * backup is in progress without having a corresponding WAL record.  (Once
10955 	 * the backup is complete, we need not force full-page writes anymore,
10956 	 * since we expect that any pages not modified during the backup interval
10957 	 * must have been correctly captured by the backup.)
10958 	 *
10959 	 * Note that forcePageWrites has no effect during an online backup from
10960 	 * the standby.
10961 	 *
10962 	 * We must hold all the insertion locks to change the value of
10963 	 * forcePageWrites, to ensure adequate interlocking against
10964 	 * XLogInsertRecord().
10965 	 */
10966 	WALInsertLockAcquireExclusive();
10967 	if (exclusive)
10968 	{
10969 		/*
10970 		 * At first, mark that we're now starting an exclusive backup, to
10971 		 * ensure that there are no other sessions currently running
10972 		 * pg_start_backup() or pg_stop_backup().
10973 		 */
10974 		if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10975 		{
10976 			WALInsertLockRelease();
10977 			ereport(ERROR,
10978 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10979 					 errmsg("a backup is already in progress"),
10980 					 errhint("Run pg_stop_backup() and try again.")));
10981 		}
10982 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10983 	}
10984 	else
10985 		XLogCtl->Insert.nonExclusiveBackups++;
10986 	XLogCtl->Insert.forcePageWrites = true;
10987 	WALInsertLockRelease();
10988 
10989 	/* Ensure we release forcePageWrites if fail below */
10990 	PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10991 	{
10992 		bool		gotUniqueStartpoint = false;
10993 		DIR		   *tblspcdir;
10994 		struct dirent *de;
10995 		tablespaceinfo *ti;
10996 		int			datadirpathlen;
10997 
10998 		/*
10999 		 * Force an XLOG file switch before the checkpoint, to ensure that the
11000 		 * WAL segment the checkpoint is written to doesn't contain pages with
11001 		 * old timeline IDs.  That would otherwise happen if you called
11002 		 * pg_start_backup() right after restoring from a PITR archive: the
11003 		 * first WAL segment containing the startup checkpoint has pages in
11004 		 * the beginning with the old timeline ID.  That can cause trouble at
11005 		 * recovery: we won't have a history file covering the old timeline if
11006 		 * pg_wal directory was not included in the base backup and the WAL
11007 		 * archive was cleared too before starting the backup.
11008 		 *
11009 		 * This also ensures that we have emitted a WAL page header that has
11010 		 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
11011 		 * Therefore, if a WAL archiver (such as pglesslog) is trying to
11012 		 * compress out removable backup blocks, it won't remove any that
11013 		 * occur after this point.
11014 		 *
11015 		 * During recovery, we skip forcing XLOG file switch, which means that
11016 		 * the backup taken during recovery is not available for the special
11017 		 * recovery case described above.
11018 		 */
11019 		if (!backup_started_in_recovery)
11020 			RequestXLogSwitch(false);
11021 
11022 		do
11023 		{
11024 			bool		checkpointfpw;
11025 
11026 			/*
11027 			 * Force a CHECKPOINT.  Aside from being necessary to prevent torn
11028 			 * page problems, this guarantees that two successive backup runs
11029 			 * will have different checkpoint positions and hence different
11030 			 * history file names, even if nothing happened in between.
11031 			 *
11032 			 * During recovery, establish a restartpoint if possible. We use
11033 			 * the last restartpoint as the backup starting checkpoint. This
11034 			 * means that two successive backup runs can have same checkpoint
11035 			 * positions.
11036 			 *
11037 			 * Since the fact that we are executing do_pg_start_backup()
11038 			 * during recovery means that checkpointer is running, we can use
11039 			 * RequestCheckpoint() to establish a restartpoint.
11040 			 *
11041 			 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
11042 			 * passing fast = true).  Otherwise this can take awhile.
11043 			 */
11044 			RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
11045 							  (fast ? CHECKPOINT_IMMEDIATE : 0));
11046 
11047 			/*
11048 			 * Now we need to fetch the checkpoint record location, and also
11049 			 * its REDO pointer.  The oldest point in WAL that would be needed
11050 			 * to restore starting from the checkpoint is precisely the REDO
11051 			 * pointer.
11052 			 */
11053 			LWLockAcquire(ControlFileLock, LW_SHARED);
11054 			checkpointloc = ControlFile->checkPoint;
11055 			startpoint = ControlFile->checkPointCopy.redo;
11056 			starttli = ControlFile->checkPointCopy.ThisTimeLineID;
11057 			checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
11058 			LWLockRelease(ControlFileLock);
11059 
11060 			if (backup_started_in_recovery)
11061 			{
11062 				XLogRecPtr	recptr;
11063 
11064 				/*
11065 				 * Check to see if all WAL replayed during online backup
11066 				 * (i.e., since last restartpoint used as backup starting
11067 				 * checkpoint) contain full-page writes.
11068 				 */
11069 				SpinLockAcquire(&XLogCtl->info_lck);
11070 				recptr = XLogCtl->lastFpwDisableRecPtr;
11071 				SpinLockRelease(&XLogCtl->info_lck);
11072 
11073 				if (!checkpointfpw || startpoint <= recptr)
11074 					ereport(ERROR,
11075 							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11076 							 errmsg("WAL generated with full_page_writes=off was replayed "
11077 									"since last restartpoint"),
11078 							 errhint("This means that the backup being taken on the standby "
11079 									 "is corrupt and should not be used. "
11080 									 "Enable full_page_writes and run CHECKPOINT on the primary, "
11081 									 "and then try an online backup again.")));
11082 
11083 				/*
11084 				 * During recovery, since we don't use the end-of-backup WAL
11085 				 * record and don't write the backup history file, the
11086 				 * starting WAL location doesn't need to be unique. This means
11087 				 * that two base backups started at the same time might use
11088 				 * the same checkpoint as starting locations.
11089 				 */
11090 				gotUniqueStartpoint = true;
11091 			}
11092 
11093 			/*
11094 			 * If two base backups are started at the same time (in WAL sender
11095 			 * processes), we need to make sure that they use different
11096 			 * checkpoints as starting locations, because we use the starting
11097 			 * WAL location as a unique identifier for the base backup in the
11098 			 * end-of-backup WAL record and when we write the backup history
11099 			 * file. Perhaps it would be better generate a separate unique ID
11100 			 * for each backup instead of forcing another checkpoint, but
11101 			 * taking a checkpoint right after another is not that expensive
11102 			 * either because only few buffers have been dirtied yet.
11103 			 */
11104 			WALInsertLockAcquireExclusive();
11105 			if (XLogCtl->Insert.lastBackupStart < startpoint)
11106 			{
11107 				XLogCtl->Insert.lastBackupStart = startpoint;
11108 				gotUniqueStartpoint = true;
11109 			}
11110 			WALInsertLockRelease();
11111 		} while (!gotUniqueStartpoint);
11112 
11113 		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11114 		XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
11115 
11116 		/*
11117 		 * Construct tablespace_map file.  If caller isn't interested in this,
11118 		 * we make a local StringInfo.
11119 		 */
11120 		if (tblspcmapfile == NULL)
11121 			tblspcmapfile = makeStringInfo();
11122 
11123 		datadirpathlen = strlen(DataDir);
11124 
11125 		/* Collect information about all tablespaces */
11126 		tblspcdir = AllocateDir("pg_tblspc");
11127 		while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
11128 		{
11129 			char		fullpath[MAXPGPATH + 10];
11130 			char		linkpath[MAXPGPATH];
11131 			char	   *relpath = NULL;
11132 			int			rllen;
11133 			StringInfoData escapedpath;
11134 			char	   *s;
11135 
11136 			/* Skip anything that doesn't look like a tablespace */
11137 			if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
11138 				continue;
11139 
11140 			snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
11141 
11142 #if defined(HAVE_READLINK) || defined(WIN32)
11143 			rllen = readlink(fullpath, linkpath, sizeof(linkpath));
11144 			if (rllen < 0)
11145 			{
11146 				ereport(WARNING,
11147 						(errmsg("could not read symbolic link \"%s\": %m",
11148 								fullpath)));
11149 				continue;
11150 			}
11151 			else if (rllen >= sizeof(linkpath))
11152 			{
11153 				ereport(WARNING,
11154 						(errmsg("symbolic link \"%s\" target is too long",
11155 								fullpath)));
11156 				continue;
11157 			}
11158 			linkpath[rllen] = '\0';
11159 
11160 			/*
11161 			 * Build a backslash-escaped version of the link path to include
11162 			 * in the tablespace map file.
11163 			 */
11164 			initStringInfo(&escapedpath);
11165 			for (s = linkpath; *s; s++)
11166 			{
11167 				if (*s == '\n' || *s == '\r' || *s == '\\')
11168 					appendStringInfoChar(&escapedpath, '\\');
11169 				appendStringInfoChar(&escapedpath, *s);
11170 			}
11171 
11172 			/*
11173 			 * Relpath holds the relative path of the tablespace directory
11174 			 * when it's located within PGDATA, or NULL if it's located
11175 			 * elsewhere.
11176 			 */
11177 			if (rllen > datadirpathlen &&
11178 				strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
11179 				IS_DIR_SEP(linkpath[datadirpathlen]))
11180 				relpath = linkpath + datadirpathlen + 1;
11181 
11182 			ti = palloc(sizeof(tablespaceinfo));
11183 			ti->oid = pstrdup(de->d_name);
11184 			ti->path = pstrdup(linkpath);
11185 			ti->rpath = relpath ? pstrdup(relpath) : NULL;
11186 			ti->size = -1;
11187 
11188 			if (tablespaces)
11189 				*tablespaces = lappend(*tablespaces, ti);
11190 
11191 			appendStringInfo(tblspcmapfile, "%s %s\n",
11192 							 ti->oid, escapedpath.data);
11193 
11194 			pfree(escapedpath.data);
11195 #else
11196 
11197 			/*
11198 			 * If the platform does not have symbolic links, it should not be
11199 			 * possible to have tablespaces - clearly somebody else created
11200 			 * them. Warn about it and ignore.
11201 			 */
11202 			ereport(WARNING,
11203 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
11204 					 errmsg("tablespaces are not supported on this platform")));
11205 #endif
11206 		}
11207 		FreeDir(tblspcdir);
11208 
11209 		/*
11210 		 * Construct backup label file.  If caller isn't interested in this,
11211 		 * we make a local StringInfo.
11212 		 */
11213 		if (labelfile == NULL)
11214 			labelfile = makeStringInfo();
11215 
11216 		/* Use the log timezone here, not the session timezone */
11217 		stamp_time = (pg_time_t) time(NULL);
11218 		pg_strftime(strfbuf, sizeof(strfbuf),
11219 					"%Y-%m-%d %H:%M:%S %Z",
11220 					pg_localtime(&stamp_time, log_timezone));
11221 		appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
11222 						 LSN_FORMAT_ARGS(startpoint), xlogfilename);
11223 		appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
11224 						 LSN_FORMAT_ARGS(checkpointloc));
11225 		appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
11226 						 exclusive ? "pg_start_backup" : "streamed");
11227 		appendStringInfo(labelfile, "BACKUP FROM: %s\n",
11228 						 backup_started_in_recovery ? "standby" : "primary");
11229 		appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
11230 		appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
11231 		appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
11232 
11233 		/*
11234 		 * Okay, write the file, or return its contents to caller.
11235 		 */
11236 		if (exclusive)
11237 		{
11238 			/*
11239 			 * Check for existing backup label --- implies a backup is already
11240 			 * running.  (XXX given that we checked exclusiveBackupState
11241 			 * above, maybe it would be OK to just unlink any such label
11242 			 * file?)
11243 			 */
11244 			if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
11245 			{
11246 				if (errno != ENOENT)
11247 					ereport(ERROR,
11248 							(errcode_for_file_access(),
11249 							 errmsg("could not stat file \"%s\": %m",
11250 									BACKUP_LABEL_FILE)));
11251 			}
11252 			else
11253 				ereport(ERROR,
11254 						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11255 						 errmsg("a backup is already in progress"),
11256 						 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
11257 								 BACKUP_LABEL_FILE)));
11258 
11259 			fp = AllocateFile(BACKUP_LABEL_FILE, "w");
11260 
11261 			if (!fp)
11262 				ereport(ERROR,
11263 						(errcode_for_file_access(),
11264 						 errmsg("could not create file \"%s\": %m",
11265 								BACKUP_LABEL_FILE)));
11266 			if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
11267 				fflush(fp) != 0 ||
11268 				pg_fsync(fileno(fp)) != 0 ||
11269 				ferror(fp) ||
11270 				FreeFile(fp))
11271 				ereport(ERROR,
11272 						(errcode_for_file_access(),
11273 						 errmsg("could not write file \"%s\": %m",
11274 								BACKUP_LABEL_FILE)));
11275 			/* Allocated locally for exclusive backups, so free separately */
11276 			pfree(labelfile->data);
11277 			pfree(labelfile);
11278 
11279 			/* Write backup tablespace_map file. */
11280 			if (tblspcmapfile->len > 0)
11281 			{
11282 				if (stat(TABLESPACE_MAP, &stat_buf) != 0)
11283 				{
11284 					if (errno != ENOENT)
11285 						ereport(ERROR,
11286 								(errcode_for_file_access(),
11287 								 errmsg("could not stat file \"%s\": %m",
11288 										TABLESPACE_MAP)));
11289 				}
11290 				else
11291 					ereport(ERROR,
11292 							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11293 							 errmsg("a backup is already in progress"),
11294 							 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
11295 									 TABLESPACE_MAP)));
11296 
11297 				fp = AllocateFile(TABLESPACE_MAP, "w");
11298 
11299 				if (!fp)
11300 					ereport(ERROR,
11301 							(errcode_for_file_access(),
11302 							 errmsg("could not create file \"%s\": %m",
11303 									TABLESPACE_MAP)));
11304 				if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
11305 					fflush(fp) != 0 ||
11306 					pg_fsync(fileno(fp)) != 0 ||
11307 					ferror(fp) ||
11308 					FreeFile(fp))
11309 					ereport(ERROR,
11310 							(errcode_for_file_access(),
11311 							 errmsg("could not write file \"%s\": %m",
11312 									TABLESPACE_MAP)));
11313 			}
11314 
11315 			/* Allocated locally for exclusive backups, so free separately */
11316 			pfree(tblspcmapfile->data);
11317 			pfree(tblspcmapfile);
11318 		}
11319 	}
11320 	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
11321 
11322 	/*
11323 	 * Mark that start phase has correctly finished for an exclusive backup.
11324 	 * Session-level locks are updated as well to reflect that state.
11325 	 *
11326 	 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
11327 	 * counters and session-level lock. Otherwise they can be updated
11328 	 * inconsistently, and which might cause do_pg_abort_backup() to fail.
11329 	 */
11330 	if (exclusive)
11331 	{
11332 		WALInsertLockAcquireExclusive();
11333 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
11334 
11335 		/* Set session-level lock */
11336 		sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
11337 		WALInsertLockRelease();
11338 	}
11339 	else
11340 		sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
11341 
11342 	/*
11343 	 * We're done.  As a convenience, return the starting WAL location.
11344 	 */
11345 	if (starttli_p)
11346 		*starttli_p = starttli;
11347 	return startpoint;
11348 }
11349 
11350 /* Error cleanup callback for pg_start_backup */
11351 static void
pg_start_backup_callback(int code,Datum arg)11352 pg_start_backup_callback(int code, Datum arg)
11353 {
11354 	bool		exclusive = DatumGetBool(arg);
11355 
11356 	/* Update backup counters and forcePageWrites on failure */
11357 	WALInsertLockAcquireExclusive();
11358 	if (exclusive)
11359 	{
11360 		Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
11361 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11362 	}
11363 	else
11364 	{
11365 		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11366 		XLogCtl->Insert.nonExclusiveBackups--;
11367 	}
11368 
11369 	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11370 		XLogCtl->Insert.nonExclusiveBackups == 0)
11371 	{
11372 		XLogCtl->Insert.forcePageWrites = false;
11373 	}
11374 	WALInsertLockRelease();
11375 }
11376 
11377 /*
11378  * Error cleanup callback for pg_stop_backup
11379  */
11380 static void
pg_stop_backup_callback(int code,Datum arg)11381 pg_stop_backup_callback(int code, Datum arg)
11382 {
11383 	bool		exclusive = DatumGetBool(arg);
11384 
11385 	/* Update backup status on failure */
11386 	WALInsertLockAcquireExclusive();
11387 	if (exclusive)
11388 	{
11389 		Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
11390 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
11391 	}
11392 	WALInsertLockRelease();
11393 }
11394 
11395 /*
11396  * Utility routine to fetch the session-level status of a backup running.
11397  */
11398 SessionBackupState
get_backup_status(void)11399 get_backup_status(void)
11400 {
11401 	return sessionBackupState;
11402 }
11403 
11404 /*
11405  * do_pg_stop_backup
11406  *
11407  * Utility function called at the end of an online backup. It cleans up the
11408  * backup state and can optionally wait for WAL segments to be archived.
11409  *
11410  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
11411  * the non-exclusive backup specified by 'labelfile'.
11412  *
11413  * Returns the last WAL location that must be present to restore from this
11414  * backup, and the corresponding timeline ID in *stoptli_p.
11415  *
11416  * It is the responsibility of the caller of this function to verify the
11417  * permissions of the calling user!
11418  */
11419 XLogRecPtr
do_pg_stop_backup(char * labelfile,bool waitforarchive,TimeLineID * stoptli_p)11420 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
11421 {
11422 	bool		exclusive = (labelfile == NULL);
11423 	bool		backup_started_in_recovery = false;
11424 	XLogRecPtr	startpoint;
11425 	XLogRecPtr	stoppoint;
11426 	TimeLineID	stoptli;
11427 	pg_time_t	stamp_time;
11428 	char		strfbuf[128];
11429 	char		histfilepath[MAXPGPATH];
11430 	char		startxlogfilename[MAXFNAMELEN];
11431 	char		stopxlogfilename[MAXFNAMELEN];
11432 	char		lastxlogfilename[MAXFNAMELEN];
11433 	char		histfilename[MAXFNAMELEN];
11434 	char		backupfrom[20];
11435 	XLogSegNo	_logSegNo;
11436 	FILE	   *lfp;
11437 	FILE	   *fp;
11438 	char		ch;
11439 	int			seconds_before_warning;
11440 	int			waits = 0;
11441 	bool		reported_waiting = false;
11442 	char	   *remaining;
11443 	char	   *ptr;
11444 	uint32		hi,
11445 				lo;
11446 
11447 	backup_started_in_recovery = RecoveryInProgress();
11448 
11449 	/*
11450 	 * Currently only non-exclusive backup can be taken during recovery.
11451 	 */
11452 	if (backup_started_in_recovery && exclusive)
11453 		ereport(ERROR,
11454 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11455 				 errmsg("recovery is in progress"),
11456 				 errhint("WAL control functions cannot be executed during recovery.")));
11457 
11458 	/*
11459 	 * During recovery, we don't need to check WAL level. Because, if WAL
11460 	 * level is not sufficient, it's impossible to get here during recovery.
11461 	 */
11462 	if (!backup_started_in_recovery && !XLogIsNeeded())
11463 		ereport(ERROR,
11464 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11465 				 errmsg("WAL level not sufficient for making an online backup"),
11466 				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
11467 
11468 	if (exclusive)
11469 	{
11470 		/*
11471 		 * At first, mark that we're now stopping an exclusive backup, to
11472 		 * ensure that there are no other sessions currently running
11473 		 * pg_start_backup() or pg_stop_backup().
11474 		 */
11475 		WALInsertLockAcquireExclusive();
11476 		if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
11477 		{
11478 			WALInsertLockRelease();
11479 			ereport(ERROR,
11480 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11481 					 errmsg("exclusive backup not in progress")));
11482 		}
11483 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
11484 		WALInsertLockRelease();
11485 
11486 		/*
11487 		 * Remove backup_label. In case of failure, the state for an exclusive
11488 		 * backup is switched back to in-progress.
11489 		 */
11490 		PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11491 		{
11492 			/*
11493 			 * Read the existing label file into memory.
11494 			 */
11495 			struct stat statbuf;
11496 			int			r;
11497 
11498 			if (stat(BACKUP_LABEL_FILE, &statbuf))
11499 			{
11500 				/* should not happen per the upper checks */
11501 				if (errno != ENOENT)
11502 					ereport(ERROR,
11503 							(errcode_for_file_access(),
11504 							 errmsg("could not stat file \"%s\": %m",
11505 									BACKUP_LABEL_FILE)));
11506 				ereport(ERROR,
11507 						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11508 						 errmsg("a backup is not in progress")));
11509 			}
11510 
11511 			lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11512 			if (!lfp)
11513 			{
11514 				ereport(ERROR,
11515 						(errcode_for_file_access(),
11516 						 errmsg("could not read file \"%s\": %m",
11517 								BACKUP_LABEL_FILE)));
11518 			}
11519 			labelfile = palloc(statbuf.st_size + 1);
11520 			r = fread(labelfile, statbuf.st_size, 1, lfp);
11521 			labelfile[statbuf.st_size] = '\0';
11522 
11523 			/*
11524 			 * Close and remove the backup label file
11525 			 */
11526 			if (r != 1 || ferror(lfp) || FreeFile(lfp))
11527 				ereport(ERROR,
11528 						(errcode_for_file_access(),
11529 						 errmsg("could not read file \"%s\": %m",
11530 								BACKUP_LABEL_FILE)));
11531 			durable_unlink(BACKUP_LABEL_FILE, ERROR);
11532 
11533 			/*
11534 			 * Remove tablespace_map file if present, it is created only if
11535 			 * there are tablespaces.
11536 			 */
11537 			durable_unlink(TABLESPACE_MAP, DEBUG1);
11538 		}
11539 		PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11540 	}
11541 
11542 	/*
11543 	 * OK to update backup counters, forcePageWrites and session-level lock.
11544 	 *
11545 	 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
11546 	 * Otherwise they can be updated inconsistently, and which might cause
11547 	 * do_pg_abort_backup() to fail.
11548 	 */
11549 	WALInsertLockAcquireExclusive();
11550 	if (exclusive)
11551 	{
11552 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11553 	}
11554 	else
11555 	{
11556 		/*
11557 		 * The user-visible pg_start/stop_backup() functions that operate on
11558 		 * exclusive backups can be called at any time, but for non-exclusive
11559 		 * backups, it is expected that each do_pg_start_backup() call is
11560 		 * matched by exactly one do_pg_stop_backup() call.
11561 		 */
11562 		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11563 		XLogCtl->Insert.nonExclusiveBackups--;
11564 	}
11565 
11566 	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11567 		XLogCtl->Insert.nonExclusiveBackups == 0)
11568 	{
11569 		XLogCtl->Insert.forcePageWrites = false;
11570 	}
11571 
11572 	/*
11573 	 * Clean up session-level lock.
11574 	 *
11575 	 * You might think that WALInsertLockRelease() can be called before
11576 	 * cleaning up session-level lock because session-level lock doesn't need
11577 	 * to be protected with WAL insertion lock. But since
11578 	 * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
11579 	 * cleaned up before it.
11580 	 */
11581 	sessionBackupState = SESSION_BACKUP_NONE;
11582 
11583 	WALInsertLockRelease();
11584 
11585 	/*
11586 	 * Read and parse the START WAL LOCATION line (this code is pretty crude,
11587 	 * but we are not expecting any variability in the file format).
11588 	 */
11589 	if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
11590 			   &hi, &lo, startxlogfilename,
11591 			   &ch) != 4 || ch != '\n')
11592 		ereport(ERROR,
11593 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11594 				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11595 	startpoint = ((uint64) hi) << 32 | lo;
11596 	remaining = strchr(labelfile, '\n') + 1;	/* %n is not portable enough */
11597 
11598 	/*
11599 	 * Parse the BACKUP FROM line. If we are taking an online backup from the
11600 	 * standby, we confirm that the standby has not been promoted during the
11601 	 * backup.
11602 	 */
11603 	ptr = strstr(remaining, "BACKUP FROM:");
11604 	if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
11605 		ereport(ERROR,
11606 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11607 				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11608 	if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
11609 		ereport(ERROR,
11610 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11611 				 errmsg("the standby was promoted during online backup"),
11612 				 errhint("This means that the backup being taken is corrupt "
11613 						 "and should not be used. "
11614 						 "Try taking another online backup.")));
11615 
11616 	/*
11617 	 * During recovery, we don't write an end-of-backup record. We assume that
11618 	 * pg_control was backed up last and its minimum recovery point can be
11619 	 * available as the backup end location. Since we don't have an
11620 	 * end-of-backup record, we use the pg_control value to check whether
11621 	 * we've reached the end of backup when starting recovery from this
11622 	 * backup. We have no way of checking if pg_control wasn't backed up last
11623 	 * however.
11624 	 *
11625 	 * We don't force a switch to new WAL file but it is still possible to
11626 	 * wait for all the required files to be archived if waitforarchive is
11627 	 * true. This is okay if we use the backup to start a standby and fetch
11628 	 * the missing WAL using streaming replication. But in the case of an
11629 	 * archive recovery, a user should set waitforarchive to true and wait for
11630 	 * them to be archived to ensure that all the required files are
11631 	 * available.
11632 	 *
11633 	 * We return the current minimum recovery point as the backup end
11634 	 * location. Note that it can be greater than the exact backup end
11635 	 * location if the minimum recovery point is updated after the backup of
11636 	 * pg_control. This is harmless for current uses.
11637 	 *
11638 	 * XXX currently a backup history file is for informational and debug
11639 	 * purposes only. It's not essential for an online backup. Furthermore,
11640 	 * even if it's created, it will not be archived during recovery because
11641 	 * an archiver is not invoked. So it doesn't seem worthwhile to write a
11642 	 * backup history file during recovery.
11643 	 */
11644 	if (backup_started_in_recovery)
11645 	{
11646 		XLogRecPtr	recptr;
11647 
11648 		/*
11649 		 * Check to see if all WAL replayed during online backup contain
11650 		 * full-page writes.
11651 		 */
11652 		SpinLockAcquire(&XLogCtl->info_lck);
11653 		recptr = XLogCtl->lastFpwDisableRecPtr;
11654 		SpinLockRelease(&XLogCtl->info_lck);
11655 
11656 		if (startpoint <= recptr)
11657 			ereport(ERROR,
11658 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11659 					 errmsg("WAL generated with full_page_writes=off was replayed "
11660 							"during online backup"),
11661 					 errhint("This means that the backup being taken on the standby "
11662 							 "is corrupt and should not be used. "
11663 							 "Enable full_page_writes and run CHECKPOINT on the primary, "
11664 							 "and then try an online backup again.")));
11665 
11666 
11667 		LWLockAcquire(ControlFileLock, LW_SHARED);
11668 		stoppoint = ControlFile->minRecoveryPoint;
11669 		stoptli = ControlFile->minRecoveryPointTLI;
11670 		LWLockRelease(ControlFileLock);
11671 	}
11672 	else
11673 	{
11674 		/*
11675 		 * Write the backup-end xlog record
11676 		 */
11677 		XLogBeginInsert();
11678 		XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
11679 		stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
11680 		stoptli = ThisTimeLineID;
11681 
11682 		/*
11683 		 * Force a switch to a new xlog segment file, so that the backup is
11684 		 * valid as soon as archiver moves out the current segment file.
11685 		 */
11686 		RequestXLogSwitch(false);
11687 
11688 		XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11689 		XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
11690 
11691 		/* Use the log timezone here, not the session timezone */
11692 		stamp_time = (pg_time_t) time(NULL);
11693 		pg_strftime(strfbuf, sizeof(strfbuf),
11694 					"%Y-%m-%d %H:%M:%S %Z",
11695 					pg_localtime(&stamp_time, log_timezone));
11696 
11697 		/*
11698 		 * Write the backup history file
11699 		 */
11700 		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11701 		BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
11702 							  startpoint, wal_segment_size);
11703 		fp = AllocateFile(histfilepath, "w");
11704 		if (!fp)
11705 			ereport(ERROR,
11706 					(errcode_for_file_access(),
11707 					 errmsg("could not create file \"%s\": %m",
11708 							histfilepath)));
11709 		fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
11710 				LSN_FORMAT_ARGS(startpoint), startxlogfilename);
11711 		fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
11712 				LSN_FORMAT_ARGS(stoppoint), stopxlogfilename);
11713 
11714 		/*
11715 		 * Transfer remaining lines including label and start timeline to
11716 		 * history file.
11717 		 */
11718 		fprintf(fp, "%s", remaining);
11719 		fprintf(fp, "STOP TIME: %s\n", strfbuf);
11720 		fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
11721 		if (fflush(fp) || ferror(fp) || FreeFile(fp))
11722 			ereport(ERROR,
11723 					(errcode_for_file_access(),
11724 					 errmsg("could not write file \"%s\": %m",
11725 							histfilepath)));
11726 
11727 		/*
11728 		 * Clean out any no-longer-needed history files.  As a side effect,
11729 		 * this will post a .ready file for the newly created history file,
11730 		 * notifying the archiver that history file may be archived
11731 		 * immediately.
11732 		 */
11733 		CleanupBackupHistory();
11734 	}
11735 
11736 	/*
11737 	 * If archiving is enabled, wait for all the required WAL files to be
11738 	 * archived before returning. If archiving isn't enabled, the required WAL
11739 	 * needs to be transported via streaming replication (hopefully with
11740 	 * wal_keep_size set high enough), or some more exotic mechanism like
11741 	 * polling and copying files from pg_wal with script. We have no knowledge
11742 	 * of those mechanisms, so it's up to the user to ensure that he gets all
11743 	 * the required WAL.
11744 	 *
11745 	 * We wait until both the last WAL file filled during backup and the
11746 	 * history file have been archived, and assume that the alphabetic sorting
11747 	 * property of the WAL files ensures any earlier WAL files are safely
11748 	 * archived as well.
11749 	 *
11750 	 * We wait forever, since archive_command is supposed to work and we
11751 	 * assume the admin wanted his backup to work completely. If you don't
11752 	 * wish to wait, then either waitforarchive should be passed in as false,
11753 	 * or you can set statement_timeout.  Also, some notices are issued to
11754 	 * clue in anyone who might be doing this interactively.
11755 	 */
11756 
11757 	if (waitforarchive &&
11758 		((!backup_started_in_recovery && XLogArchivingActive()) ||
11759 		 (backup_started_in_recovery && XLogArchivingAlways())))
11760 	{
11761 		XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11762 		XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
11763 
11764 		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11765 		BackupHistoryFileName(histfilename, stoptli, _logSegNo,
11766 							  startpoint, wal_segment_size);
11767 
11768 		seconds_before_warning = 60;
11769 		waits = 0;
11770 
11771 		while (XLogArchiveIsBusy(lastxlogfilename) ||
11772 			   XLogArchiveIsBusy(histfilename))
11773 		{
11774 			CHECK_FOR_INTERRUPTS();
11775 
11776 			if (!reported_waiting && waits > 5)
11777 			{
11778 				ereport(NOTICE,
11779 						(errmsg("base backup done, waiting for required WAL segments to be archived")));
11780 				reported_waiting = true;
11781 			}
11782 
11783 			pgstat_report_wait_start(WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
11784 			pg_usleep(1000000L);
11785 			pgstat_report_wait_end();
11786 
11787 			if (++waits >= seconds_before_warning)
11788 			{
11789 				seconds_before_warning *= 2;	/* This wraps in >10 years... */
11790 				ereport(WARNING,
11791 						(errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
11792 								waits),
11793 						 errhint("Check that your archive_command is executing properly.  "
11794 								 "You can safely cancel this backup, "
11795 								 "but the database backup will not be usable without all the WAL segments.")));
11796 			}
11797 		}
11798 
11799 		ereport(NOTICE,
11800 				(errmsg("all required WAL segments have been archived")));
11801 	}
11802 	else if (waitforarchive)
11803 		ereport(NOTICE,
11804 				(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11805 
11806 	/*
11807 	 * We're done.  As a convenience, return the ending WAL location.
11808 	 */
11809 	if (stoptli_p)
11810 		*stoptli_p = stoptli;
11811 	return stoppoint;
11812 }
11813 
11814 
11815 /*
11816  * do_pg_abort_backup: abort a running backup
11817  *
11818  * This does just the most basic steps of do_pg_stop_backup(), by taking the
11819  * system out of backup mode, thus making it a lot more safe to call from
11820  * an error handler.
11821  *
11822  * The caller can pass 'arg' as 'true' or 'false' to control whether a warning
11823  * is emitted.
11824  *
11825  * NB: This is only for aborting a non-exclusive backup that doesn't write
11826  * backup_label. A backup started with pg_start_backup() needs to be finished
11827  * with pg_stop_backup().
11828  *
11829  * NB: This gets used as a before_shmem_exit handler, hence the odd-looking
11830  * signature.
11831  */
11832 void
do_pg_abort_backup(int code,Datum arg)11833 do_pg_abort_backup(int code, Datum arg)
11834 {
11835 	bool		emit_warning = DatumGetBool(arg);
11836 
11837 	/*
11838 	 * Quick exit if session is not keeping around a non-exclusive backup
11839 	 * already started.
11840 	 */
11841 	if (sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
11842 		return;
11843 
11844 	WALInsertLockAcquireExclusive();
11845 	Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11846 	XLogCtl->Insert.nonExclusiveBackups--;
11847 
11848 	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11849 		XLogCtl->Insert.nonExclusiveBackups == 0)
11850 	{
11851 		XLogCtl->Insert.forcePageWrites = false;
11852 	}
11853 	WALInsertLockRelease();
11854 
11855 	if (emit_warning)
11856 		ereport(WARNING,
11857 				(errmsg("aborting backup due to backend exiting before pg_stop_backup was called")));
11858 }
11859 
11860 /*
11861  * Register a handler that will warn about unterminated backups at end of
11862  * session, unless this has already been done.
11863  */
11864 void
register_persistent_abort_backup_handler(void)11865 register_persistent_abort_backup_handler(void)
11866 {
11867 	static bool already_done = false;
11868 
11869 	if (already_done)
11870 		return;
11871 	before_shmem_exit(do_pg_abort_backup, DatumGetBool(true));
11872 	already_done = true;
11873 }
11874 
11875 /*
11876  * Get latest redo apply position.
11877  *
11878  * Exported to allow WALReceiver to read the pointer directly.
11879  */
11880 XLogRecPtr
GetXLogReplayRecPtr(TimeLineID * replayTLI)11881 GetXLogReplayRecPtr(TimeLineID *replayTLI)
11882 {
11883 	XLogRecPtr	recptr;
11884 	TimeLineID	tli;
11885 
11886 	SpinLockAcquire(&XLogCtl->info_lck);
11887 	recptr = XLogCtl->lastReplayedEndRecPtr;
11888 	tli = XLogCtl->lastReplayedTLI;
11889 	SpinLockRelease(&XLogCtl->info_lck);
11890 
11891 	if (replayTLI)
11892 		*replayTLI = tli;
11893 	return recptr;
11894 }
11895 
11896 /*
11897  * Get latest WAL insert pointer
11898  */
11899 XLogRecPtr
GetXLogInsertRecPtr(void)11900 GetXLogInsertRecPtr(void)
11901 {
11902 	XLogCtlInsert *Insert = &XLogCtl->Insert;
11903 	uint64		current_bytepos;
11904 
11905 	SpinLockAcquire(&Insert->insertpos_lck);
11906 	current_bytepos = Insert->CurrBytePos;
11907 	SpinLockRelease(&Insert->insertpos_lck);
11908 
11909 	return XLogBytePosToRecPtr(current_bytepos);
11910 }
11911 
11912 /*
11913  * Get latest WAL write pointer
11914  */
11915 XLogRecPtr
GetXLogWriteRecPtr(void)11916 GetXLogWriteRecPtr(void)
11917 {
11918 	SpinLockAcquire(&XLogCtl->info_lck);
11919 	LogwrtResult = XLogCtl->LogwrtResult;
11920 	SpinLockRelease(&XLogCtl->info_lck);
11921 
11922 	return LogwrtResult.Write;
11923 }
11924 
11925 /*
11926  * Returns the redo pointer of the last checkpoint or restartpoint. This is
11927  * the oldest point in WAL that we still need, if we have to restart recovery.
11928  */
11929 void
GetOldestRestartPoint(XLogRecPtr * oldrecptr,TimeLineID * oldtli)11930 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11931 {
11932 	LWLockAcquire(ControlFileLock, LW_SHARED);
11933 	*oldrecptr = ControlFile->checkPointCopy.redo;
11934 	*oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11935 	LWLockRelease(ControlFileLock);
11936 }
11937 
11938 /*
11939  * read_backup_label: check to see if a backup_label file is present
11940  *
11941  * If we see a backup_label during recovery, we assume that we are recovering
11942  * from a backup dump file, and we therefore roll forward from the checkpoint
11943  * identified by the label file, NOT what pg_control says.  This avoids the
11944  * problem that pg_control might have been archived one or more checkpoints
11945  * later than the start of the dump, and so if we rely on it as the start
11946  * point, we will fail to restore a consistent database state.
11947  *
11948  * Returns true if a backup_label was found (and fills the checkpoint
11949  * location and its REDO location into *checkPointLoc and RedoStartLSN,
11950  * respectively); returns false if not. If this backup_label came from a
11951  * streamed backup, *backupEndRequired is set to true. If this backup_label
11952  * was created during recovery, *backupFromStandby is set to true.
11953  */
11954 static bool
read_backup_label(XLogRecPtr * checkPointLoc,bool * backupEndRequired,bool * backupFromStandby)11955 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11956 				  bool *backupFromStandby)
11957 {
11958 	char		startxlogfilename[MAXFNAMELEN];
11959 	TimeLineID	tli_from_walseg,
11960 				tli_from_file;
11961 	FILE	   *lfp;
11962 	char		ch;
11963 	char		backuptype[20];
11964 	char		backupfrom[20];
11965 	char		backuplabel[MAXPGPATH];
11966 	char		backuptime[128];
11967 	uint32		hi,
11968 				lo;
11969 
11970 	*backupEndRequired = false;
11971 	*backupFromStandby = false;
11972 
11973 	/*
11974 	 * See if label file is present
11975 	 */
11976 	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11977 	if (!lfp)
11978 	{
11979 		if (errno != ENOENT)
11980 			ereport(FATAL,
11981 					(errcode_for_file_access(),
11982 					 errmsg("could not read file \"%s\": %m",
11983 							BACKUP_LABEL_FILE)));
11984 		return false;			/* it's not there, all is fine */
11985 	}
11986 
11987 	/*
11988 	 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11989 	 * is pretty crude, but we are not expecting any variability in the file
11990 	 * format).
11991 	 */
11992 	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11993 			   &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
11994 		ereport(FATAL,
11995 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11996 				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11997 	RedoStartLSN = ((uint64) hi) << 32 | lo;
11998 	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11999 			   &hi, &lo, &ch) != 3 || ch != '\n')
12000 		ereport(FATAL,
12001 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
12002 				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
12003 	*checkPointLoc = ((uint64) hi) << 32 | lo;
12004 
12005 	/*
12006 	 * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
12007 	 * from an older backup anyway, but since the information on it is not
12008 	 * strictly required, don't error out if it's missing for some reason.
12009 	 */
12010 	if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
12011 	{
12012 		if (strcmp(backuptype, "streamed") == 0)
12013 			*backupEndRequired = true;
12014 	}
12015 
12016 	if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
12017 	{
12018 		if (strcmp(backupfrom, "standby") == 0)
12019 			*backupFromStandby = true;
12020 	}
12021 
12022 	/*
12023 	 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
12024 	 * but checking for their presence is useful for debugging and the next
12025 	 * sanity checks. Cope also with the fact that the result buffers have a
12026 	 * pre-allocated size, hence if the backup_label file has been generated
12027 	 * with strings longer than the maximum assumed here an incorrect parsing
12028 	 * happens. That's fine as only minor consistency checks are done
12029 	 * afterwards.
12030 	 */
12031 	if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
12032 		ereport(DEBUG1,
12033 				(errmsg_internal("backup time %s in file \"%s\"",
12034 								 backuptime, BACKUP_LABEL_FILE)));
12035 
12036 	if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
12037 		ereport(DEBUG1,
12038 				(errmsg_internal("backup label %s in file \"%s\"",
12039 								 backuplabel, BACKUP_LABEL_FILE)));
12040 
12041 	/*
12042 	 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
12043 	 * it as a sanity check if present.
12044 	 */
12045 	if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
12046 	{
12047 		if (tli_from_walseg != tli_from_file)
12048 			ereport(FATAL,
12049 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
12050 					 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
12051 					 errdetail("Timeline ID parsed is %u, but expected %u.",
12052 							   tli_from_file, tli_from_walseg)));
12053 
12054 		ereport(DEBUG1,
12055 				(errmsg_internal("backup timeline %u in file \"%s\"",
12056 								 tli_from_file, BACKUP_LABEL_FILE)));
12057 	}
12058 
12059 	if (ferror(lfp) || FreeFile(lfp))
12060 		ereport(FATAL,
12061 				(errcode_for_file_access(),
12062 				 errmsg("could not read file \"%s\": %m",
12063 						BACKUP_LABEL_FILE)));
12064 
12065 	return true;
12066 }
12067 
12068 /*
12069  * read_tablespace_map: check to see if a tablespace_map file is present
12070  *
12071  * If we see a tablespace_map file during recovery, we assume that we are
12072  * recovering from a backup dump file, and we therefore need to create symlinks
12073  * as per the information present in tablespace_map file.
12074  *
12075  * Returns true if a tablespace_map file was found (and fills *tablespaces
12076  * with a tablespaceinfo struct for each tablespace listed in the file);
12077  * returns false if not.
12078  */
12079 static bool
read_tablespace_map(List ** tablespaces)12080 read_tablespace_map(List **tablespaces)
12081 {
12082 	tablespaceinfo *ti;
12083 	FILE	   *lfp;
12084 	char		str[MAXPGPATH];
12085 	int			ch,
12086 				i,
12087 				n;
12088 	bool		was_backslash;
12089 
12090 	/*
12091 	 * See if tablespace_map file is present
12092 	 */
12093 	lfp = AllocateFile(TABLESPACE_MAP, "r");
12094 	if (!lfp)
12095 	{
12096 		if (errno != ENOENT)
12097 			ereport(FATAL,
12098 					(errcode_for_file_access(),
12099 					 errmsg("could not read file \"%s\": %m",
12100 							TABLESPACE_MAP)));
12101 		return false;			/* it's not there, all is fine */
12102 	}
12103 
12104 	/*
12105 	 * Read and parse the link name and path lines from tablespace_map file
12106 	 * (this code is pretty crude, but we are not expecting any variability in
12107 	 * the file format).  De-escape any backslashes that were inserted.
12108 	 */
12109 	i = 0;
12110 	was_backslash = false;
12111 	while ((ch = fgetc(lfp)) != EOF)
12112 	{
12113 		if (!was_backslash && (ch == '\n' || ch == '\r'))
12114 		{
12115 			if (i == 0)
12116 				continue;		/* \r immediately followed by \n */
12117 
12118 			/*
12119 			 * The de-escaped line should contain an OID followed by exactly
12120 			 * one space followed by a path.  The path might start with
12121 			 * spaces, so don't be too liberal about parsing.
12122 			 */
12123 			str[i] = '\0';
12124 			n = 0;
12125 			while (str[n] && str[n] != ' ')
12126 				n++;
12127 			if (n < 1 || n >= i - 1)
12128 				ereport(FATAL,
12129 						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
12130 						 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
12131 			str[n++] = '\0';
12132 
12133 			ti = palloc0(sizeof(tablespaceinfo));
12134 			ti->oid = pstrdup(str);
12135 			ti->path = pstrdup(str + n);
12136 			*tablespaces = lappend(*tablespaces, ti);
12137 
12138 			i = 0;
12139 			continue;
12140 		}
12141 		else if (!was_backslash && ch == '\\')
12142 			was_backslash = true;
12143 		else
12144 		{
12145 			if (i < sizeof(str) - 1)
12146 				str[i++] = ch;
12147 			was_backslash = false;
12148 		}
12149 	}
12150 
12151 	if (i != 0 || was_backslash)	/* last line not terminated? */
12152 		ereport(FATAL,
12153 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
12154 				 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
12155 
12156 	if (ferror(lfp) || FreeFile(lfp))
12157 		ereport(FATAL,
12158 				(errcode_for_file_access(),
12159 				 errmsg("could not read file \"%s\": %m",
12160 						TABLESPACE_MAP)));
12161 
12162 	return true;
12163 }
12164 
12165 /*
12166  * Error context callback for errors occurring during rm_redo().
12167  */
12168 static void
rm_redo_error_callback(void * arg)12169 rm_redo_error_callback(void *arg)
12170 {
12171 	XLogReaderState *record = (XLogReaderState *) arg;
12172 	StringInfoData buf;
12173 
12174 	initStringInfo(&buf);
12175 	xlog_outdesc(&buf, record);
12176 	xlog_block_info(&buf, record);
12177 
12178 	/* translator: %s is a WAL record description */
12179 	errcontext("WAL redo at %X/%X for %s",
12180 			   LSN_FORMAT_ARGS(record->ReadRecPtr),
12181 			   buf.data);
12182 
12183 	pfree(buf.data);
12184 }
12185 
12186 /*
12187  * BackupInProgress: check if online backup mode is active
12188  *
12189  * This is done by checking for existence of the "backup_label" file.
12190  */
12191 bool
BackupInProgress(void)12192 BackupInProgress(void)
12193 {
12194 	struct stat stat_buf;
12195 
12196 	return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
12197 }
12198 
12199 /*
12200  * CancelBackup: rename the "backup_label" and "tablespace_map"
12201  *				 files to cancel backup mode
12202  *
12203  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
12204  * Similarly, if the "tablespace_map" file exists, it will be renamed to
12205  * "tablespace_map.old".
12206  *
12207  * Note that this will render an online backup in progress
12208  * useless. To correctly finish an online backup, pg_stop_backup must be
12209  * called.
12210  */
12211 void
CancelBackup(void)12212 CancelBackup(void)
12213 {
12214 	struct stat stat_buf;
12215 
12216 	/* if the backup_label file is not there, return */
12217 	if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
12218 		return;
12219 
12220 	/* remove leftover file from previously canceled backup if it exists */
12221 	unlink(BACKUP_LABEL_OLD);
12222 
12223 	if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
12224 	{
12225 		ereport(WARNING,
12226 				(errcode_for_file_access(),
12227 				 errmsg("online backup mode was not canceled"),
12228 				 errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
12229 						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
12230 		return;
12231 	}
12232 
12233 	/* if the tablespace_map file is not there, return */
12234 	if (stat(TABLESPACE_MAP, &stat_buf) < 0)
12235 	{
12236 		ereport(LOG,
12237 				(errmsg("online backup mode canceled"),
12238 				 errdetail("File \"%s\" was renamed to \"%s\".",
12239 						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
12240 		return;
12241 	}
12242 
12243 	/* remove leftover file from previously canceled backup if it exists */
12244 	unlink(TABLESPACE_MAP_OLD);
12245 
12246 	if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
12247 	{
12248 		ereport(LOG,
12249 				(errmsg("online backup mode canceled"),
12250 				 errdetail("Files \"%s\" and \"%s\" were renamed to "
12251 						   "\"%s\" and \"%s\", respectively.",
12252 						   BACKUP_LABEL_FILE, TABLESPACE_MAP,
12253 						   BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
12254 	}
12255 	else
12256 	{
12257 		ereport(WARNING,
12258 				(errcode_for_file_access(),
12259 				 errmsg("online backup mode canceled"),
12260 				 errdetail("File \"%s\" was renamed to \"%s\", but "
12261 						   "file \"%s\" could not be renamed to \"%s\": %m.",
12262 						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
12263 						   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
12264 	}
12265 }
12266 
12267 /*
12268  * Read the XLOG page containing RecPtr into readBuf (if not read already).
12269  * Returns number of bytes read, if the page is read successfully, or -1
12270  * in case of errors.  When errors occur, they are ereport'ed, but only
12271  * if they have not been previously reported.
12272  *
12273  * This is responsible for restoring files from archive as needed, as well
12274  * as for waiting for the requested WAL record to arrive in standby mode.
12275  *
12276  * 'emode' specifies the log level used for reporting "file not found" or
12277  * "end of WAL" situations in archive recovery, or in standby mode when a
12278  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
12279  * false in those situations, on higher log levels the ereport() won't
12280  * return.
12281  *
12282  * In standby mode, if after a successful return of XLogPageRead() the
12283  * caller finds the record it's interested in to be broken, it should
12284  * ereport the error with the level determined by
12285  * emode_for_corrupt_record(), and then set lastSourceFailed
12286  * and call XLogPageRead() again with the same arguments. This lets
12287  * XLogPageRead() to try fetching the record from another source, or to
12288  * sleep and retry.
12289  */
12290 static int
XLogPageRead(XLogReaderState * xlogreader,XLogRecPtr targetPagePtr,int reqLen,XLogRecPtr targetRecPtr,char * readBuf)12291 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
12292 			 XLogRecPtr targetRecPtr, char *readBuf)
12293 {
12294 	XLogPageReadPrivate *private =
12295 	(XLogPageReadPrivate *) xlogreader->private_data;
12296 	int			emode = private->emode;
12297 	uint32		targetPageOff;
12298 	XLogSegNo	targetSegNo PG_USED_FOR_ASSERTS_ONLY;
12299 	int			r;
12300 
12301 	XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
12302 	targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
12303 
12304 	/*
12305 	 * See if we need to switch to a new segment because the requested record
12306 	 * is not in the currently open one.
12307 	 */
12308 	if (readFile >= 0 &&
12309 		!XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
12310 	{
12311 		/*
12312 		 * Request a restartpoint if we've replayed too much xlog since the
12313 		 * last one.
12314 		 */
12315 		if (bgwriterLaunched)
12316 		{
12317 			if (XLogCheckpointNeeded(readSegNo))
12318 			{
12319 				(void) GetRedoRecPtr();
12320 				if (XLogCheckpointNeeded(readSegNo))
12321 					RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
12322 			}
12323 		}
12324 
12325 		close(readFile);
12326 		readFile = -1;
12327 		readSource = XLOG_FROM_ANY;
12328 	}
12329 
12330 	XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
12331 
12332 retry:
12333 	/* See if we need to retrieve more data */
12334 	if (readFile < 0 ||
12335 		(readSource == XLOG_FROM_STREAM &&
12336 		 flushedUpto < targetPagePtr + reqLen))
12337 	{
12338 		if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
12339 										 private->randAccess,
12340 										 private->fetching_ckpt,
12341 										 targetRecPtr))
12342 		{
12343 			if (readFile >= 0)
12344 				close(readFile);
12345 			readFile = -1;
12346 			readLen = 0;
12347 			readSource = XLOG_FROM_ANY;
12348 
12349 			return -1;
12350 		}
12351 	}
12352 
12353 	/*
12354 	 * At this point, we have the right segment open and if we're streaming we
12355 	 * know the requested record is in it.
12356 	 */
12357 	Assert(readFile != -1);
12358 
12359 	/*
12360 	 * If the current segment is being streamed from the primary, calculate
12361 	 * how much of the current page we have received already. We know the
12362 	 * requested record has been received, but this is for the benefit of
12363 	 * future calls, to allow quick exit at the top of this function.
12364 	 */
12365 	if (readSource == XLOG_FROM_STREAM)
12366 	{
12367 		if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
12368 			readLen = XLOG_BLCKSZ;
12369 		else
12370 			readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
12371 				targetPageOff;
12372 	}
12373 	else
12374 		readLen = XLOG_BLCKSZ;
12375 
12376 	/* Read the requested page */
12377 	readOff = targetPageOff;
12378 
12379 	pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
12380 	r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
12381 	if (r != XLOG_BLCKSZ)
12382 	{
12383 		char		fname[MAXFNAMELEN];
12384 		int			save_errno = errno;
12385 
12386 		pgstat_report_wait_end();
12387 		XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
12388 		if (r < 0)
12389 		{
12390 			errno = save_errno;
12391 			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
12392 					(errcode_for_file_access(),
12393 					 errmsg("could not read from log segment %s, offset %u: %m",
12394 							fname, readOff)));
12395 		}
12396 		else
12397 			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
12398 					(errcode(ERRCODE_DATA_CORRUPTED),
12399 					 errmsg("could not read from log segment %s, offset %u: read %d of %zu",
12400 							fname, readOff, r, (Size) XLOG_BLCKSZ)));
12401 		goto next_record_is_invalid;
12402 	}
12403 	pgstat_report_wait_end();
12404 
12405 	Assert(targetSegNo == readSegNo);
12406 	Assert(targetPageOff == readOff);
12407 	Assert(reqLen <= readLen);
12408 
12409 	xlogreader->seg.ws_tli = curFileTLI;
12410 
12411 	/*
12412 	 * Check the page header immediately, so that we can retry immediately if
12413 	 * it's not valid. This may seem unnecessary, because XLogReadRecord()
12414 	 * validates the page header anyway, and would propagate the failure up to
12415 	 * ReadRecord(), which would retry. However, there's a corner case with
12416 	 * continuation records, if a record is split across two pages such that
12417 	 * we would need to read the two pages from different sources. For
12418 	 * example, imagine a scenario where a streaming replica is started up,
12419 	 * and replay reaches a record that's split across two WAL segments. The
12420 	 * first page is only available locally, in pg_wal, because it's already
12421 	 * been recycled on the primary. The second page, however, is not present
12422 	 * in pg_wal, and we should stream it from the primary. There is a
12423 	 * recycled WAL segment present in pg_wal, with garbage contents, however.
12424 	 * We would read the first page from the local WAL segment, but when
12425 	 * reading the second page, we would read the bogus, recycled, WAL
12426 	 * segment. If we didn't catch that case here, we would never recover,
12427 	 * because ReadRecord() would retry reading the whole record from the
12428 	 * beginning.
12429 	 *
12430 	 * Of course, this only catches errors in the page header, which is what
12431 	 * happens in the case of a recycled WAL segment. Other kinds of errors or
12432 	 * corruption still has the same problem. But this at least fixes the
12433 	 * common case, which can happen as part of normal operation.
12434 	 *
12435 	 * Validating the page header is cheap enough that doing it twice
12436 	 * shouldn't be a big deal from a performance point of view.
12437 	 */
12438 	if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
12439 	{
12440 		/* reset any error XLogReaderValidatePageHeader() might have set */
12441 		xlogreader->errormsg_buf[0] = '\0';
12442 		goto next_record_is_invalid;
12443 	}
12444 
12445 	return readLen;
12446 
12447 next_record_is_invalid:
12448 	lastSourceFailed = true;
12449 
12450 	if (readFile >= 0)
12451 		close(readFile);
12452 	readFile = -1;
12453 	readLen = 0;
12454 	readSource = XLOG_FROM_ANY;
12455 
12456 	/* In standby-mode, keep trying */
12457 	if (StandbyMode)
12458 		goto retry;
12459 	else
12460 		return -1;
12461 }
12462 
12463 /*
12464  * Open the WAL segment containing WAL location 'RecPtr'.
12465  *
12466  * The segment can be fetched via restore_command, or via walreceiver having
12467  * streamed the record, or it can already be present in pg_wal. Checking
12468  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
12469  * too, in case someone copies a new segment directly to pg_wal. That is not
12470  * documented or recommended, though.
12471  *
12472  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
12473  * prepare to read WAL starting from RedoStartLSN after this.
12474  *
12475  * 'RecPtr' might not point to the beginning of the record we're interested
12476  * in, it might also point to the page or segment header. In that case,
12477  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
12478  * used to decide which timeline to stream the requested WAL from.
12479  *
12480  * If the record is not immediately available, the function returns false
12481  * if we're not in standby mode. In standby mode, waits for it to become
12482  * available.
12483  *
12484  * When the requested record becomes available, the function opens the file
12485  * containing it (if not open already), and returns true. When end of standby
12486  * mode is triggered by the user, and there is no more WAL available, returns
12487  * false.
12488  */
12489 static bool
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,bool randAccess,bool fetching_ckpt,XLogRecPtr tliRecPtr)12490 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
12491 							bool fetching_ckpt, XLogRecPtr tliRecPtr)
12492 {
12493 	static TimestampTz last_fail_time = 0;
12494 	TimestampTz now;
12495 	bool		streaming_reply_sent = false;
12496 
12497 	/*-------
12498 	 * Standby mode is implemented by a state machine:
12499 	 *
12500 	 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
12501 	 *	  pg_wal (XLOG_FROM_PG_WAL)
12502 	 * 2. Check trigger file
12503 	 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
12504 	 * 4. Rescan timelines
12505 	 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
12506 	 *
12507 	 * Failure to read from the current source advances the state machine to
12508 	 * the next state.
12509 	 *
12510 	 * 'currentSource' indicates the current state. There are no currentSource
12511 	 * values for "check trigger", "rescan timelines", and "sleep" states,
12512 	 * those actions are taken when reading from the previous source fails, as
12513 	 * part of advancing to the next state.
12514 	 *
12515 	 * If standby mode is turned off while reading WAL from stream, we move
12516 	 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
12517 	 * the files (which would be required at end of recovery, e.g., timeline
12518 	 * history file) from archive or pg_wal. We don't need to kill WAL receiver
12519 	 * here because it's already stopped when standby mode is turned off at
12520 	 * the end of recovery.
12521 	 *-------
12522 	 */
12523 	if (!InArchiveRecovery)
12524 		currentSource = XLOG_FROM_PG_WAL;
12525 	else if (currentSource == XLOG_FROM_ANY ||
12526 			 (!StandbyMode && currentSource == XLOG_FROM_STREAM))
12527 	{
12528 		lastSourceFailed = false;
12529 		currentSource = XLOG_FROM_ARCHIVE;
12530 	}
12531 
12532 	for (;;)
12533 	{
12534 		XLogSource	oldSource = currentSource;
12535 		bool		startWalReceiver = false;
12536 
12537 		/*
12538 		 * First check if we failed to read from the current source, and
12539 		 * advance the state machine if so. The failure to read might've
12540 		 * happened outside this function, e.g when a CRC check fails on a
12541 		 * record, or within this loop.
12542 		 */
12543 		if (lastSourceFailed)
12544 		{
12545 			switch (currentSource)
12546 			{
12547 				case XLOG_FROM_ARCHIVE:
12548 				case XLOG_FROM_PG_WAL:
12549 
12550 					/*
12551 					 * Check to see if the trigger file exists. Note that we
12552 					 * do this only after failure, so when you create the
12553 					 * trigger file, we still finish replaying as much as we
12554 					 * can from archive and pg_wal before failover.
12555 					 */
12556 					if (StandbyMode && CheckForStandbyTrigger())
12557 					{
12558 						ShutdownWalRcv();
12559 						return false;
12560 					}
12561 
12562 					/*
12563 					 * Not in standby mode, and we've now tried the archive
12564 					 * and pg_wal.
12565 					 */
12566 					if (!StandbyMode)
12567 						return false;
12568 
12569 					/*
12570 					 * Move to XLOG_FROM_STREAM state, and set to start a
12571 					 * walreceiver if necessary.
12572 					 */
12573 					currentSource = XLOG_FROM_STREAM;
12574 					startWalReceiver = true;
12575 					break;
12576 
12577 				case XLOG_FROM_STREAM:
12578 
12579 					/*
12580 					 * Failure while streaming. Most likely, we got here
12581 					 * because streaming replication was terminated, or
12582 					 * promotion was triggered. But we also get here if we
12583 					 * find an invalid record in the WAL streamed from the
12584 					 * primary, in which case something is seriously wrong.
12585 					 * There's little chance that the problem will just go
12586 					 * away, but PANIC is not good for availability either,
12587 					 * especially in hot standby mode. So, we treat that the
12588 					 * same as disconnection, and retry from archive/pg_wal
12589 					 * again. The WAL in the archive should be identical to
12590 					 * what was streamed, so it's unlikely that it helps, but
12591 					 * one can hope...
12592 					 */
12593 
12594 					/*
12595 					 * We should be able to move to XLOG_FROM_STREAM only in
12596 					 * standby mode.
12597 					 */
12598 					Assert(StandbyMode);
12599 
12600 					/*
12601 					 * Before we leave XLOG_FROM_STREAM state, make sure that
12602 					 * walreceiver is not active, so that it won't overwrite
12603 					 * WAL that we restore from archive.
12604 					 */
12605 					if (WalRcvStreaming())
12606 						ShutdownWalRcv();
12607 
12608 					/*
12609 					 * Before we sleep, re-scan for possible new timelines if
12610 					 * we were requested to recover to the latest timeline.
12611 					 */
12612 					if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
12613 					{
12614 						if (rescanLatestTimeLine())
12615 						{
12616 							currentSource = XLOG_FROM_ARCHIVE;
12617 							break;
12618 						}
12619 					}
12620 
12621 					/*
12622 					 * XLOG_FROM_STREAM is the last state in our state
12623 					 * machine, so we've exhausted all the options for
12624 					 * obtaining the requested WAL. We're going to loop back
12625 					 * and retry from the archive, but if it hasn't been long
12626 					 * since last attempt, sleep wal_retrieve_retry_interval
12627 					 * milliseconds to avoid busy-waiting.
12628 					 */
12629 					now = GetCurrentTimestamp();
12630 					if (!TimestampDifferenceExceeds(last_fail_time, now,
12631 													wal_retrieve_retry_interval))
12632 					{
12633 						long		wait_time;
12634 
12635 						wait_time = wal_retrieve_retry_interval -
12636 							TimestampDifferenceMilliseconds(last_fail_time, now);
12637 
12638 						(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
12639 										 WL_LATCH_SET | WL_TIMEOUT |
12640 										 WL_EXIT_ON_PM_DEATH,
12641 										 wait_time,
12642 										 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
12643 						ResetLatch(&XLogCtl->recoveryWakeupLatch);
12644 						now = GetCurrentTimestamp();
12645 
12646 						/* Handle interrupt signals of startup process */
12647 						HandleStartupProcInterrupts();
12648 					}
12649 					last_fail_time = now;
12650 					currentSource = XLOG_FROM_ARCHIVE;
12651 					break;
12652 
12653 				default:
12654 					elog(ERROR, "unexpected WAL source %d", currentSource);
12655 			}
12656 		}
12657 		else if (currentSource == XLOG_FROM_PG_WAL)
12658 		{
12659 			/*
12660 			 * We just successfully read a file in pg_wal. We prefer files in
12661 			 * the archive over ones in pg_wal, so try the next file again
12662 			 * from the archive first.
12663 			 */
12664 			if (InArchiveRecovery)
12665 				currentSource = XLOG_FROM_ARCHIVE;
12666 		}
12667 
12668 		if (currentSource != oldSource)
12669 			elog(DEBUG2, "switched WAL source from %s to %s after %s",
12670 				 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
12671 				 lastSourceFailed ? "failure" : "success");
12672 
12673 		/*
12674 		 * We've now handled possible failure. Try to read from the chosen
12675 		 * source.
12676 		 */
12677 		lastSourceFailed = false;
12678 
12679 		switch (currentSource)
12680 		{
12681 			case XLOG_FROM_ARCHIVE:
12682 			case XLOG_FROM_PG_WAL:
12683 
12684 				/*
12685 				 * WAL receiver must not be running when reading WAL from
12686 				 * archive or pg_wal.
12687 				 */
12688 				Assert(!WalRcvStreaming());
12689 
12690 				/* Close any old file we might have open. */
12691 				if (readFile >= 0)
12692 				{
12693 					close(readFile);
12694 					readFile = -1;
12695 				}
12696 				/* Reset curFileTLI if random fetch. */
12697 				if (randAccess)
12698 					curFileTLI = 0;
12699 
12700 				/*
12701 				 * Try to restore the file from archive, or read an existing
12702 				 * file from pg_wal.
12703 				 */
12704 				readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
12705 											  currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
12706 											  currentSource);
12707 				if (readFile >= 0)
12708 					return true;	/* success! */
12709 
12710 				/*
12711 				 * Nope, not found in archive or pg_wal.
12712 				 */
12713 				lastSourceFailed = true;
12714 				break;
12715 
12716 			case XLOG_FROM_STREAM:
12717 				{
12718 					bool		havedata;
12719 
12720 					/*
12721 					 * We should be able to move to XLOG_FROM_STREAM only in
12722 					 * standby mode.
12723 					 */
12724 					Assert(StandbyMode);
12725 
12726 					/*
12727 					 * First, shutdown walreceiver if its restart has been
12728 					 * requested -- but no point if we're already slated for
12729 					 * starting it.
12730 					 */
12731 					if (pendingWalRcvRestart && !startWalReceiver)
12732 					{
12733 						ShutdownWalRcv();
12734 
12735 						/*
12736 						 * Re-scan for possible new timelines if we were
12737 						 * requested to recover to the latest timeline.
12738 						 */
12739 						if (recoveryTargetTimeLineGoal ==
12740 							RECOVERY_TARGET_TIMELINE_LATEST)
12741 							rescanLatestTimeLine();
12742 
12743 						startWalReceiver = true;
12744 					}
12745 					pendingWalRcvRestart = false;
12746 
12747 					/*
12748 					 * Launch walreceiver if needed.
12749 					 *
12750 					 * If fetching_ckpt is true, RecPtr points to the initial
12751 					 * checkpoint location. In that case, we use RedoStartLSN
12752 					 * as the streaming start position instead of RecPtr, so
12753 					 * that when we later jump backwards to start redo at
12754 					 * RedoStartLSN, we will have the logs streamed already.
12755 					 */
12756 					if (startWalReceiver &&
12757 						PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
12758 					{
12759 						XLogRecPtr	ptr;
12760 						TimeLineID	tli;
12761 
12762 						if (fetching_ckpt)
12763 						{
12764 							ptr = RedoStartLSN;
12765 							tli = ControlFile->checkPointCopy.ThisTimeLineID;
12766 						}
12767 						else
12768 						{
12769 							ptr = RecPtr;
12770 
12771 							/*
12772 							 * Use the record begin position to determine the
12773 							 * TLI, rather than the position we're reading.
12774 							 */
12775 							tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
12776 
12777 							if (curFileTLI > 0 && tli < curFileTLI)
12778 								elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
12779 									 LSN_FORMAT_ARGS(tliRecPtr),
12780 									 tli, curFileTLI);
12781 						}
12782 						curFileTLI = tli;
12783 						RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
12784 											 PrimarySlotName,
12785 											 wal_receiver_create_temp_slot);
12786 						flushedUpto = 0;
12787 					}
12788 
12789 					/*
12790 					 * Check if WAL receiver is active or wait to start up.
12791 					 */
12792 					if (!WalRcvStreaming())
12793 					{
12794 						lastSourceFailed = true;
12795 						break;
12796 					}
12797 
12798 					/*
12799 					 * Walreceiver is active, so see if new data has arrived.
12800 					 *
12801 					 * We only advance XLogReceiptTime when we obtain fresh
12802 					 * WAL from walreceiver and observe that we had already
12803 					 * processed everything before the most recent "chunk"
12804 					 * that it flushed to disk.  In steady state where we are
12805 					 * keeping up with the incoming data, XLogReceiptTime will
12806 					 * be updated on each cycle. When we are behind,
12807 					 * XLogReceiptTime will not advance, so the grace time
12808 					 * allotted to conflicting queries will decrease.
12809 					 */
12810 					if (RecPtr < flushedUpto)
12811 						havedata = true;
12812 					else
12813 					{
12814 						XLogRecPtr	latestChunkStart;
12815 
12816 						flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
12817 						if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
12818 						{
12819 							havedata = true;
12820 							if (latestChunkStart <= RecPtr)
12821 							{
12822 								XLogReceiptTime = GetCurrentTimestamp();
12823 								SetCurrentChunkStartTime(XLogReceiptTime);
12824 							}
12825 						}
12826 						else
12827 							havedata = false;
12828 					}
12829 					if (havedata)
12830 					{
12831 						/*
12832 						 * Great, streamed far enough.  Open the file if it's
12833 						 * not open already.  Also read the timeline history
12834 						 * file if we haven't initialized timeline history
12835 						 * yet; it should be streamed over and present in
12836 						 * pg_wal by now.  Use XLOG_FROM_STREAM so that source
12837 						 * info is set correctly and XLogReceiptTime isn't
12838 						 * changed.
12839 						 *
12840 						 * NB: We must set readTimeLineHistory based on
12841 						 * recoveryTargetTLI, not receiveTLI. Normally they'll
12842 						 * be the same, but if recovery_target_timeline is
12843 						 * 'latest' and archiving is configured, then it's
12844 						 * possible that we managed to retrieve one or more
12845 						 * new timeline history files from the archive,
12846 						 * updating recoveryTargetTLI.
12847 						 */
12848 						if (readFile < 0)
12849 						{
12850 							if (!expectedTLEs)
12851 								expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
12852 							readFile = XLogFileRead(readSegNo, PANIC,
12853 													receiveTLI,
12854 													XLOG_FROM_STREAM, false);
12855 							Assert(readFile >= 0);
12856 						}
12857 						else
12858 						{
12859 							/* just make sure source info is correct... */
12860 							readSource = XLOG_FROM_STREAM;
12861 							XLogReceiptSource = XLOG_FROM_STREAM;
12862 							return true;
12863 						}
12864 						break;
12865 					}
12866 
12867 					/*
12868 					 * Data not here yet. Check for trigger, then wait for
12869 					 * walreceiver to wake us up when new WAL arrives.
12870 					 */
12871 					if (CheckForStandbyTrigger())
12872 					{
12873 						/*
12874 						 * Note that we don't "return false" immediately here.
12875 						 * After being triggered, we still want to replay all
12876 						 * the WAL that was already streamed. It's in pg_wal
12877 						 * now, so we just treat this as a failure, and the
12878 						 * state machine will move on to replay the streamed
12879 						 * WAL from pg_wal, and then recheck the trigger and
12880 						 * exit replay.
12881 						 */
12882 						lastSourceFailed = true;
12883 						break;
12884 					}
12885 
12886 					/*
12887 					 * Since we have replayed everything we have received so
12888 					 * far and are about to start waiting for more WAL, let's
12889 					 * tell the upstream server our replay location now so
12890 					 * that pg_stat_replication doesn't show stale
12891 					 * information.
12892 					 */
12893 					if (!streaming_reply_sent)
12894 					{
12895 						WalRcvForceReply();
12896 						streaming_reply_sent = true;
12897 					}
12898 
12899 					/*
12900 					 * Wait for more WAL to arrive. Time out after 5 seconds
12901 					 * to react to a trigger file promptly and to check if the
12902 					 * WAL receiver is still active.
12903 					 */
12904 					(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
12905 									 WL_LATCH_SET | WL_TIMEOUT |
12906 									 WL_EXIT_ON_PM_DEATH,
12907 									 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM);
12908 					ResetLatch(&XLogCtl->recoveryWakeupLatch);
12909 					break;
12910 				}
12911 
12912 			default:
12913 				elog(ERROR, "unexpected WAL source %d", currentSource);
12914 		}
12915 
12916 		/*
12917 		 * Check for recovery pause here so that we can confirm more quickly
12918 		 * that a requested pause has actually taken effect.
12919 		 */
12920 		if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
12921 			RECOVERY_NOT_PAUSED)
12922 			recoveryPausesHere(false);
12923 
12924 		/*
12925 		 * This possibly-long loop needs to handle interrupts of startup
12926 		 * process.
12927 		 */
12928 		HandleStartupProcInterrupts();
12929 	}
12930 
12931 	return false;				/* not reached */
12932 }
12933 
12934 /*
12935  * Set flag to signal the walreceiver to restart.  (The startup process calls
12936  * this on noticing a relevant configuration change.)
12937  */
12938 void
StartupRequestWalReceiverRestart(void)12939 StartupRequestWalReceiverRestart(void)
12940 {
12941 	if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
12942 	{
12943 		ereport(LOG,
12944 				(errmsg("WAL receiver process shutdown requested")));
12945 
12946 		pendingWalRcvRestart = true;
12947 	}
12948 }
12949 
12950 /*
12951  * Determine what log level should be used to report a corrupt WAL record
12952  * in the current WAL page, previously read by XLogPageRead().
12953  *
12954  * 'emode' is the error mode that would be used to report a file-not-found
12955  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
12956  * we're retrying the exact same record that we've tried previously, only
12957  * complain the first time to keep the noise down.  However, we only do when
12958  * reading from pg_wal, because we don't expect any invalid records in archive
12959  * or in records streamed from the primary. Files in the archive should be complete,
12960  * and we should never hit the end of WAL because we stop and wait for more WAL
12961  * to arrive before replaying it.
12962  *
12963  * NOTE: This function remembers the RecPtr value it was last called with,
12964  * to suppress repeated messages about the same record. Only call this when
12965  * you are about to ereport(), or you might cause a later message to be
12966  * erroneously suppressed.
12967  */
12968 static int
emode_for_corrupt_record(int emode,XLogRecPtr RecPtr)12969 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
12970 {
12971 	static XLogRecPtr lastComplaint = 0;
12972 
12973 	if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
12974 	{
12975 		if (RecPtr == lastComplaint)
12976 			emode = DEBUG1;
12977 		else
12978 			lastComplaint = RecPtr;
12979 	}
12980 	return emode;
12981 }
12982 
12983 /*
12984  * Has a standby promotion already been triggered?
12985  *
12986  * Unlike CheckForStandbyTrigger(), this works in any process
12987  * that's connected to shared memory.
12988  */
12989 bool
PromoteIsTriggered(void)12990 PromoteIsTriggered(void)
12991 {
12992 	/*
12993 	 * We check shared state each time only until a standby promotion is
12994 	 * triggered. We can't trigger a promotion again, so there's no need to
12995 	 * keep checking after the shared variable has once been seen true.
12996 	 */
12997 	if (LocalPromoteIsTriggered)
12998 		return true;
12999 
13000 	SpinLockAcquire(&XLogCtl->info_lck);
13001 	LocalPromoteIsTriggered = XLogCtl->SharedPromoteIsTriggered;
13002 	SpinLockRelease(&XLogCtl->info_lck);
13003 
13004 	return LocalPromoteIsTriggered;
13005 }
13006 
13007 static void
SetPromoteIsTriggered(void)13008 SetPromoteIsTriggered(void)
13009 {
13010 	SpinLockAcquire(&XLogCtl->info_lck);
13011 	XLogCtl->SharedPromoteIsTriggered = true;
13012 	SpinLockRelease(&XLogCtl->info_lck);
13013 
13014 	/*
13015 	 * Mark the recovery pause state as 'not paused' because the paused state
13016 	 * ends and promotion continues if a promotion is triggered while recovery
13017 	 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
13018 	 * return 'paused' while a promotion is ongoing.
13019 	 */
13020 	SetRecoveryPause(false);
13021 
13022 	LocalPromoteIsTriggered = true;
13023 }
13024 
13025 /*
13026  * Check to see whether the user-specified trigger file exists and whether a
13027  * promote request has arrived.  If either condition holds, return true.
13028  */
13029 static bool
CheckForStandbyTrigger(void)13030 CheckForStandbyTrigger(void)
13031 {
13032 	struct stat stat_buf;
13033 
13034 	if (LocalPromoteIsTriggered)
13035 		return true;
13036 
13037 	if (IsPromoteSignaled() && CheckPromoteSignal())
13038 	{
13039 		ereport(LOG, (errmsg("received promote request")));
13040 		RemovePromoteSignalFiles();
13041 		ResetPromoteSignaled();
13042 		SetPromoteIsTriggered();
13043 		return true;
13044 	}
13045 
13046 	if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
13047 		return false;
13048 
13049 	if (stat(PromoteTriggerFile, &stat_buf) == 0)
13050 	{
13051 		ereport(LOG,
13052 				(errmsg("promote trigger file found: %s", PromoteTriggerFile)));
13053 		unlink(PromoteTriggerFile);
13054 		SetPromoteIsTriggered();
13055 		return true;
13056 	}
13057 	else if (errno != ENOENT)
13058 		ereport(ERROR,
13059 				(errcode_for_file_access(),
13060 				 errmsg("could not stat promote trigger file \"%s\": %m",
13061 						PromoteTriggerFile)));
13062 
13063 	return false;
13064 }
13065 
13066 /*
13067  * Remove the files signaling a standby promotion request.
13068  */
13069 void
RemovePromoteSignalFiles(void)13070 RemovePromoteSignalFiles(void)
13071 {
13072 	unlink(PROMOTE_SIGNAL_FILE);
13073 }
13074 
13075 /*
13076  * Check to see if a promote request has arrived.
13077  */
13078 bool
CheckPromoteSignal(void)13079 CheckPromoteSignal(void)
13080 {
13081 	struct stat stat_buf;
13082 
13083 	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
13084 		return true;
13085 
13086 	return false;
13087 }
13088 
13089 /*
13090  * Wake up startup process to replay newly arrived WAL, or to notice that
13091  * failover has been requested.
13092  */
13093 void
WakeupRecovery(void)13094 WakeupRecovery(void)
13095 {
13096 	SetLatch(&XLogCtl->recoveryWakeupLatch);
13097 }
13098 
13099 /*
13100  * Update the WalWriterSleeping flag.
13101  */
13102 void
SetWalWriterSleeping(bool sleeping)13103 SetWalWriterSleeping(bool sleeping)
13104 {
13105 	SpinLockAcquire(&XLogCtl->info_lck);
13106 	XLogCtl->WalWriterSleeping = sleeping;
13107 	SpinLockRelease(&XLogCtl->info_lck);
13108 }
13109 
13110 /*
13111  * Schedule a walreceiver wakeup in the main recovery loop.
13112  */
13113 void
XLogRequestWalReceiverReply(void)13114 XLogRequestWalReceiverReply(void)
13115 {
13116 	doRequestWalReceiverReply = true;
13117 }
13118