1 /*-------------------------------------------------------------------------
2  *
3  * xlog.c
4  *		PostgreSQL write-ahead log manager
5  *
6  *
7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/backend/access/transam/xlog.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <math.h>
19 #include <time.h>
20 #include <fcntl.h>
21 #include <sys/stat.h>
22 #include <sys/time.h>
23 #include <unistd.h>
24 
25 #include "access/clog.h"
26 #include "access/commit_ts.h"
27 #include "access/heaptoast.h"
28 #include "access/multixact.h"
29 #include "access/rewriteheap.h"
30 #include "access/subtrans.h"
31 #include "access/timeline.h"
32 #include "access/transam.h"
33 #include "access/twophase.h"
34 #include "access/xact.h"
35 #include "access/xlog_internal.h"
36 #include "access/xlogarchive.h"
37 #include "access/xloginsert.h"
38 #include "access/xlogreader.h"
39 #include "access/xlogutils.h"
40 #include "catalog/catversion.h"
41 #include "catalog/pg_control.h"
42 #include "catalog/pg_database.h"
43 #include "commands/progress.h"
44 #include "commands/tablespace.h"
45 #include "common/controldata_utils.h"
46 #include "executor/instrument.h"
47 #include "miscadmin.h"
48 #include "pg_trace.h"
49 #include "pgstat.h"
50 #include "port/atomics.h"
51 #include "postmaster/bgwriter.h"
52 #include "postmaster/startup.h"
53 #include "postmaster/walwriter.h"
54 #include "replication/basebackup.h"
55 #include "replication/logical.h"
56 #include "replication/origin.h"
57 #include "replication/slot.h"
58 #include "replication/snapbuild.h"
59 #include "replication/walreceiver.h"
60 #include "replication/walsender.h"
61 #include "storage/bufmgr.h"
62 #include "storage/fd.h"
63 #include "storage/ipc.h"
64 #include "storage/large_object.h"
65 #include "storage/latch.h"
66 #include "storage/pmsignal.h"
67 #include "storage/predicate.h"
68 #include "storage/proc.h"
69 #include "storage/procarray.h"
70 #include "storage/reinit.h"
71 #include "storage/smgr.h"
72 #include "storage/spin.h"
73 #include "storage/sync.h"
74 #include "utils/builtins.h"
75 #include "utils/guc.h"
76 #include "utils/memutils.h"
77 #include "utils/ps_status.h"
78 #include "utils/relmapper.h"
79 #include "utils/snapmgr.h"
80 #include "utils/timestamp.h"
81 
82 extern uint32 bootstrap_data_checksum_version;
83 
84 /* Unsupported old recovery command file names (relative to $PGDATA) */
85 #define RECOVERY_COMMAND_FILE	"recovery.conf"
86 #define RECOVERY_COMMAND_DONE	"recovery.done"
87 
88 /* User-settable parameters */
89 int			max_wal_size_mb = 1024; /* 1 GB */
90 int			min_wal_size_mb = 80;	/* 80 MB */
91 int			wal_keep_size_mb = 0;
92 int			XLOGbuffers = -1;
93 int			XLogArchiveTimeout = 0;
94 int			XLogArchiveMode = ARCHIVE_MODE_OFF;
95 char	   *XLogArchiveCommand = NULL;
96 bool		EnableHotStandby = false;
97 bool		fullPageWrites = true;
98 bool		wal_log_hints = false;
99 bool		wal_compression = false;
100 char	   *wal_consistency_checking_string = NULL;
101 bool	   *wal_consistency_checking = NULL;
102 bool		wal_init_zero = true;
103 bool		wal_recycle = true;
104 bool		log_checkpoints = false;
105 int			sync_method = DEFAULT_SYNC_METHOD;
106 int			wal_level = WAL_LEVEL_MINIMAL;
107 int			CommitDelay = 0;	/* precommit delay in microseconds */
108 int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
109 int			wal_retrieve_retry_interval = 5000;
110 int			max_slot_wal_keep_size_mb = -1;
111 
112 #ifdef WAL_DEBUG
113 bool		XLOG_DEBUG = false;
114 #endif
115 
116 int			wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
117 
118 /*
119  * Number of WAL insertion locks to use. A higher value allows more insertions
120  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
121  * which needs to iterate all the locks.
122  */
123 #define NUM_XLOGINSERT_LOCKS  8
124 
125 /*
126  * Max distance from last checkpoint, before triggering a new xlog-based
127  * checkpoint.
128  */
129 int			CheckPointSegments;
130 
131 /* Estimated distance between checkpoints, in bytes */
132 static double CheckPointDistanceEstimate = 0;
133 static double PrevCheckPointDistance = 0;
134 
135 /*
136  * GUC support
137  */
138 const struct config_enum_entry sync_method_options[] = {
139 	{"fsync", SYNC_METHOD_FSYNC, false},
140 #ifdef HAVE_FSYNC_WRITETHROUGH
141 	{"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
142 #endif
143 #ifdef HAVE_FDATASYNC
144 	{"fdatasync", SYNC_METHOD_FDATASYNC, false},
145 #endif
146 #ifdef OPEN_SYNC_FLAG
147 	{"open_sync", SYNC_METHOD_OPEN, false},
148 #endif
149 #ifdef OPEN_DATASYNC_FLAG
150 	{"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
151 #endif
152 	{NULL, 0, false}
153 };
154 
155 
156 /*
157  * Although only "on", "off", and "always" are documented,
158  * we accept all the likely variants of "on" and "off".
159  */
160 const struct config_enum_entry archive_mode_options[] = {
161 	{"always", ARCHIVE_MODE_ALWAYS, false},
162 	{"on", ARCHIVE_MODE_ON, false},
163 	{"off", ARCHIVE_MODE_OFF, false},
164 	{"true", ARCHIVE_MODE_ON, true},
165 	{"false", ARCHIVE_MODE_OFF, true},
166 	{"yes", ARCHIVE_MODE_ON, true},
167 	{"no", ARCHIVE_MODE_OFF, true},
168 	{"1", ARCHIVE_MODE_ON, true},
169 	{"0", ARCHIVE_MODE_OFF, true},
170 	{NULL, 0, false}
171 };
172 
173 const struct config_enum_entry recovery_target_action_options[] = {
174 	{"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
175 	{"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
176 	{"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
177 	{NULL, 0, false}
178 };
179 
180 /*
181  * Statistics for current checkpoint are collected in this global struct.
182  * Because only the checkpointer or a stand-alone backend can perform
183  * checkpoints, this will be unused in normal backends.
184  */
185 CheckpointStatsData CheckpointStats;
186 
187 /*
188  * ThisTimeLineID will be same in all backends --- it identifies current
189  * WAL timeline for the database system.
190  */
191 TimeLineID	ThisTimeLineID = 0;
192 
193 /*
194  * Are we doing recovery from XLOG?
195  *
196  * This is only ever true in the startup process; it should be read as meaning
197  * "this process is replaying WAL records", rather than "the system is in
198  * recovery mode".  It should be examined primarily by functions that need
199  * to act differently when called from a WAL redo function (e.g., to skip WAL
200  * logging).  To check whether the system is in recovery regardless of which
201  * process you're running in, use RecoveryInProgress() but only after shared
202  * memory startup and lock initialization.
203  */
204 bool		InRecovery = false;
205 
206 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
207 HotStandbyState standbyState = STANDBY_DISABLED;
208 
209 static XLogRecPtr LastRec;
210 
211 /* Local copy of WalRcv->flushedUpto */
212 static XLogRecPtr flushedUpto = 0;
213 static TimeLineID receiveTLI = 0;
214 
215 /*
216  * abortedRecPtr is the start pointer of a broken record at end of WAL when
217  * recovery completes; missingContrecPtr is the location of the first
218  * contrecord that went missing.  See CreateOverwriteContrecordRecord for
219  * details.
220  */
221 static XLogRecPtr abortedRecPtr;
222 static XLogRecPtr missingContrecPtr;
223 
224 /*
225  * During recovery, lastFullPageWrites keeps track of full_page_writes that
226  * the replayed WAL records indicate. It's initialized with full_page_writes
227  * that the recovery starting checkpoint record indicates, and then updated
228  * each time XLOG_FPW_CHANGE record is replayed.
229  */
230 static bool lastFullPageWrites;
231 
232 /*
233  * Local copy of the state tracked by SharedRecoveryState in shared memory,
234  * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
235  * means "not known, need to check the shared state".
236  */
237 static bool LocalRecoveryInProgress = true;
238 
239 /*
240  * Local copy of SharedHotStandbyActive variable. False actually means "not
241  * known, need to check the shared state".
242  */
243 static bool LocalHotStandbyActive = false;
244 
245 /*
246  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
247  * known, need to check the shared state".
248  */
249 static bool LocalPromoteIsTriggered = false;
250 
251 /*
252  * Local state for XLogInsertAllowed():
253  *		1: unconditionally allowed to insert XLOG
254  *		0: unconditionally not allowed to insert XLOG
255  *		-1: must check RecoveryInProgress(); disallow until it is false
256  * Most processes start with -1 and transition to 1 after seeing that recovery
257  * is not in progress.  But we can also force the value for special cases.
258  * The coding in XLogInsertAllowed() depends on the first two of these states
259  * being numerically the same as bool true and false.
260  */
261 static int	LocalXLogInsertAllowed = -1;
262 
263 /*
264  * When ArchiveRecoveryRequested is set, archive recovery was requested,
265  * ie. signal files were present. When InArchiveRecovery is set, we are
266  * currently recovering using offline XLOG archives. These variables are only
267  * valid in the startup process.
268  *
269  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
270  * currently performing crash recovery using only XLOG files in pg_wal, but
271  * will switch to using offline XLOG archives as soon as we reach the end of
272  * WAL in pg_wal.
273 */
274 bool		ArchiveRecoveryRequested = false;
275 bool		InArchiveRecovery = false;
276 
277 static bool standby_signal_file_found = false;
278 static bool recovery_signal_file_found = false;
279 
280 /* Was the last xlog file restored from archive, or local? */
281 static bool restoredFromArchive = false;
282 
283 /* Buffers dedicated to consistency checks of size BLCKSZ */
284 static char *replay_image_masked = NULL;
285 static char *master_image_masked = NULL;
286 
287 /* options formerly taken from recovery.conf for archive recovery */
288 char	   *recoveryRestoreCommand = NULL;
289 char	   *recoveryEndCommand = NULL;
290 char	   *archiveCleanupCommand = NULL;
291 RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
292 bool		recoveryTargetInclusive = true;
293 int			recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
294 TransactionId recoveryTargetXid;
295 char	   *recovery_target_time_string;
296 static TimestampTz recoveryTargetTime;
297 const char *recoveryTargetName;
298 XLogRecPtr	recoveryTargetLSN;
299 int			recovery_min_apply_delay = 0;
300 
301 /* options formerly taken from recovery.conf for XLOG streaming */
302 bool		StandbyModeRequested = false;
303 char	   *PrimaryConnInfo = NULL;
304 char	   *PrimarySlotName = NULL;
305 char	   *PromoteTriggerFile = NULL;
306 bool		wal_receiver_create_temp_slot = false;
307 
308 /* are we currently in standby mode? */
309 bool		StandbyMode = false;
310 
311 /* whether request for fast promotion has been made yet */
312 static bool fast_promote = false;
313 
314 /*
315  * if recoveryStopsBefore/After returns true, it saves information of the stop
316  * point here
317  */
318 static TransactionId recoveryStopXid;
319 static TimestampTz recoveryStopTime;
320 static XLogRecPtr recoveryStopLSN;
321 static char recoveryStopName[MAXFNAMELEN];
322 static bool recoveryStopAfter;
323 
324 /*
325  * During normal operation, the only timeline we care about is ThisTimeLineID.
326  * During recovery, however, things are more complicated.  To simplify life
327  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
328  * scan through the WAL history (that is, it is the line that was active when
329  * the currently-scanned WAL record was generated).  We also need these
330  * timeline values:
331  *
332  * recoveryTargetTimeLineGoal: what the user requested, if any
333  *
334  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
335  *
336  * recoveryTargetTLI: the currently understood target timeline; changes
337  *
338  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
339  * its known parents, newest first (so recoveryTargetTLI is always the
340  * first list member).  Only these TLIs are expected to be seen in the WAL
341  * segments we read, and indeed only these TLIs will be considered as
342  * candidate WAL files to open at all.
343  *
344  * curFileTLI: the TLI appearing in the name of the current input WAL file.
345  * (This is not necessarily the same as ThisTimeLineID, because we could
346  * be scanning data that was copied from an ancestor timeline when the current
347  * file was created.)  During a sequential scan we do not allow this value
348  * to decrease.
349  */
350 RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
351 TimeLineID	recoveryTargetTLIRequested = 0;
352 TimeLineID	recoveryTargetTLI = 0;
353 static List *expectedTLEs;
354 static TimeLineID curFileTLI;
355 
356 /*
357  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
358  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
359  * end+1 of the last record, and is reset when we end a top-level transaction,
360  * or start a new one; so it can be used to tell if the current transaction has
361  * created any XLOG records.
362  *
363  * While in parallel mode, this may not be fully up to date.  When committing,
364  * a transaction can assume this covers all xlog records written either by the
365  * user backend or by any parallel worker which was present at any point during
366  * the transaction.  But when aborting, or when still in parallel mode, other
367  * parallel backends may have written WAL records at later LSNs than the value
368  * stored here.  The parallel leader advances its own copy, when necessary,
369  * in WaitForParallelWorkersToFinish.
370  */
371 XLogRecPtr	ProcLastRecPtr = InvalidXLogRecPtr;
372 XLogRecPtr	XactLastRecEnd = InvalidXLogRecPtr;
373 XLogRecPtr	XactLastCommitEnd = InvalidXLogRecPtr;
374 
375 /*
376  * RedoRecPtr is this backend's local copy of the REDO record pointer
377  * (which is almost but not quite the same as a pointer to the most recent
378  * CHECKPOINT record).  We update this from the shared-memory copy,
379  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
380  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
381  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
382  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
383  * InitXLOGAccess.
384  */
385 static XLogRecPtr RedoRecPtr;
386 
387 /*
388  * doPageWrites is this backend's local copy of (forcePageWrites ||
389  * fullPageWrites).  It is used together with RedoRecPtr to decide whether
390  * a full-page image of a page need to be taken.
391  */
392 static bool doPageWrites;
393 
394 /* Has the recovery code requested a walreceiver wakeup? */
395 static bool doRequestWalReceiverReply;
396 
397 /*
398  * RedoStartLSN points to the checkpoint's REDO location which is specified
399  * in a backup label file, backup history file or control file. In standby
400  * mode, XLOG streaming usually starts from the position where an invalid
401  * record was found. But if we fail to read even the initial checkpoint
402  * record, we use the REDO location instead of the checkpoint location as
403  * the start position of XLOG streaming. Otherwise we would have to jump
404  * backwards to the REDO location after reading the checkpoint record,
405  * because the REDO record can precede the checkpoint record.
406  */
407 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
408 
409 /*----------
410  * Shared-memory data structures for XLOG control
411  *
412  * LogwrtRqst indicates a byte position that we need to write and/or fsync
413  * the log up to (all records before that point must be written or fsynced).
414  * LogwrtResult indicates the byte positions we have already written/fsynced.
415  * These structs are identical but are declared separately to indicate their
416  * slightly different functions.
417  *
418  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
419  * WALWriteLock.  To update it, you need to hold both locks.  The point of
420  * this arrangement is that the value can be examined by code that already
421  * holds WALWriteLock without needing to grab info_lck as well.  In addition
422  * to the shared variable, each backend has a private copy of LogwrtResult,
423  * which is updated when convenient.
424  *
425  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
426  * (protected by info_lck), but we don't need to cache any copies of it.
427  *
428  * info_lck is only held long enough to read/update the protected variables,
429  * so it's a plain spinlock.  The other locks are held longer (potentially
430  * over I/O operations), so we use LWLocks for them.  These locks are:
431  *
432  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
433  * It is only held while initializing and changing the mapping.  If the
434  * contents of the buffer being replaced haven't been written yet, the mapping
435  * lock is released while the write is done, and reacquired afterwards.
436  *
437  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
438  * XLogFlush).
439  *
440  * ControlFileLock: must be held to read/update control file or create
441  * new log file.
442  *
443  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
444  * only one checkpointer at a time; currently, with all checkpoints done by
445  * the checkpointer, this is just pro forma).
446  *
447  *----------
448  */
449 
450 typedef struct XLogwrtRqst
451 {
452 	XLogRecPtr	Write;			/* last byte + 1 to write out */
453 	XLogRecPtr	Flush;			/* last byte + 1 to flush */
454 } XLogwrtRqst;
455 
456 typedef struct XLogwrtResult
457 {
458 	XLogRecPtr	Write;			/* last byte + 1 written out */
459 	XLogRecPtr	Flush;			/* last byte + 1 flushed */
460 } XLogwrtResult;
461 
462 /*
463  * Inserting to WAL is protected by a small fixed number of WAL insertion
464  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
465  * matter which one. To lock out other concurrent insertions, you must hold
466  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
467  * indicator of how far the insertion has progressed (insertingAt).
468  *
469  * The insertingAt values are read when a process wants to flush WAL from
470  * the in-memory buffers to disk, to check that all the insertions to the
471  * region the process is about to write out have finished. You could simply
472  * wait for all currently in-progress insertions to finish, but the
473  * insertingAt indicator allows you to ignore insertions to later in the WAL,
474  * so that you only wait for the insertions that are modifying the buffers
475  * you're about to write out.
476  *
477  * This isn't just an optimization. If all the WAL buffers are dirty, an
478  * inserter that's holding a WAL insert lock might need to evict an old WAL
479  * buffer, which requires flushing the WAL. If it's possible for an inserter
480  * to block on another inserter unnecessarily, deadlock can arise when two
481  * inserters holding a WAL insert lock wait for each other to finish their
482  * insertion.
483  *
484  * Small WAL records that don't cross a page boundary never update the value,
485  * the WAL record is just copied to the page and the lock is released. But
486  * to avoid the deadlock-scenario explained above, the indicator is always
487  * updated before sleeping while holding an insertion lock.
488  *
489  * lastImportantAt contains the LSN of the last important WAL record inserted
490  * using a given lock. This value is used to detect if there has been
491  * important WAL activity since the last time some action, like a checkpoint,
492  * was performed - allowing to not repeat the action if not. The LSN is
493  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
494  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
495  * records.  Tracking the WAL activity directly in WALInsertLock has the
496  * advantage of not needing any additional locks to update the value.
497  */
498 typedef struct
499 {
500 	LWLock		lock;
501 	XLogRecPtr	insertingAt;
502 	XLogRecPtr	lastImportantAt;
503 } WALInsertLock;
504 
505 /*
506  * All the WAL insertion locks are allocated as an array in shared memory. We
507  * force the array stride to be a power of 2, which saves a few cycles in
508  * indexing, but more importantly also ensures that individual slots don't
509  * cross cache line boundaries. (Of course, we have to also ensure that the
510  * array start address is suitably aligned.)
511  */
512 typedef union WALInsertLockPadded
513 {
514 	WALInsertLock l;
515 	char		pad[PG_CACHE_LINE_SIZE];
516 } WALInsertLockPadded;
517 
518 /*
519  * State of an exclusive backup, necessary to control concurrent activities
520  * across sessions when working on exclusive backups.
521  *
522  * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
523  * running, to be more precise pg_start_backup() is not being executed for
524  * an exclusive backup and there is no exclusive backup in progress.
525  * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
526  * exclusive backup.
527  * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
528  * running and an exclusive backup is in progress. pg_stop_backup() is
529  * needed to finish it.
530  * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
531  * exclusive backup.
532  */
533 typedef enum ExclusiveBackupState
534 {
535 	EXCLUSIVE_BACKUP_NONE = 0,
536 	EXCLUSIVE_BACKUP_STARTING,
537 	EXCLUSIVE_BACKUP_IN_PROGRESS,
538 	EXCLUSIVE_BACKUP_STOPPING
539 } ExclusiveBackupState;
540 
541 /*
542  * Session status of running backup, used for sanity checks in SQL-callable
543  * functions to start and stop backups.
544  */
545 static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
546 
547 /*
548  * Shared state data for WAL insertion.
549  */
550 typedef struct XLogCtlInsert
551 {
552 	slock_t		insertpos_lck;	/* protects CurrBytePos and PrevBytePos */
553 
554 	/*
555 	 * CurrBytePos is the end of reserved WAL. The next record will be
556 	 * inserted at that position. PrevBytePos is the start position of the
557 	 * previously inserted (or rather, reserved) record - it is copied to the
558 	 * prev-link of the next record. These are stored as "usable byte
559 	 * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
560 	 */
561 	uint64		CurrBytePos;
562 	uint64		PrevBytePos;
563 
564 	/*
565 	 * Make sure the above heavily-contended spinlock and byte positions are
566 	 * on their own cache line. In particular, the RedoRecPtr and full page
567 	 * write variables below should be on a different cache line. They are
568 	 * read on every WAL insertion, but updated rarely, and we don't want
569 	 * those reads to steal the cache line containing Curr/PrevBytePos.
570 	 */
571 	char		pad[PG_CACHE_LINE_SIZE];
572 
573 	/*
574 	 * fullPageWrites is the master copy used by all backends to determine
575 	 * whether to write full-page to WAL, instead of using process-local one.
576 	 * This is required because, when full_page_writes is changed by SIGHUP,
577 	 * we must WAL-log it before it actually affects WAL-logging by backends.
578 	 * Checkpointer sets at startup or after SIGHUP.
579 	 *
580 	 * To read these fields, you must hold an insertion lock. To modify them,
581 	 * you must hold ALL the locks.
582 	 */
583 	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
584 	bool		forcePageWrites;	/* forcing full-page writes for PITR? */
585 	bool		fullPageWrites;
586 
587 	/*
588 	 * exclusiveBackupState indicates the state of an exclusive backup (see
589 	 * comments of ExclusiveBackupState for more details). nonExclusiveBackups
590 	 * is a counter indicating the number of streaming base backups currently
591 	 * in progress. forcePageWrites is set to true when either of these is
592 	 * non-zero. lastBackupStart is the latest checkpoint redo location used
593 	 * as a starting point for an online backup.
594 	 */
595 	ExclusiveBackupState exclusiveBackupState;
596 	int			nonExclusiveBackups;
597 	XLogRecPtr	lastBackupStart;
598 
599 	/*
600 	 * WAL insertion locks.
601 	 */
602 	WALInsertLockPadded *WALInsertLocks;
603 } XLogCtlInsert;
604 
605 /*
606  * Total shared-memory state for XLOG.
607  */
608 typedef struct XLogCtlData
609 {
610 	XLogCtlInsert Insert;
611 
612 	/* Protected by info_lck: */
613 	XLogwrtRqst LogwrtRqst;
614 	XLogRecPtr	RedoRecPtr;		/* a recent copy of Insert->RedoRecPtr */
615 	FullTransactionId ckptFullXid;	/* nextFullXid of latest checkpoint */
616 	XLogRecPtr	asyncXactLSN;	/* LSN of newest async commit/abort */
617 	XLogRecPtr	replicationSlotMinLSN;	/* oldest LSN needed by any slot */
618 
619 	XLogSegNo	lastRemovedSegNo;	/* latest removed/recycled XLOG segment */
620 
621 	/* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
622 	XLogRecPtr	unloggedLSN;
623 	slock_t		ulsn_lck;
624 
625 	/* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
626 	pg_time_t	lastSegSwitchTime;
627 	XLogRecPtr	lastSegSwitchLSN;
628 
629 	/*
630 	 * Protected by info_lck and WALWriteLock (you must hold either lock to
631 	 * read it, but both to update)
632 	 */
633 	XLogwrtResult LogwrtResult;
634 
635 	/*
636 	 * Latest initialized page in the cache (last byte position + 1).
637 	 *
638 	 * To change the identity of a buffer (and InitializedUpTo), you need to
639 	 * hold WALBufMappingLock.  To change the identity of a buffer that's
640 	 * still dirty, the old page needs to be written out first, and for that
641 	 * you need WALWriteLock, and you need to ensure that there are no
642 	 * in-progress insertions to the page by calling
643 	 * WaitXLogInsertionsToFinish().
644 	 */
645 	XLogRecPtr	InitializedUpTo;
646 
647 	/*
648 	 * These values do not change after startup, although the pointed-to pages
649 	 * and xlblocks values certainly do.  xlblocks values are protected by
650 	 * WALBufMappingLock.
651 	 */
652 	char	   *pages;			/* buffers for unwritten XLOG pages */
653 	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + XLOG_BLCKSZ */
654 	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
655 
656 	/*
657 	 * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
658 	 * If we created a new timeline when the system was started up,
659 	 * PrevTimeLineID is the old timeline's ID that we forked off from.
660 	 * Otherwise it's equal to ThisTimeLineID.
661 	 */
662 	TimeLineID	ThisTimeLineID;
663 	TimeLineID	PrevTimeLineID;
664 
665 	/*
666 	 * SharedRecoveryState indicates if we're still in crash or archive
667 	 * recovery.  Protected by info_lck.
668 	 */
669 	RecoveryState SharedRecoveryState;
670 
671 	/*
672 	 * SharedHotStandbyActive indicates if we allow hot standby queries to be
673 	 * run.  Protected by info_lck.
674 	 */
675 	bool		SharedHotStandbyActive;
676 
677 	/*
678 	 * SharedPromoteIsTriggered indicates if a standby promotion has been
679 	 * triggered.  Protected by info_lck.
680 	 */
681 	bool		SharedPromoteIsTriggered;
682 
683 	/*
684 	 * WalWriterSleeping indicates whether the WAL writer is currently in
685 	 * low-power mode (and hence should be nudged if an async commit occurs).
686 	 * Protected by info_lck.
687 	 */
688 	bool		WalWriterSleeping;
689 
690 	/*
691 	 * recoveryWakeupLatch is used to wake up the startup process to continue
692 	 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
693 	 * to appear.
694 	 */
695 	Latch		recoveryWakeupLatch;
696 
697 	/*
698 	 * During recovery, we keep a copy of the latest checkpoint record here.
699 	 * lastCheckPointRecPtr points to start of checkpoint record and
700 	 * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
701 	 * checkpointer when it wants to create a restartpoint.
702 	 *
703 	 * Protected by info_lck.
704 	 */
705 	XLogRecPtr	lastCheckPointRecPtr;
706 	XLogRecPtr	lastCheckPointEndPtr;
707 	CheckPoint	lastCheckPoint;
708 
709 	/*
710 	 * lastReplayedEndRecPtr points to end+1 of the last record successfully
711 	 * replayed. When we're currently replaying a record, ie. in a redo
712 	 * function, replayEndRecPtr points to the end+1 of the record being
713 	 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
714 	 */
715 	XLogRecPtr	lastReplayedEndRecPtr;
716 	TimeLineID	lastReplayedTLI;
717 	XLogRecPtr	replayEndRecPtr;
718 	TimeLineID	replayEndTLI;
719 	/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
720 	TimestampTz recoveryLastXTime;
721 
722 	/*
723 	 * timestamp of when we started replaying the current chunk of WAL data,
724 	 * only relevant for replication or archive recovery
725 	 */
726 	TimestampTz currentChunkStartTime;
727 	/* Are we requested to pause recovery? */
728 	bool		recoveryPause;
729 
730 	/*
731 	 * lastFpwDisableRecPtr points to the start of the last replayed
732 	 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
733 	 */
734 	XLogRecPtr	lastFpwDisableRecPtr;
735 
736 	slock_t		info_lck;		/* locks shared variables shown above */
737 } XLogCtlData;
738 
739 static XLogCtlData *XLogCtl = NULL;
740 
741 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
742 static WALInsertLockPadded *WALInsertLocks = NULL;
743 
744 /*
745  * We maintain an image of pg_control in shared memory.
746  */
747 static ControlFileData *ControlFile = NULL;
748 
749 /*
750  * Calculate the amount of space left on the page after 'endptr'. Beware
751  * multiple evaluation!
752  */
753 #define INSERT_FREESPACE(endptr)	\
754 	(((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
755 
756 /* Macro to advance to next buffer index. */
757 #define NextBufIdx(idx)		\
758 		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
759 
760 /*
761  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
762  * would hold if it was in cache, the page containing 'recptr'.
763  */
764 #define XLogRecPtrToBufIdx(recptr)	\
765 	(((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
766 
767 /*
768  * These are the number of bytes in a WAL page usable for WAL data.
769  */
770 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
771 
772 /*
773  * Convert values of GUCs measured in megabytes to equiv. segment count.
774  * Rounds down.
775  */
776 #define ConvertToXSegs(x, segsize)	XLogMBVarToSegs((x), (segsize))
777 
778 /* The number of bytes in a WAL segment usable for WAL data. */
779 static int	UsableBytesInSegment;
780 
781 /*
782  * Private, possibly out-of-date copy of shared LogwrtResult.
783  * See discussion above.
784  */
785 static XLogwrtResult LogwrtResult = {0, 0};
786 
787 /*
788  * Codes indicating where we got a WAL file from during recovery, or where
789  * to attempt to get one.
790  */
791 typedef enum
792 {
793 	XLOG_FROM_ANY = 0,			/* request to read WAL from any source */
794 	XLOG_FROM_ARCHIVE,			/* restored using restore_command */
795 	XLOG_FROM_PG_WAL,			/* existing file in pg_wal */
796 	XLOG_FROM_STREAM			/* streamed from master */
797 } XLogSource;
798 
799 /* human-readable names for XLogSources, for debugging output */
800 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
801 
802 /*
803  * openLogFile is -1 or a kernel FD for an open log file segment.
804  * openLogSegNo identifies the segment.  These variables are only used to
805  * write the XLOG, and so will normally refer to the active segment.
806  * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
807  */
808 static int	openLogFile = -1;
809 static XLogSegNo openLogSegNo = 0;
810 
811 /*
812  * These variables are used similarly to the ones above, but for reading
813  * the XLOG.  readOff is the offset of the page just read, readLen
814  * indicates how much of it has been read into readBuf, and readSource
815  * indicates where we got the currently open file from.
816  * Note: we could use Reserve/ReleaseExternalFD to track consumption of
817  * this FD too; but it doesn't currently seem worthwhile, since the XLOG is
818  * not read by general-purpose sessions.
819  */
820 static int	readFile = -1;
821 static XLogSegNo readSegNo = 0;
822 static uint32 readOff = 0;
823 static uint32 readLen = 0;
824 static XLogSource readSource = XLOG_FROM_ANY;
825 
826 /*
827  * Keeps track of which source we're currently reading from. This is
828  * different from readSource in that this is always set, even when we don't
829  * currently have a WAL file open. If lastSourceFailed is set, our last
830  * attempt to read from currentSource failed, and we should try another source
831  * next.
832  *
833  * pendingWalRcvRestart is set when a config change occurs that requires a
834  * walreceiver restart.  This is only valid in XLOG_FROM_STREAM state.
835  */
836 static XLogSource currentSource = XLOG_FROM_ANY;
837 static bool lastSourceFailed = false;
838 static bool pendingWalRcvRestart = false;
839 
840 typedef struct XLogPageReadPrivate
841 {
842 	int			emode;
843 	bool		fetching_ckpt;	/* are we fetching a checkpoint record? */
844 	bool		randAccess;
845 } XLogPageReadPrivate;
846 
847 /*
848  * These variables track when we last obtained some WAL data to process,
849  * and where we got it from.  (XLogReceiptSource is initially the same as
850  * readSource, but readSource gets reset to zero when we don't have data
851  * to process right now.  It is also different from currentSource, which
852  * also changes when we try to read from a source and fail, while
853  * XLogReceiptSource tracks where we last successfully read some WAL.)
854  */
855 static TimestampTz XLogReceiptTime = 0;
856 static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
857 
858 /* State information for XLOG reading */
859 static XLogRecPtr ReadRecPtr;	/* start of last record read */
860 static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
861 
862 /*
863  * Local copies of equivalent fields in the control file.  When running
864  * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
865  * expect to replay all the WAL available, and updateMinRecoveryPoint is
866  * switched to false to prevent any updates while replaying records.
867  * Those values are kept consistent as long as crash recovery runs.
868  */
869 static XLogRecPtr minRecoveryPoint;
870 static TimeLineID minRecoveryPointTLI;
871 static bool updateMinRecoveryPoint = true;
872 
873 /*
874  * Have we reached a consistent database state? In crash recovery, we have
875  * to replay all the WAL, so reachedConsistency is never set. During archive
876  * recovery, the database is consistent once minRecoveryPoint is reached.
877  */
878 bool		reachedConsistency = false;
879 
880 static bool InRedo = false;
881 
882 /* Have we launched bgwriter during recovery? */
883 static bool bgwriterLaunched = false;
884 
885 /* For WALInsertLockAcquire/Release functions */
886 static int	MyLockNo = 0;
887 static bool holdingAllLocks = false;
888 
889 #ifdef WAL_DEBUG
890 static MemoryContext walDebugCxt = NULL;
891 #endif
892 
893 static void readRecoverySignalFile(void);
894 static void validateRecoveryParameters(void);
895 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
896 static bool recoveryStopsBefore(XLogReaderState *record);
897 static bool recoveryStopsAfter(XLogReaderState *record);
898 static void recoveryPausesHere(bool endOfRecovery);
899 static bool recoveryApplyDelay(XLogReaderState *record);
900 static void SetLatestXTime(TimestampTz xtime);
901 static void SetCurrentChunkStartTime(TimestampTz xtime);
902 static void CheckRequiredParameterValues(void);
903 static void XLogReportParameters(void);
904 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
905 								TimeLineID prevTLI);
906 static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
907 									  XLogReaderState *state);
908 static void LocalSetXLogInsertAllowed(void);
909 static void CreateEndOfRecoveryRecord(void);
910 static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn);
911 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
912 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
913 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
914 
915 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
916 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
917 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
918 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
919 								   bool find_free, XLogSegNo max_segno,
920 								   bool use_lock);
921 static int	XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
922 						 XLogSource source, bool notfoundOk);
923 static int	XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
924 static int	XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
925 						 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
926 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
927 										bool fetching_ckpt, XLogRecPtr tliRecPtr);
928 static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
929 static void XLogFileClose(void);
930 static void PreallocXlogFiles(XLogRecPtr endptr);
931 static void RemoveTempXlogFiles(void);
932 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr);
933 static void RemoveXlogFile(const char *segname, XLogRecPtr lastredoptr, XLogRecPtr endptr);
934 static void UpdateLastRemovedPtr(char *filename);
935 static void ValidateXLOGDirectoryStructure(void);
936 static void CleanupBackupHistory(void);
937 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
938 static XLogRecord *ReadRecord(XLogReaderState *xlogreader,
939 							  int emode, bool fetching_ckpt);
940 static void CheckRecoveryConsistency(void);
941 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
942 										XLogRecPtr RecPtr, int whichChkpt, bool report);
943 static bool rescanLatestTimeLine(void);
944 static void InitControlFile(uint64 sysidentifier);
945 static void WriteControlFile(void);
946 static void ReadControlFile(void);
947 static char *str_time(pg_time_t tnow);
948 static void SetPromoteIsTriggered(void);
949 static bool CheckForStandbyTrigger(void);
950 
951 #ifdef WAL_DEBUG
952 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
953 #endif
954 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
955 static void pg_start_backup_callback(int code, Datum arg);
956 static void pg_stop_backup_callback(int code, Datum arg);
957 static bool read_backup_label(XLogRecPtr *checkPointLoc,
958 							  bool *backupEndRequired, bool *backupFromStandby);
959 static bool read_tablespace_map(List **tablespaces);
960 
961 static void rm_redo_error_callback(void *arg);
962 static int	get_sync_bit(int method);
963 
964 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
965 								XLogRecData *rdata,
966 								XLogRecPtr StartPos, XLogRecPtr EndPos);
967 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
968 									  XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
969 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
970 							  XLogRecPtr *PrevPtr);
971 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
972 static char *GetXLogBuffer(XLogRecPtr ptr);
973 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
974 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
975 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
976 static void checkXLogConsistency(XLogReaderState *record);
977 
978 static void WALInsertLockAcquire(void);
979 static void WALInsertLockAcquireExclusive(void);
980 static void WALInsertLockRelease(void);
981 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
982 
983 /*
984  * Insert an XLOG record represented by an already-constructed chain of data
985  * chunks.  This is a low-level routine; to construct the WAL record header
986  * and data, use the higher-level routines in xloginsert.c.
987  *
988  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
989  * WAL record applies to, that were not included in the record as full page
990  * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
991  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
992  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
993  * record is always inserted.
994  *
995  * 'flags' gives more in-depth control on the record being inserted. See
996  * XLogSetRecordFlags() for details.
997  *
998  * The first XLogRecData in the chain must be for the record header, and its
999  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
1000  * xl_crc fields in the header, the rest of the header must already be filled
1001  * by the caller.
1002  *
1003  * Returns XLOG pointer to end of record (beginning of next record).
1004  * This can be used as LSN for data pages affected by the logged action.
1005  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
1006  * before the data page can be written out.  This implements the basic
1007  * WAL rule "write the log before the data".)
1008  */
1009 XLogRecPtr
XLogInsertRecord(XLogRecData * rdata,XLogRecPtr fpw_lsn,uint8 flags,int num_fpi)1010 XLogInsertRecord(XLogRecData *rdata,
1011 				 XLogRecPtr fpw_lsn,
1012 				 uint8 flags,
1013 				 int num_fpi)
1014 {
1015 	XLogCtlInsert *Insert = &XLogCtl->Insert;
1016 	pg_crc32c	rdata_crc;
1017 	bool		inserted;
1018 	XLogRecord *rechdr = (XLogRecord *) rdata->data;
1019 	uint8		info = rechdr->xl_info & ~XLR_INFO_MASK;
1020 	bool		isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
1021 							   info == XLOG_SWITCH);
1022 	XLogRecPtr	StartPos;
1023 	XLogRecPtr	EndPos;
1024 	bool		prevDoPageWrites = doPageWrites;
1025 
1026 	/* we assume that all of the record header is in the first chunk */
1027 	Assert(rdata->len >= SizeOfXLogRecord);
1028 
1029 	/* cross-check on whether we should be here or not */
1030 	if (!XLogInsertAllowed())
1031 		elog(ERROR, "cannot make new WAL entries during recovery");
1032 
1033 	/*----------
1034 	 *
1035 	 * We have now done all the preparatory work we can without holding a
1036 	 * lock or modifying shared state. From here on, inserting the new WAL
1037 	 * record to the shared WAL buffer cache is a two-step process:
1038 	 *
1039 	 * 1. Reserve the right amount of space from the WAL. The current head of
1040 	 *	  reserved space is kept in Insert->CurrBytePos, and is protected by
1041 	 *	  insertpos_lck.
1042 	 *
1043 	 * 2. Copy the record to the reserved WAL space. This involves finding the
1044 	 *	  correct WAL buffer containing the reserved space, and copying the
1045 	 *	  record in place. This can be done concurrently in multiple processes.
1046 	 *
1047 	 * To keep track of which insertions are still in-progress, each concurrent
1048 	 * inserter acquires an insertion lock. In addition to just indicating that
1049 	 * an insertion is in progress, the lock tells others how far the inserter
1050 	 * has progressed. There is a small fixed number of insertion locks,
1051 	 * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
1052 	 * boundary, it updates the value stored in the lock to the how far it has
1053 	 * inserted, to allow the previous buffer to be flushed.
1054 	 *
1055 	 * Holding onto an insertion lock also protects RedoRecPtr and
1056 	 * fullPageWrites from changing until the insertion is finished.
1057 	 *
1058 	 * Step 2 can usually be done completely in parallel. If the required WAL
1059 	 * page is not initialized yet, you have to grab WALBufMappingLock to
1060 	 * initialize it, but the WAL writer tries to do that ahead of insertions
1061 	 * to avoid that from happening in the critical path.
1062 	 *
1063 	 *----------
1064 	 */
1065 	START_CRIT_SECTION();
1066 	if (isLogSwitch)
1067 		WALInsertLockAcquireExclusive();
1068 	else
1069 		WALInsertLockAcquire();
1070 
1071 	/*
1072 	 * Check to see if my copy of RedoRecPtr is out of date. If so, may have
1073 	 * to go back and have the caller recompute everything. This can only
1074 	 * happen just after a checkpoint, so it's better to be slow in this case
1075 	 * and fast otherwise.
1076 	 *
1077 	 * Also check to see if fullPageWrites or forcePageWrites was just turned
1078 	 * on; if we weren't already doing full-page writes then go back and
1079 	 * recompute.
1080 	 *
1081 	 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1082 	 * affect the contents of the XLOG record, so we'll update our local copy
1083 	 * but not force a recomputation.  (If doPageWrites was just turned off,
1084 	 * we could recompute the record without full pages, but we choose not to
1085 	 * bother.)
1086 	 */
1087 	if (RedoRecPtr != Insert->RedoRecPtr)
1088 	{
1089 		Assert(RedoRecPtr < Insert->RedoRecPtr);
1090 		RedoRecPtr = Insert->RedoRecPtr;
1091 	}
1092 	doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
1093 
1094 	if (doPageWrites &&
1095 		(!prevDoPageWrites ||
1096 		 (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
1097 	{
1098 		/*
1099 		 * Oops, some buffer now needs to be backed up that the caller didn't
1100 		 * back up.  Start over.
1101 		 */
1102 		WALInsertLockRelease();
1103 		END_CRIT_SECTION();
1104 		return InvalidXLogRecPtr;
1105 	}
1106 
1107 	/*
1108 	 * Reserve space for the record in the WAL. This also sets the xl_prev
1109 	 * pointer.
1110 	 */
1111 	if (isLogSwitch)
1112 		inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1113 	else
1114 	{
1115 		ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1116 								  &rechdr->xl_prev);
1117 		inserted = true;
1118 	}
1119 
1120 	if (inserted)
1121 	{
1122 		/*
1123 		 * Now that xl_prev has been filled in, calculate CRC of the record
1124 		 * header.
1125 		 */
1126 		rdata_crc = rechdr->xl_crc;
1127 		COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1128 		FIN_CRC32C(rdata_crc);
1129 		rechdr->xl_crc = rdata_crc;
1130 
1131 		/*
1132 		 * All the record data, including the header, is now ready to be
1133 		 * inserted. Copy the record in the space reserved.
1134 		 */
1135 		CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1136 							StartPos, EndPos);
1137 
1138 		/*
1139 		 * Unless record is flagged as not important, update LSN of last
1140 		 * important record in the current slot. When holding all locks, just
1141 		 * update the first one.
1142 		 */
1143 		if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
1144 		{
1145 			int			lockno = holdingAllLocks ? 0 : MyLockNo;
1146 
1147 			WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1148 		}
1149 	}
1150 	else
1151 	{
1152 		/*
1153 		 * This was an xlog-switch record, but the current insert location was
1154 		 * already exactly at the beginning of a segment, so there was no need
1155 		 * to do anything.
1156 		 */
1157 	}
1158 
1159 	/*
1160 	 * Done! Let others know that we're finished.
1161 	 */
1162 	WALInsertLockRelease();
1163 
1164 	MarkCurrentTransactionIdLoggedIfAny();
1165 
1166 	END_CRIT_SECTION();
1167 
1168 	/*
1169 	 * Update shared LogwrtRqst.Write, if we crossed page boundary.
1170 	 */
1171 	if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1172 	{
1173 		SpinLockAcquire(&XLogCtl->info_lck);
1174 		/* advance global request to include new block(s) */
1175 		if (XLogCtl->LogwrtRqst.Write < EndPos)
1176 			XLogCtl->LogwrtRqst.Write = EndPos;
1177 		/* update local result copy while I have the chance */
1178 		LogwrtResult = XLogCtl->LogwrtResult;
1179 		SpinLockRelease(&XLogCtl->info_lck);
1180 	}
1181 
1182 	/*
1183 	 * If this was an XLOG_SWITCH record, flush the record and the empty
1184 	 * padding space that fills the rest of the segment, and perform
1185 	 * end-of-segment actions (eg, notifying archiver).
1186 	 */
1187 	if (isLogSwitch)
1188 	{
1189 		TRACE_POSTGRESQL_WAL_SWITCH();
1190 		XLogFlush(EndPos);
1191 
1192 		/*
1193 		 * Even though we reserved the rest of the segment for us, which is
1194 		 * reflected in EndPos, we return a pointer to just the end of the
1195 		 * xlog-switch record.
1196 		 */
1197 		if (inserted)
1198 		{
1199 			EndPos = StartPos + SizeOfXLogRecord;
1200 			if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1201 			{
1202 				uint64		offset = XLogSegmentOffset(EndPos, wal_segment_size);
1203 
1204 				if (offset == EndPos % XLOG_BLCKSZ)
1205 					EndPos += SizeOfXLogLongPHD;
1206 				else
1207 					EndPos += SizeOfXLogShortPHD;
1208 			}
1209 		}
1210 	}
1211 
1212 #ifdef WAL_DEBUG
1213 	if (XLOG_DEBUG)
1214 	{
1215 		static XLogReaderState *debug_reader = NULL;
1216 		StringInfoData buf;
1217 		StringInfoData recordBuf;
1218 		char	   *errormsg = NULL;
1219 		MemoryContext oldCxt;
1220 
1221 		oldCxt = MemoryContextSwitchTo(walDebugCxt);
1222 
1223 		initStringInfo(&buf);
1224 		appendStringInfo(&buf, "INSERT @ %X/%X: ",
1225 						 (uint32) (EndPos >> 32), (uint32) EndPos);
1226 
1227 		/*
1228 		 * We have to piece together the WAL record data from the XLogRecData
1229 		 * entries, so that we can pass it to the rm_desc function as one
1230 		 * contiguous chunk.
1231 		 */
1232 		initStringInfo(&recordBuf);
1233 		for (; rdata != NULL; rdata = rdata->next)
1234 			appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1235 
1236 		if (!debug_reader)
1237 			debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
1238 											  XL_ROUTINE(), NULL);
1239 
1240 		if (!debug_reader)
1241 		{
1242 			appendStringInfoString(&buf, "error decoding record: out of memory");
1243 		}
1244 		else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1245 								   &errormsg))
1246 		{
1247 			appendStringInfo(&buf, "error decoding record: %s",
1248 							 errormsg ? errormsg : "no error message");
1249 		}
1250 		else
1251 		{
1252 			appendStringInfoString(&buf, " - ");
1253 			xlog_outdesc(&buf, debug_reader);
1254 		}
1255 		elog(LOG, "%s", buf.data);
1256 
1257 		pfree(buf.data);
1258 		pfree(recordBuf.data);
1259 		MemoryContextSwitchTo(oldCxt);
1260 	}
1261 #endif
1262 
1263 	/*
1264 	 * Update our global variables
1265 	 */
1266 	ProcLastRecPtr = StartPos;
1267 	XactLastRecEnd = EndPos;
1268 
1269 	/* Report WAL traffic to the instrumentation. */
1270 	if (inserted)
1271 	{
1272 		pgWalUsage.wal_bytes += rechdr->xl_tot_len;
1273 		pgWalUsage.wal_records++;
1274 		pgWalUsage.wal_fpi += num_fpi;
1275 	}
1276 
1277 	return EndPos;
1278 }
1279 
1280 /*
1281  * Reserves the right amount of space for a record of given size from the WAL.
1282  * *StartPos is set to the beginning of the reserved section, *EndPos to
1283  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1284  * used to set the xl_prev of this record.
1285  *
1286  * This is the performance critical part of XLogInsert that must be serialized
1287  * across backends. The rest can happen mostly in parallel. Try to keep this
1288  * section as short as possible, insertpos_lck can be heavily contended on a
1289  * busy system.
1290  *
1291  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1292  * where we actually copy the record to the reserved space.
1293  */
1294 static void
ReserveXLogInsertLocation(int size,XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1295 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1296 						  XLogRecPtr *PrevPtr)
1297 {
1298 	XLogCtlInsert *Insert = &XLogCtl->Insert;
1299 	uint64		startbytepos;
1300 	uint64		endbytepos;
1301 	uint64		prevbytepos;
1302 
1303 	size = MAXALIGN(size);
1304 
1305 	/* All (non xlog-switch) records should contain data. */
1306 	Assert(size > SizeOfXLogRecord);
1307 
1308 	/*
1309 	 * The duration the spinlock needs to be held is minimized by minimizing
1310 	 * the calculations that have to be done while holding the lock. The
1311 	 * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1312 	 * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1313 	 * page headers. The mapping between "usable" byte positions and physical
1314 	 * positions (XLogRecPtrs) can be done outside the locked region, and
1315 	 * because the usable byte position doesn't include any headers, reserving
1316 	 * X bytes from WAL is almost as simple as "CurrBytePos += X".
1317 	 */
1318 	SpinLockAcquire(&Insert->insertpos_lck);
1319 
1320 	startbytepos = Insert->CurrBytePos;
1321 	endbytepos = startbytepos + size;
1322 	prevbytepos = Insert->PrevBytePos;
1323 	Insert->CurrBytePos = endbytepos;
1324 	Insert->PrevBytePos = startbytepos;
1325 
1326 	SpinLockRelease(&Insert->insertpos_lck);
1327 
1328 	*StartPos = XLogBytePosToRecPtr(startbytepos);
1329 	*EndPos = XLogBytePosToEndRecPtr(endbytepos);
1330 	*PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1331 
1332 	/*
1333 	 * Check that the conversions between "usable byte positions" and
1334 	 * XLogRecPtrs work consistently in both directions.
1335 	 */
1336 	Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1337 	Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1338 	Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1339 }
1340 
1341 /*
1342  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1343  *
1344  * A log-switch record is handled slightly differently. The rest of the
1345  * segment will be reserved for this insertion, as indicated by the returned
1346  * *EndPos value. However, if we are already at the beginning of the current
1347  * segment, *StartPos and *EndPos are set to the current location without
1348  * reserving any space, and the function returns false.
1349 */
1350 static bool
ReserveXLogSwitch(XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1351 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1352 {
1353 	XLogCtlInsert *Insert = &XLogCtl->Insert;
1354 	uint64		startbytepos;
1355 	uint64		endbytepos;
1356 	uint64		prevbytepos;
1357 	uint32		size = MAXALIGN(SizeOfXLogRecord);
1358 	XLogRecPtr	ptr;
1359 	uint32		segleft;
1360 
1361 	/*
1362 	 * These calculations are a bit heavy-weight to be done while holding a
1363 	 * spinlock, but since we're holding all the WAL insertion locks, there
1364 	 * are no other inserters competing for it. GetXLogInsertRecPtr() does
1365 	 * compete for it, but that's not called very frequently.
1366 	 */
1367 	SpinLockAcquire(&Insert->insertpos_lck);
1368 
1369 	startbytepos = Insert->CurrBytePos;
1370 
1371 	ptr = XLogBytePosToEndRecPtr(startbytepos);
1372 	if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1373 	{
1374 		SpinLockRelease(&Insert->insertpos_lck);
1375 		*EndPos = *StartPos = ptr;
1376 		return false;
1377 	}
1378 
1379 	endbytepos = startbytepos + size;
1380 	prevbytepos = Insert->PrevBytePos;
1381 
1382 	*StartPos = XLogBytePosToRecPtr(startbytepos);
1383 	*EndPos = XLogBytePosToEndRecPtr(endbytepos);
1384 
1385 	segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1386 	if (segleft != wal_segment_size)
1387 	{
1388 		/* consume the rest of the segment */
1389 		*EndPos += segleft;
1390 		endbytepos = XLogRecPtrToBytePos(*EndPos);
1391 	}
1392 	Insert->CurrBytePos = endbytepos;
1393 	Insert->PrevBytePos = startbytepos;
1394 
1395 	SpinLockRelease(&Insert->insertpos_lck);
1396 
1397 	*PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1398 
1399 	Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
1400 	Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1401 	Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1402 	Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1403 
1404 	return true;
1405 }
1406 
1407 /*
1408  * Checks whether the current buffer page and backup page stored in the
1409  * WAL record are consistent or not. Before comparing the two pages, a
1410  * masking can be applied to the pages to ignore certain areas like hint bits,
1411  * unused space between pd_lower and pd_upper among other things. This
1412  * function should be called once WAL replay has been completed for a
1413  * given record.
1414  */
1415 static void
checkXLogConsistency(XLogReaderState * record)1416 checkXLogConsistency(XLogReaderState *record)
1417 {
1418 	RmgrId		rmid = XLogRecGetRmid(record);
1419 	RelFileNode rnode;
1420 	ForkNumber	forknum;
1421 	BlockNumber blkno;
1422 	int			block_id;
1423 
1424 	/* Records with no backup blocks have no need for consistency checks. */
1425 	if (!XLogRecHasAnyBlockRefs(record))
1426 		return;
1427 
1428 	Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
1429 
1430 	for (block_id = 0; block_id <= record->max_block_id; block_id++)
1431 	{
1432 		Buffer		buf;
1433 		Page		page;
1434 
1435 		if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1436 		{
1437 			/*
1438 			 * WAL record doesn't contain a block reference with the given id.
1439 			 * Do nothing.
1440 			 */
1441 			continue;
1442 		}
1443 
1444 		Assert(XLogRecHasBlockImage(record, block_id));
1445 
1446 		if (XLogRecBlockImageApply(record, block_id))
1447 		{
1448 			/*
1449 			 * WAL record has already applied the page, so bypass the
1450 			 * consistency check as that would result in comparing the full
1451 			 * page stored in the record with itself.
1452 			 */
1453 			continue;
1454 		}
1455 
1456 		/*
1457 		 * Read the contents from the current buffer and store it in a
1458 		 * temporary page.
1459 		 */
1460 		buf = XLogReadBufferExtended(rnode, forknum, blkno,
1461 									 RBM_NORMAL_NO_LOG);
1462 		if (!BufferIsValid(buf))
1463 			continue;
1464 
1465 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1466 		page = BufferGetPage(buf);
1467 
1468 		/*
1469 		 * Take a copy of the local page where WAL has been applied to have a
1470 		 * comparison base before masking it...
1471 		 */
1472 		memcpy(replay_image_masked, page, BLCKSZ);
1473 
1474 		/* No need for this page anymore now that a copy is in. */
1475 		UnlockReleaseBuffer(buf);
1476 
1477 		/*
1478 		 * If the block LSN is already ahead of this WAL record, we can't
1479 		 * expect contents to match.  This can happen if recovery is
1480 		 * restarted.
1481 		 */
1482 		if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1483 			continue;
1484 
1485 		/*
1486 		 * Read the contents from the backup copy, stored in WAL record and
1487 		 * store it in a temporary page. There is no need to allocate a new
1488 		 * page here, a local buffer is fine to hold its contents and a mask
1489 		 * can be directly applied on it.
1490 		 */
1491 		if (!RestoreBlockImage(record, block_id, master_image_masked))
1492 			elog(ERROR, "failed to restore block image");
1493 
1494 		/*
1495 		 * If masking function is defined, mask both the master and replay
1496 		 * images
1497 		 */
1498 		if (RmgrTable[rmid].rm_mask != NULL)
1499 		{
1500 			RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1501 			RmgrTable[rmid].rm_mask(master_image_masked, blkno);
1502 		}
1503 
1504 		/* Time to compare the master and replay images. */
1505 		if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
1506 		{
1507 			elog(FATAL,
1508 				 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1509 				 rnode.spcNode, rnode.dbNode, rnode.relNode,
1510 				 forknum, blkno);
1511 		}
1512 	}
1513 }
1514 
1515 /*
1516  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
1517  * area in the WAL.
1518  */
1519 static void
CopyXLogRecordToWAL(int write_len,bool isLogSwitch,XLogRecData * rdata,XLogRecPtr StartPos,XLogRecPtr EndPos)1520 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1521 					XLogRecPtr StartPos, XLogRecPtr EndPos)
1522 {
1523 	char	   *currpos;
1524 	int			freespace;
1525 	int			written;
1526 	XLogRecPtr	CurrPos;
1527 	XLogPageHeader pagehdr;
1528 
1529 	/*
1530 	 * Get a pointer to the right place in the right WAL buffer to start
1531 	 * inserting to.
1532 	 */
1533 	CurrPos = StartPos;
1534 	currpos = GetXLogBuffer(CurrPos);
1535 	freespace = INSERT_FREESPACE(CurrPos);
1536 
1537 	/*
1538 	 * there should be enough space for at least the first field (xl_tot_len)
1539 	 * on this page.
1540 	 */
1541 	Assert(freespace >= sizeof(uint32));
1542 
1543 	/* Copy record data */
1544 	written = 0;
1545 	while (rdata != NULL)
1546 	{
1547 		char	   *rdata_data = rdata->data;
1548 		int			rdata_len = rdata->len;
1549 
1550 		while (rdata_len > freespace)
1551 		{
1552 			/*
1553 			 * Write what fits on this page, and continue on the next page.
1554 			 */
1555 			Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1556 			memcpy(currpos, rdata_data, freespace);
1557 			rdata_data += freespace;
1558 			rdata_len -= freespace;
1559 			written += freespace;
1560 			CurrPos += freespace;
1561 
1562 			/*
1563 			 * Get pointer to beginning of next page, and set the xlp_rem_len
1564 			 * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1565 			 *
1566 			 * It's safe to set the contrecord flag and xlp_rem_len without a
1567 			 * lock on the page. All the other flags were already set when the
1568 			 * page was initialized, in AdvanceXLInsertBuffer, and we're the
1569 			 * only backend that needs to set the contrecord flag.
1570 			 */
1571 			currpos = GetXLogBuffer(CurrPos);
1572 			pagehdr = (XLogPageHeader) currpos;
1573 			pagehdr->xlp_rem_len = write_len - written;
1574 			pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1575 
1576 			/* skip over the page header */
1577 			if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1578 			{
1579 				CurrPos += SizeOfXLogLongPHD;
1580 				currpos += SizeOfXLogLongPHD;
1581 			}
1582 			else
1583 			{
1584 				CurrPos += SizeOfXLogShortPHD;
1585 				currpos += SizeOfXLogShortPHD;
1586 			}
1587 			freespace = INSERT_FREESPACE(CurrPos);
1588 		}
1589 
1590 		Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1591 		memcpy(currpos, rdata_data, rdata_len);
1592 		currpos += rdata_len;
1593 		CurrPos += rdata_len;
1594 		freespace -= rdata_len;
1595 		written += rdata_len;
1596 
1597 		rdata = rdata->next;
1598 	}
1599 	Assert(written == write_len);
1600 
1601 	/*
1602 	 * If this was an xlog-switch, it's not enough to write the switch record,
1603 	 * we also have to consume all the remaining space in the WAL segment.  We
1604 	 * have already reserved that space, but we need to actually fill it.
1605 	 */
1606 	if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
1607 	{
1608 		/* An xlog-switch record doesn't contain any data besides the header */
1609 		Assert(write_len == SizeOfXLogRecord);
1610 
1611 		/* Assert that we did reserve the right amount of space */
1612 		Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1613 
1614 		/* Use up all the remaining space on the current page */
1615 		CurrPos += freespace;
1616 
1617 		/*
1618 		 * Cause all remaining pages in the segment to be flushed, leaving the
1619 		 * XLog position where it should be, at the start of the next segment.
1620 		 * We do this one page at a time, to make sure we don't deadlock
1621 		 * against ourselves if wal_buffers < wal_segment_size.
1622 		 */
1623 		while (CurrPos < EndPos)
1624 		{
1625 			/*
1626 			 * The minimal action to flush the page would be to call
1627 			 * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1628 			 * AdvanceXLInsertBuffer(...).  The page would be left initialized
1629 			 * mostly to zeros, except for the page header (always the short
1630 			 * variant, as this is never a segment's first page).
1631 			 *
1632 			 * The large vistas of zeros are good for compressibility, but the
1633 			 * headers interrupting them every XLOG_BLCKSZ (with values that
1634 			 * differ from page to page) are not.  The effect varies with
1635 			 * compression tool, but bzip2 for instance compresses about an
1636 			 * order of magnitude worse if those headers are left in place.
1637 			 *
1638 			 * Rather than complicating AdvanceXLInsertBuffer itself (which is
1639 			 * called in heavily-loaded circumstances as well as this lightly-
1640 			 * loaded one) with variant behavior, we just use GetXLogBuffer
1641 			 * (which itself calls the two methods we need) to get the pointer
1642 			 * and zero most of the page.  Then we just zero the page header.
1643 			 */
1644 			currpos = GetXLogBuffer(CurrPos);
1645 			MemSet(currpos, 0, SizeOfXLogShortPHD);
1646 
1647 			CurrPos += XLOG_BLCKSZ;
1648 		}
1649 	}
1650 	else
1651 	{
1652 		/* Align the end position, so that the next record starts aligned */
1653 		CurrPos = MAXALIGN64(CurrPos);
1654 	}
1655 
1656 	if (CurrPos != EndPos)
1657 		elog(PANIC, "space reserved for WAL record does not match what was written");
1658 }
1659 
1660 /*
1661  * Acquire a WAL insertion lock, for inserting to WAL.
1662  */
1663 static void
WALInsertLockAcquire(void)1664 WALInsertLockAcquire(void)
1665 {
1666 	bool		immed;
1667 
1668 	/*
1669 	 * It doesn't matter which of the WAL insertion locks we acquire, so try
1670 	 * the one we used last time.  If the system isn't particularly busy, it's
1671 	 * a good bet that it's still available, and it's good to have some
1672 	 * affinity to a particular lock so that you don't unnecessarily bounce
1673 	 * cache lines between processes when there's no contention.
1674 	 *
1675 	 * If this is the first time through in this backend, pick a lock
1676 	 * (semi-)randomly.  This allows the locks to be used evenly if you have a
1677 	 * lot of very short connections.
1678 	 */
1679 	static int	lockToTry = -1;
1680 
1681 	if (lockToTry == -1)
1682 		lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1683 	MyLockNo = lockToTry;
1684 
1685 	/*
1686 	 * The insertingAt value is initially set to 0, as we don't know our
1687 	 * insert location yet.
1688 	 */
1689 	immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1690 	if (!immed)
1691 	{
1692 		/*
1693 		 * If we couldn't get the lock immediately, try another lock next
1694 		 * time.  On a system with more insertion locks than concurrent
1695 		 * inserters, this causes all the inserters to eventually migrate to a
1696 		 * lock that no-one else is using.  On a system with more inserters
1697 		 * than locks, it still helps to distribute the inserters evenly
1698 		 * across the locks.
1699 		 */
1700 		lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1701 	}
1702 }
1703 
1704 /*
1705  * Acquire all WAL insertion locks, to prevent other backends from inserting
1706  * to WAL.
1707  */
1708 static void
WALInsertLockAcquireExclusive(void)1709 WALInsertLockAcquireExclusive(void)
1710 {
1711 	int			i;
1712 
1713 	/*
1714 	 * When holding all the locks, all but the last lock's insertingAt
1715 	 * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1716 	 * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1717 	 */
1718 	for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1719 	{
1720 		LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1721 		LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1722 						&WALInsertLocks[i].l.insertingAt,
1723 						PG_UINT64_MAX);
1724 	}
1725 	/* Variable value reset to 0 at release */
1726 	LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1727 
1728 	holdingAllLocks = true;
1729 }
1730 
1731 /*
1732  * Release our insertion lock (or locks, if we're holding them all).
1733  *
1734  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1735  * next time the lock is acquired.
1736  */
1737 static void
WALInsertLockRelease(void)1738 WALInsertLockRelease(void)
1739 {
1740 	if (holdingAllLocks)
1741 	{
1742 		int			i;
1743 
1744 		for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1745 			LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1746 								  &WALInsertLocks[i].l.insertingAt,
1747 								  0);
1748 
1749 		holdingAllLocks = false;
1750 	}
1751 	else
1752 	{
1753 		LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1754 							  &WALInsertLocks[MyLockNo].l.insertingAt,
1755 							  0);
1756 	}
1757 }
1758 
1759 /*
1760  * Update our insertingAt value, to let others know that we've finished
1761  * inserting up to that point.
1762  */
1763 static void
WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)1764 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1765 {
1766 	if (holdingAllLocks)
1767 	{
1768 		/*
1769 		 * We use the last lock to mark our actual position, see comments in
1770 		 * WALInsertLockAcquireExclusive.
1771 		 */
1772 		LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1773 						&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1774 						insertingAt);
1775 	}
1776 	else
1777 		LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1778 						&WALInsertLocks[MyLockNo].l.insertingAt,
1779 						insertingAt);
1780 }
1781 
1782 /*
1783  * Wait for any WAL insertions < upto to finish.
1784  *
1785  * Returns the location of the oldest insertion that is still in-progress.
1786  * Any WAL prior to that point has been fully copied into WAL buffers, and
1787  * can be flushed out to disk. Because this waits for any insertions older
1788  * than 'upto' to finish, the return value is always >= 'upto'.
1789  *
1790  * Note: When you are about to write out WAL, you must call this function
1791  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1792  * need to wait for an insertion to finish (or at least advance to next
1793  * uninitialized page), and the inserter might need to evict an old WAL buffer
1794  * to make room for a new one, which in turn requires WALWriteLock.
1795  */
1796 static XLogRecPtr
WaitXLogInsertionsToFinish(XLogRecPtr upto)1797 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1798 {
1799 	uint64		bytepos;
1800 	XLogRecPtr	reservedUpto;
1801 	XLogRecPtr	finishedUpto;
1802 	XLogCtlInsert *Insert = &XLogCtl->Insert;
1803 	int			i;
1804 
1805 	if (MyProc == NULL)
1806 		elog(PANIC, "cannot wait without a PGPROC structure");
1807 
1808 	/* Read the current insert position */
1809 	SpinLockAcquire(&Insert->insertpos_lck);
1810 	bytepos = Insert->CurrBytePos;
1811 	SpinLockRelease(&Insert->insertpos_lck);
1812 	reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1813 
1814 	/*
1815 	 * No-one should request to flush a piece of WAL that hasn't even been
1816 	 * reserved yet. However, it can happen if there is a block with a bogus
1817 	 * LSN on disk, for example. XLogFlush checks for that situation and
1818 	 * complains, but only after the flush. Here we just assume that to mean
1819 	 * that all WAL that has been reserved needs to be finished. In this
1820 	 * corner-case, the return value can be smaller than 'upto' argument.
1821 	 */
1822 	if (upto > reservedUpto)
1823 	{
1824 		elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1825 			 (uint32) (upto >> 32), (uint32) upto,
1826 			 (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1827 		upto = reservedUpto;
1828 	}
1829 
1830 	/*
1831 	 * Loop through all the locks, sleeping on any in-progress insert older
1832 	 * than 'upto'.
1833 	 *
1834 	 * finishedUpto is our return value, indicating the point upto which all
1835 	 * the WAL insertions have been finished. Initialize it to the head of
1836 	 * reserved WAL, and as we iterate through the insertion locks, back it
1837 	 * out for any insertion that's still in progress.
1838 	 */
1839 	finishedUpto = reservedUpto;
1840 	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1841 	{
1842 		XLogRecPtr	insertingat = InvalidXLogRecPtr;
1843 
1844 		do
1845 		{
1846 			/*
1847 			 * See if this insertion is in progress.  LWLockWaitForVar will
1848 			 * wait for the lock to be released, or for the 'value' to be set
1849 			 * by a LWLockUpdateVar call.  When a lock is initially acquired,
1850 			 * its value is 0 (InvalidXLogRecPtr), which means that we don't
1851 			 * know where it's inserting yet.  We will have to wait for it. If
1852 			 * it's a small insertion, the record will most likely fit on the
1853 			 * same page and the inserter will release the lock without ever
1854 			 * calling LWLockUpdateVar.  But if it has to sleep, it will
1855 			 * advertise the insertion point with LWLockUpdateVar before
1856 			 * sleeping.
1857 			 */
1858 			if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1859 								 &WALInsertLocks[i].l.insertingAt,
1860 								 insertingat, &insertingat))
1861 			{
1862 				/* the lock was free, so no insertion in progress */
1863 				insertingat = InvalidXLogRecPtr;
1864 				break;
1865 			}
1866 
1867 			/*
1868 			 * This insertion is still in progress. Have to wait, unless the
1869 			 * inserter has proceeded past 'upto'.
1870 			 */
1871 		} while (insertingat < upto);
1872 
1873 		if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1874 			finishedUpto = insertingat;
1875 	}
1876 	return finishedUpto;
1877 }
1878 
1879 /*
1880  * Get a pointer to the right location in the WAL buffer containing the
1881  * given XLogRecPtr.
1882  *
1883  * If the page is not initialized yet, it is initialized. That might require
1884  * evicting an old dirty buffer from the buffer cache, which means I/O.
1885  *
1886  * The caller must ensure that the page containing the requested location
1887  * isn't evicted yet, and won't be evicted. The way to ensure that is to
1888  * hold onto a WAL insertion lock with the insertingAt position set to
1889  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1890  * to evict an old page from the buffer. (This means that once you call
1891  * GetXLogBuffer() with a given 'ptr', you must not access anything before
1892  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1893  * later, because older buffers might be recycled already)
1894  */
1895 static char *
GetXLogBuffer(XLogRecPtr ptr)1896 GetXLogBuffer(XLogRecPtr ptr)
1897 {
1898 	int			idx;
1899 	XLogRecPtr	endptr;
1900 	static uint64 cachedPage = 0;
1901 	static char *cachedPos = NULL;
1902 	XLogRecPtr	expectedEndPtr;
1903 
1904 	/*
1905 	 * Fast path for the common case that we need to access again the same
1906 	 * page as last time.
1907 	 */
1908 	if (ptr / XLOG_BLCKSZ == cachedPage)
1909 	{
1910 		Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1911 		Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1912 		return cachedPos + ptr % XLOG_BLCKSZ;
1913 	}
1914 
1915 	/*
1916 	 * The XLog buffer cache is organized so that a page is always loaded to a
1917 	 * particular buffer.  That way we can easily calculate the buffer a given
1918 	 * page must be loaded into, from the XLogRecPtr alone.
1919 	 */
1920 	idx = XLogRecPtrToBufIdx(ptr);
1921 
1922 	/*
1923 	 * See what page is loaded in the buffer at the moment. It could be the
1924 	 * page we're looking for, or something older. It can't be anything newer
1925 	 * - that would imply the page we're looking for has already been written
1926 	 * out to disk and evicted, and the caller is responsible for making sure
1927 	 * that doesn't happen.
1928 	 *
1929 	 * However, we don't hold a lock while we read the value. If someone has
1930 	 * just initialized the page, it's possible that we get a "torn read" of
1931 	 * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1932 	 * that case we will see a bogus value. That's ok, we'll grab the mapping
1933 	 * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1934 	 * the page we're looking for. But it means that when we do this unlocked
1935 	 * read, we might see a value that appears to be ahead of the page we're
1936 	 * looking for. Don't PANIC on that, until we've verified the value while
1937 	 * holding the lock.
1938 	 */
1939 	expectedEndPtr = ptr;
1940 	expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1941 
1942 	endptr = XLogCtl->xlblocks[idx];
1943 	if (expectedEndPtr != endptr)
1944 	{
1945 		XLogRecPtr	initializedUpto;
1946 
1947 		/*
1948 		 * Before calling AdvanceXLInsertBuffer(), which can block, let others
1949 		 * know how far we're finished with inserting the record.
1950 		 *
1951 		 * NB: If 'ptr' points to just after the page header, advertise a
1952 		 * position at the beginning of the page rather than 'ptr' itself. If
1953 		 * there are no other insertions running, someone might try to flush
1954 		 * up to our advertised location. If we advertised a position after
1955 		 * the page header, someone might try to flush the page header, even
1956 		 * though page might actually not be initialized yet. As the first
1957 		 * inserter on the page, we are effectively responsible for making
1958 		 * sure that it's initialized, before we let insertingAt to move past
1959 		 * the page header.
1960 		 */
1961 		if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1962 			XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
1963 			initializedUpto = ptr - SizeOfXLogShortPHD;
1964 		else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1965 				 XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
1966 			initializedUpto = ptr - SizeOfXLogLongPHD;
1967 		else
1968 			initializedUpto = ptr;
1969 
1970 		WALInsertLockUpdateInsertingAt(initializedUpto);
1971 
1972 		AdvanceXLInsertBuffer(ptr, false);
1973 		endptr = XLogCtl->xlblocks[idx];
1974 
1975 		if (expectedEndPtr != endptr)
1976 			elog(PANIC, "could not find WAL buffer for %X/%X",
1977 				 (uint32) (ptr >> 32), (uint32) ptr);
1978 	}
1979 	else
1980 	{
1981 		/*
1982 		 * Make sure the initialization of the page is visible to us, and
1983 		 * won't arrive later to overwrite the WAL data we write on the page.
1984 		 */
1985 		pg_memory_barrier();
1986 	}
1987 
1988 	/*
1989 	 * Found the buffer holding this page. Return a pointer to the right
1990 	 * offset within the page.
1991 	 */
1992 	cachedPage = ptr / XLOG_BLCKSZ;
1993 	cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1994 
1995 	Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1996 	Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1997 
1998 	return cachedPos + ptr % XLOG_BLCKSZ;
1999 }
2000 
2001 /*
2002  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
2003  * is the position starting from the beginning of WAL, excluding all WAL
2004  * page headers.
2005  */
2006 static XLogRecPtr
XLogBytePosToRecPtr(uint64 bytepos)2007 XLogBytePosToRecPtr(uint64 bytepos)
2008 {
2009 	uint64		fullsegs;
2010 	uint64		fullpages;
2011 	uint64		bytesleft;
2012 	uint32		seg_offset;
2013 	XLogRecPtr	result;
2014 
2015 	fullsegs = bytepos / UsableBytesInSegment;
2016 	bytesleft = bytepos % UsableBytesInSegment;
2017 
2018 	if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2019 	{
2020 		/* fits on first page of segment */
2021 		seg_offset = bytesleft + SizeOfXLogLongPHD;
2022 	}
2023 	else
2024 	{
2025 		/* account for the first page on segment with long header */
2026 		seg_offset = XLOG_BLCKSZ;
2027 		bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2028 
2029 		fullpages = bytesleft / UsableBytesInPage;
2030 		bytesleft = bytesleft % UsableBytesInPage;
2031 
2032 		seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2033 	}
2034 
2035 	XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2036 
2037 	return result;
2038 }
2039 
2040 /*
2041  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
2042  * returns a pointer to the beginning of the page (ie. before page header),
2043  * not to where the first xlog record on that page would go to. This is used
2044  * when converting a pointer to the end of a record.
2045  */
2046 static XLogRecPtr
XLogBytePosToEndRecPtr(uint64 bytepos)2047 XLogBytePosToEndRecPtr(uint64 bytepos)
2048 {
2049 	uint64		fullsegs;
2050 	uint64		fullpages;
2051 	uint64		bytesleft;
2052 	uint32		seg_offset;
2053 	XLogRecPtr	result;
2054 
2055 	fullsegs = bytepos / UsableBytesInSegment;
2056 	bytesleft = bytepos % UsableBytesInSegment;
2057 
2058 	if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2059 	{
2060 		/* fits on first page of segment */
2061 		if (bytesleft == 0)
2062 			seg_offset = 0;
2063 		else
2064 			seg_offset = bytesleft + SizeOfXLogLongPHD;
2065 	}
2066 	else
2067 	{
2068 		/* account for the first page on segment with long header */
2069 		seg_offset = XLOG_BLCKSZ;
2070 		bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2071 
2072 		fullpages = bytesleft / UsableBytesInPage;
2073 		bytesleft = bytesleft % UsableBytesInPage;
2074 
2075 		if (bytesleft == 0)
2076 			seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2077 		else
2078 			seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2079 	}
2080 
2081 	XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2082 
2083 	return result;
2084 }
2085 
2086 /*
2087  * Convert an XLogRecPtr to a "usable byte position".
2088  */
2089 static uint64
XLogRecPtrToBytePos(XLogRecPtr ptr)2090 XLogRecPtrToBytePos(XLogRecPtr ptr)
2091 {
2092 	uint64		fullsegs;
2093 	uint32		fullpages;
2094 	uint32		offset;
2095 	uint64		result;
2096 
2097 	XLByteToSeg(ptr, fullsegs, wal_segment_size);
2098 
2099 	fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
2100 	offset = ptr % XLOG_BLCKSZ;
2101 
2102 	if (fullpages == 0)
2103 	{
2104 		result = fullsegs * UsableBytesInSegment;
2105 		if (offset > 0)
2106 		{
2107 			Assert(offset >= SizeOfXLogLongPHD);
2108 			result += offset - SizeOfXLogLongPHD;
2109 		}
2110 	}
2111 	else
2112 	{
2113 		result = fullsegs * UsableBytesInSegment +
2114 			(XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2115 			(fullpages - 1) * UsableBytesInPage;	/* full pages */
2116 		if (offset > 0)
2117 		{
2118 			Assert(offset >= SizeOfXLogShortPHD);
2119 			result += offset - SizeOfXLogShortPHD;
2120 		}
2121 	}
2122 
2123 	return result;
2124 }
2125 
2126 /*
2127  * Initialize XLOG buffers, writing out old buffers if they still contain
2128  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2129  * true, initialize as many pages as we can without having to write out
2130  * unwritten data. Any new pages are initialized to zeros, with pages headers
2131  * initialized properly.
2132  */
2133 static void
AdvanceXLInsertBuffer(XLogRecPtr upto,bool opportunistic)2134 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2135 {
2136 	XLogCtlInsert *Insert = &XLogCtl->Insert;
2137 	int			nextidx;
2138 	XLogRecPtr	OldPageRqstPtr;
2139 	XLogwrtRqst WriteRqst;
2140 	XLogRecPtr	NewPageEndPtr = InvalidXLogRecPtr;
2141 	XLogRecPtr	NewPageBeginPtr;
2142 	XLogPageHeader NewPage;
2143 	int			npages = 0;
2144 
2145 	LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2146 
2147 	/*
2148 	 * Now that we have the lock, check if someone initialized the page
2149 	 * already.
2150 	 */
2151 	while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2152 	{
2153 		nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2154 
2155 		/*
2156 		 * Get ending-offset of the buffer page we need to replace (this may
2157 		 * be zero if the buffer hasn't been used yet).  Fall through if it's
2158 		 * already written out.
2159 		 */
2160 		OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2161 		if (LogwrtResult.Write < OldPageRqstPtr)
2162 		{
2163 			/*
2164 			 * Nope, got work to do. If we just want to pre-initialize as much
2165 			 * as we can without flushing, give up now.
2166 			 */
2167 			if (opportunistic)
2168 				break;
2169 
2170 			/* Before waiting, get info_lck and update LogwrtResult */
2171 			SpinLockAcquire(&XLogCtl->info_lck);
2172 			if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2173 				XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2174 			LogwrtResult = XLogCtl->LogwrtResult;
2175 			SpinLockRelease(&XLogCtl->info_lck);
2176 
2177 			/*
2178 			 * Now that we have an up-to-date LogwrtResult value, see if we
2179 			 * still need to write it or if someone else already did.
2180 			 */
2181 			if (LogwrtResult.Write < OldPageRqstPtr)
2182 			{
2183 				/*
2184 				 * Must acquire write lock. Release WALBufMappingLock first,
2185 				 * to make sure that all insertions that we need to wait for
2186 				 * can finish (up to this same position). Otherwise we risk
2187 				 * deadlock.
2188 				 */
2189 				LWLockRelease(WALBufMappingLock);
2190 
2191 				WaitXLogInsertionsToFinish(OldPageRqstPtr);
2192 
2193 				LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2194 
2195 				LogwrtResult = XLogCtl->LogwrtResult;
2196 				if (LogwrtResult.Write >= OldPageRqstPtr)
2197 				{
2198 					/* OK, someone wrote it already */
2199 					LWLockRelease(WALWriteLock);
2200 				}
2201 				else
2202 				{
2203 					/* Have to write it ourselves */
2204 					TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2205 					WriteRqst.Write = OldPageRqstPtr;
2206 					WriteRqst.Flush = 0;
2207 					XLogWrite(WriteRqst, false);
2208 					LWLockRelease(WALWriteLock);
2209 					TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2210 				}
2211 				/* Re-acquire WALBufMappingLock and retry */
2212 				LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2213 				continue;
2214 			}
2215 		}
2216 
2217 		/*
2218 		 * Now the next buffer slot is free and we can set it up to be the
2219 		 * next output page.
2220 		 */
2221 		NewPageBeginPtr = XLogCtl->InitializedUpTo;
2222 		NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2223 
2224 		Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2225 
2226 		NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2227 
2228 		/*
2229 		 * Be sure to re-zero the buffer so that bytes beyond what we've
2230 		 * written will look like zeroes and not valid XLOG records...
2231 		 */
2232 		MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2233 
2234 		/*
2235 		 * Fill the new page's header
2236 		 */
2237 		NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2238 
2239 		/* NewPage->xlp_info = 0; */	/* done by memset */
2240 		NewPage->xlp_tli = ThisTimeLineID;
2241 		NewPage->xlp_pageaddr = NewPageBeginPtr;
2242 
2243 		/* NewPage->xlp_rem_len = 0; */	/* done by memset */
2244 
2245 		/*
2246 		 * If online backup is not in progress, mark the header to indicate
2247 		 * that WAL records beginning in this page have removable backup
2248 		 * blocks.  This allows the WAL archiver to know whether it is safe to
2249 		 * compress archived WAL data by transforming full-block records into
2250 		 * the non-full-block format.  It is sufficient to record this at the
2251 		 * page level because we force a page switch (in fact a segment
2252 		 * switch) when starting a backup, so the flag will be off before any
2253 		 * records can be written during the backup.  At the end of a backup,
2254 		 * the last page will be marked as all unsafe when perhaps only part
2255 		 * is unsafe, but at worst the archiver would miss the opportunity to
2256 		 * compress a few records.
2257 		 */
2258 		if (!Insert->forcePageWrites)
2259 			NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2260 
2261 		/*
2262 		 * If a record was found to be broken at the end of recovery, and
2263 		 * we're going to write on the page where its first contrecord was
2264 		 * lost, set the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag on the page
2265 		 * header.  See CreateOverwriteContrecordRecord().
2266 		 */
2267 		if (missingContrecPtr == NewPageBeginPtr)
2268 		{
2269 			NewPage->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
2270 			missingContrecPtr = InvalidXLogRecPtr;
2271 		}
2272 
2273 		/*
2274 		 * If first page of an XLOG segment file, make it a long header.
2275 		 */
2276 		if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
2277 		{
2278 			XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2279 
2280 			NewLongPage->xlp_sysid = ControlFile->system_identifier;
2281 			NewLongPage->xlp_seg_size = wal_segment_size;
2282 			NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2283 			NewPage->xlp_info |= XLP_LONG_HEADER;
2284 		}
2285 
2286 		/*
2287 		 * Make sure the initialization of the page becomes visible to others
2288 		 * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2289 		 * holding a lock.
2290 		 */
2291 		pg_write_barrier();
2292 
2293 		*((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2294 
2295 		XLogCtl->InitializedUpTo = NewPageEndPtr;
2296 
2297 		npages++;
2298 	}
2299 	LWLockRelease(WALBufMappingLock);
2300 
2301 #ifdef WAL_DEBUG
2302 	if (XLOG_DEBUG && npages > 0)
2303 	{
2304 		elog(DEBUG1, "initialized %d pages, up to %X/%X",
2305 			 npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2306 	}
2307 #endif
2308 }
2309 
2310 /*
2311  * Calculate CheckPointSegments based on max_wal_size_mb and
2312  * checkpoint_completion_target.
2313  */
2314 static void
CalculateCheckpointSegments(void)2315 CalculateCheckpointSegments(void)
2316 {
2317 	double		target;
2318 
2319 	/*-------
2320 	 * Calculate the distance at which to trigger a checkpoint, to avoid
2321 	 * exceeding max_wal_size_mb. This is based on two assumptions:
2322 	 *
2323 	 * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2324 	 *    WAL for two checkpoint cycles to allow us to recover from the
2325 	 *    secondary checkpoint if the first checkpoint failed, though we
2326 	 *    only did this on the master anyway, not on standby. Keeping just
2327 	 *    one checkpoint simplifies processing and reduces disk space in
2328 	 *    many smaller databases.)
2329 	 * b) during checkpoint, we consume checkpoint_completion_target *
2330 	 *	  number of segments consumed between checkpoints.
2331 	 *-------
2332 	 */
2333 	target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2334 		(1.0 + CheckPointCompletionTarget);
2335 
2336 	/* round down */
2337 	CheckPointSegments = (int) target;
2338 
2339 	if (CheckPointSegments < 1)
2340 		CheckPointSegments = 1;
2341 }
2342 
2343 void
assign_max_wal_size(int newval,void * extra)2344 assign_max_wal_size(int newval, void *extra)
2345 {
2346 	max_wal_size_mb = newval;
2347 	CalculateCheckpointSegments();
2348 }
2349 
2350 void
assign_checkpoint_completion_target(double newval,void * extra)2351 assign_checkpoint_completion_target(double newval, void *extra)
2352 {
2353 	CheckPointCompletionTarget = newval;
2354 	CalculateCheckpointSegments();
2355 }
2356 
2357 /*
2358  * At a checkpoint, how many WAL segments to recycle as preallocated future
2359  * XLOG segments? Returns the highest segment that should be preallocated.
2360  */
2361 static XLogSegNo
XLOGfileslop(XLogRecPtr lastredoptr)2362 XLOGfileslop(XLogRecPtr lastredoptr)
2363 {
2364 	XLogSegNo	minSegNo;
2365 	XLogSegNo	maxSegNo;
2366 	double		distance;
2367 	XLogSegNo	recycleSegNo;
2368 
2369 	/*
2370 	 * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2371 	 * correspond to. Always recycle enough segments to meet the minimum, and
2372 	 * remove enough segments to stay below the maximum.
2373 	 */
2374 	minSegNo = lastredoptr / wal_segment_size +
2375 		ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
2376 	maxSegNo = lastredoptr / wal_segment_size +
2377 		ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2378 
2379 	/*
2380 	 * Between those limits, recycle enough segments to get us through to the
2381 	 * estimated end of next checkpoint.
2382 	 *
2383 	 * To estimate where the next checkpoint will finish, assume that the
2384 	 * system runs steadily consuming CheckPointDistanceEstimate bytes between
2385 	 * every checkpoint.
2386 	 */
2387 	distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2388 	/* add 10% for good measure. */
2389 	distance *= 1.10;
2390 
2391 	recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
2392 									wal_segment_size);
2393 
2394 	if (recycleSegNo < minSegNo)
2395 		recycleSegNo = minSegNo;
2396 	if (recycleSegNo > maxSegNo)
2397 		recycleSegNo = maxSegNo;
2398 
2399 	return recycleSegNo;
2400 }
2401 
2402 /*
2403  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2404  *
2405  * new_segno indicates a log file that has just been filled up (or read
2406  * during recovery). We measure the distance from RedoRecPtr to new_segno
2407  * and see if that exceeds CheckPointSegments.
2408  *
2409  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2410  */
2411 static bool
XLogCheckpointNeeded(XLogSegNo new_segno)2412 XLogCheckpointNeeded(XLogSegNo new_segno)
2413 {
2414 	XLogSegNo	old_segno;
2415 
2416 	XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2417 
2418 	if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2419 		return true;
2420 	return false;
2421 }
2422 
2423 /*
2424  * Write and/or fsync the log at least as far as WriteRqst indicates.
2425  *
2426  * If flexible == true, we don't have to write as far as WriteRqst, but
2427  * may stop at any convenient boundary (such as a cache or logfile boundary).
2428  * This option allows us to avoid uselessly issuing multiple writes when a
2429  * single one would do.
2430  *
2431  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2432  * must be called before grabbing the lock, to make sure the data is ready to
2433  * write.
2434  */
2435 static void
XLogWrite(XLogwrtRqst WriteRqst,bool flexible)2436 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2437 {
2438 	bool		ispartialpage;
2439 	bool		last_iteration;
2440 	bool		finishing_seg;
2441 	bool		use_existent;
2442 	int			curridx;
2443 	int			npages;
2444 	int			startidx;
2445 	uint32		startoffset;
2446 
2447 	/* We should always be inside a critical section here */
2448 	Assert(CritSectionCount > 0);
2449 
2450 	/*
2451 	 * Update local LogwrtResult (caller probably did this already, but...)
2452 	 */
2453 	LogwrtResult = XLogCtl->LogwrtResult;
2454 
2455 	/*
2456 	 * Since successive pages in the xlog cache are consecutively allocated,
2457 	 * we can usually gather multiple pages together and issue just one
2458 	 * write() call.  npages is the number of pages we have determined can be
2459 	 * written together; startidx is the cache block index of the first one,
2460 	 * and startoffset is the file offset at which it should go. The latter
2461 	 * two variables are only valid when npages > 0, but we must initialize
2462 	 * all of them to keep the compiler quiet.
2463 	 */
2464 	npages = 0;
2465 	startidx = 0;
2466 	startoffset = 0;
2467 
2468 	/*
2469 	 * Within the loop, curridx is the cache block index of the page to
2470 	 * consider writing.  Begin at the buffer containing the next unwritten
2471 	 * page, or last partially written page.
2472 	 */
2473 	curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2474 
2475 	while (LogwrtResult.Write < WriteRqst.Write)
2476 	{
2477 		/*
2478 		 * Make sure we're not ahead of the insert process.  This could happen
2479 		 * if we're passed a bogus WriteRqst.Write that is past the end of the
2480 		 * last page that's been initialized by AdvanceXLInsertBuffer.
2481 		 */
2482 		XLogRecPtr	EndPtr = XLogCtl->xlblocks[curridx];
2483 
2484 		if (LogwrtResult.Write >= EndPtr)
2485 			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2486 				 (uint32) (LogwrtResult.Write >> 32),
2487 				 (uint32) LogwrtResult.Write,
2488 				 (uint32) (EndPtr >> 32), (uint32) EndPtr);
2489 
2490 		/* Advance LogwrtResult.Write to end of current buffer page */
2491 		LogwrtResult.Write = EndPtr;
2492 		ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2493 
2494 		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2495 							 wal_segment_size))
2496 		{
2497 			/*
2498 			 * Switch to new logfile segment.  We cannot have any pending
2499 			 * pages here (since we dump what we have at segment end).
2500 			 */
2501 			Assert(npages == 0);
2502 			if (openLogFile >= 0)
2503 				XLogFileClose();
2504 			XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2505 							wal_segment_size);
2506 
2507 			/* create/use new log file */
2508 			use_existent = true;
2509 			openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2510 			ReserveExternalFD();
2511 		}
2512 
2513 		/* Make sure we have the current logfile open */
2514 		if (openLogFile < 0)
2515 		{
2516 			XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2517 							wal_segment_size);
2518 			openLogFile = XLogFileOpen(openLogSegNo);
2519 			ReserveExternalFD();
2520 		}
2521 
2522 		/* Add current page to the set of pending pages-to-dump */
2523 		if (npages == 0)
2524 		{
2525 			/* first of group */
2526 			startidx = curridx;
2527 			startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2528 											wal_segment_size);
2529 		}
2530 		npages++;
2531 
2532 		/*
2533 		 * Dump the set if this will be the last loop iteration, or if we are
2534 		 * at the last page of the cache area (since the next page won't be
2535 		 * contiguous in memory), or if we are at the end of the logfile
2536 		 * segment.
2537 		 */
2538 		last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2539 
2540 		finishing_seg = !ispartialpage &&
2541 			(startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2542 
2543 		if (last_iteration ||
2544 			curridx == XLogCtl->XLogCacheBlck ||
2545 			finishing_seg)
2546 		{
2547 			char	   *from;
2548 			Size		nbytes;
2549 			Size		nleft;
2550 			int			written;
2551 
2552 			/* OK to write the page(s) */
2553 			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2554 			nbytes = npages * (Size) XLOG_BLCKSZ;
2555 			nleft = nbytes;
2556 			do
2557 			{
2558 				errno = 0;
2559 				pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2560 				written = pg_pwrite(openLogFile, from, nleft, startoffset);
2561 				pgstat_report_wait_end();
2562 				if (written <= 0)
2563 				{
2564 					char		xlogfname[MAXFNAMELEN];
2565 					int			save_errno;
2566 
2567 					if (errno == EINTR)
2568 						continue;
2569 
2570 					save_errno = errno;
2571 					XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
2572 								 wal_segment_size);
2573 					errno = save_errno;
2574 					ereport(PANIC,
2575 							(errcode_for_file_access(),
2576 							 errmsg("could not write to log file %s "
2577 									"at offset %u, length %zu: %m",
2578 									xlogfname, startoffset, nleft)));
2579 				}
2580 				nleft -= written;
2581 				from += written;
2582 				startoffset += written;
2583 			} while (nleft > 0);
2584 
2585 			npages = 0;
2586 
2587 			/*
2588 			 * If we just wrote the whole last page of a logfile segment,
2589 			 * fsync the segment immediately.  This avoids having to go back
2590 			 * and re-open prior segments when an fsync request comes along
2591 			 * later. Doing it here ensures that one and only one backend will
2592 			 * perform this fsync.
2593 			 *
2594 			 * This is also the right place to notify the Archiver that the
2595 			 * segment is ready to copy to archival storage, and to update the
2596 			 * timer for archive_timeout, and to signal for a checkpoint if
2597 			 * too many logfile segments have been used since the last
2598 			 * checkpoint.
2599 			 */
2600 			if (finishing_seg)
2601 			{
2602 				issue_xlog_fsync(openLogFile, openLogSegNo);
2603 
2604 				/* signal that we need to wakeup walsenders later */
2605 				WalSndWakeupRequest();
2606 
2607 				LogwrtResult.Flush = LogwrtResult.Write;	/* end of page */
2608 
2609 				if (XLogArchivingActive())
2610 					XLogArchiveNotifySeg(openLogSegNo);
2611 
2612 				XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2613 				XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2614 
2615 				/*
2616 				 * Request a checkpoint if we've consumed too much xlog since
2617 				 * the last one.  For speed, we first check using the local
2618 				 * copy of RedoRecPtr, which might be out of date; if it looks
2619 				 * like a checkpoint is needed, forcibly update RedoRecPtr and
2620 				 * recheck.
2621 				 */
2622 				if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2623 				{
2624 					(void) GetRedoRecPtr();
2625 					if (XLogCheckpointNeeded(openLogSegNo))
2626 						RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2627 				}
2628 			}
2629 		}
2630 
2631 		if (ispartialpage)
2632 		{
2633 			/* Only asked to write a partial page */
2634 			LogwrtResult.Write = WriteRqst.Write;
2635 			break;
2636 		}
2637 		curridx = NextBufIdx(curridx);
2638 
2639 		/* If flexible, break out of loop as soon as we wrote something */
2640 		if (flexible && npages == 0)
2641 			break;
2642 	}
2643 
2644 	Assert(npages == 0);
2645 
2646 	/*
2647 	 * If asked to flush, do so
2648 	 */
2649 	if (LogwrtResult.Flush < WriteRqst.Flush &&
2650 		LogwrtResult.Flush < LogwrtResult.Write)
2651 
2652 	{
2653 		/*
2654 		 * Could get here without iterating above loop, in which case we might
2655 		 * have no open file or the wrong one.  However, we do not need to
2656 		 * fsync more than one file.
2657 		 */
2658 		if (sync_method != SYNC_METHOD_OPEN &&
2659 			sync_method != SYNC_METHOD_OPEN_DSYNC)
2660 		{
2661 			if (openLogFile >= 0 &&
2662 				!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2663 								 wal_segment_size))
2664 				XLogFileClose();
2665 			if (openLogFile < 0)
2666 			{
2667 				XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2668 								wal_segment_size);
2669 				openLogFile = XLogFileOpen(openLogSegNo);
2670 				ReserveExternalFD();
2671 			}
2672 
2673 			issue_xlog_fsync(openLogFile, openLogSegNo);
2674 		}
2675 
2676 		/* signal that we need to wakeup walsenders later */
2677 		WalSndWakeupRequest();
2678 
2679 		LogwrtResult.Flush = LogwrtResult.Write;
2680 	}
2681 
2682 	/*
2683 	 * Update shared-memory status
2684 	 *
2685 	 * We make sure that the shared 'request' values do not fall behind the
2686 	 * 'result' values.  This is not absolutely essential, but it saves some
2687 	 * code in a couple of places.
2688 	 */
2689 	{
2690 		SpinLockAcquire(&XLogCtl->info_lck);
2691 		XLogCtl->LogwrtResult = LogwrtResult;
2692 		if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2693 			XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2694 		if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2695 			XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2696 		SpinLockRelease(&XLogCtl->info_lck);
2697 	}
2698 }
2699 
2700 /*
2701  * Record the LSN for an asynchronous transaction commit/abort
2702  * and nudge the WALWriter if there is work for it to do.
2703  * (This should not be called for synchronous commits.)
2704  */
2705 void
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)2706 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2707 {
2708 	XLogRecPtr	WriteRqstPtr = asyncXactLSN;
2709 	bool		sleeping;
2710 
2711 	SpinLockAcquire(&XLogCtl->info_lck);
2712 	LogwrtResult = XLogCtl->LogwrtResult;
2713 	sleeping = XLogCtl->WalWriterSleeping;
2714 	if (XLogCtl->asyncXactLSN < asyncXactLSN)
2715 		XLogCtl->asyncXactLSN = asyncXactLSN;
2716 	SpinLockRelease(&XLogCtl->info_lck);
2717 
2718 	/*
2719 	 * If the WALWriter is sleeping, we should kick it to make it come out of
2720 	 * low-power mode.  Otherwise, determine whether there's a full page of
2721 	 * WAL available to write.
2722 	 */
2723 	if (!sleeping)
2724 	{
2725 		/* back off to last completed page boundary */
2726 		WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2727 
2728 		/* if we have already flushed that far, we're done */
2729 		if (WriteRqstPtr <= LogwrtResult.Flush)
2730 			return;
2731 	}
2732 
2733 	/*
2734 	 * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2735 	 * to come out of low-power mode so that this async commit will reach disk
2736 	 * within the expected amount of time.
2737 	 */
2738 	if (ProcGlobal->walwriterLatch)
2739 		SetLatch(ProcGlobal->walwriterLatch);
2740 }
2741 
2742 /*
2743  * Record the LSN up to which we can remove WAL because it's not required by
2744  * any replication slot.
2745  */
2746 void
XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)2747 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2748 {
2749 	SpinLockAcquire(&XLogCtl->info_lck);
2750 	XLogCtl->replicationSlotMinLSN = lsn;
2751 	SpinLockRelease(&XLogCtl->info_lck);
2752 }
2753 
2754 
2755 /*
2756  * Return the oldest LSN we must retain to satisfy the needs of some
2757  * replication slot.
2758  */
2759 static XLogRecPtr
XLogGetReplicationSlotMinimumLSN(void)2760 XLogGetReplicationSlotMinimumLSN(void)
2761 {
2762 	XLogRecPtr	retval;
2763 
2764 	SpinLockAcquire(&XLogCtl->info_lck);
2765 	retval = XLogCtl->replicationSlotMinLSN;
2766 	SpinLockRelease(&XLogCtl->info_lck);
2767 
2768 	return retval;
2769 }
2770 
2771 /*
2772  * Advance minRecoveryPoint in control file.
2773  *
2774  * If we crash during recovery, we must reach this point again before the
2775  * database is consistent.
2776  *
2777  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2778  * is only updated if it's not already greater than or equal to 'lsn'.
2779  */
2780 static void
UpdateMinRecoveryPoint(XLogRecPtr lsn,bool force)2781 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2782 {
2783 	/* Quick check using our local copy of the variable */
2784 	if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2785 		return;
2786 
2787 	/*
2788 	 * An invalid minRecoveryPoint means that we need to recover all the WAL,
2789 	 * i.e., we're doing crash recovery.  We never modify the control file's
2790 	 * value in that case, so we can short-circuit future checks here too. The
2791 	 * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2792 	 * updated until crash recovery finishes.  We only do this for the startup
2793 	 * process as it should not update its own reference of minRecoveryPoint
2794 	 * until it has finished crash recovery to make sure that all WAL
2795 	 * available is replayed in this case.  This also saves from extra locks
2796 	 * taken on the control file from the startup process.
2797 	 */
2798 	if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
2799 	{
2800 		updateMinRecoveryPoint = false;
2801 		return;
2802 	}
2803 
2804 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2805 
2806 	/* update local copy */
2807 	minRecoveryPoint = ControlFile->minRecoveryPoint;
2808 	minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2809 
2810 	if (XLogRecPtrIsInvalid(minRecoveryPoint))
2811 		updateMinRecoveryPoint = false;
2812 	else if (force || minRecoveryPoint < lsn)
2813 	{
2814 		XLogRecPtr	newMinRecoveryPoint;
2815 		TimeLineID	newMinRecoveryPointTLI;
2816 
2817 		/*
2818 		 * To avoid having to update the control file too often, we update it
2819 		 * all the way to the last record being replayed, even though 'lsn'
2820 		 * would suffice for correctness.  This also allows the 'force' case
2821 		 * to not need a valid 'lsn' value.
2822 		 *
2823 		 * Another important reason for doing it this way is that the passed
2824 		 * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2825 		 * the caller got it from a corrupted heap page.  Accepting such a
2826 		 * value as the min recovery point would prevent us from coming up at
2827 		 * all.  Instead, we just log a warning and continue with recovery.
2828 		 * (See also the comments about corrupt LSNs in XLogFlush.)
2829 		 */
2830 		SpinLockAcquire(&XLogCtl->info_lck);
2831 		newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2832 		newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2833 		SpinLockRelease(&XLogCtl->info_lck);
2834 
2835 		if (!force && newMinRecoveryPoint < lsn)
2836 			elog(WARNING,
2837 				 "xlog min recovery request %X/%X is past current point %X/%X",
2838 				 (uint32) (lsn >> 32), (uint32) lsn,
2839 				 (uint32) (newMinRecoveryPoint >> 32),
2840 				 (uint32) newMinRecoveryPoint);
2841 
2842 		/* update control file */
2843 		if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2844 		{
2845 			ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2846 			ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2847 			UpdateControlFile();
2848 			minRecoveryPoint = newMinRecoveryPoint;
2849 			minRecoveryPointTLI = newMinRecoveryPointTLI;
2850 
2851 			ereport(DEBUG2,
2852 					(errmsg("updated min recovery point to %X/%X on timeline %u",
2853 							(uint32) (minRecoveryPoint >> 32),
2854 							(uint32) minRecoveryPoint,
2855 							newMinRecoveryPointTLI)));
2856 		}
2857 	}
2858 	LWLockRelease(ControlFileLock);
2859 }
2860 
2861 /*
2862  * Ensure that all XLOG data through the given position is flushed to disk.
2863  *
2864  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2865  * already held, and we try to avoid acquiring it if possible.
2866  */
2867 void
XLogFlush(XLogRecPtr record)2868 XLogFlush(XLogRecPtr record)
2869 {
2870 	XLogRecPtr	WriteRqstPtr;
2871 	XLogwrtRqst WriteRqst;
2872 
2873 	/*
2874 	 * During REDO, we are reading not writing WAL.  Therefore, instead of
2875 	 * trying to flush the WAL, we should update minRecoveryPoint instead. We
2876 	 * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2877 	 * to act this way too, and because when it tries to write the
2878 	 * end-of-recovery checkpoint, it should indeed flush.
2879 	 */
2880 	if (!XLogInsertAllowed())
2881 	{
2882 		UpdateMinRecoveryPoint(record, false);
2883 		return;
2884 	}
2885 
2886 	/* Quick exit if already known flushed */
2887 	if (record <= LogwrtResult.Flush)
2888 		return;
2889 
2890 #ifdef WAL_DEBUG
2891 	if (XLOG_DEBUG)
2892 		elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2893 			 (uint32) (record >> 32), (uint32) record,
2894 			 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2895 			 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2896 #endif
2897 
2898 	START_CRIT_SECTION();
2899 
2900 	/*
2901 	 * Since fsync is usually a horribly expensive operation, we try to
2902 	 * piggyback as much data as we can on each fsync: if we see any more data
2903 	 * entered into the xlog buffer, we'll write and fsync that too, so that
2904 	 * the final value of LogwrtResult.Flush is as large as possible. This
2905 	 * gives us some chance of avoiding another fsync immediately after.
2906 	 */
2907 
2908 	/* initialize to given target; may increase below */
2909 	WriteRqstPtr = record;
2910 
2911 	/*
2912 	 * Now wait until we get the write lock, or someone else does the flush
2913 	 * for us.
2914 	 */
2915 	for (;;)
2916 	{
2917 		XLogRecPtr	insertpos;
2918 
2919 		/* read LogwrtResult and update local state */
2920 		SpinLockAcquire(&XLogCtl->info_lck);
2921 		if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2922 			WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2923 		LogwrtResult = XLogCtl->LogwrtResult;
2924 		SpinLockRelease(&XLogCtl->info_lck);
2925 
2926 		/* done already? */
2927 		if (record <= LogwrtResult.Flush)
2928 			break;
2929 
2930 		/*
2931 		 * Before actually performing the write, wait for all in-flight
2932 		 * insertions to the pages we're about to write to finish.
2933 		 */
2934 		insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2935 
2936 		/*
2937 		 * Try to get the write lock. If we can't get it immediately, wait
2938 		 * until it's released, and recheck if we still need to do the flush
2939 		 * or if the backend that held the lock did it for us already. This
2940 		 * helps to maintain a good rate of group committing when the system
2941 		 * is bottlenecked by the speed of fsyncing.
2942 		 */
2943 		if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2944 		{
2945 			/*
2946 			 * The lock is now free, but we didn't acquire it yet. Before we
2947 			 * do, loop back to check if someone else flushed the record for
2948 			 * us already.
2949 			 */
2950 			continue;
2951 		}
2952 
2953 		/* Got the lock; recheck whether request is satisfied */
2954 		LogwrtResult = XLogCtl->LogwrtResult;
2955 		if (record <= LogwrtResult.Flush)
2956 		{
2957 			LWLockRelease(WALWriteLock);
2958 			break;
2959 		}
2960 
2961 		/*
2962 		 * Sleep before flush! By adding a delay here, we may give further
2963 		 * backends the opportunity to join the backlog of group commit
2964 		 * followers; this can significantly improve transaction throughput,
2965 		 * at the risk of increasing transaction latency.
2966 		 *
2967 		 * We do not sleep if enableFsync is not turned on, nor if there are
2968 		 * fewer than CommitSiblings other backends with active transactions.
2969 		 */
2970 		if (CommitDelay > 0 && enableFsync &&
2971 			MinimumActiveBackends(CommitSiblings))
2972 		{
2973 			pg_usleep(CommitDelay);
2974 
2975 			/*
2976 			 * Re-check how far we can now flush the WAL. It's generally not
2977 			 * safe to call WaitXLogInsertionsToFinish while holding
2978 			 * WALWriteLock, because an in-progress insertion might need to
2979 			 * also grab WALWriteLock to make progress. But we know that all
2980 			 * the insertions up to insertpos have already finished, because
2981 			 * that's what the earlier WaitXLogInsertionsToFinish() returned.
2982 			 * We're only calling it again to allow insertpos to be moved
2983 			 * further forward, not to actually wait for anyone.
2984 			 */
2985 			insertpos = WaitXLogInsertionsToFinish(insertpos);
2986 		}
2987 
2988 		/* try to write/flush later additions to XLOG as well */
2989 		WriteRqst.Write = insertpos;
2990 		WriteRqst.Flush = insertpos;
2991 
2992 		XLogWrite(WriteRqst, false);
2993 
2994 		LWLockRelease(WALWriteLock);
2995 		/* done */
2996 		break;
2997 	}
2998 
2999 	END_CRIT_SECTION();
3000 
3001 	/* wake up walsenders now that we've released heavily contended locks */
3002 	WalSndWakeupProcessRequests();
3003 
3004 	/*
3005 	 * If we still haven't flushed to the request point then we have a
3006 	 * problem; most likely, the requested flush point is past end of XLOG.
3007 	 * This has been seen to occur when a disk page has a corrupted LSN.
3008 	 *
3009 	 * Formerly we treated this as a PANIC condition, but that hurts the
3010 	 * system's robustness rather than helping it: we do not want to take down
3011 	 * the whole system due to corruption on one data page.  In particular, if
3012 	 * the bad page is encountered again during recovery then we would be
3013 	 * unable to restart the database at all!  (This scenario actually
3014 	 * happened in the field several times with 7.1 releases.)	As of 8.4, bad
3015 	 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3016 	 * the only time we can reach here during recovery is while flushing the
3017 	 * end-of-recovery checkpoint record, and we don't expect that to have a
3018 	 * bad LSN.
3019 	 *
3020 	 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3021 	 * since xact.c calls this routine inside a critical section.  However,
3022 	 * calls from bufmgr.c are not within critical sections and so we will not
3023 	 * force a restart for a bad LSN on a data page.
3024 	 */
3025 	if (LogwrtResult.Flush < record)
3026 		elog(ERROR,
3027 			 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
3028 			 (uint32) (record >> 32), (uint32) record,
3029 			 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3030 }
3031 
3032 /*
3033  * Write & flush xlog, but without specifying exactly where to.
3034  *
3035  * We normally write only completed blocks; but if there is nothing to do on
3036  * that basis, we check for unwritten async commits in the current incomplete
3037  * block, and write through the latest one of those.  Thus, if async commits
3038  * are not being used, we will write complete blocks only.
3039  *
3040  * If, based on the above, there's anything to write we do so immediately. But
3041  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
3042  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
3043  * more than wal_writer_flush_after unflushed blocks.
3044  *
3045  * We can guarantee that async commits reach disk after at most three
3046  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
3047  * to write "flexibly", meaning it can stop at the end of the buffer ring;
3048  * this makes a difference only with very high load or long wal_writer_delay,
3049  * but imposes one extra cycle for the worst case for async commits.)
3050  *
3051  * This routine is invoked periodically by the background walwriter process.
3052  *
3053  * Returns true if there was any work to do, even if we skipped flushing due
3054  * to wal_writer_delay/wal_writer_flush_after.
3055  */
3056 bool
XLogBackgroundFlush(void)3057 XLogBackgroundFlush(void)
3058 {
3059 	XLogwrtRqst WriteRqst;
3060 	bool		flexible = true;
3061 	static TimestampTz lastflush;
3062 	TimestampTz now;
3063 	int			flushbytes;
3064 
3065 	/* XLOG doesn't need flushing during recovery */
3066 	if (RecoveryInProgress())
3067 		return false;
3068 
3069 	/* read LogwrtResult and update local state */
3070 	SpinLockAcquire(&XLogCtl->info_lck);
3071 	LogwrtResult = XLogCtl->LogwrtResult;
3072 	WriteRqst = XLogCtl->LogwrtRqst;
3073 	SpinLockRelease(&XLogCtl->info_lck);
3074 
3075 	/* back off to last completed page boundary */
3076 	WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3077 
3078 	/* if we have already flushed that far, consider async commit records */
3079 	if (WriteRqst.Write <= LogwrtResult.Flush)
3080 	{
3081 		SpinLockAcquire(&XLogCtl->info_lck);
3082 		WriteRqst.Write = XLogCtl->asyncXactLSN;
3083 		SpinLockRelease(&XLogCtl->info_lck);
3084 		flexible = false;		/* ensure it all gets written */
3085 	}
3086 
3087 	/*
3088 	 * If already known flushed, we're done. Just need to check if we are
3089 	 * holding an open file handle to a logfile that's no longer in use,
3090 	 * preventing the file from being deleted.
3091 	 */
3092 	if (WriteRqst.Write <= LogwrtResult.Flush)
3093 	{
3094 		if (openLogFile >= 0)
3095 		{
3096 			if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3097 								 wal_segment_size))
3098 			{
3099 				XLogFileClose();
3100 			}
3101 		}
3102 		return false;
3103 	}
3104 
3105 	/*
3106 	 * Determine how far to flush WAL, based on the wal_writer_delay and
3107 	 * wal_writer_flush_after GUCs.
3108 	 */
3109 	now = GetCurrentTimestamp();
3110 	flushbytes =
3111 		WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3112 
3113 	if (WalWriterFlushAfter == 0 || lastflush == 0)
3114 	{
3115 		/* first call, or block based limits disabled */
3116 		WriteRqst.Flush = WriteRqst.Write;
3117 		lastflush = now;
3118 	}
3119 	else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3120 	{
3121 		/*
3122 		 * Flush the writes at least every WalWriterDelay ms. This is
3123 		 * important to bound the amount of time it takes for an asynchronous
3124 		 * commit to hit disk.
3125 		 */
3126 		WriteRqst.Flush = WriteRqst.Write;
3127 		lastflush = now;
3128 	}
3129 	else if (flushbytes >= WalWriterFlushAfter)
3130 	{
3131 		/* exceeded wal_writer_flush_after blocks, flush */
3132 		WriteRqst.Flush = WriteRqst.Write;
3133 		lastflush = now;
3134 	}
3135 	else
3136 	{
3137 		/* no flushing, this time round */
3138 		WriteRqst.Flush = 0;
3139 	}
3140 
3141 #ifdef WAL_DEBUG
3142 	if (XLOG_DEBUG)
3143 		elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3144 			 (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
3145 			 (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
3146 			 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3147 			 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3148 #endif
3149 
3150 	START_CRIT_SECTION();
3151 
3152 	/* now wait for any in-progress insertions to finish and get write lock */
3153 	WaitXLogInsertionsToFinish(WriteRqst.Write);
3154 	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3155 	LogwrtResult = XLogCtl->LogwrtResult;
3156 	if (WriteRqst.Write > LogwrtResult.Write ||
3157 		WriteRqst.Flush > LogwrtResult.Flush)
3158 	{
3159 		XLogWrite(WriteRqst, flexible);
3160 	}
3161 	LWLockRelease(WALWriteLock);
3162 
3163 	END_CRIT_SECTION();
3164 
3165 	/* wake up walsenders now that we've released heavily contended locks */
3166 	WalSndWakeupProcessRequests();
3167 
3168 	/*
3169 	 * Great, done. To take some work off the critical path, try to initialize
3170 	 * as many of the no-longer-needed WAL buffers for future use as we can.
3171 	 */
3172 	AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3173 
3174 	/*
3175 	 * If we determined that we need to write data, but somebody else
3176 	 * wrote/flushed already, it should be considered as being active, to
3177 	 * avoid hibernating too early.
3178 	 */
3179 	return true;
3180 }
3181 
3182 /*
3183  * Test whether XLOG data has been flushed up to (at least) the given position.
3184  *
3185  * Returns true if a flush is still needed.  (It may be that someone else
3186  * is already in process of flushing that far, however.)
3187  */
3188 bool
XLogNeedsFlush(XLogRecPtr record)3189 XLogNeedsFlush(XLogRecPtr record)
3190 {
3191 	/*
3192 	 * During recovery, we don't flush WAL but update minRecoveryPoint
3193 	 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3194 	 * would need to be updated.
3195 	 */
3196 	if (RecoveryInProgress())
3197 	{
3198 		/*
3199 		 * An invalid minRecoveryPoint means that we need to recover all the
3200 		 * WAL, i.e., we're doing crash recovery.  We never modify the control
3201 		 * file's value in that case, so we can short-circuit future checks
3202 		 * here too.  This triggers a quick exit path for the startup process,
3203 		 * which cannot update its local copy of minRecoveryPoint as long as
3204 		 * it has not replayed all WAL available when doing crash recovery.
3205 		 */
3206 		if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
3207 			updateMinRecoveryPoint = false;
3208 
3209 		/* Quick exit if already known to be updated or cannot be updated */
3210 		if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3211 			return false;
3212 
3213 		/*
3214 		 * Update local copy of minRecoveryPoint. But if the lock is busy,
3215 		 * just return a conservative guess.
3216 		 */
3217 		if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3218 			return true;
3219 		minRecoveryPoint = ControlFile->minRecoveryPoint;
3220 		minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3221 		LWLockRelease(ControlFileLock);
3222 
3223 		/*
3224 		 * Check minRecoveryPoint for any other process than the startup
3225 		 * process doing crash recovery, which should not update the control
3226 		 * file value if crash recovery is still running.
3227 		 */
3228 		if (XLogRecPtrIsInvalid(minRecoveryPoint))
3229 			updateMinRecoveryPoint = false;
3230 
3231 		/* check again */
3232 		if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3233 			return false;
3234 		else
3235 			return true;
3236 	}
3237 
3238 	/* Quick exit if already known flushed */
3239 	if (record <= LogwrtResult.Flush)
3240 		return false;
3241 
3242 	/* read LogwrtResult and update local state */
3243 	SpinLockAcquire(&XLogCtl->info_lck);
3244 	LogwrtResult = XLogCtl->LogwrtResult;
3245 	SpinLockRelease(&XLogCtl->info_lck);
3246 
3247 	/* check again */
3248 	if (record <= LogwrtResult.Flush)
3249 		return false;
3250 
3251 	return true;
3252 }
3253 
3254 /*
3255  * Create a new XLOG file segment, or open a pre-existing one.
3256  *
3257  * logsegno: identify segment to be created/opened.
3258  *
3259  * *use_existent: if true, OK to use a pre-existing file (else, any
3260  * pre-existing file will be deleted).  On return, true if a pre-existing
3261  * file was used.
3262  *
3263  * use_lock: if true, acquire ControlFileLock while moving file into
3264  * place.  This should be true except during bootstrap log creation.  The
3265  * caller must *not* hold the lock at call.
3266  *
3267  * Returns FD of opened file.
3268  *
3269  * Note: errors here are ERROR not PANIC because we might or might not be
3270  * inside a critical section (eg, during checkpoint there is no reason to
3271  * take down the system on failure).  They will promote to PANIC if we are
3272  * in a critical section.
3273  */
3274 int
XLogFileInit(XLogSegNo logsegno,bool * use_existent,bool use_lock)3275 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3276 {
3277 	char		path[MAXPGPATH];
3278 	char		tmppath[MAXPGPATH];
3279 	PGAlignedXLogBlock zbuffer;
3280 	XLogSegNo	installed_segno;
3281 	XLogSegNo	max_segno;
3282 	int			fd;
3283 	int			nbytes;
3284 	int			save_errno;
3285 
3286 	XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
3287 
3288 	/*
3289 	 * Try to use existent file (checkpoint maker may have created it already)
3290 	 */
3291 	if (*use_existent)
3292 	{
3293 		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3294 		if (fd < 0)
3295 		{
3296 			if (errno != ENOENT)
3297 				ereport(ERROR,
3298 						(errcode_for_file_access(),
3299 						 errmsg("could not open file \"%s\": %m", path)));
3300 		}
3301 		else
3302 			return fd;
3303 	}
3304 
3305 	/*
3306 	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3307 	 * another process is doing the same thing.  If so, we will end up
3308 	 * pre-creating an extra log segment.  That seems OK, and better than
3309 	 * holding the lock throughout this lengthy process.
3310 	 */
3311 	elog(DEBUG2, "creating and filling new WAL file");
3312 
3313 	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3314 
3315 	unlink(tmppath);
3316 
3317 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
3318 	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3319 	if (fd < 0)
3320 		ereport(ERROR,
3321 				(errcode_for_file_access(),
3322 				 errmsg("could not create file \"%s\": %m", tmppath)));
3323 
3324 	memset(zbuffer.data, 0, XLOG_BLCKSZ);
3325 
3326 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3327 	save_errno = 0;
3328 	if (wal_init_zero)
3329 	{
3330 		/*
3331 		 * Zero-fill the file.  With this setting, we do this the hard way to
3332 		 * ensure that all the file space has really been allocated.  On
3333 		 * platforms that allow "holes" in files, just seeking to the end
3334 		 * doesn't allocate intermediate space.  This way, we know that we
3335 		 * have all the space and (after the fsync below) that all the
3336 		 * indirect blocks are down on disk.  Therefore, fdatasync(2) or
3337 		 * O_DSYNC will be sufficient to sync future writes to the log file.
3338 		 */
3339 		for (nbytes = 0; nbytes < wal_segment_size; nbytes += XLOG_BLCKSZ)
3340 		{
3341 			errno = 0;
3342 			if (write(fd, zbuffer.data, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3343 			{
3344 				/* if write didn't set errno, assume no disk space */
3345 				save_errno = errno ? errno : ENOSPC;
3346 				break;
3347 			}
3348 		}
3349 	}
3350 	else
3351 	{
3352 		/*
3353 		 * Otherwise, seeking to the end and writing a solitary byte is
3354 		 * enough.
3355 		 */
3356 		errno = 0;
3357 		if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
3358 		{
3359 			/* if write didn't set errno, assume no disk space */
3360 			save_errno = errno ? errno : ENOSPC;
3361 		}
3362 	}
3363 	pgstat_report_wait_end();
3364 
3365 	if (save_errno)
3366 	{
3367 		/*
3368 		 * If we fail to make the file, delete it to release disk space
3369 		 */
3370 		unlink(tmppath);
3371 
3372 		close(fd);
3373 
3374 		errno = save_errno;
3375 
3376 		ereport(ERROR,
3377 				(errcode_for_file_access(),
3378 				 errmsg("could not write to file \"%s\": %m", tmppath)));
3379 	}
3380 
3381 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3382 	if (pg_fsync(fd) != 0)
3383 	{
3384 		int			save_errno = errno;
3385 
3386 		close(fd);
3387 		errno = save_errno;
3388 		ereport(ERROR,
3389 				(errcode_for_file_access(),
3390 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
3391 	}
3392 	pgstat_report_wait_end();
3393 
3394 	if (close(fd) != 0)
3395 		ereport(ERROR,
3396 				(errcode_for_file_access(),
3397 				 errmsg("could not close file \"%s\": %m", tmppath)));
3398 
3399 	/*
3400 	 * Now move the segment into place with its final name.
3401 	 *
3402 	 * If caller didn't want to use a pre-existing file, get rid of any
3403 	 * pre-existing file.  Otherwise, cope with possibility that someone else
3404 	 * has created the file while we were filling ours: if so, use ours to
3405 	 * pre-create a future log segment.
3406 	 */
3407 	installed_segno = logsegno;
3408 
3409 	/*
3410 	 * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3411 	 * that was a constant, but that was always a bit dubious: normally, at a
3412 	 * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3413 	 * here, it was the offset from the insert location. We can't do the
3414 	 * normal XLOGfileslop calculation here because we don't have access to
3415 	 * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3416 	 * CheckPointSegments.
3417 	 */
3418 	max_segno = logsegno + CheckPointSegments;
3419 	if (!InstallXLogFileSegment(&installed_segno, tmppath,
3420 								*use_existent, max_segno,
3421 								use_lock))
3422 	{
3423 		/*
3424 		 * No need for any more future segments, or InstallXLogFileSegment()
3425 		 * failed to rename the file into place. If the rename failed, opening
3426 		 * the file below will fail.
3427 		 */
3428 		unlink(tmppath);
3429 	}
3430 
3431 	/* Set flag to tell caller there was no existent file */
3432 	*use_existent = false;
3433 
3434 	/* Now open original target segment (might not be file I just made) */
3435 	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3436 	if (fd < 0)
3437 		ereport(ERROR,
3438 				(errcode_for_file_access(),
3439 				 errmsg("could not open file \"%s\": %m", path)));
3440 
3441 	elog(DEBUG2, "done creating and filling new WAL file");
3442 
3443 	return fd;
3444 }
3445 
3446 /*
3447  * Create a new XLOG file segment by copying a pre-existing one.
3448  *
3449  * destsegno: identify segment to be created.
3450  *
3451  * srcTLI, srcsegno: identify segment to be copied (could be from
3452  *		a different timeline)
3453  *
3454  * upto: how much of the source file to copy (the rest is filled with
3455  *		zeros)
3456  *
3457  * Currently this is only used during recovery, and so there are no locking
3458  * considerations.  But we should be just as tense as XLogFileInit to avoid
3459  * emplacing a bogus file.
3460  */
3461 static void
XLogFileCopy(XLogSegNo destsegno,TimeLineID srcTLI,XLogSegNo srcsegno,int upto)3462 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3463 			 int upto)
3464 {
3465 	char		path[MAXPGPATH];
3466 	char		tmppath[MAXPGPATH];
3467 	PGAlignedXLogBlock buffer;
3468 	int			srcfd;
3469 	int			fd;
3470 	int			nbytes;
3471 
3472 	/*
3473 	 * Open the source file
3474 	 */
3475 	XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3476 	srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
3477 	if (srcfd < 0)
3478 		ereport(ERROR,
3479 				(errcode_for_file_access(),
3480 				 errmsg("could not open file \"%s\": %m", path)));
3481 
3482 	/*
3483 	 * Copy into a temp file name.
3484 	 */
3485 	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3486 
3487 	unlink(tmppath);
3488 
3489 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
3490 	fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3491 	if (fd < 0)
3492 		ereport(ERROR,
3493 				(errcode_for_file_access(),
3494 				 errmsg("could not create file \"%s\": %m", tmppath)));
3495 
3496 	/*
3497 	 * Do the data copying.
3498 	 */
3499 	for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3500 	{
3501 		int			nread;
3502 
3503 		nread = upto - nbytes;
3504 
3505 		/*
3506 		 * The part that is not read from the source file is filled with
3507 		 * zeros.
3508 		 */
3509 		if (nread < sizeof(buffer))
3510 			memset(buffer.data, 0, sizeof(buffer));
3511 
3512 		if (nread > 0)
3513 		{
3514 			int			r;
3515 
3516 			if (nread > sizeof(buffer))
3517 				nread = sizeof(buffer);
3518 			pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3519 			r = read(srcfd, buffer.data, nread);
3520 			if (r != nread)
3521 			{
3522 				if (r < 0)
3523 					ereport(ERROR,
3524 							(errcode_for_file_access(),
3525 							 errmsg("could not read file \"%s\": %m",
3526 									path)));
3527 				else
3528 					ereport(ERROR,
3529 							(errcode(ERRCODE_DATA_CORRUPTED),
3530 							 errmsg("could not read file \"%s\": read %d of %zu",
3531 									path, r, (Size) nread)));
3532 			}
3533 			pgstat_report_wait_end();
3534 		}
3535 		errno = 0;
3536 		pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3537 		if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3538 		{
3539 			int			save_errno = errno;
3540 
3541 			/*
3542 			 * If we fail to make the file, delete it to release disk space
3543 			 */
3544 			unlink(tmppath);
3545 			/* if write didn't set errno, assume problem is no disk space */
3546 			errno = save_errno ? save_errno : ENOSPC;
3547 
3548 			ereport(ERROR,
3549 					(errcode_for_file_access(),
3550 					 errmsg("could not write to file \"%s\": %m", tmppath)));
3551 		}
3552 		pgstat_report_wait_end();
3553 	}
3554 
3555 	pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3556 	if (pg_fsync(fd) != 0)
3557 		ereport(data_sync_elevel(ERROR),
3558 				(errcode_for_file_access(),
3559 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
3560 	pgstat_report_wait_end();
3561 
3562 	if (CloseTransientFile(fd) != 0)
3563 		ereport(ERROR,
3564 				(errcode_for_file_access(),
3565 				 errmsg("could not close file \"%s\": %m", tmppath)));
3566 
3567 	if (CloseTransientFile(srcfd) != 0)
3568 		ereport(ERROR,
3569 				(errcode_for_file_access(),
3570 				 errmsg("could not close file \"%s\": %m", path)));
3571 
3572 	/*
3573 	 * Now move the segment into place with its final name.
3574 	 */
3575 	if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3576 		elog(ERROR, "InstallXLogFileSegment should not have failed");
3577 }
3578 
3579 /*
3580  * Install a new XLOG segment file as a current or future log segment.
3581  *
3582  * This is used both to install a newly-created segment (which has a temp
3583  * filename while it's being created) and to recycle an old segment.
3584  *
3585  * *segno: identify segment to install as (or first possible target).
3586  * When find_free is true, this is modified on return to indicate the
3587  * actual installation location or last segment searched.
3588  *
3589  * tmppath: initial name of file to install.  It will be renamed into place.
3590  *
3591  * find_free: if true, install the new segment at the first empty segno
3592  * number at or after the passed numbers.  If false, install the new segment
3593  * exactly where specified, deleting any existing segment file there.
3594  *
3595  * max_segno: maximum segment number to install the new file as.  Fail if no
3596  * free slot is found between *segno and max_segno. (Ignored when find_free
3597  * is false.)
3598  *
3599  * use_lock: if true, acquire ControlFileLock while moving file into
3600  * place.  This should be true except during bootstrap log creation.  The
3601  * caller must *not* hold the lock at call.
3602  *
3603  * Returns true if the file was installed successfully.  false indicates that
3604  * max_segno limit was exceeded, or an error occurred while renaming the
3605  * file into place.
3606  */
3607 static bool
InstallXLogFileSegment(XLogSegNo * segno,char * tmppath,bool find_free,XLogSegNo max_segno,bool use_lock)3608 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3609 					   bool find_free, XLogSegNo max_segno,
3610 					   bool use_lock)
3611 {
3612 	char		path[MAXPGPATH];
3613 	struct stat stat_buf;
3614 
3615 	XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3616 
3617 	/*
3618 	 * We want to be sure that only one process does this at a time.
3619 	 */
3620 	if (use_lock)
3621 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3622 
3623 	if (!find_free)
3624 	{
3625 		/* Force installation: get rid of any pre-existing segment file */
3626 		durable_unlink(path, DEBUG1);
3627 	}
3628 	else
3629 	{
3630 		/* Find a free slot to put it in */
3631 		while (stat(path, &stat_buf) == 0)
3632 		{
3633 			if ((*segno) >= max_segno)
3634 			{
3635 				/* Failed to find a free slot within specified range */
3636 				if (use_lock)
3637 					LWLockRelease(ControlFileLock);
3638 				return false;
3639 			}
3640 			(*segno)++;
3641 			XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3642 		}
3643 	}
3644 
3645 	/*
3646 	 * Perform the rename using link if available, paranoidly trying to avoid
3647 	 * overwriting an existing file (there shouldn't be one).
3648 	 */
3649 	if (durable_rename_excl(tmppath, path, LOG) != 0)
3650 	{
3651 		if (use_lock)
3652 			LWLockRelease(ControlFileLock);
3653 		/* durable_rename_excl already emitted log message */
3654 		return false;
3655 	}
3656 
3657 	if (use_lock)
3658 		LWLockRelease(ControlFileLock);
3659 
3660 	return true;
3661 }
3662 
3663 /*
3664  * Open a pre-existing logfile segment for writing.
3665  */
3666 int
XLogFileOpen(XLogSegNo segno)3667 XLogFileOpen(XLogSegNo segno)
3668 {
3669 	char		path[MAXPGPATH];
3670 	int			fd;
3671 
3672 	XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
3673 
3674 	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3675 	if (fd < 0)
3676 		ereport(PANIC,
3677 				(errcode_for_file_access(),
3678 				 errmsg("could not open file \"%s\": %m", path)));
3679 
3680 	return fd;
3681 }
3682 
3683 /*
3684  * Open a logfile segment for reading (during recovery).
3685  *
3686  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3687  * Otherwise, it's assumed to be already available in pg_wal.
3688  */
3689 static int
XLogFileRead(XLogSegNo segno,int emode,TimeLineID tli,XLogSource source,bool notfoundOk)3690 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3691 			 XLogSource source, bool notfoundOk)
3692 {
3693 	char		xlogfname[MAXFNAMELEN];
3694 	char		activitymsg[MAXFNAMELEN + 16];
3695 	char		path[MAXPGPATH];
3696 	int			fd;
3697 
3698 	XLogFileName(xlogfname, tli, segno, wal_segment_size);
3699 
3700 	switch (source)
3701 	{
3702 		case XLOG_FROM_ARCHIVE:
3703 			/* Report recovery progress in PS display */
3704 			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3705 					 xlogfname);
3706 			set_ps_display(activitymsg);
3707 
3708 			restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3709 													  "RECOVERYXLOG",
3710 													  wal_segment_size,
3711 													  InRedo);
3712 			if (!restoredFromArchive)
3713 				return -1;
3714 			break;
3715 
3716 		case XLOG_FROM_PG_WAL:
3717 		case XLOG_FROM_STREAM:
3718 			XLogFilePath(path, tli, segno, wal_segment_size);
3719 			restoredFromArchive = false;
3720 			break;
3721 
3722 		default:
3723 			elog(ERROR, "invalid XLogFileRead source %d", source);
3724 	}
3725 
3726 	/*
3727 	 * If the segment was fetched from archival storage, replace the existing
3728 	 * xlog segment (if any) with the archival version.
3729 	 */
3730 	if (source == XLOG_FROM_ARCHIVE)
3731 	{
3732 		KeepFileRestoredFromArchive(path, xlogfname);
3733 
3734 		/*
3735 		 * Set path to point at the new file in pg_wal.
3736 		 */
3737 		snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3738 	}
3739 
3740 	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
3741 	if (fd >= 0)
3742 	{
3743 		/* Success! */
3744 		curFileTLI = tli;
3745 
3746 		/* Report recovery progress in PS display */
3747 		snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3748 				 xlogfname);
3749 		set_ps_display(activitymsg);
3750 
3751 		/* Track source of data in assorted state variables */
3752 		readSource = source;
3753 		XLogReceiptSource = source;
3754 		/* In FROM_STREAM case, caller tracks receipt time, not me */
3755 		if (source != XLOG_FROM_STREAM)
3756 			XLogReceiptTime = GetCurrentTimestamp();
3757 
3758 		return fd;
3759 	}
3760 	if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3761 		ereport(PANIC,
3762 				(errcode_for_file_access(),
3763 				 errmsg("could not open file \"%s\": %m", path)));
3764 	return -1;
3765 }
3766 
3767 /*
3768  * Open a logfile segment for reading (during recovery).
3769  *
3770  * This version searches for the segment with any TLI listed in expectedTLEs.
3771  */
3772 static int
XLogFileReadAnyTLI(XLogSegNo segno,int emode,XLogSource source)3773 XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
3774 {
3775 	char		path[MAXPGPATH];
3776 	ListCell   *cell;
3777 	int			fd;
3778 	List	   *tles;
3779 
3780 	/*
3781 	 * Loop looking for a suitable timeline ID: we might need to read any of
3782 	 * the timelines listed in expectedTLEs.
3783 	 *
3784 	 * We expect curFileTLI on entry to be the TLI of the preceding file in
3785 	 * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3786 	 * to go backwards; this prevents us from picking up the wrong file when a
3787 	 * parent timeline extends to higher segment numbers than the child we
3788 	 * want to read.
3789 	 *
3790 	 * If we haven't read the timeline history file yet, read it now, so that
3791 	 * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3792 	 * however, unless we actually find a valid segment.  That way if there is
3793 	 * neither a timeline history file nor a WAL segment in the archive, and
3794 	 * streaming replication is set up, we'll read the timeline history file
3795 	 * streamed from the master when we start streaming, instead of recovering
3796 	 * with a dummy history generated here.
3797 	 */
3798 	if (expectedTLEs)
3799 		tles = expectedTLEs;
3800 	else
3801 		tles = readTimeLineHistory(recoveryTargetTLI);
3802 
3803 	foreach(cell, tles)
3804 	{
3805 		TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
3806 		TimeLineID	tli = hent->tli;
3807 
3808 		if (tli < curFileTLI)
3809 			break;				/* don't bother looking at too-old TLIs */
3810 
3811 		/*
3812 		 * Skip scanning the timeline ID that the logfile segment to read
3813 		 * doesn't belong to
3814 		 */
3815 		if (hent->begin != InvalidXLogRecPtr)
3816 		{
3817 			XLogSegNo	beginseg = 0;
3818 
3819 			XLByteToSeg(hent->begin, beginseg, wal_segment_size);
3820 
3821 			/*
3822 			 * The logfile segment that doesn't belong to the timeline is
3823 			 * older or newer than the segment that the timeline started or
3824 			 * ended at, respectively. It's sufficient to check only the
3825 			 * starting segment of the timeline here. Since the timelines are
3826 			 * scanned in descending order in this loop, any segments newer
3827 			 * than the ending segment should belong to newer timeline and
3828 			 * have already been read before. So it's not necessary to check
3829 			 * the ending segment of the timeline here.
3830 			 */
3831 			if (segno < beginseg)
3832 				continue;
3833 		}
3834 
3835 		if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3836 		{
3837 			fd = XLogFileRead(segno, emode, tli,
3838 							  XLOG_FROM_ARCHIVE, true);
3839 			if (fd != -1)
3840 			{
3841 				elog(DEBUG1, "got WAL segment from archive");
3842 				if (!expectedTLEs)
3843 					expectedTLEs = tles;
3844 				return fd;
3845 			}
3846 		}
3847 
3848 		if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
3849 		{
3850 			fd = XLogFileRead(segno, emode, tli,
3851 							  XLOG_FROM_PG_WAL, true);
3852 			if (fd != -1)
3853 			{
3854 				if (!expectedTLEs)
3855 					expectedTLEs = tles;
3856 				return fd;
3857 			}
3858 		}
3859 	}
3860 
3861 	/* Couldn't find it.  For simplicity, complain about front timeline */
3862 	XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
3863 	errno = ENOENT;
3864 	ereport(emode,
3865 			(errcode_for_file_access(),
3866 			 errmsg("could not open file \"%s\": %m", path)));
3867 	return -1;
3868 }
3869 
3870 /*
3871  * Close the current logfile segment for writing.
3872  */
3873 static void
XLogFileClose(void)3874 XLogFileClose(void)
3875 {
3876 	Assert(openLogFile >= 0);
3877 
3878 	/*
3879 	 * WAL segment files will not be re-read in normal operation, so we advise
3880 	 * the OS to release any cached pages.  But do not do so if WAL archiving
3881 	 * or streaming is active, because archiver and walsender process could
3882 	 * use the cache to read the WAL segment.
3883 	 */
3884 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3885 	if (!XLogIsNeeded())
3886 		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3887 #endif
3888 
3889 	if (close(openLogFile) != 0)
3890 	{
3891 		char		xlogfname[MAXFNAMELEN];
3892 		int			save_errno = errno;
3893 
3894 		XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, wal_segment_size);
3895 		errno = save_errno;
3896 		ereport(PANIC,
3897 				(errcode_for_file_access(),
3898 				 errmsg("could not close file \"%s\": %m", xlogfname)));
3899 	}
3900 
3901 	openLogFile = -1;
3902 	ReleaseExternalFD();
3903 }
3904 
3905 /*
3906  * Preallocate log files beyond the specified log endpoint.
3907  *
3908  * XXX this is currently extremely conservative, since it forces only one
3909  * future log segment to exist, and even that only if we are 75% done with
3910  * the current one.  This is only appropriate for very low-WAL-volume systems.
3911  * High-volume systems will be OK once they've built up a sufficient set of
3912  * recycled log segments, but the startup transient is likely to include
3913  * a lot of segment creations by foreground processes, which is not so good.
3914  */
3915 static void
PreallocXlogFiles(XLogRecPtr endptr)3916 PreallocXlogFiles(XLogRecPtr endptr)
3917 {
3918 	XLogSegNo	_logSegNo;
3919 	int			lf;
3920 	bool		use_existent;
3921 	uint64		offset;
3922 
3923 	XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3924 	offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3925 	if (offset >= (uint32) (0.75 * wal_segment_size))
3926 	{
3927 		_logSegNo++;
3928 		use_existent = true;
3929 		lf = XLogFileInit(_logSegNo, &use_existent, true);
3930 		close(lf);
3931 		if (!use_existent)
3932 			CheckpointStats.ckpt_segs_added++;
3933 	}
3934 }
3935 
3936 /*
3937  * Throws an error if the given log segment has already been removed or
3938  * recycled. The caller should only pass a segment that it knows to have
3939  * existed while the server has been running, as this function always
3940  * succeeds if no WAL segments have been removed since startup.
3941  * 'tli' is only used in the error message.
3942  *
3943  * Note: this function guarantees to keep errno unchanged on return.
3944  * This supports callers that use this to possibly deliver a better
3945  * error message about a missing file, while still being able to throw
3946  * a normal file-access error afterwards, if this does return.
3947  */
3948 void
CheckXLogRemoved(XLogSegNo segno,TimeLineID tli)3949 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3950 {
3951 	int			save_errno = errno;
3952 	XLogSegNo	lastRemovedSegNo;
3953 
3954 	SpinLockAcquire(&XLogCtl->info_lck);
3955 	lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3956 	SpinLockRelease(&XLogCtl->info_lck);
3957 
3958 	if (segno <= lastRemovedSegNo)
3959 	{
3960 		char		filename[MAXFNAMELEN];
3961 
3962 		XLogFileName(filename, tli, segno, wal_segment_size);
3963 		errno = save_errno;
3964 		ereport(ERROR,
3965 				(errcode_for_file_access(),
3966 				 errmsg("requested WAL segment %s has already been removed",
3967 						filename)));
3968 	}
3969 	errno = save_errno;
3970 }
3971 
3972 /*
3973  * Return the last WAL segment removed, or 0 if no segment has been removed
3974  * since startup.
3975  *
3976  * NB: the result can be out of date arbitrarily fast, the caller has to deal
3977  * with that.
3978  */
3979 XLogSegNo
XLogGetLastRemovedSegno(void)3980 XLogGetLastRemovedSegno(void)
3981 {
3982 	XLogSegNo	lastRemovedSegNo;
3983 
3984 	SpinLockAcquire(&XLogCtl->info_lck);
3985 	lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3986 	SpinLockRelease(&XLogCtl->info_lck);
3987 
3988 	return lastRemovedSegNo;
3989 }
3990 
3991 
3992 /*
3993  * Update the last removed segno pointer in shared memory, to reflect that the
3994  * given XLOG file has been removed.
3995  */
3996 static void
UpdateLastRemovedPtr(char * filename)3997 UpdateLastRemovedPtr(char *filename)
3998 {
3999 	uint32		tli;
4000 	XLogSegNo	segno;
4001 
4002 	XLogFromFileName(filename, &tli, &segno, wal_segment_size);
4003 
4004 	SpinLockAcquire(&XLogCtl->info_lck);
4005 	if (segno > XLogCtl->lastRemovedSegNo)
4006 		XLogCtl->lastRemovedSegNo = segno;
4007 	SpinLockRelease(&XLogCtl->info_lck);
4008 }
4009 
4010 /*
4011  * Remove all temporary log files in pg_wal
4012  *
4013  * This is called at the beginning of recovery after a previous crash,
4014  * at a point where no other processes write fresh WAL data.
4015  */
4016 static void
RemoveTempXlogFiles(void)4017 RemoveTempXlogFiles(void)
4018 {
4019 	DIR		   *xldir;
4020 	struct dirent *xlde;
4021 
4022 	elog(DEBUG2, "removing all temporary WAL segments");
4023 
4024 	xldir = AllocateDir(XLOGDIR);
4025 	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4026 	{
4027 		char		path[MAXPGPATH];
4028 
4029 		if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
4030 			continue;
4031 
4032 		snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4033 		unlink(path);
4034 		elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
4035 	}
4036 	FreeDir(xldir);
4037 }
4038 
4039 /*
4040  * Recycle or remove all log files older or equal to passed segno.
4041  *
4042  * endptr is current (or recent) end of xlog, and lastredoptr is the
4043  * redo pointer of the last checkpoint. These are used to determine
4044  * whether we want to recycle rather than delete no-longer-wanted log files.
4045  */
4046 static void
RemoveOldXlogFiles(XLogSegNo segno,XLogRecPtr lastredoptr,XLogRecPtr endptr)4047 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr)
4048 {
4049 	DIR		   *xldir;
4050 	struct dirent *xlde;
4051 	char		lastoff[MAXFNAMELEN];
4052 
4053 	/*
4054 	 * Construct a filename of the last segment to be kept. The timeline ID
4055 	 * doesn't matter, we ignore that in the comparison. (During recovery,
4056 	 * ThisTimeLineID isn't set, so we can't use that.)
4057 	 */
4058 	XLogFileName(lastoff, 0, segno, wal_segment_size);
4059 
4060 	elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4061 		 lastoff);
4062 
4063 	xldir = AllocateDir(XLOGDIR);
4064 
4065 	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4066 	{
4067 		/* Ignore files that are not XLOG segments */
4068 		if (!IsXLogFileName(xlde->d_name) &&
4069 			!IsPartialXLogFileName(xlde->d_name))
4070 			continue;
4071 
4072 		/*
4073 		 * We ignore the timeline part of the XLOG segment identifiers in
4074 		 * deciding whether a segment is still needed.  This ensures that we
4075 		 * won't prematurely remove a segment from a parent timeline. We could
4076 		 * probably be a little more proactive about removing segments of
4077 		 * non-parent timelines, but that would be a whole lot more
4078 		 * complicated.
4079 		 *
4080 		 * We use the alphanumeric sorting property of the filenames to decide
4081 		 * which ones are earlier than the lastoff segment.
4082 		 */
4083 		if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4084 		{
4085 			if (XLogArchiveCheckDone(xlde->d_name))
4086 			{
4087 				/* Update the last removed location in shared memory first */
4088 				UpdateLastRemovedPtr(xlde->d_name);
4089 
4090 				RemoveXlogFile(xlde->d_name, lastredoptr, endptr);
4091 			}
4092 		}
4093 	}
4094 
4095 	FreeDir(xldir);
4096 }
4097 
4098 /*
4099  * Remove WAL files that are not part of the given timeline's history.
4100  *
4101  * This is called during recovery, whenever we switch to follow a new
4102  * timeline, and at the end of recovery when we create a new timeline. We
4103  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
4104  * might be leftover pre-allocated or recycled WAL segments on the old timeline
4105  * that we haven't used yet, and contain garbage. If we just leave them in
4106  * pg_wal, they will eventually be archived, and we can't let that happen.
4107  * Files that belong to our timeline history are valid, because we have
4108  * successfully replayed them, but from others we can't be sure.
4109  *
4110  * 'switchpoint' is the current point in WAL where we switch to new timeline,
4111  * and 'newTLI' is the new timeline we switch to.
4112  */
4113 static void
RemoveNonParentXlogFiles(XLogRecPtr switchpoint,TimeLineID newTLI)4114 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
4115 {
4116 	DIR		   *xldir;
4117 	struct dirent *xlde;
4118 	char		switchseg[MAXFNAMELEN];
4119 	XLogSegNo	endLogSegNo;
4120 
4121 	XLByteToPrevSeg(switchpoint, endLogSegNo, wal_segment_size);
4122 
4123 	/*
4124 	 * Construct a filename of the last segment to be kept.
4125 	 */
4126 	XLogFileName(switchseg, newTLI, endLogSegNo, wal_segment_size);
4127 
4128 	elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
4129 		 switchseg);
4130 
4131 	xldir = AllocateDir(XLOGDIR);
4132 
4133 	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4134 	{
4135 		/* Ignore files that are not XLOG segments */
4136 		if (!IsXLogFileName(xlde->d_name))
4137 			continue;
4138 
4139 		/*
4140 		 * Remove files that are on a timeline older than the new one we're
4141 		 * switching to, but with a segment number >= the first segment on the
4142 		 * new timeline.
4143 		 */
4144 		if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
4145 			strcmp(xlde->d_name + 8, switchseg + 8) > 0)
4146 		{
4147 			/*
4148 			 * If the file has already been marked as .ready, however, don't
4149 			 * remove it yet. It should be OK to remove it - files that are
4150 			 * not part of our timeline history are not required for recovery
4151 			 * - but seems safer to let them be archived and removed later.
4152 			 */
4153 			if (!XLogArchiveIsReady(xlde->d_name))
4154 				RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
4155 		}
4156 	}
4157 
4158 	FreeDir(xldir);
4159 }
4160 
4161 /*
4162  * Recycle or remove a log file that's no longer needed.
4163  *
4164  * endptr is current (or recent) end of xlog, and lastredoptr is the
4165  * redo pointer of the last checkpoint. These are used to determine
4166  * whether we want to recycle rather than delete no-longer-wanted log files.
4167  * If lastredoptr is not known, pass invalid, and the function will recycle,
4168  * somewhat arbitrarily, 10 future segments.
4169  */
4170 static void
RemoveXlogFile(const char * segname,XLogRecPtr lastredoptr,XLogRecPtr endptr)4171 RemoveXlogFile(const char *segname, XLogRecPtr lastredoptr, XLogRecPtr endptr)
4172 {
4173 	char		path[MAXPGPATH];
4174 #ifdef WIN32
4175 	char		newpath[MAXPGPATH];
4176 #endif
4177 	struct stat statbuf;
4178 	XLogSegNo	endlogSegNo;
4179 	XLogSegNo	recycleSegNo;
4180 
4181 	if (wal_recycle)
4182 	{
4183 		/*
4184 		 * Initialize info about where to try to recycle to.
4185 		 */
4186 		XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
4187 		if (lastredoptr == InvalidXLogRecPtr)
4188 			recycleSegNo = endlogSegNo + 10;
4189 		else
4190 			recycleSegNo = XLOGfileslop(lastredoptr);
4191 	}
4192 	else
4193 		recycleSegNo = 0;		/* keep compiler quiet */
4194 
4195 	snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4196 
4197 	/*
4198 	 * Before deleting the file, see if it can be recycled as a future log
4199 	 * segment. Only recycle normal files, pg_standby for example can create
4200 	 * symbolic links pointing to a separate archive directory.
4201 	 */
4202 	if (wal_recycle &&
4203 		endlogSegNo <= recycleSegNo &&
4204 		lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4205 		InstallXLogFileSegment(&endlogSegNo, path,
4206 							   true, recycleSegNo, true))
4207 	{
4208 		ereport(DEBUG2,
4209 				(errmsg("recycled write-ahead log file \"%s\"",
4210 						segname)));
4211 		CheckpointStats.ckpt_segs_recycled++;
4212 		/* Needn't recheck that slot on future iterations */
4213 		endlogSegNo++;
4214 	}
4215 	else
4216 	{
4217 		/* No need for any more future segments... */
4218 		int			rc;
4219 
4220 		ereport(DEBUG2,
4221 				(errmsg("removing write-ahead log file \"%s\"",
4222 						segname)));
4223 
4224 #ifdef WIN32
4225 
4226 		/*
4227 		 * On Windows, if another process (e.g another backend) holds the file
4228 		 * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4229 		 * will still show up in directory listing until the last handle is
4230 		 * closed. To avoid confusing the lingering deleted file for a live
4231 		 * WAL file that needs to be archived, rename it before deleting it.
4232 		 *
4233 		 * If another process holds the file open without FILE_SHARE_DELETE
4234 		 * flag, rename will fail. We'll try again at the next checkpoint.
4235 		 */
4236 		snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4237 		if (rename(path, newpath) != 0)
4238 		{
4239 			ereport(LOG,
4240 					(errcode_for_file_access(),
4241 					 errmsg("could not rename file \"%s\": %m",
4242 							path)));
4243 			return;
4244 		}
4245 		rc = durable_unlink(newpath, LOG);
4246 #else
4247 		rc = durable_unlink(path, LOG);
4248 #endif
4249 		if (rc != 0)
4250 		{
4251 			/* Message already logged by durable_unlink() */
4252 			return;
4253 		}
4254 		CheckpointStats.ckpt_segs_removed++;
4255 	}
4256 
4257 	XLogArchiveCleanup(segname);
4258 }
4259 
4260 /*
4261  * Verify whether pg_wal and pg_wal/archive_status exist.
4262  * If the latter does not exist, recreate it.
4263  *
4264  * It is not the goal of this function to verify the contents of these
4265  * directories, but to help in cases where someone has performed a cluster
4266  * copy for PITR purposes but omitted pg_wal from the copy.
4267  *
4268  * We could also recreate pg_wal if it doesn't exist, but a deliberate
4269  * policy decision was made not to.  It is fairly common for pg_wal to be
4270  * a symlink, and if that was the DBA's intent then automatically making a
4271  * plain directory would result in degraded performance with no notice.
4272  */
4273 static void
ValidateXLOGDirectoryStructure(void)4274 ValidateXLOGDirectoryStructure(void)
4275 {
4276 	char		path[MAXPGPATH];
4277 	struct stat stat_buf;
4278 
4279 	/* Check for pg_wal; if it doesn't exist, error out */
4280 	if (stat(XLOGDIR, &stat_buf) != 0 ||
4281 		!S_ISDIR(stat_buf.st_mode))
4282 		ereport(FATAL,
4283 				(errmsg("required WAL directory \"%s\" does not exist",
4284 						XLOGDIR)));
4285 
4286 	/* Check for archive_status */
4287 	snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4288 	if (stat(path, &stat_buf) == 0)
4289 	{
4290 		/* Check for weird cases where it exists but isn't a directory */
4291 		if (!S_ISDIR(stat_buf.st_mode))
4292 			ereport(FATAL,
4293 					(errmsg("required WAL directory \"%s\" does not exist",
4294 							path)));
4295 	}
4296 	else
4297 	{
4298 		ereport(LOG,
4299 				(errmsg("creating missing WAL directory \"%s\"", path)));
4300 		if (MakePGDirectory(path) < 0)
4301 			ereport(FATAL,
4302 					(errmsg("could not create missing directory \"%s\": %m",
4303 							path)));
4304 	}
4305 }
4306 
4307 /*
4308  * Remove previous backup history files.  This also retries creation of
4309  * .ready files for any backup history files for which XLogArchiveNotify
4310  * failed earlier.
4311  */
4312 static void
CleanupBackupHistory(void)4313 CleanupBackupHistory(void)
4314 {
4315 	DIR		   *xldir;
4316 	struct dirent *xlde;
4317 	char		path[MAXPGPATH + sizeof(XLOGDIR)];
4318 
4319 	xldir = AllocateDir(XLOGDIR);
4320 
4321 	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4322 	{
4323 		if (IsBackupHistoryFileName(xlde->d_name))
4324 		{
4325 			if (XLogArchiveCheckDone(xlde->d_name))
4326 			{
4327 				elog(DEBUG2, "removing WAL backup history file \"%s\"",
4328 					 xlde->d_name);
4329 				snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4330 				unlink(path);
4331 				XLogArchiveCleanup(xlde->d_name);
4332 			}
4333 		}
4334 	}
4335 
4336 	FreeDir(xldir);
4337 }
4338 
4339 /*
4340  * Attempt to read the next XLOG record.
4341  *
4342  * Before first call, the reader needs to be positioned to the first record
4343  * by calling XLogBeginRead().
4344  *
4345  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4346  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4347  * record is available.
4348  */
4349 static XLogRecord *
ReadRecord(XLogReaderState * xlogreader,int emode,bool fetching_ckpt)4350 ReadRecord(XLogReaderState *xlogreader, int emode,
4351 		   bool fetching_ckpt)
4352 {
4353 	XLogRecord *record;
4354 	XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4355 
4356 	/* Pass through parameters to XLogPageRead */
4357 	private->fetching_ckpt = fetching_ckpt;
4358 	private->emode = emode;
4359 	private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
4360 
4361 	/* This is the first attempt to read this page. */
4362 	lastSourceFailed = false;
4363 
4364 	for (;;)
4365 	{
4366 		char	   *errormsg;
4367 
4368 		record = XLogReadRecord(xlogreader, &errormsg);
4369 		ReadRecPtr = xlogreader->ReadRecPtr;
4370 		EndRecPtr = xlogreader->EndRecPtr;
4371 		if (record == NULL)
4372 		{
4373 			/*
4374 			 * When not in standby mode we find that WAL ends in an incomplete
4375 			 * record, keep track of that record.  After recovery is done,
4376 			 * we'll write a record to indicate downstream WAL readers that
4377 			 * that portion is to be ignored.
4378 			 */
4379 			if (!StandbyMode &&
4380 				!XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
4381 			{
4382 				abortedRecPtr = xlogreader->abortedRecPtr;
4383 				missingContrecPtr = xlogreader->missingContrecPtr;
4384 			}
4385 
4386 			if (readFile >= 0)
4387 			{
4388 				close(readFile);
4389 				readFile = -1;
4390 			}
4391 
4392 			/*
4393 			 * We only end up here without a message when XLogPageRead()
4394 			 * failed - in that case we already logged something. In
4395 			 * StandbyMode that only happens if we have been triggered, so we
4396 			 * shouldn't loop anymore in that case.
4397 			 */
4398 			if (errormsg)
4399 				ereport(emode_for_corrupt_record(emode, EndRecPtr),
4400 						(errmsg_internal("%s", errormsg) /* already translated */ ));
4401 		}
4402 
4403 		/*
4404 		 * Check page TLI is one of the expected values.
4405 		 */
4406 		else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4407 		{
4408 			char		fname[MAXFNAMELEN];
4409 			XLogSegNo	segno;
4410 			int32		offset;
4411 
4412 			XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
4413 			offset = XLogSegmentOffset(xlogreader->latestPagePtr,
4414 									   wal_segment_size);
4415 			XLogFileName(fname, xlogreader->seg.ws_tli, segno,
4416 						 wal_segment_size);
4417 			ereport(emode_for_corrupt_record(emode, EndRecPtr),
4418 					(errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4419 							xlogreader->latestPageTLI,
4420 							fname,
4421 							offset)));
4422 			record = NULL;
4423 		}
4424 
4425 		if (record)
4426 		{
4427 			/* Great, got a record */
4428 			return record;
4429 		}
4430 		else
4431 		{
4432 			/* No valid record available from this source */
4433 			lastSourceFailed = true;
4434 
4435 			/*
4436 			 * If archive recovery was requested, but we were still doing
4437 			 * crash recovery, switch to archive recovery and retry using the
4438 			 * offline archive. We have now replayed all the valid WAL in
4439 			 * pg_wal, so we are presumably now consistent.
4440 			 *
4441 			 * We require that there's at least some valid WAL present in
4442 			 * pg_wal, however (!fetching_ckpt).  We could recover using the
4443 			 * WAL from the archive, even if pg_wal is completely empty, but
4444 			 * we'd have no idea how far we'd have to replay to reach
4445 			 * consistency.  So err on the safe side and give up.
4446 			 */
4447 			if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4448 				!fetching_ckpt)
4449 			{
4450 				ereport(DEBUG1,
4451 						(errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4452 				InArchiveRecovery = true;
4453 				if (StandbyModeRequested)
4454 					StandbyMode = true;
4455 
4456 				/* initialize minRecoveryPoint to this record */
4457 				LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4458 				ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4459 				if (ControlFile->minRecoveryPoint < EndRecPtr)
4460 				{
4461 					ControlFile->minRecoveryPoint = EndRecPtr;
4462 					ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4463 				}
4464 				/* update local copy */
4465 				minRecoveryPoint = ControlFile->minRecoveryPoint;
4466 				minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4467 
4468 				/*
4469 				 * The startup process can update its local copy of
4470 				 * minRecoveryPoint from this point.
4471 				 */
4472 				updateMinRecoveryPoint = true;
4473 
4474 				UpdateControlFile();
4475 
4476 				/*
4477 				 * We update SharedRecoveryState while holding the lock on
4478 				 * ControlFileLock so both states are consistent in shared
4479 				 * memory.
4480 				 */
4481 				SpinLockAcquire(&XLogCtl->info_lck);
4482 				XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
4483 				SpinLockRelease(&XLogCtl->info_lck);
4484 
4485 				LWLockRelease(ControlFileLock);
4486 
4487 				CheckRecoveryConsistency();
4488 
4489 				/*
4490 				 * Before we retry, reset lastSourceFailed and currentSource
4491 				 * so that we will check the archive next.
4492 				 */
4493 				lastSourceFailed = false;
4494 				currentSource = XLOG_FROM_ANY;
4495 
4496 				continue;
4497 			}
4498 
4499 			/* In standby mode, loop back to retry. Otherwise, give up. */
4500 			if (StandbyMode && !CheckForStandbyTrigger())
4501 				continue;
4502 			else
4503 				return NULL;
4504 		}
4505 	}
4506 }
4507 
4508 /*
4509  * Scan for new timelines that might have appeared in the archive since we
4510  * started recovery.
4511  *
4512  * If there are any, the function changes recovery target TLI to the latest
4513  * one and returns 'true'.
4514  */
4515 static bool
rescanLatestTimeLine(void)4516 rescanLatestTimeLine(void)
4517 {
4518 	List	   *newExpectedTLEs;
4519 	bool		found;
4520 	ListCell   *cell;
4521 	TimeLineID	newtarget;
4522 	TimeLineID	oldtarget = recoveryTargetTLI;
4523 	TimeLineHistoryEntry *currentTle = NULL;
4524 
4525 	newtarget = findNewestTimeLine(recoveryTargetTLI);
4526 	if (newtarget == recoveryTargetTLI)
4527 	{
4528 		/* No new timelines found */
4529 		return false;
4530 	}
4531 
4532 	/*
4533 	 * Determine the list of expected TLIs for the new TLI
4534 	 */
4535 
4536 	newExpectedTLEs = readTimeLineHistory(newtarget);
4537 
4538 	/*
4539 	 * If the current timeline is not part of the history of the new timeline,
4540 	 * we cannot proceed to it.
4541 	 */
4542 	found = false;
4543 	foreach(cell, newExpectedTLEs)
4544 	{
4545 		currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4546 
4547 		if (currentTle->tli == recoveryTargetTLI)
4548 		{
4549 			found = true;
4550 			break;
4551 		}
4552 	}
4553 	if (!found)
4554 	{
4555 		ereport(LOG,
4556 				(errmsg("new timeline %u is not a child of database system timeline %u",
4557 						newtarget,
4558 						ThisTimeLineID)));
4559 		return false;
4560 	}
4561 
4562 	/*
4563 	 * The current timeline was found in the history file, but check that the
4564 	 * next timeline was forked off from it *after* the current recovery
4565 	 * location.
4566 	 */
4567 	if (currentTle->end < EndRecPtr)
4568 	{
4569 		ereport(LOG,
4570 				(errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4571 						newtarget,
4572 						ThisTimeLineID,
4573 						(uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4574 		return false;
4575 	}
4576 
4577 	/* The new timeline history seems valid. Switch target */
4578 	recoveryTargetTLI = newtarget;
4579 	list_free_deep(expectedTLEs);
4580 	expectedTLEs = newExpectedTLEs;
4581 
4582 	/*
4583 	 * As in StartupXLOG(), try to ensure we have all the history files
4584 	 * between the old target and new target in pg_wal.
4585 	 */
4586 	restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4587 
4588 	ereport(LOG,
4589 			(errmsg("new target timeline is %u",
4590 					recoveryTargetTLI)));
4591 
4592 	return true;
4593 }
4594 
4595 /*
4596  * I/O routines for pg_control
4597  *
4598  * *ControlFile is a buffer in shared memory that holds an image of the
4599  * contents of pg_control.  WriteControlFile() initializes pg_control
4600  * given a preloaded buffer, ReadControlFile() loads the buffer from
4601  * the pg_control file (during postmaster or standalone-backend startup),
4602  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4603  * InitControlFile() fills the buffer with initial values.
4604  *
4605  * For simplicity, WriteControlFile() initializes the fields of pg_control
4606  * that are related to checking backend/database compatibility, and
4607  * ReadControlFile() verifies they are correct.  We could split out the
4608  * I/O and compatibility-check functions, but there seems no need currently.
4609  */
4610 
4611 static void
InitControlFile(uint64 sysidentifier)4612 InitControlFile(uint64 sysidentifier)
4613 {
4614 	char		mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
4615 
4616 	/*
4617 	 * Generate a random nonce. This is used for authentication requests that
4618 	 * will fail because the user does not exist. The nonce is used to create
4619 	 * a genuine-looking password challenge for the non-existent user, in lieu
4620 	 * of an actual stored password.
4621 	 */
4622 	if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
4623 		ereport(PANIC,
4624 				(errcode(ERRCODE_INTERNAL_ERROR),
4625 				 errmsg("could not generate secret authorization token")));
4626 
4627 	memset(ControlFile, 0, sizeof(ControlFileData));
4628 	/* Initialize pg_control status fields */
4629 	ControlFile->system_identifier = sysidentifier;
4630 	memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
4631 	ControlFile->state = DB_SHUTDOWNED;
4632 	ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
4633 
4634 	/* Set important parameter values for use when replaying WAL */
4635 	ControlFile->MaxConnections = MaxConnections;
4636 	ControlFile->max_worker_processes = max_worker_processes;
4637 	ControlFile->max_wal_senders = max_wal_senders;
4638 	ControlFile->max_prepared_xacts = max_prepared_xacts;
4639 	ControlFile->max_locks_per_xact = max_locks_per_xact;
4640 	ControlFile->wal_level = wal_level;
4641 	ControlFile->wal_log_hints = wal_log_hints;
4642 	ControlFile->track_commit_timestamp = track_commit_timestamp;
4643 	ControlFile->data_checksum_version = bootstrap_data_checksum_version;
4644 }
4645 
4646 static void
WriteControlFile(void)4647 WriteControlFile(void)
4648 {
4649 	int			fd;
4650 	char		buffer[PG_CONTROL_FILE_SIZE];	/* need not be aligned */
4651 
4652 	/*
4653 	 * Ensure that the size of the pg_control data structure is sane.  See the
4654 	 * comments for these symbols in pg_control.h.
4655 	 */
4656 	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
4657 					 "pg_control is too large for atomic disk writes");
4658 	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
4659 					 "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
4660 
4661 	/*
4662 	 * Initialize version and compatibility-check fields
4663 	 */
4664 	ControlFile->pg_control_version = PG_CONTROL_VERSION;
4665 	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4666 
4667 	ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4668 	ControlFile->floatFormat = FLOATFORMAT_VALUE;
4669 
4670 	ControlFile->blcksz = BLCKSZ;
4671 	ControlFile->relseg_size = RELSEG_SIZE;
4672 	ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4673 	ControlFile->xlog_seg_size = wal_segment_size;
4674 
4675 	ControlFile->nameDataLen = NAMEDATALEN;
4676 	ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4677 
4678 	ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4679 	ControlFile->loblksize = LOBLKSIZE;
4680 
4681 	ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4682 
4683 	/* Contents are protected with a CRC */
4684 	INIT_CRC32C(ControlFile->crc);
4685 	COMP_CRC32C(ControlFile->crc,
4686 				(char *) ControlFile,
4687 				offsetof(ControlFileData, crc));
4688 	FIN_CRC32C(ControlFile->crc);
4689 
4690 	/*
4691 	 * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4692 	 * the excess over sizeof(ControlFileData).  This reduces the odds of
4693 	 * premature-EOF errors when reading pg_control.  We'll still fail when we
4694 	 * check the contents of the file, but hopefully with a more specific
4695 	 * error than "couldn't read pg_control".
4696 	 */
4697 	memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4698 	memcpy(buffer, ControlFile, sizeof(ControlFileData));
4699 
4700 	fd = BasicOpenFile(XLOG_CONTROL_FILE,
4701 					   O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
4702 	if (fd < 0)
4703 		ereport(PANIC,
4704 				(errcode_for_file_access(),
4705 				 errmsg("could not create file \"%s\": %m",
4706 						XLOG_CONTROL_FILE)));
4707 
4708 	errno = 0;
4709 	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4710 	if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4711 	{
4712 		/* if write didn't set errno, assume problem is no disk space */
4713 		if (errno == 0)
4714 			errno = ENOSPC;
4715 		ereport(PANIC,
4716 				(errcode_for_file_access(),
4717 				 errmsg("could not write to file \"%s\": %m",
4718 						XLOG_CONTROL_FILE)));
4719 	}
4720 	pgstat_report_wait_end();
4721 
4722 	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4723 	if (pg_fsync(fd) != 0)
4724 		ereport(PANIC,
4725 				(errcode_for_file_access(),
4726 				 errmsg("could not fsync file \"%s\": %m",
4727 						XLOG_CONTROL_FILE)));
4728 	pgstat_report_wait_end();
4729 
4730 	if (close(fd) != 0)
4731 		ereport(PANIC,
4732 				(errcode_for_file_access(),
4733 				 errmsg("could not close file \"%s\": %m",
4734 						XLOG_CONTROL_FILE)));
4735 }
4736 
4737 static void
ReadControlFile(void)4738 ReadControlFile(void)
4739 {
4740 	pg_crc32c	crc;
4741 	int			fd;
4742 	static char wal_segsz_str[20];
4743 	int			r;
4744 
4745 	/*
4746 	 * Read data...
4747 	 */
4748 	fd = BasicOpenFile(XLOG_CONTROL_FILE,
4749 					   O_RDWR | PG_BINARY);
4750 	if (fd < 0)
4751 		ereport(PANIC,
4752 				(errcode_for_file_access(),
4753 				 errmsg("could not open file \"%s\": %m",
4754 						XLOG_CONTROL_FILE)));
4755 
4756 	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4757 	r = read(fd, ControlFile, sizeof(ControlFileData));
4758 	if (r != sizeof(ControlFileData))
4759 	{
4760 		if (r < 0)
4761 			ereport(PANIC,
4762 					(errcode_for_file_access(),
4763 					 errmsg("could not read file \"%s\": %m",
4764 							XLOG_CONTROL_FILE)));
4765 		else
4766 			ereport(PANIC,
4767 					(errcode(ERRCODE_DATA_CORRUPTED),
4768 					 errmsg("could not read file \"%s\": read %d of %zu",
4769 							XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4770 	}
4771 	pgstat_report_wait_end();
4772 
4773 	close(fd);
4774 
4775 	/*
4776 	 * Check for expected pg_control format version.  If this is wrong, the
4777 	 * CRC check will likely fail because we'll be checking the wrong number
4778 	 * of bytes.  Complaining about wrong version will probably be more
4779 	 * enlightening than complaining about wrong CRC.
4780 	 */
4781 
4782 	if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4783 		ereport(FATAL,
4784 				(errmsg("database files are incompatible with server"),
4785 				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4786 						   " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4787 						   ControlFile->pg_control_version, ControlFile->pg_control_version,
4788 						   PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4789 				 errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4790 
4791 	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4792 		ereport(FATAL,
4793 				(errmsg("database files are incompatible with server"),
4794 				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4795 						   " but the server was compiled with PG_CONTROL_VERSION %d.",
4796 						   ControlFile->pg_control_version, PG_CONTROL_VERSION),
4797 				 errhint("It looks like you need to initdb.")));
4798 
4799 	/* Now check the CRC. */
4800 	INIT_CRC32C(crc);
4801 	COMP_CRC32C(crc,
4802 				(char *) ControlFile,
4803 				offsetof(ControlFileData, crc));
4804 	FIN_CRC32C(crc);
4805 
4806 	if (!EQ_CRC32C(crc, ControlFile->crc))
4807 		ereport(FATAL,
4808 				(errmsg("incorrect checksum in control file")));
4809 
4810 	/*
4811 	 * Do compatibility checking immediately.  If the database isn't
4812 	 * compatible with the backend executable, we want to abort before we can
4813 	 * possibly do any damage.
4814 	 */
4815 	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4816 		ereport(FATAL,
4817 				(errmsg("database files are incompatible with server"),
4818 				 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4819 						   " but the server was compiled with CATALOG_VERSION_NO %d.",
4820 						   ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4821 				 errhint("It looks like you need to initdb.")));
4822 	if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4823 		ereport(FATAL,
4824 				(errmsg("database files are incompatible with server"),
4825 				 errdetail("The database cluster was initialized with MAXALIGN %d,"
4826 						   " but the server was compiled with MAXALIGN %d.",
4827 						   ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4828 				 errhint("It looks like you need to initdb.")));
4829 	if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4830 		ereport(FATAL,
4831 				(errmsg("database files are incompatible with server"),
4832 				 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4833 				 errhint("It looks like you need to initdb.")));
4834 	if (ControlFile->blcksz != BLCKSZ)
4835 		ereport(FATAL,
4836 				(errmsg("database files are incompatible with server"),
4837 				 errdetail("The database cluster was initialized with BLCKSZ %d,"
4838 						   " but the server was compiled with BLCKSZ %d.",
4839 						   ControlFile->blcksz, BLCKSZ),
4840 				 errhint("It looks like you need to recompile or initdb.")));
4841 	if (ControlFile->relseg_size != RELSEG_SIZE)
4842 		ereport(FATAL,
4843 				(errmsg("database files are incompatible with server"),
4844 				 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4845 						   " but the server was compiled with RELSEG_SIZE %d.",
4846 						   ControlFile->relseg_size, RELSEG_SIZE),
4847 				 errhint("It looks like you need to recompile or initdb.")));
4848 	if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4849 		ereport(FATAL,
4850 				(errmsg("database files are incompatible with server"),
4851 				 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4852 						   " but the server was compiled with XLOG_BLCKSZ %d.",
4853 						   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4854 				 errhint("It looks like you need to recompile or initdb.")));
4855 	if (ControlFile->nameDataLen != NAMEDATALEN)
4856 		ereport(FATAL,
4857 				(errmsg("database files are incompatible with server"),
4858 				 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4859 						   " but the server was compiled with NAMEDATALEN %d.",
4860 						   ControlFile->nameDataLen, NAMEDATALEN),
4861 				 errhint("It looks like you need to recompile or initdb.")));
4862 	if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4863 		ereport(FATAL,
4864 				(errmsg("database files are incompatible with server"),
4865 				 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4866 						   " but the server was compiled with INDEX_MAX_KEYS %d.",
4867 						   ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4868 				 errhint("It looks like you need to recompile or initdb.")));
4869 	if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4870 		ereport(FATAL,
4871 				(errmsg("database files are incompatible with server"),
4872 				 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4873 						   " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4874 						   ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4875 				 errhint("It looks like you need to recompile or initdb.")));
4876 	if (ControlFile->loblksize != LOBLKSIZE)
4877 		ereport(FATAL,
4878 				(errmsg("database files are incompatible with server"),
4879 				 errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4880 						   " but the server was compiled with LOBLKSIZE %d.",
4881 						   ControlFile->loblksize, (int) LOBLKSIZE),
4882 				 errhint("It looks like you need to recompile or initdb.")));
4883 
4884 #ifdef USE_FLOAT8_BYVAL
4885 	if (ControlFile->float8ByVal != true)
4886 		ereport(FATAL,
4887 				(errmsg("database files are incompatible with server"),
4888 				 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4889 						   " but the server was compiled with USE_FLOAT8_BYVAL."),
4890 				 errhint("It looks like you need to recompile or initdb.")));
4891 #else
4892 	if (ControlFile->float8ByVal != false)
4893 		ereport(FATAL,
4894 				(errmsg("database files are incompatible with server"),
4895 				 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4896 						   " but the server was compiled without USE_FLOAT8_BYVAL."),
4897 				 errhint("It looks like you need to recompile or initdb.")));
4898 #endif
4899 
4900 	wal_segment_size = ControlFile->xlog_seg_size;
4901 
4902 	if (!IsValidWalSegSize(wal_segment_size))
4903 		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4904 						errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
4905 									  "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
4906 									  wal_segment_size,
4907 									  wal_segment_size)));
4908 
4909 	snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4910 	SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4911 					PGC_S_OVERRIDE);
4912 
4913 	/* check and update variables dependent on wal_segment_size */
4914 	if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
4915 		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4916 						errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
4917 
4918 	if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
4919 		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4920 						errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
4921 
4922 	UsableBytesInSegment =
4923 		(wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4924 		(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4925 
4926 	CalculateCheckpointSegments();
4927 
4928 	/* Make the initdb settings visible as GUC variables, too */
4929 	SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4930 					PGC_INTERNAL, PGC_S_OVERRIDE);
4931 }
4932 
4933 /*
4934  * Utility wrapper to update the control file.  Note that the control
4935  * file gets flushed.
4936  */
4937 void
UpdateControlFile(void)4938 UpdateControlFile(void)
4939 {
4940 	update_controlfile(DataDir, ControlFile, true);
4941 }
4942 
4943 /*
4944  * Returns the unique system identifier from control file.
4945  */
4946 uint64
GetSystemIdentifier(void)4947 GetSystemIdentifier(void)
4948 {
4949 	Assert(ControlFile != NULL);
4950 	return ControlFile->system_identifier;
4951 }
4952 
4953 /*
4954  * Returns the random nonce from control file.
4955  */
4956 char *
GetMockAuthenticationNonce(void)4957 GetMockAuthenticationNonce(void)
4958 {
4959 	Assert(ControlFile != NULL);
4960 	return ControlFile->mock_authentication_nonce;
4961 }
4962 
4963 /*
4964  * Are checksums enabled for data pages?
4965  */
4966 bool
DataChecksumsEnabled(void)4967 DataChecksumsEnabled(void)
4968 {
4969 	Assert(ControlFile != NULL);
4970 	return (ControlFile->data_checksum_version > 0);
4971 }
4972 
4973 /*
4974  * Returns a fake LSN for unlogged relations.
4975  *
4976  * Each call generates an LSN that is greater than any previous value
4977  * returned. The current counter value is saved and restored across clean
4978  * shutdowns, but like unlogged relations, does not survive a crash. This can
4979  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4980  * LSN-like increasing sequence of numbers without writing any WAL.
4981  */
4982 XLogRecPtr
GetFakeLSNForUnloggedRel(void)4983 GetFakeLSNForUnloggedRel(void)
4984 {
4985 	XLogRecPtr	nextUnloggedLSN;
4986 
4987 	/* increment the unloggedLSN counter, need SpinLock */
4988 	SpinLockAcquire(&XLogCtl->ulsn_lck);
4989 	nextUnloggedLSN = XLogCtl->unloggedLSN++;
4990 	SpinLockRelease(&XLogCtl->ulsn_lck);
4991 
4992 	return nextUnloggedLSN;
4993 }
4994 
4995 /*
4996  * Auto-tune the number of XLOG buffers.
4997  *
4998  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4999  * a maximum of one XLOG segment (there is little reason to think that more
5000  * is helpful, at least so long as we force an fsync when switching log files)
5001  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
5002  * 9.1, when auto-tuning was added).
5003  *
5004  * This should not be called until NBuffers has received its final value.
5005  */
5006 static int
XLOGChooseNumBuffers(void)5007 XLOGChooseNumBuffers(void)
5008 {
5009 	int			xbuffers;
5010 
5011 	xbuffers = NBuffers / 32;
5012 	if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
5013 		xbuffers = (wal_segment_size / XLOG_BLCKSZ);
5014 	if (xbuffers < 8)
5015 		xbuffers = 8;
5016 	return xbuffers;
5017 }
5018 
5019 /*
5020  * GUC check_hook for wal_buffers
5021  */
5022 bool
check_wal_buffers(int * newval,void ** extra,GucSource source)5023 check_wal_buffers(int *newval, void **extra, GucSource source)
5024 {
5025 	/*
5026 	 * -1 indicates a request for auto-tune.
5027 	 */
5028 	if (*newval == -1)
5029 	{
5030 		/*
5031 		 * If we haven't yet changed the boot_val default of -1, just let it
5032 		 * be.  We'll fix it when XLOGShmemSize is called.
5033 		 */
5034 		if (XLOGbuffers == -1)
5035 			return true;
5036 
5037 		/* Otherwise, substitute the auto-tune value */
5038 		*newval = XLOGChooseNumBuffers();
5039 	}
5040 
5041 	/*
5042 	 * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
5043 	 * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
5044 	 * the case, we just silently treat such values as a request for the
5045 	 * minimum.  (We could throw an error instead, but that doesn't seem very
5046 	 * helpful.)
5047 	 */
5048 	if (*newval < 4)
5049 		*newval = 4;
5050 
5051 	return true;
5052 }
5053 
5054 /*
5055  * Read the control file, set respective GUCs.
5056  *
5057  * This is to be called during startup, including a crash recovery cycle,
5058  * unless in bootstrap mode, where no control file yet exists.  As there's no
5059  * usable shared memory yet (its sizing can depend on the contents of the
5060  * control file!), first store the contents in local memory. XLOGShmemInit()
5061  * will then copy it to shared memory later.
5062  *
5063  * reset just controls whether previous contents are to be expected (in the
5064  * reset case, there's a dangling pointer into old shared memory), or not.
5065  */
5066 void
LocalProcessControlFile(bool reset)5067 LocalProcessControlFile(bool reset)
5068 {
5069 	Assert(reset || ControlFile == NULL);
5070 	ControlFile = palloc(sizeof(ControlFileData));
5071 	ReadControlFile();
5072 }
5073 
5074 /*
5075  * Initialization of shared memory for XLOG
5076  */
5077 Size
XLOGShmemSize(void)5078 XLOGShmemSize(void)
5079 {
5080 	Size		size;
5081 
5082 	/*
5083 	 * If the value of wal_buffers is -1, use the preferred auto-tune value.
5084 	 * This isn't an amazingly clean place to do this, but we must wait till
5085 	 * NBuffers has received its final value, and must do it before using the
5086 	 * value of XLOGbuffers to do anything important.
5087 	 */
5088 	if (XLOGbuffers == -1)
5089 	{
5090 		char		buf[32];
5091 
5092 		snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5093 		SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5094 	}
5095 	Assert(XLOGbuffers > 0);
5096 
5097 	/* XLogCtl */
5098 	size = sizeof(XLogCtlData);
5099 
5100 	/* WAL insertion locks, plus alignment */
5101 	size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
5102 	/* xlblocks array */
5103 	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5104 	/* extra alignment padding for XLOG I/O buffers */
5105 	size = add_size(size, XLOG_BLCKSZ);
5106 	/* and the buffers themselves */
5107 	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5108 
5109 	/*
5110 	 * Note: we don't count ControlFileData, it comes out of the "slop factor"
5111 	 * added by CreateSharedMemoryAndSemaphores.  This lets us use this
5112 	 * routine again below to compute the actual allocation size.
5113 	 */
5114 
5115 	return size;
5116 }
5117 
5118 void
XLOGShmemInit(void)5119 XLOGShmemInit(void)
5120 {
5121 	bool		foundCFile,
5122 				foundXLog;
5123 	char	   *allocptr;
5124 	int			i;
5125 	ControlFileData *localControlFile;
5126 
5127 #ifdef WAL_DEBUG
5128 
5129 	/*
5130 	 * Create a memory context for WAL debugging that's exempt from the normal
5131 	 * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
5132 	 * an allocation fails, but wal_debug is not for production use anyway.
5133 	 */
5134 	if (walDebugCxt == NULL)
5135 	{
5136 		walDebugCxt = AllocSetContextCreate(TopMemoryContext,
5137 											"WAL Debug",
5138 											ALLOCSET_DEFAULT_SIZES);
5139 		MemoryContextAllowInCriticalSection(walDebugCxt, true);
5140 	}
5141 #endif
5142 
5143 
5144 	XLogCtl = (XLogCtlData *)
5145 		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5146 
5147 	localControlFile = ControlFile;
5148 	ControlFile = (ControlFileData *)
5149 		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5150 
5151 	if (foundCFile || foundXLog)
5152 	{
5153 		/* both should be present or neither */
5154 		Assert(foundCFile && foundXLog);
5155 
5156 		/* Initialize local copy of WALInsertLocks */
5157 		WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
5158 
5159 		if (localControlFile)
5160 			pfree(localControlFile);
5161 		return;
5162 	}
5163 	memset(XLogCtl, 0, sizeof(XLogCtlData));
5164 
5165 	/*
5166 	 * Already have read control file locally, unless in bootstrap mode. Move
5167 	 * contents into shared memory.
5168 	 */
5169 	if (localControlFile)
5170 	{
5171 		memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
5172 		pfree(localControlFile);
5173 	}
5174 
5175 	/*
5176 	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5177 	 * multiple of the alignment for same, so no extra alignment padding is
5178 	 * needed here.
5179 	 */
5180 	allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5181 	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5182 	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5183 	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5184 
5185 
5186 	/* WAL insertion locks. Ensure they're aligned to the full padded size */
5187 	allocptr += sizeof(WALInsertLockPadded) -
5188 		((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5189 	WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5190 		(WALInsertLockPadded *) allocptr;
5191 	allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5192 
5193 	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
5194 	{
5195 		LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5196 		WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
5197 		WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5198 	}
5199 
5200 	/*
5201 	 * Align the start of the page buffers to a full xlog block size boundary.
5202 	 * This simplifies some calculations in XLOG insertion. It is also
5203 	 * required for O_DIRECT.
5204 	 */
5205 	allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5206 	XLogCtl->pages = allocptr;
5207 	memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5208 
5209 	/*
5210 	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5211 	 * in additional info.)
5212 	 */
5213 	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5214 	XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5215 	XLogCtl->SharedHotStandbyActive = false;
5216 	XLogCtl->SharedPromoteIsTriggered = false;
5217 	XLogCtl->WalWriterSleeping = false;
5218 
5219 	SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5220 	SpinLockInit(&XLogCtl->info_lck);
5221 	SpinLockInit(&XLogCtl->ulsn_lck);
5222 	InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5223 }
5224 
5225 /*
5226  * This func must be called ONCE on system install.  It creates pg_control
5227  * and the initial XLOG segment.
5228  */
5229 void
BootStrapXLOG(void)5230 BootStrapXLOG(void)
5231 {
5232 	CheckPoint	checkPoint;
5233 	char	   *buffer;
5234 	XLogPageHeader page;
5235 	XLogLongPageHeader longpage;
5236 	XLogRecord *record;
5237 	char	   *recptr;
5238 	bool		use_existent;
5239 	uint64		sysidentifier;
5240 	struct timeval tv;
5241 	pg_crc32c	crc;
5242 
5243 	/*
5244 	 * Select a hopefully-unique system identifier code for this installation.
5245 	 * We use the result of gettimeofday(), including the fractional seconds
5246 	 * field, as being about as unique as we can easily get.  (Think not to
5247 	 * use random(), since it hasn't been seeded and there's no portable way
5248 	 * to seed it other than the system clock value...)  The upper half of the
5249 	 * uint64 value is just the tv_sec part, while the lower half contains the
5250 	 * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5251 	 * PID for a little extra uniqueness.  A person knowing this encoding can
5252 	 * determine the initialization time of the installation, which could
5253 	 * perhaps be useful sometimes.
5254 	 */
5255 	gettimeofday(&tv, NULL);
5256 	sysidentifier = ((uint64) tv.tv_sec) << 32;
5257 	sysidentifier |= ((uint64) tv.tv_usec) << 12;
5258 	sysidentifier |= getpid() & 0xFFF;
5259 
5260 	/* First timeline ID is always 1 */
5261 	ThisTimeLineID = 1;
5262 
5263 	/* page buffer must be aligned suitably for O_DIRECT */
5264 	buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5265 	page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5266 	memset(page, 0, XLOG_BLCKSZ);
5267 
5268 	/*
5269 	 * Set up information for the initial checkpoint record
5270 	 *
5271 	 * The initial checkpoint record is written to the beginning of the WAL
5272 	 * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5273 	 * used, so that we can use 0/0 to mean "before any valid WAL segment".
5274 	 */
5275 	checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5276 	checkPoint.ThisTimeLineID = ThisTimeLineID;
5277 	checkPoint.PrevTimeLineID = ThisTimeLineID;
5278 	checkPoint.fullPageWrites = fullPageWrites;
5279 	checkPoint.nextFullXid =
5280 		FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
5281 	checkPoint.nextOid = FirstBootstrapObjectId;
5282 	checkPoint.nextMulti = FirstMultiXactId;
5283 	checkPoint.nextMultiOffset = 0;
5284 	checkPoint.oldestXid = FirstNormalTransactionId;
5285 	checkPoint.oldestXidDB = TemplateDbOid;
5286 	checkPoint.oldestMulti = FirstMultiXactId;
5287 	checkPoint.oldestMultiDB = TemplateDbOid;
5288 	checkPoint.oldestCommitTsXid = InvalidTransactionId;
5289 	checkPoint.newestCommitTsXid = InvalidTransactionId;
5290 	checkPoint.time = (pg_time_t) time(NULL);
5291 	checkPoint.oldestActiveXid = InvalidTransactionId;
5292 
5293 	ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
5294 	ShmemVariableCache->nextOid = checkPoint.nextOid;
5295 	ShmemVariableCache->oidCount = 0;
5296 	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5297 	AdvanceOldestClogXid(checkPoint.oldestXid);
5298 	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5299 	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5300 	SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5301 
5302 	/* Set up the XLOG page header */
5303 	page->xlp_magic = XLOG_PAGE_MAGIC;
5304 	page->xlp_info = XLP_LONG_HEADER;
5305 	page->xlp_tli = ThisTimeLineID;
5306 	page->xlp_pageaddr = wal_segment_size;
5307 	longpage = (XLogLongPageHeader) page;
5308 	longpage->xlp_sysid = sysidentifier;
5309 	longpage->xlp_seg_size = wal_segment_size;
5310 	longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5311 
5312 	/* Insert the initial checkpoint record */
5313 	recptr = ((char *) page + SizeOfXLogLongPHD);
5314 	record = (XLogRecord *) recptr;
5315 	record->xl_prev = 0;
5316 	record->xl_xid = InvalidTransactionId;
5317 	record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5318 	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5319 	record->xl_rmid = RM_XLOG_ID;
5320 	recptr += SizeOfXLogRecord;
5321 	/* fill the XLogRecordDataHeaderShort struct */
5322 	*(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5323 	*(recptr++) = sizeof(checkPoint);
5324 	memcpy(recptr, &checkPoint, sizeof(checkPoint));
5325 	recptr += sizeof(checkPoint);
5326 	Assert(recptr - (char *) record == record->xl_tot_len);
5327 
5328 	INIT_CRC32C(crc);
5329 	COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5330 	COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5331 	FIN_CRC32C(crc);
5332 	record->xl_crc = crc;
5333 
5334 	/* Create first XLOG segment file */
5335 	use_existent = false;
5336 	openLogFile = XLogFileInit(1, &use_existent, false);
5337 
5338 	/*
5339 	 * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
5340 	 * close the file again in a moment.
5341 	 */
5342 
5343 	/* Write the first page with the initial record */
5344 	errno = 0;
5345 	pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5346 	if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5347 	{
5348 		/* if write didn't set errno, assume problem is no disk space */
5349 		if (errno == 0)
5350 			errno = ENOSPC;
5351 		ereport(PANIC,
5352 				(errcode_for_file_access(),
5353 				 errmsg("could not write bootstrap write-ahead log file: %m")));
5354 	}
5355 	pgstat_report_wait_end();
5356 
5357 	pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5358 	if (pg_fsync(openLogFile) != 0)
5359 		ereport(PANIC,
5360 				(errcode_for_file_access(),
5361 				 errmsg("could not fsync bootstrap write-ahead log file: %m")));
5362 	pgstat_report_wait_end();
5363 
5364 	if (close(openLogFile) != 0)
5365 		ereport(PANIC,
5366 				(errcode_for_file_access(),
5367 				 errmsg("could not close bootstrap write-ahead log file: %m")));
5368 
5369 	openLogFile = -1;
5370 
5371 	/* Now create pg_control */
5372 	InitControlFile(sysidentifier);
5373 	ControlFile->time = checkPoint.time;
5374 	ControlFile->checkPoint = checkPoint.redo;
5375 	ControlFile->checkPointCopy = checkPoint;
5376 
5377 	/* some additional ControlFile fields are set in WriteControlFile() */
5378 	WriteControlFile();
5379 
5380 	/* Bootstrap the commit log, too */
5381 	BootStrapCLOG();
5382 	BootStrapCommitTs();
5383 	BootStrapSUBTRANS();
5384 	BootStrapMultiXact();
5385 
5386 	pfree(buffer);
5387 
5388 	/*
5389 	 * Force control file to be read - in contrast to normal processing we'd
5390 	 * otherwise never run the checks and GUC related initializations therein.
5391 	 */
5392 	ReadControlFile();
5393 }
5394 
5395 static char *
str_time(pg_time_t tnow)5396 str_time(pg_time_t tnow)
5397 {
5398 	static char buf[128];
5399 
5400 	pg_strftime(buf, sizeof(buf),
5401 				"%Y-%m-%d %H:%M:%S %Z",
5402 				pg_localtime(&tnow, log_timezone));
5403 
5404 	return buf;
5405 }
5406 
5407 /*
5408  * See if there are any recovery signal files and if so, set state for
5409  * recovery.
5410  *
5411  * See if there is a recovery command file (recovery.conf), and if so
5412  * throw an ERROR since as of PG12 we no longer recognize that.
5413  */
5414 static void
readRecoverySignalFile(void)5415 readRecoverySignalFile(void)
5416 {
5417 	struct stat stat_buf;
5418 
5419 	if (IsBootstrapProcessingMode())
5420 		return;
5421 
5422 	/*
5423 	 * Check for old recovery API file: recovery.conf
5424 	 */
5425 	if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
5426 		ereport(FATAL,
5427 				(errcode_for_file_access(),
5428 				 errmsg("using recovery command file \"%s\" is not supported",
5429 						RECOVERY_COMMAND_FILE)));
5430 
5431 	/*
5432 	 * Remove unused .done file, if present. Ignore if absent.
5433 	 */
5434 	unlink(RECOVERY_COMMAND_DONE);
5435 
5436 	/*
5437 	 * Check for recovery signal files and if found, fsync them since they
5438 	 * represent server state information.  We don't sweat too much about the
5439 	 * possibility of fsync failure, however.
5440 	 *
5441 	 * If present, standby signal file takes precedence. If neither is present
5442 	 * then we won't enter archive recovery.
5443 	 */
5444 	if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
5445 	{
5446 		int			fd;
5447 
5448 		fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
5449 							   S_IRUSR | S_IWUSR);
5450 		if (fd >= 0)
5451 		{
5452 			(void) pg_fsync(fd);
5453 			close(fd);
5454 		}
5455 		standby_signal_file_found = true;
5456 	}
5457 	else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
5458 	{
5459 		int			fd;
5460 
5461 		fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
5462 							   S_IRUSR | S_IWUSR);
5463 		if (fd >= 0)
5464 		{
5465 			(void) pg_fsync(fd);
5466 			close(fd);
5467 		}
5468 		recovery_signal_file_found = true;
5469 	}
5470 
5471 	StandbyModeRequested = false;
5472 	ArchiveRecoveryRequested = false;
5473 	if (standby_signal_file_found)
5474 	{
5475 		StandbyModeRequested = true;
5476 		ArchiveRecoveryRequested = true;
5477 	}
5478 	else if (recovery_signal_file_found)
5479 	{
5480 		StandbyModeRequested = false;
5481 		ArchiveRecoveryRequested = true;
5482 	}
5483 	else
5484 		return;
5485 
5486 	/*
5487 	 * We don't support standby mode in standalone backends; that requires
5488 	 * other processes such as the WAL receiver to be alive.
5489 	 */
5490 	if (StandbyModeRequested && !IsUnderPostmaster)
5491 		ereport(FATAL,
5492 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5493 				 errmsg("standby mode is not supported by single-user servers")));
5494 }
5495 
5496 static void
validateRecoveryParameters(void)5497 validateRecoveryParameters(void)
5498 {
5499 	if (!ArchiveRecoveryRequested)
5500 		return;
5501 
5502 	/*
5503 	 * Check for compulsory parameters
5504 	 */
5505 	if (StandbyModeRequested)
5506 	{
5507 		if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
5508 			(recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
5509 			ereport(WARNING,
5510 					(errmsg("specified neither primary_conninfo nor restore_command"),
5511 					 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5512 	}
5513 	else
5514 	{
5515 		if (recoveryRestoreCommand == NULL ||
5516 			strcmp(recoveryRestoreCommand, "") == 0)
5517 			ereport(FATAL,
5518 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5519 					 errmsg("must specify restore_command when standby mode is not enabled")));
5520 	}
5521 
5522 	/*
5523 	 * Override any inconsistent requests. Note that this is a change of
5524 	 * behaviour in 9.5; prior to this we simply ignored a request to pause if
5525 	 * hot_standby = off, which was surprising behaviour.
5526 	 */
5527 	if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5528 		!EnableHotStandby)
5529 		recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5530 
5531 	/*
5532 	 * Final parsing of recovery_target_time string; see also
5533 	 * check_recovery_target_time().
5534 	 */
5535 	if (recoveryTarget == RECOVERY_TARGET_TIME)
5536 	{
5537 		recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5538 																	 CStringGetDatum(recovery_target_time_string),
5539 																	 ObjectIdGetDatum(InvalidOid),
5540 																	 Int32GetDatum(-1)));
5541 	}
5542 
5543 	/*
5544 	 * If user specified recovery_target_timeline, validate it or compute the
5545 	 * "latest" value.  We can't do this until after we've gotten the restore
5546 	 * command and set InArchiveRecovery, because we need to fetch timeline
5547 	 * history files from the archive.
5548 	 */
5549 	if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5550 	{
5551 		TimeLineID	rtli = recoveryTargetTLIRequested;
5552 
5553 		/* Timeline 1 does not have a history file, all else should */
5554 		if (rtli != 1 && !existsTimeLineHistory(rtli))
5555 			ereport(FATAL,
5556 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5557 					 errmsg("recovery target timeline %u does not exist",
5558 							rtli)));
5559 		recoveryTargetTLI = rtli;
5560 	}
5561 	else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
5562 	{
5563 		/* We start the "latest" search from pg_control's timeline */
5564 		recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5565 	}
5566 	else
5567 	{
5568 		/*
5569 		 * else we just use the recoveryTargetTLI as already read from
5570 		 * ControlFile
5571 		 */
5572 		Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
5573 	}
5574 }
5575 
5576 /*
5577  * Exit archive-recovery state
5578  */
5579 static void
exitArchiveRecovery(TimeLineID endTLI,XLogRecPtr endOfLog)5580 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5581 {
5582 	char		xlogfname[MAXFNAMELEN];
5583 	XLogSegNo	endLogSegNo;
5584 	XLogSegNo	startLogSegNo;
5585 
5586 	/* we always switch to a new timeline after archive recovery */
5587 	Assert(endTLI != ThisTimeLineID);
5588 
5589 	/*
5590 	 * We are no longer in archive recovery state.
5591 	 */
5592 	InArchiveRecovery = false;
5593 
5594 	/*
5595 	 * Update min recovery point one last time.
5596 	 */
5597 	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5598 
5599 	/*
5600 	 * If the ending log segment is still open, close it (to avoid problems on
5601 	 * Windows with trying to rename or delete an open file).
5602 	 */
5603 	if (readFile >= 0)
5604 	{
5605 		close(readFile);
5606 		readFile = -1;
5607 	}
5608 
5609 	/*
5610 	 * Calculate the last segment on the old timeline, and the first segment
5611 	 * on the new timeline. If the switch happens in the middle of a segment,
5612 	 * they are the same, but if the switch happens exactly at a segment
5613 	 * boundary, startLogSegNo will be endLogSegNo + 1.
5614 	 */
5615 	XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5616 	XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5617 
5618 	/*
5619 	 * Initialize the starting WAL segment for the new timeline. If the switch
5620 	 * happens in the middle of a segment, copy data from the last WAL segment
5621 	 * of the old timeline up to the switch point, to the starting WAL segment
5622 	 * on the new timeline.
5623 	 */
5624 	if (endLogSegNo == startLogSegNo)
5625 	{
5626 		/*
5627 		 * Make a copy of the file on the new timeline.
5628 		 *
5629 		 * Writing WAL isn't allowed yet, so there are no locking
5630 		 * considerations. But we should be just as tense as XLogFileInit to
5631 		 * avoid emplacing a bogus file.
5632 		 */
5633 		XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5634 					 XLogSegmentOffset(endOfLog, wal_segment_size));
5635 	}
5636 	else
5637 	{
5638 		/*
5639 		 * The switch happened at a segment boundary, so just create the next
5640 		 * segment on the new timeline.
5641 		 */
5642 		bool		use_existent = true;
5643 		int			fd;
5644 
5645 		fd = XLogFileInit(startLogSegNo, &use_existent, true);
5646 
5647 		if (close(fd) != 0)
5648 		{
5649 			char		xlogfname[MAXFNAMELEN];
5650 			int			save_errno = errno;
5651 
5652 			XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo,
5653 						 wal_segment_size);
5654 			errno = save_errno;
5655 			ereport(ERROR,
5656 					(errcode_for_file_access(),
5657 					 errmsg("could not close file \"%s\": %m", xlogfname)));
5658 		}
5659 	}
5660 
5661 	/*
5662 	 * Let's just make real sure there are not .ready or .done flags posted
5663 	 * for the new segment.
5664 	 */
5665 	XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
5666 	XLogArchiveCleanup(xlogfname);
5667 
5668 	/*
5669 	 * Remove the signal files out of the way, so that we don't accidentally
5670 	 * re-enter archive recovery mode in a subsequent crash.
5671 	 */
5672 	if (standby_signal_file_found)
5673 		durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
5674 
5675 	if (recovery_signal_file_found)
5676 		durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
5677 
5678 	ereport(LOG,
5679 			(errmsg("archive recovery complete")));
5680 }
5681 
5682 /*
5683  * Extract timestamp from WAL record.
5684  *
5685  * If the record contains a timestamp, returns true, and saves the timestamp
5686  * in *recordXtime. If the record type has no timestamp, returns false.
5687  * Currently, only transaction commit/abort records and restore points contain
5688  * timestamps.
5689  */
5690 static bool
getRecordTimestamp(XLogReaderState * record,TimestampTz * recordXtime)5691 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5692 {
5693 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5694 	uint8		xact_info = info & XLOG_XACT_OPMASK;
5695 	uint8		rmid = XLogRecGetRmid(record);
5696 
5697 	if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5698 	{
5699 		*recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5700 		return true;
5701 	}
5702 	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5703 							   xact_info == XLOG_XACT_COMMIT_PREPARED))
5704 	{
5705 		*recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5706 		return true;
5707 	}
5708 	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5709 							   xact_info == XLOG_XACT_ABORT_PREPARED))
5710 	{
5711 		*recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5712 		return true;
5713 	}
5714 	return false;
5715 }
5716 
5717 /*
5718  * For point-in-time recovery, this function decides whether we want to
5719  * stop applying the XLOG before the current record.
5720  *
5721  * Returns true if we are stopping, false otherwise. If stopping, some
5722  * information is saved in recoveryStopXid et al for use in annotating the
5723  * new timeline's history file.
5724  */
5725 static bool
recoveryStopsBefore(XLogReaderState * record)5726 recoveryStopsBefore(XLogReaderState *record)
5727 {
5728 	bool		stopsHere = false;
5729 	uint8		xact_info;
5730 	bool		isCommit;
5731 	TimestampTz recordXtime = 0;
5732 	TransactionId recordXid;
5733 
5734 	/*
5735 	 * Ignore recovery target settings when not in archive recovery (meaning
5736 	 * we are in crash recovery).
5737 	 */
5738 	if (!ArchiveRecoveryRequested)
5739 		return false;
5740 
5741 	/* Check if we should stop as soon as reaching consistency */
5742 	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5743 	{
5744 		ereport(LOG,
5745 				(errmsg("recovery stopping after reaching consistency")));
5746 
5747 		recoveryStopAfter = false;
5748 		recoveryStopXid = InvalidTransactionId;
5749 		recoveryStopLSN = InvalidXLogRecPtr;
5750 		recoveryStopTime = 0;
5751 		recoveryStopName[0] = '\0';
5752 		return true;
5753 	}
5754 
5755 	/* Check if target LSN has been reached */
5756 	if (recoveryTarget == RECOVERY_TARGET_LSN &&
5757 		!recoveryTargetInclusive &&
5758 		record->ReadRecPtr >= recoveryTargetLSN)
5759 	{
5760 		recoveryStopAfter = false;
5761 		recoveryStopXid = InvalidTransactionId;
5762 		recoveryStopLSN = record->ReadRecPtr;
5763 		recoveryStopTime = 0;
5764 		recoveryStopName[0] = '\0';
5765 		ereport(LOG,
5766 				(errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
5767 						(uint32) (recoveryStopLSN >> 32),
5768 						(uint32) recoveryStopLSN)));
5769 		return true;
5770 	}
5771 
5772 	/* Otherwise we only consider stopping before COMMIT or ABORT records. */
5773 	if (XLogRecGetRmid(record) != RM_XACT_ID)
5774 		return false;
5775 
5776 	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5777 
5778 	if (xact_info == XLOG_XACT_COMMIT)
5779 	{
5780 		isCommit = true;
5781 		recordXid = XLogRecGetXid(record);
5782 	}
5783 	else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5784 	{
5785 		xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5786 		xl_xact_parsed_commit parsed;
5787 
5788 		isCommit = true;
5789 		ParseCommitRecord(XLogRecGetInfo(record),
5790 						  xlrec,
5791 						  &parsed);
5792 		recordXid = parsed.twophase_xid;
5793 	}
5794 	else if (xact_info == XLOG_XACT_ABORT)
5795 	{
5796 		isCommit = false;
5797 		recordXid = XLogRecGetXid(record);
5798 	}
5799 	else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5800 	{
5801 		xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5802 		xl_xact_parsed_abort parsed;
5803 
5804 		isCommit = false;
5805 		ParseAbortRecord(XLogRecGetInfo(record),
5806 						 xlrec,
5807 						 &parsed);
5808 		recordXid = parsed.twophase_xid;
5809 	}
5810 	else
5811 		return false;
5812 
5813 	if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5814 	{
5815 		/*
5816 		 * There can be only one transaction end record with this exact
5817 		 * transactionid
5818 		 *
5819 		 * when testing for an xid, we MUST test for equality only, since
5820 		 * transactions are numbered in the order they start, not the order
5821 		 * they complete. A higher numbered xid will complete before you about
5822 		 * 50% of the time...
5823 		 */
5824 		stopsHere = (recordXid == recoveryTargetXid);
5825 	}
5826 
5827 	if (recoveryTarget == RECOVERY_TARGET_TIME &&
5828 		getRecordTimestamp(record, &recordXtime))
5829 	{
5830 		/*
5831 		 * There can be many transactions that share the same commit time, so
5832 		 * we stop after the last one, if we are inclusive, or stop at the
5833 		 * first one if we are exclusive
5834 		 */
5835 		if (recoveryTargetInclusive)
5836 			stopsHere = (recordXtime > recoveryTargetTime);
5837 		else
5838 			stopsHere = (recordXtime >= recoveryTargetTime);
5839 	}
5840 
5841 	if (stopsHere)
5842 	{
5843 		recoveryStopAfter = false;
5844 		recoveryStopXid = recordXid;
5845 		recoveryStopTime = recordXtime;
5846 		recoveryStopLSN = InvalidXLogRecPtr;
5847 		recoveryStopName[0] = '\0';
5848 
5849 		if (isCommit)
5850 		{
5851 			ereport(LOG,
5852 					(errmsg("recovery stopping before commit of transaction %u, time %s",
5853 							recoveryStopXid,
5854 							timestamptz_to_str(recoveryStopTime))));
5855 		}
5856 		else
5857 		{
5858 			ereport(LOG,
5859 					(errmsg("recovery stopping before abort of transaction %u, time %s",
5860 							recoveryStopXid,
5861 							timestamptz_to_str(recoveryStopTime))));
5862 		}
5863 	}
5864 
5865 	return stopsHere;
5866 }
5867 
5868 /*
5869  * Same as recoveryStopsBefore, but called after applying the record.
5870  *
5871  * We also track the timestamp of the latest applied COMMIT/ABORT
5872  * record in XLogCtl->recoveryLastXTime.
5873  */
5874 static bool
recoveryStopsAfter(XLogReaderState * record)5875 recoveryStopsAfter(XLogReaderState *record)
5876 {
5877 	uint8		info;
5878 	uint8		xact_info;
5879 	uint8		rmid;
5880 	TimestampTz recordXtime;
5881 
5882 	/*
5883 	 * Ignore recovery target settings when not in archive recovery (meaning
5884 	 * we are in crash recovery).
5885 	 */
5886 	if (!ArchiveRecoveryRequested)
5887 		return false;
5888 
5889 	info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5890 	rmid = XLogRecGetRmid(record);
5891 
5892 	/*
5893 	 * There can be many restore points that share the same name; we stop at
5894 	 * the first one.
5895 	 */
5896 	if (recoveryTarget == RECOVERY_TARGET_NAME &&
5897 		rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5898 	{
5899 		xl_restore_point *recordRestorePointData;
5900 
5901 		recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5902 
5903 		if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5904 		{
5905 			recoveryStopAfter = true;
5906 			recoveryStopXid = InvalidTransactionId;
5907 			recoveryStopLSN = InvalidXLogRecPtr;
5908 			(void) getRecordTimestamp(record, &recoveryStopTime);
5909 			strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5910 
5911 			ereport(LOG,
5912 					(errmsg("recovery stopping at restore point \"%s\", time %s",
5913 							recoveryStopName,
5914 							timestamptz_to_str(recoveryStopTime))));
5915 			return true;
5916 		}
5917 	}
5918 
5919 	/* Check if the target LSN has been reached */
5920 	if (recoveryTarget == RECOVERY_TARGET_LSN &&
5921 		recoveryTargetInclusive &&
5922 		record->ReadRecPtr >= recoveryTargetLSN)
5923 	{
5924 		recoveryStopAfter = true;
5925 		recoveryStopXid = InvalidTransactionId;
5926 		recoveryStopLSN = record->ReadRecPtr;
5927 		recoveryStopTime = 0;
5928 		recoveryStopName[0] = '\0';
5929 		ereport(LOG,
5930 				(errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
5931 						(uint32) (recoveryStopLSN >> 32),
5932 						(uint32) recoveryStopLSN)));
5933 		return true;
5934 	}
5935 
5936 	if (rmid != RM_XACT_ID)
5937 		return false;
5938 
5939 	xact_info = info & XLOG_XACT_OPMASK;
5940 
5941 	if (xact_info == XLOG_XACT_COMMIT ||
5942 		xact_info == XLOG_XACT_COMMIT_PREPARED ||
5943 		xact_info == XLOG_XACT_ABORT ||
5944 		xact_info == XLOG_XACT_ABORT_PREPARED)
5945 	{
5946 		TransactionId recordXid;
5947 
5948 		/* Update the last applied transaction timestamp */
5949 		if (getRecordTimestamp(record, &recordXtime))
5950 			SetLatestXTime(recordXtime);
5951 
5952 		/* Extract the XID of the committed/aborted transaction */
5953 		if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5954 		{
5955 			xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5956 			xl_xact_parsed_commit parsed;
5957 
5958 			ParseCommitRecord(XLogRecGetInfo(record),
5959 							  xlrec,
5960 							  &parsed);
5961 			recordXid = parsed.twophase_xid;
5962 		}
5963 		else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5964 		{
5965 			xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5966 			xl_xact_parsed_abort parsed;
5967 
5968 			ParseAbortRecord(XLogRecGetInfo(record),
5969 							 xlrec,
5970 							 &parsed);
5971 			recordXid = parsed.twophase_xid;
5972 		}
5973 		else
5974 			recordXid = XLogRecGetXid(record);
5975 
5976 		/*
5977 		 * There can be only one transaction end record with this exact
5978 		 * transactionid
5979 		 *
5980 		 * when testing for an xid, we MUST test for equality only, since
5981 		 * transactions are numbered in the order they start, not the order
5982 		 * they complete. A higher numbered xid will complete before you about
5983 		 * 50% of the time...
5984 		 */
5985 		if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5986 			recordXid == recoveryTargetXid)
5987 		{
5988 			recoveryStopAfter = true;
5989 			recoveryStopXid = recordXid;
5990 			recoveryStopTime = recordXtime;
5991 			recoveryStopLSN = InvalidXLogRecPtr;
5992 			recoveryStopName[0] = '\0';
5993 
5994 			if (xact_info == XLOG_XACT_COMMIT ||
5995 				xact_info == XLOG_XACT_COMMIT_PREPARED)
5996 			{
5997 				ereport(LOG,
5998 						(errmsg("recovery stopping after commit of transaction %u, time %s",
5999 								recoveryStopXid,
6000 								timestamptz_to_str(recoveryStopTime))));
6001 			}
6002 			else if (xact_info == XLOG_XACT_ABORT ||
6003 					 xact_info == XLOG_XACT_ABORT_PREPARED)
6004 			{
6005 				ereport(LOG,
6006 						(errmsg("recovery stopping after abort of transaction %u, time %s",
6007 								recoveryStopXid,
6008 								timestamptz_to_str(recoveryStopTime))));
6009 			}
6010 			return true;
6011 		}
6012 	}
6013 
6014 	/* Check if we should stop as soon as reaching consistency */
6015 	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
6016 	{
6017 		ereport(LOG,
6018 				(errmsg("recovery stopping after reaching consistency")));
6019 
6020 		recoveryStopAfter = true;
6021 		recoveryStopXid = InvalidTransactionId;
6022 		recoveryStopTime = 0;
6023 		recoveryStopLSN = InvalidXLogRecPtr;
6024 		recoveryStopName[0] = '\0';
6025 		return true;
6026 	}
6027 
6028 	return false;
6029 }
6030 
6031 /*
6032  * Wait until shared recoveryPause flag is cleared.
6033  *
6034  * endOfRecovery is true if the recovery target is reached and
6035  * the paused state starts at the end of recovery because of
6036  * recovery_target_action=pause, and false otherwise.
6037  *
6038  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
6039  * Probably not worth the trouble though.  This state shouldn't be one that
6040  * anyone cares about server power consumption in.
6041  */
6042 static void
recoveryPausesHere(bool endOfRecovery)6043 recoveryPausesHere(bool endOfRecovery)
6044 {
6045 	/* Don't pause unless users can connect! */
6046 	if (!LocalHotStandbyActive)
6047 		return;
6048 
6049 	/* Don't pause after standby promotion has been triggered */
6050 	if (LocalPromoteIsTriggered)
6051 		return;
6052 
6053 	if (endOfRecovery)
6054 		ereport(LOG,
6055 				(errmsg("pausing at the end of recovery"),
6056 				 errhint("Execute pg_wal_replay_resume() to promote.")));
6057 	else
6058 		ereport(LOG,
6059 				(errmsg("recovery has paused"),
6060 				 errhint("Execute pg_wal_replay_resume() to continue.")));
6061 
6062 	while (RecoveryIsPaused())
6063 	{
6064 		HandleStartupProcInterrupts();
6065 		if (CheckForStandbyTrigger())
6066 			return;
6067 		pgstat_report_wait_start(WAIT_EVENT_RECOVERY_PAUSE);
6068 		pg_usleep(1000000L);	/* 1000 ms */
6069 		pgstat_report_wait_end();
6070 	}
6071 }
6072 
6073 bool
RecoveryIsPaused(void)6074 RecoveryIsPaused(void)
6075 {
6076 	bool		recoveryPause;
6077 
6078 	SpinLockAcquire(&XLogCtl->info_lck);
6079 	recoveryPause = XLogCtl->recoveryPause;
6080 	SpinLockRelease(&XLogCtl->info_lck);
6081 
6082 	return recoveryPause;
6083 }
6084 
6085 void
SetRecoveryPause(bool recoveryPause)6086 SetRecoveryPause(bool recoveryPause)
6087 {
6088 	SpinLockAcquire(&XLogCtl->info_lck);
6089 	XLogCtl->recoveryPause = recoveryPause;
6090 	SpinLockRelease(&XLogCtl->info_lck);
6091 }
6092 
6093 /*
6094  * When recovery_min_apply_delay is set, we wait long enough to make sure
6095  * certain record types are applied at least that interval behind the master.
6096  *
6097  * Returns true if we waited.
6098  *
6099  * Note that the delay is calculated between the WAL record log time and
6100  * the current time on standby. We would prefer to keep track of when this
6101  * standby received each WAL record, which would allow a more consistent
6102  * approach and one not affected by time synchronisation issues, but that
6103  * is significantly more effort and complexity for little actual gain in
6104  * usability.
6105  */
6106 static bool
recoveryApplyDelay(XLogReaderState * record)6107 recoveryApplyDelay(XLogReaderState *record)
6108 {
6109 	uint8		xact_info;
6110 	TimestampTz xtime;
6111 	TimestampTz delayUntil;
6112 	long		msecs;
6113 
6114 	/* nothing to do if no delay configured */
6115 	if (recovery_min_apply_delay <= 0)
6116 		return false;
6117 
6118 	/* no delay is applied on a database not yet consistent */
6119 	if (!reachedConsistency)
6120 		return false;
6121 
6122 	/* nothing to do if crash recovery is requested */
6123 	if (!ArchiveRecoveryRequested)
6124 		return false;
6125 
6126 	/*
6127 	 * Is it a COMMIT record?
6128 	 *
6129 	 * We deliberately choose not to delay aborts since they have no effect on
6130 	 * MVCC. We already allow replay of records that don't have a timestamp,
6131 	 * so there is already opportunity for issues caused by early conflicts on
6132 	 * standbys.
6133 	 */
6134 	if (XLogRecGetRmid(record) != RM_XACT_ID)
6135 		return false;
6136 
6137 	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
6138 
6139 	if (xact_info != XLOG_XACT_COMMIT &&
6140 		xact_info != XLOG_XACT_COMMIT_PREPARED)
6141 		return false;
6142 
6143 	if (!getRecordTimestamp(record, &xtime))
6144 		return false;
6145 
6146 	delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6147 
6148 	/*
6149 	 * Exit without arming the latch if it's already past time to apply this
6150 	 * record
6151 	 */
6152 	msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
6153 	if (msecs <= 0)
6154 		return false;
6155 
6156 	while (true)
6157 	{
6158 		ResetLatch(&XLogCtl->recoveryWakeupLatch);
6159 
6160 		/*
6161 		 * This might change recovery_min_apply_delay or the trigger file's
6162 		 * location.
6163 		 */
6164 		HandleStartupProcInterrupts();
6165 
6166 		if (CheckForStandbyTrigger())
6167 			break;
6168 
6169 		/*
6170 		 * Recalculate delayUntil as recovery_min_apply_delay could have
6171 		 * changed while waiting in this loop.
6172 		 */
6173 		delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6174 
6175 		/*
6176 		 * Wait for difference between GetCurrentTimestamp() and delayUntil.
6177 		 */
6178 		msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
6179 												delayUntil);
6180 
6181 		if (msecs <= 0)
6182 			break;
6183 
6184 		elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
6185 
6186 		(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
6187 						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
6188 						 msecs,
6189 						 WAIT_EVENT_RECOVERY_APPLY_DELAY);
6190 	}
6191 	return true;
6192 }
6193 
6194 /*
6195  * Save timestamp of latest processed commit/abort record.
6196  *
6197  * We keep this in XLogCtl, not a simple static variable, so that it can be
6198  * seen by processes other than the startup process.  Note in particular
6199  * that CreateRestartPoint is executed in the checkpointer.
6200  */
6201 static void
SetLatestXTime(TimestampTz xtime)6202 SetLatestXTime(TimestampTz xtime)
6203 {
6204 	SpinLockAcquire(&XLogCtl->info_lck);
6205 	XLogCtl->recoveryLastXTime = xtime;
6206 	SpinLockRelease(&XLogCtl->info_lck);
6207 }
6208 
6209 /*
6210  * Fetch timestamp of latest processed commit/abort record.
6211  */
6212 TimestampTz
GetLatestXTime(void)6213 GetLatestXTime(void)
6214 {
6215 	TimestampTz xtime;
6216 
6217 	SpinLockAcquire(&XLogCtl->info_lck);
6218 	xtime = XLogCtl->recoveryLastXTime;
6219 	SpinLockRelease(&XLogCtl->info_lck);
6220 
6221 	return xtime;
6222 }
6223 
6224 /*
6225  * Save timestamp of the next chunk of WAL records to apply.
6226  *
6227  * We keep this in XLogCtl, not a simple static variable, so that it can be
6228  * seen by all backends.
6229  */
6230 static void
SetCurrentChunkStartTime(TimestampTz xtime)6231 SetCurrentChunkStartTime(TimestampTz xtime)
6232 {
6233 	SpinLockAcquire(&XLogCtl->info_lck);
6234 	XLogCtl->currentChunkStartTime = xtime;
6235 	SpinLockRelease(&XLogCtl->info_lck);
6236 }
6237 
6238 /*
6239  * Fetch timestamp of latest processed commit/abort record.
6240  * Startup process maintains an accurate local copy in XLogReceiptTime
6241  */
6242 TimestampTz
GetCurrentChunkReplayStartTime(void)6243 GetCurrentChunkReplayStartTime(void)
6244 {
6245 	TimestampTz xtime;
6246 
6247 	SpinLockAcquire(&XLogCtl->info_lck);
6248 	xtime = XLogCtl->currentChunkStartTime;
6249 	SpinLockRelease(&XLogCtl->info_lck);
6250 
6251 	return xtime;
6252 }
6253 
6254 /*
6255  * Returns time of receipt of current chunk of XLOG data, as well as
6256  * whether it was received from streaming replication or from archives.
6257  */
6258 void
GetXLogReceiptTime(TimestampTz * rtime,bool * fromStream)6259 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6260 {
6261 	/*
6262 	 * This must be executed in the startup process, since we don't export the
6263 	 * relevant state to shared memory.
6264 	 */
6265 	Assert(InRecovery);
6266 
6267 	*rtime = XLogReceiptTime;
6268 	*fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6269 }
6270 
6271 /*
6272  * Note that text field supplied is a parameter name and does not require
6273  * translation
6274  */
6275 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6276 do { \
6277 	if ((currValue) < (minValue)) \
6278 		ereport(ERROR, \
6279 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6280 				 errmsg("hot standby is not possible because %s = %d is a lower setting than on the master server (its value was %d)", \
6281 						param_name, \
6282 						currValue, \
6283 						minValue))); \
6284 } while(0)
6285 
6286 /*
6287  * Check to see if required parameters are set high enough on this server
6288  * for various aspects of recovery operation.
6289  *
6290  * Note that all the parameters which this function tests need to be
6291  * listed in Administrator's Overview section in high-availability.sgml.
6292  * If you change them, don't forget to update the list.
6293  */
6294 static void
CheckRequiredParameterValues(void)6295 CheckRequiredParameterValues(void)
6296 {
6297 	/*
6298 	 * For archive recovery, the WAL must be generated with at least 'replica'
6299 	 * wal_level.
6300 	 */
6301 	if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6302 	{
6303 		ereport(WARNING,
6304 				(errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6305 				 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6306 	}
6307 
6308 	/*
6309 	 * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6310 	 * must have at least as many backend slots as the primary.
6311 	 */
6312 	if (ArchiveRecoveryRequested && EnableHotStandby)
6313 	{
6314 		if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
6315 			ereport(ERROR,
6316 					(errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
6317 					 errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
6318 
6319 		/* We ignore autovacuum_max_workers when we make this test. */
6320 		RecoveryRequiresIntParameter("max_connections",
6321 									 MaxConnections,
6322 									 ControlFile->MaxConnections);
6323 		RecoveryRequiresIntParameter("max_worker_processes",
6324 									 max_worker_processes,
6325 									 ControlFile->max_worker_processes);
6326 		RecoveryRequiresIntParameter("max_wal_senders",
6327 									 max_wal_senders,
6328 									 ControlFile->max_wal_senders);
6329 		RecoveryRequiresIntParameter("max_prepared_transactions",
6330 									 max_prepared_xacts,
6331 									 ControlFile->max_prepared_xacts);
6332 		RecoveryRequiresIntParameter("max_locks_per_transaction",
6333 									 max_locks_per_xact,
6334 									 ControlFile->max_locks_per_xact);
6335 	}
6336 }
6337 
6338 /*
6339  * This must be called ONCE during postmaster or standalone-backend startup
6340  */
6341 void
StartupXLOG(void)6342 StartupXLOG(void)
6343 {
6344 	XLogCtlInsert *Insert;
6345 	CheckPoint	checkPoint;
6346 	bool		wasShutdown;
6347 	bool		reachedRecoveryTarget = false;
6348 	bool		haveBackupLabel = false;
6349 	bool		haveTblspcMap = false;
6350 	XLogRecPtr	RecPtr,
6351 				checkPointLoc,
6352 				EndOfLog;
6353 	TimeLineID	EndOfLogTLI;
6354 	TimeLineID	PrevTimeLineID;
6355 	XLogRecord *record;
6356 	TransactionId oldestActiveXID;
6357 	bool		backupEndRequired = false;
6358 	bool		backupFromStandby = false;
6359 	DBState		dbstate_at_startup;
6360 	XLogReaderState *xlogreader;
6361 	XLogPageReadPrivate private;
6362 	bool		fast_promoted = false;
6363 	struct stat st;
6364 
6365 	/*
6366 	 * We should have an aux process resource owner to use, and we should not
6367 	 * be in a transaction that's installed some other resowner.
6368 	 */
6369 	Assert(AuxProcessResourceOwner != NULL);
6370 	Assert(CurrentResourceOwner == NULL ||
6371 		   CurrentResourceOwner == AuxProcessResourceOwner);
6372 	CurrentResourceOwner = AuxProcessResourceOwner;
6373 
6374 	/*
6375 	 * Check that contents look valid.
6376 	 */
6377 	if (!XRecOffIsValid(ControlFile->checkPoint))
6378 		ereport(FATAL,
6379 				(errmsg("control file contains invalid checkpoint location")));
6380 
6381 	switch (ControlFile->state)
6382 	{
6383 		case DB_SHUTDOWNED:
6384 
6385 			/*
6386 			 * This is the expected case, so don't be chatty in standalone
6387 			 * mode
6388 			 */
6389 			ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6390 					(errmsg("database system was shut down at %s",
6391 							str_time(ControlFile->time))));
6392 			break;
6393 
6394 		case DB_SHUTDOWNED_IN_RECOVERY:
6395 			ereport(LOG,
6396 					(errmsg("database system was shut down in recovery at %s",
6397 							str_time(ControlFile->time))));
6398 			break;
6399 
6400 		case DB_SHUTDOWNING:
6401 			ereport(LOG,
6402 					(errmsg("database system shutdown was interrupted; last known up at %s",
6403 							str_time(ControlFile->time))));
6404 			break;
6405 
6406 		case DB_IN_CRASH_RECOVERY:
6407 			ereport(LOG,
6408 					(errmsg("database system was interrupted while in recovery at %s",
6409 							str_time(ControlFile->time)),
6410 					 errhint("This probably means that some data is corrupted and"
6411 							 " you will have to use the last backup for recovery.")));
6412 			break;
6413 
6414 		case DB_IN_ARCHIVE_RECOVERY:
6415 			ereport(LOG,
6416 					(errmsg("database system was interrupted while in recovery at log time %s",
6417 							str_time(ControlFile->checkPointCopy.time)),
6418 					 errhint("If this has occurred more than once some data might be corrupted"
6419 							 " and you might need to choose an earlier recovery target.")));
6420 			break;
6421 
6422 		case DB_IN_PRODUCTION:
6423 			ereport(LOG,
6424 					(errmsg("database system was interrupted; last known up at %s",
6425 							str_time(ControlFile->time))));
6426 			break;
6427 
6428 		default:
6429 			ereport(FATAL,
6430 					(errmsg("control file contains invalid database cluster state")));
6431 	}
6432 
6433 	/* This is just to allow attaching to startup process with a debugger */
6434 #ifdef XLOG_REPLAY_DELAY
6435 	if (ControlFile->state != DB_SHUTDOWNED)
6436 		pg_usleep(60000000L);
6437 #endif
6438 
6439 	/*
6440 	 * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
6441 	 * someone has performed a copy for PITR, these directories may have been
6442 	 * excluded and need to be re-created.
6443 	 */
6444 	ValidateXLOGDirectoryStructure();
6445 
6446 	/*----------
6447 	 * If we previously crashed, perform a couple of actions:
6448 	 *
6449 	 * - The pg_wal directory may still include some temporary WAL segments
6450 	 *   used when creating a new segment, so perform some clean up to not
6451 	 *   bloat this path.  This is done first as there is no point to sync
6452 	 *   this temporary data.
6453 	 *
6454 	 * - There might be data which we had written, intending to fsync it, but
6455 	 *   which we had not actually fsync'd yet.  Therefore, a power failure in
6456 	 *   the near future might cause earlier unflushed writes to be lost, even
6457 	 *   though more recent data written to disk from here on would be
6458 	 *   persisted.  To avoid that, fsync the entire data directory.
6459 	 */
6460 	if (ControlFile->state != DB_SHUTDOWNED &&
6461 		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6462 	{
6463 		RemoveTempXlogFiles();
6464 		SyncDataDirectory();
6465 	}
6466 
6467 	/*
6468 	 * Initialize on the assumption we want to recover to the latest timeline
6469 	 * that's active according to pg_control.
6470 	 */
6471 	if (ControlFile->minRecoveryPointTLI >
6472 		ControlFile->checkPointCopy.ThisTimeLineID)
6473 		recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6474 	else
6475 		recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6476 
6477 	/*
6478 	 * Check for signal files, and if so set up state for offline recovery
6479 	 */
6480 	readRecoverySignalFile();
6481 	validateRecoveryParameters();
6482 
6483 	if (ArchiveRecoveryRequested)
6484 	{
6485 		if (StandbyModeRequested)
6486 			ereport(LOG,
6487 					(errmsg("entering standby mode")));
6488 		else if (recoveryTarget == RECOVERY_TARGET_XID)
6489 			ereport(LOG,
6490 					(errmsg("starting point-in-time recovery to XID %u",
6491 							recoveryTargetXid)));
6492 		else if (recoveryTarget == RECOVERY_TARGET_TIME)
6493 			ereport(LOG,
6494 					(errmsg("starting point-in-time recovery to %s",
6495 							timestamptz_to_str(recoveryTargetTime))));
6496 		else if (recoveryTarget == RECOVERY_TARGET_NAME)
6497 			ereport(LOG,
6498 					(errmsg("starting point-in-time recovery to \"%s\"",
6499 							recoveryTargetName)));
6500 		else if (recoveryTarget == RECOVERY_TARGET_LSN)
6501 			ereport(LOG,
6502 					(errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
6503 							(uint32) (recoveryTargetLSN >> 32),
6504 							(uint32) recoveryTargetLSN)));
6505 		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6506 			ereport(LOG,
6507 					(errmsg("starting point-in-time recovery to earliest consistent point")));
6508 		else
6509 			ereport(LOG,
6510 					(errmsg("starting archive recovery")));
6511 	}
6512 
6513 	/*
6514 	 * Take ownership of the wakeup latch if we're going to sleep during
6515 	 * recovery.
6516 	 */
6517 	if (ArchiveRecoveryRequested)
6518 		OwnLatch(&XLogCtl->recoveryWakeupLatch);
6519 
6520 	/* Set up XLOG reader facility */
6521 	MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6522 	xlogreader =
6523 		XLogReaderAllocate(wal_segment_size, NULL,
6524 						   XL_ROUTINE(.page_read = &XLogPageRead,
6525 									  .segment_open = NULL,
6526 									  .segment_close = wal_segment_close),
6527 						   &private);
6528 	if (!xlogreader)
6529 		ereport(ERROR,
6530 				(errcode(ERRCODE_OUT_OF_MEMORY),
6531 				 errmsg("out of memory"),
6532 				 errdetail("Failed while allocating a WAL reading processor.")));
6533 	xlogreader->system_identifier = ControlFile->system_identifier;
6534 
6535 	/*
6536 	 * Allocate two page buffers dedicated to WAL consistency checks.  We do
6537 	 * it this way, rather than just making static arrays, for two reasons:
6538 	 * (1) no need to waste the storage in most instantiations of the backend;
6539 	 * (2) a static char array isn't guaranteed to have any particular
6540 	 * alignment, whereas palloc() will provide MAXALIGN'd storage.
6541 	 */
6542 	replay_image_masked = (char *) palloc(BLCKSZ);
6543 	master_image_masked = (char *) palloc(BLCKSZ);
6544 
6545 	if (read_backup_label(&checkPointLoc, &backupEndRequired,
6546 						  &backupFromStandby))
6547 	{
6548 		List	   *tablespaces = NIL;
6549 
6550 		/*
6551 		 * Archive recovery was requested, and thanks to the backup label
6552 		 * file, we know how far we need to replay to reach consistency. Enter
6553 		 * archive recovery directly.
6554 		 */
6555 		InArchiveRecovery = true;
6556 		if (StandbyModeRequested)
6557 			StandbyMode = true;
6558 
6559 		/*
6560 		 * When a backup_label file is present, we want to roll forward from
6561 		 * the checkpoint it identifies, rather than using pg_control.
6562 		 */
6563 		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6564 		if (record != NULL)
6565 		{
6566 			memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6567 			wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6568 			ereport(DEBUG1,
6569 					(errmsg("checkpoint record is at %X/%X",
6570 							(uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6571 			InRecovery = true;	/* force recovery even if SHUTDOWNED */
6572 
6573 			/*
6574 			 * Make sure that REDO location exists. This may not be the case
6575 			 * if there was a crash during an online backup, which left a
6576 			 * backup_label around that references a WAL segment that's
6577 			 * already been archived.
6578 			 */
6579 			if (checkPoint.redo < checkPointLoc)
6580 			{
6581 				XLogBeginRead(xlogreader, checkPoint.redo);
6582 				if (!ReadRecord(xlogreader, LOG, false))
6583 					ereport(FATAL,
6584 							(errmsg("could not find redo location referenced by checkpoint record"),
6585 							 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6586 									 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6587 									 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6588 									 DataDir, DataDir, DataDir)));
6589 			}
6590 		}
6591 		else
6592 		{
6593 			ereport(FATAL,
6594 					(errmsg("could not locate required checkpoint record"),
6595 					 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6596 							 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6597 							 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6598 							 DataDir, DataDir, DataDir)));
6599 			wasShutdown = false;	/* keep compiler quiet */
6600 		}
6601 
6602 		/* read the tablespace_map file if present and create symlinks. */
6603 		if (read_tablespace_map(&tablespaces))
6604 		{
6605 			ListCell   *lc;
6606 
6607 			foreach(lc, tablespaces)
6608 			{
6609 				tablespaceinfo *ti = lfirst(lc);
6610 				char	   *linkloc;
6611 
6612 				linkloc = psprintf("pg_tblspc/%s", ti->oid);
6613 
6614 				/*
6615 				 * Remove the existing symlink if any and Create the symlink
6616 				 * under PGDATA.
6617 				 */
6618 				remove_tablespace_symlink(linkloc);
6619 
6620 				if (symlink(ti->path, linkloc) < 0)
6621 					ereport(ERROR,
6622 							(errcode_for_file_access(),
6623 							 errmsg("could not create symbolic link \"%s\": %m",
6624 									linkloc)));
6625 
6626 				pfree(ti->oid);
6627 				pfree(ti->path);
6628 				pfree(ti);
6629 			}
6630 
6631 			/* set flag to delete it later */
6632 			haveTblspcMap = true;
6633 		}
6634 
6635 		/* set flag to delete it later */
6636 		haveBackupLabel = true;
6637 	}
6638 	else
6639 	{
6640 		/*
6641 		 * If tablespace_map file is present without backup_label file, there
6642 		 * is no use of such file.  There is no harm in retaining it, but it
6643 		 * is better to get rid of the map file so that we don't have any
6644 		 * redundant file in data directory and it will avoid any sort of
6645 		 * confusion.  It seems prudent though to just rename the file out of
6646 		 * the way rather than delete it completely, also we ignore any error
6647 		 * that occurs in rename operation as even if map file is present
6648 		 * without backup_label file, it is harmless.
6649 		 */
6650 		if (stat(TABLESPACE_MAP, &st) == 0)
6651 		{
6652 			unlink(TABLESPACE_MAP_OLD);
6653 			if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6654 				ereport(LOG,
6655 						(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6656 								TABLESPACE_MAP, BACKUP_LABEL_FILE),
6657 						 errdetail("File \"%s\" was renamed to \"%s\".",
6658 								   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6659 			else
6660 				ereport(LOG,
6661 						(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6662 								TABLESPACE_MAP, BACKUP_LABEL_FILE),
6663 						 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6664 								   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6665 		}
6666 
6667 		/*
6668 		 * It's possible that archive recovery was requested, but we don't
6669 		 * know how far we need to replay the WAL before we reach consistency.
6670 		 * This can happen for example if a base backup is taken from a
6671 		 * running server using an atomic filesystem snapshot, without calling
6672 		 * pg_start/stop_backup. Or if you just kill a running master server
6673 		 * and put it into archive recovery by creating a recovery signal
6674 		 * file.
6675 		 *
6676 		 * Our strategy in that case is to perform crash recovery first,
6677 		 * replaying all the WAL present in pg_wal, and only enter archive
6678 		 * recovery after that.
6679 		 *
6680 		 * But usually we already know how far we need to replay the WAL (up
6681 		 * to minRecoveryPoint, up to backupEndPoint, or until we see an
6682 		 * end-of-backup record), and we can enter archive recovery directly.
6683 		 */
6684 		if (ArchiveRecoveryRequested &&
6685 			(ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6686 			 ControlFile->backupEndRequired ||
6687 			 ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6688 			 ControlFile->state == DB_SHUTDOWNED))
6689 		{
6690 			InArchiveRecovery = true;
6691 			if (StandbyModeRequested)
6692 				StandbyMode = true;
6693 		}
6694 
6695 		/* Get the last valid checkpoint record. */
6696 		checkPointLoc = ControlFile->checkPoint;
6697 		RedoStartLSN = ControlFile->checkPointCopy.redo;
6698 		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6699 		if (record != NULL)
6700 		{
6701 			ereport(DEBUG1,
6702 					(errmsg("checkpoint record is at %X/%X",
6703 							(uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6704 		}
6705 		else
6706 		{
6707 			/*
6708 			 * We used to attempt to go back to a secondary checkpoint record
6709 			 * here, but only when not in standby mode. We now just fail if we
6710 			 * can't read the last checkpoint because this allows us to
6711 			 * simplify processing around checkpoints.
6712 			 */
6713 			ereport(PANIC,
6714 					(errmsg("could not locate a valid checkpoint record")));
6715 		}
6716 		memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6717 		wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6718 	}
6719 
6720 	/*
6721 	 * Clear out any old relcache cache files.  This is *necessary* if we do
6722 	 * any WAL replay, since that would probably result in the cache files
6723 	 * being out of sync with database reality.  In theory we could leave them
6724 	 * in place if the database had been cleanly shut down, but it seems
6725 	 * safest to just remove them always and let them be rebuilt during the
6726 	 * first backend startup.  These files needs to be removed from all
6727 	 * directories including pg_tblspc, however the symlinks are created only
6728 	 * after reading tablespace_map file in case of archive recovery from
6729 	 * backup, so needs to clear old relcache files here after creating
6730 	 * symlinks.
6731 	 */
6732 	RelationCacheInitFileRemove();
6733 
6734 	/*
6735 	 * If the location of the checkpoint record is not on the expected
6736 	 * timeline in the history of the requested timeline, we cannot proceed:
6737 	 * the backup is not part of the history of the requested timeline.
6738 	 */
6739 	Assert(expectedTLEs);		/* was initialized by reading checkpoint
6740 								 * record */
6741 	if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6742 		checkPoint.ThisTimeLineID)
6743 	{
6744 		XLogRecPtr	switchpoint;
6745 
6746 		/*
6747 		 * tliSwitchPoint will throw an error if the checkpoint's timeline is
6748 		 * not in expectedTLEs at all.
6749 		 */
6750 		switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6751 		ereport(FATAL,
6752 				(errmsg("requested timeline %u is not a child of this server's history",
6753 						recoveryTargetTLI),
6754 				 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6755 						   (uint32) (ControlFile->checkPoint >> 32),
6756 						   (uint32) ControlFile->checkPoint,
6757 						   ControlFile->checkPointCopy.ThisTimeLineID,
6758 						   (uint32) (switchpoint >> 32),
6759 						   (uint32) switchpoint)));
6760 	}
6761 
6762 	/*
6763 	 * The min recovery point should be part of the requested timeline's
6764 	 * history, too.
6765 	 */
6766 	if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6767 		tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6768 		ControlFile->minRecoveryPointTLI)
6769 		ereport(FATAL,
6770 				(errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6771 						recoveryTargetTLI,
6772 						(uint32) (ControlFile->minRecoveryPoint >> 32),
6773 						(uint32) ControlFile->minRecoveryPoint,
6774 						ControlFile->minRecoveryPointTLI)));
6775 
6776 	LastRec = RecPtr = checkPointLoc;
6777 
6778 	ereport(DEBUG1,
6779 			(errmsg_internal("redo record is at %X/%X; shutdown %s",
6780 							 (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6781 							 wasShutdown ? "true" : "false")));
6782 	ereport(DEBUG1,
6783 			(errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
6784 							 U64FromFullTransactionId(checkPoint.nextFullXid),
6785 							 checkPoint.nextOid)));
6786 	ereport(DEBUG1,
6787 			(errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6788 							 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6789 	ereport(DEBUG1,
6790 			(errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6791 							 checkPoint.oldestXid, checkPoint.oldestXidDB)));
6792 	ereport(DEBUG1,
6793 			(errmsg_internal("oldest MultiXactId: %u, in database %u",
6794 							 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6795 	ereport(DEBUG1,
6796 			(errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6797 							 checkPoint.oldestCommitTsXid,
6798 							 checkPoint.newestCommitTsXid)));
6799 	if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextFullXid)))
6800 		ereport(PANIC,
6801 				(errmsg("invalid next transaction ID")));
6802 
6803 	/* initialize shared memory variables from the checkpoint record */
6804 	ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
6805 	ShmemVariableCache->nextOid = checkPoint.nextOid;
6806 	ShmemVariableCache->oidCount = 0;
6807 	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6808 	AdvanceOldestClogXid(checkPoint.oldestXid);
6809 	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6810 	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6811 	SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6812 					 checkPoint.newestCommitTsXid);
6813 	XLogCtl->ckptFullXid = checkPoint.nextFullXid;
6814 
6815 	/*
6816 	 * Initialize replication slots, before there's a chance to remove
6817 	 * required resources.
6818 	 */
6819 	StartupReplicationSlots();
6820 
6821 	/*
6822 	 * Startup logical state, needs to be setup now so we have proper data
6823 	 * during crash recovery.
6824 	 */
6825 	StartupReorderBuffer();
6826 
6827 	/*
6828 	 * Startup MultiXact. We need to do this early to be able to replay
6829 	 * truncations.
6830 	 */
6831 	StartupMultiXact();
6832 
6833 	/*
6834 	 * Ditto for commit timestamps.  Activate the facility if the setting is
6835 	 * enabled in the control file, as there should be no tracking of commit
6836 	 * timestamps done when the setting was disabled.  This facility can be
6837 	 * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
6838 	 */
6839 	if (ControlFile->track_commit_timestamp)
6840 		StartupCommitTs();
6841 
6842 	/*
6843 	 * Recover knowledge about replay progress of known replication partners.
6844 	 */
6845 	StartupReplicationOrigin();
6846 
6847 	/*
6848 	 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6849 	 * control file. On recovery, all unlogged relations are blown away, so
6850 	 * the unlogged LSN counter can be reset too.
6851 	 */
6852 	if (ControlFile->state == DB_SHUTDOWNED)
6853 		XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6854 	else
6855 		XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
6856 
6857 	/*
6858 	 * We must replay WAL entries using the same TimeLineID they were created
6859 	 * under, so temporarily adopt the TLI indicated by the checkpoint (see
6860 	 * also xlog_redo()).
6861 	 */
6862 	ThisTimeLineID = checkPoint.ThisTimeLineID;
6863 
6864 	/*
6865 	 * Copy any missing timeline history files between 'now' and the recovery
6866 	 * target timeline from archive to pg_wal. While we don't need those files
6867 	 * ourselves - the history file of the recovery target timeline covers all
6868 	 * the previous timelines in the history too - a cascading standby server
6869 	 * might be interested in them. Or, if you archive the WAL from this
6870 	 * server to a different archive than the master, it'd be good for all the
6871 	 * history files to get archived there after failover, so that you can use
6872 	 * one of the old timelines as a PITR target. Timeline history files are
6873 	 * small, so it's better to copy them unnecessarily than not copy them and
6874 	 * regret later.
6875 	 */
6876 	restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6877 
6878 	/*
6879 	 * Before running in recovery, scan pg_twophase and fill in its status to
6880 	 * be able to work on entries generated by redo.  Doing a scan before
6881 	 * taking any recovery action has the merit to discard any 2PC files that
6882 	 * are newer than the first record to replay, saving from any conflicts at
6883 	 * replay.  This avoids as well any subsequent scans when doing recovery
6884 	 * of the on-disk two-phase data.
6885 	 */
6886 	restoreTwoPhaseData();
6887 
6888 	lastFullPageWrites = checkPoint.fullPageWrites;
6889 
6890 	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6891 	doPageWrites = lastFullPageWrites;
6892 
6893 	if (RecPtr < checkPoint.redo)
6894 		ereport(PANIC,
6895 				(errmsg("invalid redo in checkpoint record")));
6896 
6897 	/*
6898 	 * Check whether we need to force recovery from WAL.  If it appears to
6899 	 * have been a clean shutdown and we did not have a recovery signal file,
6900 	 * then assume no recovery needed.
6901 	 */
6902 	if (checkPoint.redo < RecPtr)
6903 	{
6904 		if (wasShutdown)
6905 			ereport(PANIC,
6906 					(errmsg("invalid redo record in shutdown checkpoint")));
6907 		InRecovery = true;
6908 	}
6909 	else if (ControlFile->state != DB_SHUTDOWNED)
6910 		InRecovery = true;
6911 	else if (ArchiveRecoveryRequested)
6912 	{
6913 		/* force recovery due to presence of recovery signal file */
6914 		InRecovery = true;
6915 	}
6916 
6917 	/*
6918 	 * Start recovery assuming that the final record isn't lost.
6919 	 */
6920 	abortedRecPtr = InvalidXLogRecPtr;
6921 	missingContrecPtr = InvalidXLogRecPtr;
6922 
6923 	/* REDO */
6924 	if (InRecovery)
6925 	{
6926 		int			rmid;
6927 
6928 		/*
6929 		 * Update pg_control to show that we are recovering and to show the
6930 		 * selected checkpoint as the place we are starting from. We also mark
6931 		 * pg_control with any minimum recovery stop point obtained from a
6932 		 * backup history file.
6933 		 */
6934 		dbstate_at_startup = ControlFile->state;
6935 		if (InArchiveRecovery)
6936 		{
6937 			ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6938 
6939 			SpinLockAcquire(&XLogCtl->info_lck);
6940 			XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
6941 			SpinLockRelease(&XLogCtl->info_lck);
6942 		}
6943 		else
6944 		{
6945 			ereport(LOG,
6946 					(errmsg("database system was not properly shut down; "
6947 							"automatic recovery in progress")));
6948 			if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6949 				ereport(LOG,
6950 						(errmsg("crash recovery starts in timeline %u "
6951 								"and has target timeline %u",
6952 								ControlFile->checkPointCopy.ThisTimeLineID,
6953 								recoveryTargetTLI)));
6954 			ControlFile->state = DB_IN_CRASH_RECOVERY;
6955 
6956 			SpinLockAcquire(&XLogCtl->info_lck);
6957 			XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
6958 			SpinLockRelease(&XLogCtl->info_lck);
6959 		}
6960 		ControlFile->checkPoint = checkPointLoc;
6961 		ControlFile->checkPointCopy = checkPoint;
6962 		if (InArchiveRecovery)
6963 		{
6964 			/* initialize minRecoveryPoint if not set yet */
6965 			if (ControlFile->minRecoveryPoint < checkPoint.redo)
6966 			{
6967 				ControlFile->minRecoveryPoint = checkPoint.redo;
6968 				ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6969 			}
6970 		}
6971 
6972 		/*
6973 		 * Set backupStartPoint if we're starting recovery from a base backup.
6974 		 *
6975 		 * Also set backupEndPoint and use minRecoveryPoint as the backup end
6976 		 * location if we're starting recovery from a base backup which was
6977 		 * taken from a standby. In this case, the database system status in
6978 		 * pg_control must indicate that the database was already in recovery.
6979 		 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
6980 		 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6981 		 * before reaching this point; e.g. because restore_command or
6982 		 * primary_conninfo were faulty.
6983 		 *
6984 		 * Any other state indicates that the backup somehow became corrupted
6985 		 * and we can't sensibly continue with recovery.
6986 		 */
6987 		if (haveBackupLabel)
6988 		{
6989 			ControlFile->backupStartPoint = checkPoint.redo;
6990 			ControlFile->backupEndRequired = backupEndRequired;
6991 
6992 			if (backupFromStandby)
6993 			{
6994 				if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6995 					dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6996 					ereport(FATAL,
6997 							(errmsg("backup_label contains data inconsistent with control file"),
6998 							 errhint("This means that the backup is corrupted and you will "
6999 									 "have to use another backup for recovery.")));
7000 				ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
7001 			}
7002 		}
7003 		ControlFile->time = (pg_time_t) time(NULL);
7004 		/* No need to hold ControlFileLock yet, we aren't up far enough */
7005 		UpdateControlFile();
7006 
7007 		/*
7008 		 * Initialize our local copy of minRecoveryPoint.  When doing crash
7009 		 * recovery we want to replay up to the end of WAL.  Particularly, in
7010 		 * the case of a promoted standby minRecoveryPoint value in the
7011 		 * control file is only updated after the first checkpoint.  However,
7012 		 * if the instance crashes before the first post-recovery checkpoint
7013 		 * is completed then recovery will use a stale location causing the
7014 		 * startup process to think that there are still invalid page
7015 		 * references when checking for data consistency.
7016 		 */
7017 		if (InArchiveRecovery)
7018 		{
7019 			minRecoveryPoint = ControlFile->minRecoveryPoint;
7020 			minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
7021 		}
7022 		else
7023 		{
7024 			minRecoveryPoint = InvalidXLogRecPtr;
7025 			minRecoveryPointTLI = 0;
7026 		}
7027 
7028 		/*
7029 		 * Reset pgstat data, because it may be invalid after recovery.
7030 		 */
7031 		pgstat_reset_all();
7032 
7033 		/*
7034 		 * If there was a backup label file, it's done its job and the info
7035 		 * has now been propagated into pg_control.  We must get rid of the
7036 		 * label file so that if we crash during recovery, we'll pick up at
7037 		 * the latest recovery restartpoint instead of going all the way back
7038 		 * to the backup start point.  It seems prudent though to just rename
7039 		 * the file out of the way rather than delete it completely.
7040 		 */
7041 		if (haveBackupLabel)
7042 		{
7043 			unlink(BACKUP_LABEL_OLD);
7044 			durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
7045 		}
7046 
7047 		/*
7048 		 * If there was a tablespace_map file, it's done its job and the
7049 		 * symlinks have been created.  We must get rid of the map file so
7050 		 * that if we crash during recovery, we don't create symlinks again.
7051 		 * It seems prudent though to just rename the file out of the way
7052 		 * rather than delete it completely.
7053 		 */
7054 		if (haveTblspcMap)
7055 		{
7056 			unlink(TABLESPACE_MAP_OLD);
7057 			durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
7058 		}
7059 
7060 		/* Check that the GUCs used to generate the WAL allow recovery */
7061 		CheckRequiredParameterValues();
7062 
7063 		/*
7064 		 * We're in recovery, so unlogged relations may be trashed and must be
7065 		 * reset.  This should be done BEFORE allowing Hot Standby
7066 		 * connections, so that read-only backends don't try to read whatever
7067 		 * garbage is left over from before.
7068 		 */
7069 		ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
7070 
7071 		/*
7072 		 * Likewise, delete any saved transaction snapshot files that got left
7073 		 * behind by crashed backends.
7074 		 */
7075 		DeleteAllExportedSnapshotFiles();
7076 
7077 		/*
7078 		 * Initialize for Hot Standby, if enabled. We won't let backends in
7079 		 * yet, not until we've reached the min recovery point specified in
7080 		 * control file and we've established a recovery snapshot from a
7081 		 * running-xacts WAL record.
7082 		 */
7083 		if (ArchiveRecoveryRequested && EnableHotStandby)
7084 		{
7085 			TransactionId *xids;
7086 			int			nxids;
7087 
7088 			ereport(DEBUG1,
7089 					(errmsg("initializing for hot standby")));
7090 
7091 			InitRecoveryTransactionEnvironment();
7092 
7093 			if (wasShutdown)
7094 				oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7095 			else
7096 				oldestActiveXID = checkPoint.oldestActiveXid;
7097 			Assert(TransactionIdIsValid(oldestActiveXID));
7098 
7099 			/* Tell procarray about the range of xids it has to deal with */
7100 			ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextFullXid));
7101 
7102 			/*
7103 			 * Startup commit log and subtrans only.  MultiXact and commit
7104 			 * timestamp have already been started up and other SLRUs are not
7105 			 * maintained during recovery and need not be started yet.
7106 			 */
7107 			StartupCLOG();
7108 			StartupSUBTRANS(oldestActiveXID);
7109 
7110 			/*
7111 			 * If we're beginning at a shutdown checkpoint, we know that
7112 			 * nothing was running on the master at this point. So fake-up an
7113 			 * empty running-xacts record and use that here and now. Recover
7114 			 * additional standby state for prepared transactions.
7115 			 */
7116 			if (wasShutdown)
7117 			{
7118 				RunningTransactionsData running;
7119 				TransactionId latestCompletedXid;
7120 
7121 				/*
7122 				 * Construct a RunningTransactions snapshot representing a
7123 				 * shut down server, with only prepared transactions still
7124 				 * alive. We're never overflowed at this point because all
7125 				 * subxids are listed with their parent prepared transactions.
7126 				 */
7127 				running.xcnt = nxids;
7128 				running.subxcnt = 0;
7129 				running.subxid_overflow = false;
7130 				running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
7131 				running.oldestRunningXid = oldestActiveXID;
7132 				latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
7133 				TransactionIdRetreat(latestCompletedXid);
7134 				Assert(TransactionIdIsNormal(latestCompletedXid));
7135 				running.latestCompletedXid = latestCompletedXid;
7136 				running.xids = xids;
7137 
7138 				ProcArrayApplyRecoveryInfo(&running);
7139 
7140 				StandbyRecoverPreparedTransactions();
7141 			}
7142 		}
7143 
7144 		/* Initialize resource managers */
7145 		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7146 		{
7147 			if (RmgrTable[rmid].rm_startup != NULL)
7148 				RmgrTable[rmid].rm_startup();
7149 		}
7150 
7151 		/*
7152 		 * Initialize shared variables for tracking progress of WAL replay, as
7153 		 * if we had just replayed the record before the REDO location (or the
7154 		 * checkpoint record itself, if it's a shutdown checkpoint).
7155 		 */
7156 		SpinLockAcquire(&XLogCtl->info_lck);
7157 		if (checkPoint.redo < RecPtr)
7158 			XLogCtl->replayEndRecPtr = checkPoint.redo;
7159 		else
7160 			XLogCtl->replayEndRecPtr = EndRecPtr;
7161 		XLogCtl->replayEndTLI = ThisTimeLineID;
7162 		XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
7163 		XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
7164 		XLogCtl->recoveryLastXTime = 0;
7165 		XLogCtl->currentChunkStartTime = 0;
7166 		XLogCtl->recoveryPause = false;
7167 		SpinLockRelease(&XLogCtl->info_lck);
7168 
7169 		/* Also ensure XLogReceiptTime has a sane value */
7170 		XLogReceiptTime = GetCurrentTimestamp();
7171 
7172 		/*
7173 		 * Let postmaster know we've started redo now, so that it can launch
7174 		 * checkpointer to perform restartpoints.  We don't bother during
7175 		 * crash recovery as restartpoints can only be performed during
7176 		 * archive recovery.  And we'd like to keep crash recovery simple, to
7177 		 * avoid introducing bugs that could affect you when recovering after
7178 		 * crash.
7179 		 *
7180 		 * After this point, we can no longer assume that we're the only
7181 		 * process in addition to postmaster!  Also, fsync requests are
7182 		 * subsequently to be handled by the checkpointer, not locally.
7183 		 */
7184 		if (ArchiveRecoveryRequested && IsUnderPostmaster)
7185 		{
7186 			PublishStartupProcessInformation();
7187 			EnableSyncRequestForwarding();
7188 			SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
7189 			bgwriterLaunched = true;
7190 		}
7191 
7192 		/*
7193 		 * Allow read-only connections immediately if we're consistent
7194 		 * already.
7195 		 */
7196 		CheckRecoveryConsistency();
7197 
7198 		/*
7199 		 * Find the first record that logically follows the checkpoint --- it
7200 		 * might physically precede it, though.
7201 		 */
7202 		if (checkPoint.redo < RecPtr)
7203 		{
7204 			/* back up to find the record */
7205 			XLogBeginRead(xlogreader, checkPoint.redo);
7206 			record = ReadRecord(xlogreader, PANIC, false);
7207 		}
7208 		else
7209 		{
7210 			/* just have to read next record after CheckPoint */
7211 			record = ReadRecord(xlogreader, LOG, false);
7212 		}
7213 
7214 		if (record != NULL)
7215 		{
7216 			ErrorContextCallback errcallback;
7217 			TimestampTz xtime;
7218 
7219 			InRedo = true;
7220 
7221 			ereport(LOG,
7222 					(errmsg("redo starts at %X/%X",
7223 							(uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7224 
7225 			/*
7226 			 * main redo apply loop
7227 			 */
7228 			do
7229 			{
7230 				bool		switchedTLI = false;
7231 
7232 #ifdef WAL_DEBUG
7233 				if (XLOG_DEBUG ||
7234 					(rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
7235 					(rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
7236 				{
7237 					StringInfoData buf;
7238 
7239 					initStringInfo(&buf);
7240 					appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
7241 									 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
7242 									 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
7243 					xlog_outrec(&buf, xlogreader);
7244 					appendStringInfoString(&buf, " - ");
7245 					xlog_outdesc(&buf, xlogreader);
7246 					elog(LOG, "%s", buf.data);
7247 					pfree(buf.data);
7248 				}
7249 #endif
7250 
7251 				/* Handle interrupt signals of startup process */
7252 				HandleStartupProcInterrupts();
7253 
7254 				/*
7255 				 * Pause WAL replay, if requested by a hot-standby session via
7256 				 * SetRecoveryPause().
7257 				 *
7258 				 * Note that we intentionally don't take the info_lck spinlock
7259 				 * here.  We might therefore read a slightly stale value of
7260 				 * the recoveryPause flag, but it can't be very stale (no
7261 				 * worse than the last spinlock we did acquire).  Since a
7262 				 * pause request is a pretty asynchronous thing anyway,
7263 				 * possibly responding to it one WAL record later than we
7264 				 * otherwise would is a minor issue, so it doesn't seem worth
7265 				 * adding another spinlock cycle to prevent that.
7266 				 */
7267 				if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7268 					recoveryPausesHere(false);
7269 
7270 				/*
7271 				 * Have we reached our recovery target?
7272 				 */
7273 				if (recoveryStopsBefore(xlogreader))
7274 				{
7275 					reachedRecoveryTarget = true;
7276 					break;
7277 				}
7278 
7279 				/*
7280 				 * If we've been asked to lag the master, wait on latch until
7281 				 * enough time has passed.
7282 				 */
7283 				if (recoveryApplyDelay(xlogreader))
7284 				{
7285 					/*
7286 					 * We test for paused recovery again here. If user sets
7287 					 * delayed apply, it may be because they expect to pause
7288 					 * recovery in case of problems, so we must test again
7289 					 * here otherwise pausing during the delay-wait wouldn't
7290 					 * work.
7291 					 */
7292 					if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7293 						recoveryPausesHere(false);
7294 				}
7295 
7296 				/* Setup error traceback support for ereport() */
7297 				errcallback.callback = rm_redo_error_callback;
7298 				errcallback.arg = (void *) xlogreader;
7299 				errcallback.previous = error_context_stack;
7300 				error_context_stack = &errcallback;
7301 
7302 				/*
7303 				 * ShmemVariableCache->nextFullXid must be beyond record's
7304 				 * xid.
7305 				 */
7306 				AdvanceNextFullTransactionIdPastXid(record->xl_xid);
7307 
7308 				/*
7309 				 * Before replaying this record, check if this record causes
7310 				 * the current timeline to change. The record is already
7311 				 * considered to be part of the new timeline, so we update
7312 				 * ThisTimeLineID before replaying it. That's important so
7313 				 * that replayEndTLI, which is recorded as the minimum
7314 				 * recovery point's TLI if recovery stops after this record,
7315 				 * is set correctly.
7316 				 */
7317 				if (record->xl_rmid == RM_XLOG_ID)
7318 				{
7319 					TimeLineID	newTLI = ThisTimeLineID;
7320 					TimeLineID	prevTLI = ThisTimeLineID;
7321 					uint8		info = record->xl_info & ~XLR_INFO_MASK;
7322 
7323 					if (info == XLOG_CHECKPOINT_SHUTDOWN)
7324 					{
7325 						CheckPoint	checkPoint;
7326 
7327 						memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7328 						newTLI = checkPoint.ThisTimeLineID;
7329 						prevTLI = checkPoint.PrevTimeLineID;
7330 					}
7331 					else if (info == XLOG_END_OF_RECOVERY)
7332 					{
7333 						xl_end_of_recovery xlrec;
7334 
7335 						memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7336 						newTLI = xlrec.ThisTimeLineID;
7337 						prevTLI = xlrec.PrevTimeLineID;
7338 					}
7339 
7340 					if (newTLI != ThisTimeLineID)
7341 					{
7342 						/* Check that it's OK to switch to this TLI */
7343 						checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7344 
7345 						/* Following WAL records should be run with new TLI */
7346 						ThisTimeLineID = newTLI;
7347 						switchedTLI = true;
7348 					}
7349 				}
7350 
7351 				/*
7352 				 * Update shared replayEndRecPtr before replaying this record,
7353 				 * so that XLogFlush will update minRecoveryPoint correctly.
7354 				 */
7355 				SpinLockAcquire(&XLogCtl->info_lck);
7356 				XLogCtl->replayEndRecPtr = EndRecPtr;
7357 				XLogCtl->replayEndTLI = ThisTimeLineID;
7358 				SpinLockRelease(&XLogCtl->info_lck);
7359 
7360 				/*
7361 				 * If we are attempting to enter Hot Standby mode, process
7362 				 * XIDs we see
7363 				 */
7364 				if (standbyState >= STANDBY_INITIALIZED &&
7365 					TransactionIdIsValid(record->xl_xid))
7366 					RecordKnownAssignedTransactionIds(record->xl_xid);
7367 
7368 				/* Now apply the WAL record itself */
7369 				RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7370 
7371 				/*
7372 				 * After redo, check whether the backup pages associated with
7373 				 * the WAL record are consistent with the existing pages. This
7374 				 * check is done only if consistency check is enabled for this
7375 				 * record.
7376 				 */
7377 				if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
7378 					checkXLogConsistency(xlogreader);
7379 
7380 				/* Pop the error context stack */
7381 				error_context_stack = errcallback.previous;
7382 
7383 				/*
7384 				 * Update lastReplayedEndRecPtr after this record has been
7385 				 * successfully replayed.
7386 				 */
7387 				SpinLockAcquire(&XLogCtl->info_lck);
7388 				XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7389 				XLogCtl->lastReplayedTLI = ThisTimeLineID;
7390 				SpinLockRelease(&XLogCtl->info_lck);
7391 
7392 				/*
7393 				 * If rm_redo called XLogRequestWalReceiverReply, then we wake
7394 				 * up the receiver so that it notices the updated
7395 				 * lastReplayedEndRecPtr and sends a reply to the master.
7396 				 */
7397 				if (doRequestWalReceiverReply)
7398 				{
7399 					doRequestWalReceiverReply = false;
7400 					WalRcvForceReply();
7401 				}
7402 
7403 				/* Remember this record as the last-applied one */
7404 				LastRec = ReadRecPtr;
7405 
7406 				/* Allow read-only connections if we're consistent now */
7407 				CheckRecoveryConsistency();
7408 
7409 				/* Is this a timeline switch? */
7410 				if (switchedTLI)
7411 				{
7412 					/*
7413 					 * Before we continue on the new timeline, clean up any
7414 					 * (possibly bogus) future WAL segments on the old
7415 					 * timeline.
7416 					 */
7417 					RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7418 
7419 					/*
7420 					 * Wake up any walsenders to notice that we are on a new
7421 					 * timeline.
7422 					 */
7423 					if (switchedTLI && AllowCascadeReplication())
7424 						WalSndWakeup();
7425 				}
7426 
7427 				/* Exit loop if we reached inclusive recovery target */
7428 				if (recoveryStopsAfter(xlogreader))
7429 				{
7430 					reachedRecoveryTarget = true;
7431 					break;
7432 				}
7433 
7434 				/* Else, try to fetch the next WAL record */
7435 				record = ReadRecord(xlogreader, LOG, false);
7436 			} while (record != NULL);
7437 
7438 			/*
7439 			 * end of main redo apply loop
7440 			 */
7441 
7442 			if (reachedRecoveryTarget)
7443 			{
7444 				if (!reachedConsistency)
7445 					ereport(FATAL,
7446 							(errmsg("requested recovery stop point is before consistent recovery point")));
7447 
7448 				/*
7449 				 * This is the last point where we can restart recovery with a
7450 				 * new recovery target, if we shutdown and begin again. After
7451 				 * this, Resource Managers may choose to do permanent
7452 				 * corrective actions at end of recovery.
7453 				 */
7454 				switch (recoveryTargetAction)
7455 				{
7456 					case RECOVERY_TARGET_ACTION_SHUTDOWN:
7457 
7458 						/*
7459 						 * exit with special return code to request shutdown
7460 						 * of postmaster.  Log messages issued from
7461 						 * postmaster.
7462 						 */
7463 						proc_exit(3);
7464 
7465 					case RECOVERY_TARGET_ACTION_PAUSE:
7466 						SetRecoveryPause(true);
7467 						recoveryPausesHere(true);
7468 
7469 						/* drop into promote */
7470 
7471 					case RECOVERY_TARGET_ACTION_PROMOTE:
7472 						break;
7473 				}
7474 			}
7475 
7476 			/* Allow resource managers to do any required cleanup. */
7477 			for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7478 			{
7479 				if (RmgrTable[rmid].rm_cleanup != NULL)
7480 					RmgrTable[rmid].rm_cleanup();
7481 			}
7482 
7483 			ereport(LOG,
7484 					(errmsg("redo done at %X/%X",
7485 							(uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7486 			xtime = GetLatestXTime();
7487 			if (xtime)
7488 				ereport(LOG,
7489 						(errmsg("last completed transaction was at log time %s",
7490 								timestamptz_to_str(xtime))));
7491 
7492 			InRedo = false;
7493 		}
7494 		else
7495 		{
7496 			/* there are no WAL records following the checkpoint */
7497 			ereport(LOG,
7498 					(errmsg("redo is not required")));
7499 
7500 		}
7501 
7502 		/*
7503 		 * This check is intentionally after the above log messages that
7504 		 * indicate how far recovery went.
7505 		 */
7506 		if (ArchiveRecoveryRequested &&
7507 			recoveryTarget != RECOVERY_TARGET_UNSET &&
7508 			!reachedRecoveryTarget)
7509 			ereport(FATAL,
7510 					(errmsg("recovery ended before configured recovery target was reached")));
7511 	}
7512 
7513 	/*
7514 	 * Kill WAL receiver, if it's still running, before we continue to write
7515 	 * the startup checkpoint and aborted-contrecord records. It will trump
7516 	 * over these records and subsequent ones if it's still alive when we
7517 	 * start writing WAL.
7518 	 */
7519 	ShutdownWalRcv();
7520 
7521 	/*
7522 	 * Reset unlogged relations to the contents of their INIT fork. This is
7523 	 * done AFTER recovery is complete so as to include any unlogged relations
7524 	 * created during recovery, but BEFORE recovery is marked as having
7525 	 * completed successfully. Otherwise we'd not retry if any of the post
7526 	 * end-of-recovery steps fail.
7527 	 */
7528 	if (InRecovery)
7529 		ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7530 
7531 	/*
7532 	 * We don't need the latch anymore. It's not strictly necessary to disown
7533 	 * it, but let's do it for the sake of tidiness.
7534 	 */
7535 	if (ArchiveRecoveryRequested)
7536 		DisownLatch(&XLogCtl->recoveryWakeupLatch);
7537 
7538 	/*
7539 	 * We are now done reading the xlog from stream. Turn off streaming
7540 	 * recovery to force fetching the files (which would be required at end of
7541 	 * recovery, e.g., timeline history file) from archive or pg_wal.
7542 	 *
7543 	 * Note that standby mode must be turned off after killing WAL receiver,
7544 	 * i.e., calling ShutdownWalRcv().
7545 	 */
7546 	Assert(!WalRcvStreaming());
7547 	StandbyMode = false;
7548 
7549 	/*
7550 	 * Determine where to start writing WAL next.
7551 	 *
7552 	 * When recovery ended in an incomplete record, write a WAL record about
7553 	 * that and continue after it.  In all other cases, re-fetch the last
7554 	 * valid or last applied record, so we can identify the exact endpoint of
7555 	 * what we consider the valid portion of WAL.
7556 	 */
7557 	XLogBeginRead(xlogreader, LastRec);
7558 	record = ReadRecord(xlogreader, PANIC, false);
7559 	EndOfLog = EndRecPtr;
7560 
7561 	/*
7562 	 * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7563 	 * the end-of-log. It could be different from the timeline that EndOfLog
7564 	 * nominally belongs to, if there was a timeline switch in that segment,
7565 	 * and we were reading the old WAL from a segment belonging to a higher
7566 	 * timeline.
7567 	 */
7568 	EndOfLogTLI = xlogreader->seg.ws_tli;
7569 
7570 	/*
7571 	 * Complain if we did not roll forward far enough to render the backup
7572 	 * dump consistent.  Note: it is indeed okay to look at the local variable
7573 	 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7574 	 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7575 	 * advanced beyond the WAL we processed.
7576 	 */
7577 	if (InRecovery &&
7578 		(EndOfLog < minRecoveryPoint ||
7579 		 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7580 	{
7581 		/*
7582 		 * Ran off end of WAL before reaching end-of-backup WAL record, or
7583 		 * minRecoveryPoint. That's usually a bad sign, indicating that you
7584 		 * tried to recover from an online backup but never called
7585 		 * pg_stop_backup(), or you didn't archive all the WAL up to that
7586 		 * point. However, this also happens in crash recovery, if the system
7587 		 * crashes while an online backup is in progress. We must not treat
7588 		 * that as an error, or the database will refuse to start up.
7589 		 */
7590 		if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7591 		{
7592 			if (ControlFile->backupEndRequired)
7593 				ereport(FATAL,
7594 						(errmsg("WAL ends before end of online backup"),
7595 						 errhint("All WAL generated while online backup was taken must be available at recovery.")));
7596 			else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7597 				ereport(FATAL,
7598 						(errmsg("WAL ends before end of online backup"),
7599 						 errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7600 			else
7601 				ereport(FATAL,
7602 						(errmsg("WAL ends before consistent recovery point")));
7603 		}
7604 	}
7605 
7606 	/*
7607 	 * Pre-scan prepared transactions to find out the range of XIDs present.
7608 	 * This information is not quite needed yet, but it is positioned here so
7609 	 * as potential problems are detected before any on-disk change is done.
7610 	 */
7611 	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7612 
7613 	/*
7614 	 * Consider whether we need to assign a new timeline ID.
7615 	 *
7616 	 * If we are doing an archive recovery, we always assign a new ID.  This
7617 	 * handles a couple of issues.  If we stopped short of the end of WAL
7618 	 * during recovery, then we are clearly generating a new timeline and must
7619 	 * assign it a unique new ID.  Even if we ran to the end, modifying the
7620 	 * current last segment is problematic because it may result in trying to
7621 	 * overwrite an already-archived copy of that segment, and we encourage
7622 	 * DBAs to make their archive_commands reject that.  We can dodge the
7623 	 * problem by making the new active segment have a new timeline ID.
7624 	 *
7625 	 * In a normal crash recovery, we can just extend the timeline we were in.
7626 	 */
7627 	PrevTimeLineID = ThisTimeLineID;
7628 	if (ArchiveRecoveryRequested)
7629 	{
7630 		char		reason[200];
7631 		char		recoveryPath[MAXPGPATH];
7632 
7633 		Assert(InArchiveRecovery);
7634 
7635 		ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7636 		ereport(LOG,
7637 				(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7638 
7639 		/*
7640 		 * Create a comment for the history file to explain why and where
7641 		 * timeline changed.
7642 		 */
7643 		if (recoveryTarget == RECOVERY_TARGET_XID)
7644 			snprintf(reason, sizeof(reason),
7645 					 "%s transaction %u",
7646 					 recoveryStopAfter ? "after" : "before",
7647 					 recoveryStopXid);
7648 		else if (recoveryTarget == RECOVERY_TARGET_TIME)
7649 			snprintf(reason, sizeof(reason),
7650 					 "%s %s\n",
7651 					 recoveryStopAfter ? "after" : "before",
7652 					 timestamptz_to_str(recoveryStopTime));
7653 		else if (recoveryTarget == RECOVERY_TARGET_LSN)
7654 			snprintf(reason, sizeof(reason),
7655 					 "%s LSN %X/%X\n",
7656 					 recoveryStopAfter ? "after" : "before",
7657 					 (uint32) (recoveryStopLSN >> 32),
7658 					 (uint32) recoveryStopLSN);
7659 		else if (recoveryTarget == RECOVERY_TARGET_NAME)
7660 			snprintf(reason, sizeof(reason),
7661 					 "at restore point \"%s\"",
7662 					 recoveryStopName);
7663 		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7664 			snprintf(reason, sizeof(reason), "reached consistency");
7665 		else
7666 			snprintf(reason, sizeof(reason), "no recovery target specified");
7667 
7668 		/*
7669 		 * We are now done reading the old WAL.  Turn off archive fetching if
7670 		 * it was active, and make a writable copy of the last WAL segment.
7671 		 * (Note that we also have a copy of the last block of the old WAL in
7672 		 * readBuf; we will use that below.)
7673 		 */
7674 		exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7675 
7676 		/*
7677 		 * Write the timeline history file, and have it archived. After this
7678 		 * point (or rather, as soon as the file is archived), the timeline
7679 		 * will appear as "taken" in the WAL archive and to any standby
7680 		 * servers.  If we crash before actually switching to the new
7681 		 * timeline, standby servers will nevertheless think that we switched
7682 		 * to the new timeline, and will try to connect to the new timeline.
7683 		 * To minimize the window for that, try to do as little as possible
7684 		 * between here and writing the end-of-recovery record.
7685 		 */
7686 		writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7687 							 EndRecPtr, reason);
7688 
7689 		/*
7690 		 * Since there might be a partial WAL segment named RECOVERYXLOG, get
7691 		 * rid of it.
7692 		 */
7693 		snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
7694 		unlink(recoveryPath);	/* ignore any error */
7695 
7696 		/* Get rid of any remaining recovered timeline-history file, too */
7697 		snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
7698 		unlink(recoveryPath);	/* ignore any error */
7699 	}
7700 
7701 	/* Save the selected TimeLineID in shared memory, too */
7702 	XLogCtl->ThisTimeLineID = ThisTimeLineID;
7703 	XLogCtl->PrevTimeLineID = PrevTimeLineID;
7704 
7705 	/*
7706 	 * Actually, if WAL ended in an incomplete record, skip the parts that
7707 	 * made it through and start writing after the portion that persisted.
7708 	 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
7709 	 * we'll do as soon as we're open for writing new WAL.)
7710 	 */
7711 	if (!XLogRecPtrIsInvalid(missingContrecPtr))
7712 	{
7713 		Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
7714 		EndOfLog = missingContrecPtr;
7715 	}
7716 
7717 	/*
7718 	 * Prepare to write WAL starting at EndOfLog location, and init xlog
7719 	 * buffer cache using the block containing the last record from the
7720 	 * previous incarnation.
7721 	 */
7722 	Insert = &XLogCtl->Insert;
7723 	Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7724 	Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7725 
7726 	/*
7727 	 * Tricky point here: readBuf contains the *last* block that the LastRec
7728 	 * record spans, not the one it starts in.  The last block is indeed the
7729 	 * one we want to use.
7730 	 */
7731 	if (EndOfLog % XLOG_BLCKSZ != 0)
7732 	{
7733 		char	   *page;
7734 		int			len;
7735 		int			firstIdx;
7736 		XLogRecPtr	pageBeginPtr;
7737 
7738 		pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7739 		Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
7740 
7741 		firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7742 
7743 		/* Copy the valid part of the last block, and zero the rest */
7744 		page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7745 		len = EndOfLog % XLOG_BLCKSZ;
7746 		memcpy(page, xlogreader->readBuf, len);
7747 		memset(page + len, 0, XLOG_BLCKSZ - len);
7748 
7749 		XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7750 		XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7751 	}
7752 	else
7753 	{
7754 		/*
7755 		 * There is no partial block to copy. Just set InitializedUpTo, and
7756 		 * let the first attempt to insert a log record to initialize the next
7757 		 * buffer.
7758 		 */
7759 		XLogCtl->InitializedUpTo = EndOfLog;
7760 	}
7761 
7762 	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7763 
7764 	XLogCtl->LogwrtResult = LogwrtResult;
7765 
7766 	XLogCtl->LogwrtRqst.Write = EndOfLog;
7767 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
7768 
7769 	LocalSetXLogInsertAllowed();
7770 
7771 	/* If necessary, write overwrite-contrecord before doing anything else */
7772 	if (!XLogRecPtrIsInvalid(abortedRecPtr))
7773 	{
7774 		Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
7775 		CreateOverwriteContrecordRecord(abortedRecPtr);
7776 		abortedRecPtr = InvalidXLogRecPtr;
7777 		missingContrecPtr = InvalidXLogRecPtr;
7778 	}
7779 
7780 	/*
7781 	 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7782 	 * record before resource manager writes cleanup WAL records or checkpoint
7783 	 * record is written.
7784 	 */
7785 	Insert->fullPageWrites = lastFullPageWrites;
7786 	UpdateFullPageWrites();
7787 	LocalXLogInsertAllowed = -1;
7788 
7789 	if (InRecovery)
7790 	{
7791 		/*
7792 		 * Perform a checkpoint to update all our recovery activity to disk.
7793 		 *
7794 		 * Note that we write a shutdown checkpoint rather than an on-line
7795 		 * one. This is not particularly critical, but since we may be
7796 		 * assigning a new TLI, using a shutdown checkpoint allows us to have
7797 		 * the rule that TLI only changes in shutdown checkpoints, which
7798 		 * allows some extra error checking in xlog_redo.
7799 		 *
7800 		 * In fast promotion, only create a lightweight end-of-recovery record
7801 		 * instead of a full checkpoint. A checkpoint is requested later,
7802 		 * after we're fully out of recovery mode and already accepting
7803 		 * queries.
7804 		 */
7805 		if (bgwriterLaunched)
7806 		{
7807 			if (fast_promote)
7808 			{
7809 				checkPointLoc = ControlFile->checkPoint;
7810 
7811 				/*
7812 				 * Confirm the last checkpoint is available for us to recover
7813 				 * from if we fail.
7814 				 */
7815 				record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7816 				if (record != NULL)
7817 				{
7818 					fast_promoted = true;
7819 
7820 					/*
7821 					 * Insert a special WAL record to mark the end of
7822 					 * recovery, since we aren't doing a checkpoint. That
7823 					 * means that the checkpointer process may likely be in
7824 					 * the middle of a time-smoothed restartpoint and could
7825 					 * continue to be for minutes after this. That sounds
7826 					 * strange, but the effect is roughly the same and it
7827 					 * would be stranger to try to come out of the
7828 					 * restartpoint and then checkpoint. We request a
7829 					 * checkpoint later anyway, just for safety.
7830 					 */
7831 					CreateEndOfRecoveryRecord();
7832 				}
7833 			}
7834 
7835 			if (!fast_promoted)
7836 				RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7837 								  CHECKPOINT_IMMEDIATE |
7838 								  CHECKPOINT_WAIT);
7839 		}
7840 		else
7841 			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7842 	}
7843 
7844 	if (ArchiveRecoveryRequested)
7845 	{
7846 		/*
7847 		 * And finally, execute the recovery_end_command, if any.
7848 		 */
7849 		if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
7850 			ExecuteRecoveryCommand(recoveryEndCommand,
7851 								   "recovery_end_command",
7852 								   true);
7853 
7854 		/*
7855 		 * We switched to a new timeline. Clean up segments on the old
7856 		 * timeline.
7857 		 *
7858 		 * If there are any higher-numbered segments on the old timeline,
7859 		 * remove them. They might contain valid WAL, but they might also be
7860 		 * pre-allocated files containing garbage. In any case, they are not
7861 		 * part of the new timeline's history so we don't need them.
7862 		 */
7863 		RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
7864 
7865 		/*
7866 		 * If the switch happened in the middle of a segment, what to do with
7867 		 * the last, partial segment on the old timeline? If we don't archive
7868 		 * it, and the server that created the WAL never archives it either
7869 		 * (e.g. because it was hit by a meteor), it will never make it to the
7870 		 * archive. That's OK from our point of view, because the new segment
7871 		 * that we created with the new TLI contains all the WAL from the old
7872 		 * timeline up to the switch point. But if you later try to do PITR to
7873 		 * the "missing" WAL on the old timeline, recovery won't find it in
7874 		 * the archive. It's physically present in the new file with new TLI,
7875 		 * but recovery won't look there when it's recovering to the older
7876 		 * timeline. On the other hand, if we archive the partial segment, and
7877 		 * the original server on that timeline is still running and archives
7878 		 * the completed version of the same segment later, it will fail. (We
7879 		 * used to do that in 9.4 and below, and it caused such problems).
7880 		 *
7881 		 * As a compromise, we rename the last segment with the .partial
7882 		 * suffix, and archive it. Archive recovery will never try to read
7883 		 * .partial segments, so they will normally go unused. But in the odd
7884 		 * PITR case, the administrator can copy them manually to the pg_wal
7885 		 * directory (removing the suffix). They can be useful in debugging,
7886 		 * too.
7887 		 *
7888 		 * If a .done or .ready file already exists for the old timeline,
7889 		 * however, we had already determined that the segment is complete, so
7890 		 * we can let it be archived normally. (In particular, if it was
7891 		 * restored from the archive to begin with, it's expected to have a
7892 		 * .done file).
7893 		 */
7894 		if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
7895 			XLogArchivingActive())
7896 		{
7897 			char		origfname[MAXFNAMELEN];
7898 			XLogSegNo	endLogSegNo;
7899 
7900 			XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
7901 			XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
7902 
7903 			if (!XLogArchiveIsReadyOrDone(origfname))
7904 			{
7905 				char		origpath[MAXPGPATH];
7906 				char		partialfname[MAXFNAMELEN];
7907 				char		partialpath[MAXPGPATH];
7908 
7909 				XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
7910 				snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
7911 				snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
7912 
7913 				/*
7914 				 * Make sure there's no .done or .ready file for the .partial
7915 				 * file.
7916 				 */
7917 				XLogArchiveCleanup(partialfname);
7918 
7919 				durable_rename(origpath, partialpath, ERROR);
7920 				XLogArchiveNotify(partialfname);
7921 			}
7922 		}
7923 	}
7924 
7925 	/*
7926 	 * Preallocate additional log files, if wanted.
7927 	 */
7928 	PreallocXlogFiles(EndOfLog);
7929 
7930 	/*
7931 	 * Okay, we're officially UP.
7932 	 */
7933 	InRecovery = false;
7934 
7935 	/* start the archive_timeout timer and LSN running */
7936 	XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7937 	XLogCtl->lastSegSwitchLSN = EndOfLog;
7938 
7939 	/* also initialize latestCompletedXid, to nextXid - 1 */
7940 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7941 	ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
7942 	TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7943 	LWLockRelease(ProcArrayLock);
7944 
7945 	/*
7946 	 * Start up the commit log and subtrans, if not already done for hot
7947 	 * standby.  (commit timestamps are started below, if necessary.)
7948 	 */
7949 	if (standbyState == STANDBY_DISABLED)
7950 	{
7951 		StartupCLOG();
7952 		StartupSUBTRANS(oldestActiveXID);
7953 	}
7954 
7955 	/*
7956 	 * Perform end of recovery actions for any SLRUs that need it.
7957 	 */
7958 	TrimCLOG();
7959 	TrimMultiXact();
7960 
7961 	/* Reload shared-memory state for prepared transactions */
7962 	RecoverPreparedTransactions();
7963 
7964 	/* Shut down xlogreader */
7965 	if (readFile >= 0)
7966 	{
7967 		close(readFile);
7968 		readFile = -1;
7969 	}
7970 	XLogReaderFree(xlogreader);
7971 
7972 	/*
7973 	 * If any of the critical GUCs have changed, log them before we allow
7974 	 * backends to write WAL.
7975 	 */
7976 	LocalSetXLogInsertAllowed();
7977 	XLogReportParameters();
7978 
7979 	/*
7980 	 * Local WAL inserts enabled, so it's time to finish initialization of
7981 	 * commit timestamp.
7982 	 */
7983 	CompleteCommitTsInitialization();
7984 
7985 	/*
7986 	 * All done with end-of-recovery actions.
7987 	 *
7988 	 * Now allow backends to write WAL and update the control file status in
7989 	 * consequence.  The boolean flag allowing backends to write WAL is
7990 	 * updated while holding ControlFileLock to prevent other backends to look
7991 	 * at an inconsistent state of the control file in shared memory.  There
7992 	 * is still a small window during which backends can write WAL and the
7993 	 * control file is still referring to a system not in DB_IN_PRODUCTION
7994 	 * state while looking at the on-disk control file.
7995 	 *
7996 	 * Also, although the boolean flag to allow WAL is probably atomic in
7997 	 * itself, we use the info_lck here to ensure that there are no race
7998 	 * conditions concerning visibility of other recent updates to shared
7999 	 * memory.
8000 	 */
8001 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8002 	ControlFile->state = DB_IN_PRODUCTION;
8003 	ControlFile->time = (pg_time_t) time(NULL);
8004 
8005 	SpinLockAcquire(&XLogCtl->info_lck);
8006 	XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
8007 	SpinLockRelease(&XLogCtl->info_lck);
8008 
8009 	UpdateControlFile();
8010 	LWLockRelease(ControlFileLock);
8011 
8012 	/*
8013 	 * Shutdown the recovery environment.  This must occur after
8014 	 * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
8015 	 * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
8016 	 * any session building a snapshot will not rely on KnownAssignedXids as
8017 	 * RecoveryInProgress() would return false at this stage.  This is
8018 	 * particularly critical for prepared 2PC transactions, that would still
8019 	 * need to be included in snapshots once recovery has ended.
8020 	 */
8021 	if (standbyState != STANDBY_DISABLED)
8022 		ShutdownRecoveryTransactionEnvironment();
8023 
8024 	/*
8025 	 * If there were cascading standby servers connected to us, nudge any wal
8026 	 * sender processes to notice that we've been promoted.
8027 	 */
8028 	WalSndWakeup();
8029 
8030 	/*
8031 	 * If this was a fast promotion, request an (online) checkpoint now. This
8032 	 * isn't required for consistency, but the last restartpoint might be far
8033 	 * back, and in case of a crash, recovering from it might take a longer
8034 	 * than is appropriate now that we're not in standby mode anymore.
8035 	 */
8036 	if (fast_promoted)
8037 		RequestCheckpoint(CHECKPOINT_FORCE);
8038 }
8039 
8040 /*
8041  * Checks if recovery has reached a consistent state. When consistency is
8042  * reached and we have a valid starting standby snapshot, tell postmaster
8043  * that it can start accepting read-only connections.
8044  */
8045 static void
CheckRecoveryConsistency(void)8046 CheckRecoveryConsistency(void)
8047 {
8048 	XLogRecPtr	lastReplayedEndRecPtr;
8049 
8050 	/*
8051 	 * During crash recovery, we don't reach a consistent state until we've
8052 	 * replayed all the WAL.
8053 	 */
8054 	if (XLogRecPtrIsInvalid(minRecoveryPoint))
8055 		return;
8056 
8057 	Assert(InArchiveRecovery);
8058 
8059 	/*
8060 	 * assume that we are called in the startup process, and hence don't need
8061 	 * a lock to read lastReplayedEndRecPtr
8062 	 */
8063 	lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
8064 
8065 	/*
8066 	 * Have we reached the point where our base backup was completed?
8067 	 */
8068 	if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
8069 		ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
8070 	{
8071 		/*
8072 		 * We have reached the end of base backup, as indicated by pg_control.
8073 		 * The data on disk is now consistent. Reset backupStartPoint and
8074 		 * backupEndPoint, and update minRecoveryPoint to make sure we don't
8075 		 * allow starting up at an earlier point even if recovery is stopped
8076 		 * and restarted soon after this.
8077 		 */
8078 		elog(DEBUG1, "end of backup reached");
8079 
8080 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8081 
8082 		if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
8083 			ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
8084 
8085 		ControlFile->backupStartPoint = InvalidXLogRecPtr;
8086 		ControlFile->backupEndPoint = InvalidXLogRecPtr;
8087 		ControlFile->backupEndRequired = false;
8088 		UpdateControlFile();
8089 
8090 		LWLockRelease(ControlFileLock);
8091 	}
8092 
8093 	/*
8094 	 * Have we passed our safe starting point? Note that minRecoveryPoint is
8095 	 * known to be incorrectly set if ControlFile->backupEndRequired, until
8096 	 * the XLOG_BACKUP_END arrives to advise us of the correct
8097 	 * minRecoveryPoint. All we know prior to that is that we're not
8098 	 * consistent yet.
8099 	 */
8100 	if (!reachedConsistency && !ControlFile->backupEndRequired &&
8101 		minRecoveryPoint <= lastReplayedEndRecPtr &&
8102 		XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
8103 	{
8104 		/*
8105 		 * Check to see if the XLOG sequence contained any unresolved
8106 		 * references to uninitialized pages.
8107 		 */
8108 		XLogCheckInvalidPages();
8109 
8110 		reachedConsistency = true;
8111 		ereport(LOG,
8112 				(errmsg("consistent recovery state reached at %X/%X",
8113 						(uint32) (lastReplayedEndRecPtr >> 32),
8114 						(uint32) lastReplayedEndRecPtr)));
8115 	}
8116 
8117 	/*
8118 	 * Have we got a valid starting snapshot that will allow queries to be
8119 	 * run? If so, we can tell postmaster that the database is consistent now,
8120 	 * enabling connections.
8121 	 */
8122 	if (standbyState == STANDBY_SNAPSHOT_READY &&
8123 		!LocalHotStandbyActive &&
8124 		reachedConsistency &&
8125 		IsUnderPostmaster)
8126 	{
8127 		SpinLockAcquire(&XLogCtl->info_lck);
8128 		XLogCtl->SharedHotStandbyActive = true;
8129 		SpinLockRelease(&XLogCtl->info_lck);
8130 
8131 		LocalHotStandbyActive = true;
8132 
8133 		SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
8134 	}
8135 }
8136 
8137 /*
8138  * Is the system still in recovery?
8139  *
8140  * Unlike testing InRecovery, this works in any process that's connected to
8141  * shared memory.
8142  *
8143  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
8144  * variables the first time we see that recovery is finished.
8145  */
8146 bool
RecoveryInProgress(void)8147 RecoveryInProgress(void)
8148 {
8149 	/*
8150 	 * We check shared state each time only until we leave recovery mode. We
8151 	 * can't re-enter recovery, so there's no need to keep checking after the
8152 	 * shared variable has once been seen false.
8153 	 */
8154 	if (!LocalRecoveryInProgress)
8155 		return false;
8156 	else
8157 	{
8158 		/*
8159 		 * use volatile pointer to make sure we make a fresh read of the
8160 		 * shared variable.
8161 		 */
8162 		volatile XLogCtlData *xlogctl = XLogCtl;
8163 
8164 		LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
8165 
8166 		/*
8167 		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
8168 		 * is finished. InitPostgres() relies upon this behaviour to ensure
8169 		 * that InitXLOGAccess() is called at backend startup.  (If you change
8170 		 * this, see also LocalSetXLogInsertAllowed.)
8171 		 */
8172 		if (!LocalRecoveryInProgress)
8173 		{
8174 			/*
8175 			 * If we just exited recovery, make sure we read TimeLineID and
8176 			 * RedoRecPtr after SharedRecoveryState (for machines with weak
8177 			 * memory ordering).
8178 			 */
8179 			pg_memory_barrier();
8180 			InitXLOGAccess();
8181 		}
8182 
8183 		/*
8184 		 * Note: We don't need a memory barrier when we're still in recovery.
8185 		 * We might exit recovery immediately after return, so the caller
8186 		 * can't rely on 'true' meaning that we're still in recovery anyway.
8187 		 */
8188 
8189 		return LocalRecoveryInProgress;
8190 	}
8191 }
8192 
8193 /*
8194  * Returns current recovery state from shared memory.
8195  *
8196  * This returned state is kept consistent with the contents of the control
8197  * file.  See details about the possible values of RecoveryState in xlog.h.
8198  */
8199 RecoveryState
GetRecoveryState(void)8200 GetRecoveryState(void)
8201 {
8202 	RecoveryState retval;
8203 
8204 	SpinLockAcquire(&XLogCtl->info_lck);
8205 	retval = XLogCtl->SharedRecoveryState;
8206 	SpinLockRelease(&XLogCtl->info_lck);
8207 
8208 	return retval;
8209 }
8210 
8211 /*
8212  * Is HotStandby active yet? This is only important in special backends
8213  * since normal backends won't ever be able to connect until this returns
8214  * true. Postmaster knows this by way of signal, not via shared memory.
8215  *
8216  * Unlike testing standbyState, this works in any process that's connected to
8217  * shared memory.  (And note that standbyState alone doesn't tell the truth
8218  * anyway.)
8219  */
8220 bool
HotStandbyActive(void)8221 HotStandbyActive(void)
8222 {
8223 	/*
8224 	 * We check shared state each time only until Hot Standby is active. We
8225 	 * can't de-activate Hot Standby, so there's no need to keep checking
8226 	 * after the shared variable has once been seen true.
8227 	 */
8228 	if (LocalHotStandbyActive)
8229 		return true;
8230 	else
8231 	{
8232 		/* spinlock is essential on machines with weak memory ordering! */
8233 		SpinLockAcquire(&XLogCtl->info_lck);
8234 		LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
8235 		SpinLockRelease(&XLogCtl->info_lck);
8236 
8237 		return LocalHotStandbyActive;
8238 	}
8239 }
8240 
8241 /*
8242  * Like HotStandbyActive(), but to be used only in WAL replay code,
8243  * where we don't need to ask any other process what the state is.
8244  */
8245 bool
HotStandbyActiveInReplay(void)8246 HotStandbyActiveInReplay(void)
8247 {
8248 	Assert(AmStartupProcess() || !IsPostmasterEnvironment);
8249 	return LocalHotStandbyActive;
8250 }
8251 
8252 /*
8253  * Is this process allowed to insert new WAL records?
8254  *
8255  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
8256  * But we also have provisions for forcing the result "true" or "false"
8257  * within specific processes regardless of the global state.
8258  */
8259 bool
XLogInsertAllowed(void)8260 XLogInsertAllowed(void)
8261 {
8262 	/*
8263 	 * If value is "unconditionally true" or "unconditionally false", just
8264 	 * return it.  This provides the normal fast path once recovery is known
8265 	 * done.
8266 	 */
8267 	if (LocalXLogInsertAllowed >= 0)
8268 		return (bool) LocalXLogInsertAllowed;
8269 
8270 	/*
8271 	 * Else, must check to see if we're still in recovery.
8272 	 */
8273 	if (RecoveryInProgress())
8274 		return false;
8275 
8276 	/*
8277 	 * On exit from recovery, reset to "unconditionally true", since there is
8278 	 * no need to keep checking.
8279 	 */
8280 	LocalXLogInsertAllowed = 1;
8281 	return true;
8282 }
8283 
8284 /*
8285  * Make XLogInsertAllowed() return true in the current process only.
8286  *
8287  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
8288  * and even call LocalSetXLogInsertAllowed() again after that.
8289  */
8290 static void
LocalSetXLogInsertAllowed(void)8291 LocalSetXLogInsertAllowed(void)
8292 {
8293 	Assert(LocalXLogInsertAllowed == -1);
8294 	LocalXLogInsertAllowed = 1;
8295 
8296 	/* Initialize as RecoveryInProgress() would do when switching state */
8297 	InitXLOGAccess();
8298 }
8299 
8300 /*
8301  * Subroutine to try to fetch and validate a prior checkpoint record.
8302  *
8303  * whichChkpt identifies the checkpoint (merely for reporting purposes).
8304  * 1 for "primary", 0 for "other" (backup_label)
8305  */
8306 static XLogRecord *
ReadCheckpointRecord(XLogReaderState * xlogreader,XLogRecPtr RecPtr,int whichChkpt,bool report)8307 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
8308 					 int whichChkpt, bool report)
8309 {
8310 	XLogRecord *record;
8311 	uint8		info;
8312 
8313 	if (!XRecOffIsValid(RecPtr))
8314 	{
8315 		if (!report)
8316 			return NULL;
8317 
8318 		switch (whichChkpt)
8319 		{
8320 			case 1:
8321 				ereport(LOG,
8322 						(errmsg("invalid primary checkpoint link in control file")));
8323 				break;
8324 			default:
8325 				ereport(LOG,
8326 						(errmsg("invalid checkpoint link in backup_label file")));
8327 				break;
8328 		}
8329 		return NULL;
8330 	}
8331 
8332 	XLogBeginRead(xlogreader, RecPtr);
8333 	record = ReadRecord(xlogreader, LOG, true);
8334 
8335 	if (record == NULL)
8336 	{
8337 		if (!report)
8338 			return NULL;
8339 
8340 		switch (whichChkpt)
8341 		{
8342 			case 1:
8343 				ereport(LOG,
8344 						(errmsg("invalid primary checkpoint record")));
8345 				break;
8346 			default:
8347 				ereport(LOG,
8348 						(errmsg("invalid checkpoint record")));
8349 				break;
8350 		}
8351 		return NULL;
8352 	}
8353 	if (record->xl_rmid != RM_XLOG_ID)
8354 	{
8355 		switch (whichChkpt)
8356 		{
8357 			case 1:
8358 				ereport(LOG,
8359 						(errmsg("invalid resource manager ID in primary checkpoint record")));
8360 				break;
8361 			default:
8362 				ereport(LOG,
8363 						(errmsg("invalid resource manager ID in checkpoint record")));
8364 				break;
8365 		}
8366 		return NULL;
8367 	}
8368 	info = record->xl_info & ~XLR_INFO_MASK;
8369 	if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8370 		info != XLOG_CHECKPOINT_ONLINE)
8371 	{
8372 		switch (whichChkpt)
8373 		{
8374 			case 1:
8375 				ereport(LOG,
8376 						(errmsg("invalid xl_info in primary checkpoint record")));
8377 				break;
8378 			default:
8379 				ereport(LOG,
8380 						(errmsg("invalid xl_info in checkpoint record")));
8381 				break;
8382 		}
8383 		return NULL;
8384 	}
8385 	if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8386 	{
8387 		switch (whichChkpt)
8388 		{
8389 			case 1:
8390 				ereport(LOG,
8391 						(errmsg("invalid length of primary checkpoint record")));
8392 				break;
8393 			default:
8394 				ereport(LOG,
8395 						(errmsg("invalid length of checkpoint record")));
8396 				break;
8397 		}
8398 		return NULL;
8399 	}
8400 	return record;
8401 }
8402 
8403 /*
8404  * This must be called in a backend process before creating WAL records
8405  * (except in a standalone backend, which does StartupXLOG instead).  We need
8406  * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8407  *
8408  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8409  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
8410  * unnecessary however, since the postmaster itself never touches XLOG anyway.
8411  */
8412 void
InitXLOGAccess(void)8413 InitXLOGAccess(void)
8414 {
8415 	XLogCtlInsert *Insert = &XLogCtl->Insert;
8416 
8417 	/* ThisTimeLineID doesn't change so we need no lock to copy it */
8418 	ThisTimeLineID = XLogCtl->ThisTimeLineID;
8419 	Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8420 
8421 	/* set wal_segment_size */
8422 	wal_segment_size = ControlFile->xlog_seg_size;
8423 
8424 	/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8425 	(void) GetRedoRecPtr();
8426 	/* Also update our copy of doPageWrites. */
8427 	doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
8428 
8429 	/* Also initialize the working areas for constructing WAL records */
8430 	InitXLogInsert();
8431 }
8432 
8433 /*
8434  * Return the current Redo pointer from shared memory.
8435  *
8436  * As a side-effect, the local RedoRecPtr copy is updated.
8437  */
8438 XLogRecPtr
GetRedoRecPtr(void)8439 GetRedoRecPtr(void)
8440 {
8441 	XLogRecPtr	ptr;
8442 
8443 	/*
8444 	 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8445 	 * grabbed a WAL insertion lock to read the master copy, someone might
8446 	 * update it just after we've released the lock.
8447 	 */
8448 	SpinLockAcquire(&XLogCtl->info_lck);
8449 	ptr = XLogCtl->RedoRecPtr;
8450 	SpinLockRelease(&XLogCtl->info_lck);
8451 
8452 	if (RedoRecPtr < ptr)
8453 		RedoRecPtr = ptr;
8454 
8455 	return RedoRecPtr;
8456 }
8457 
8458 /*
8459  * Return information needed to decide whether a modified block needs a
8460  * full-page image to be included in the WAL record.
8461  *
8462  * The returned values are cached copies from backend-private memory, and
8463  * possibly out-of-date.  XLogInsertRecord will re-check them against
8464  * up-to-date values, while holding the WAL insert lock.
8465  */
8466 void
GetFullPageWriteInfo(XLogRecPtr * RedoRecPtr_p,bool * doPageWrites_p)8467 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8468 {
8469 	*RedoRecPtr_p = RedoRecPtr;
8470 	*doPageWrites_p = doPageWrites;
8471 }
8472 
8473 /*
8474  * GetInsertRecPtr -- Returns the current insert position.
8475  *
8476  * NOTE: The value *actually* returned is the position of the last full
8477  * xlog page. It lags behind the real insert position by at most 1 page.
8478  * For that, we don't need to scan through WAL insertion locks, and an
8479  * approximation is enough for the current usage of this function.
8480  */
8481 XLogRecPtr
GetInsertRecPtr(void)8482 GetInsertRecPtr(void)
8483 {
8484 	XLogRecPtr	recptr;
8485 
8486 	SpinLockAcquire(&XLogCtl->info_lck);
8487 	recptr = XLogCtl->LogwrtRqst.Write;
8488 	SpinLockRelease(&XLogCtl->info_lck);
8489 
8490 	return recptr;
8491 }
8492 
8493 /*
8494  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8495  * position known to be fsync'd to disk.
8496  */
8497 XLogRecPtr
GetFlushRecPtr(void)8498 GetFlushRecPtr(void)
8499 {
8500 	SpinLockAcquire(&XLogCtl->info_lck);
8501 	LogwrtResult = XLogCtl->LogwrtResult;
8502 	SpinLockRelease(&XLogCtl->info_lck);
8503 
8504 	return LogwrtResult.Flush;
8505 }
8506 
8507 /*
8508  * GetLastImportantRecPtr -- Returns the LSN of the last important record
8509  * inserted. All records not explicitly marked as unimportant are considered
8510  * important.
8511  *
8512  * The LSN is determined by computing the maximum of
8513  * WALInsertLocks[i].lastImportantAt.
8514  */
8515 XLogRecPtr
GetLastImportantRecPtr(void)8516 GetLastImportantRecPtr(void)
8517 {
8518 	XLogRecPtr	res = InvalidXLogRecPtr;
8519 	int			i;
8520 
8521 	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
8522 	{
8523 		XLogRecPtr	last_important;
8524 
8525 		/*
8526 		 * Need to take a lock to prevent torn reads of the LSN, which are
8527 		 * possible on some of the supported platforms. WAL insert locks only
8528 		 * support exclusive mode, so we have to use that.
8529 		 */
8530 		LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8531 		last_important = WALInsertLocks[i].l.lastImportantAt;
8532 		LWLockRelease(&WALInsertLocks[i].l.lock);
8533 
8534 		if (res < last_important)
8535 			res = last_important;
8536 	}
8537 
8538 	return res;
8539 }
8540 
8541 /*
8542  * Get the time and LSN of the last xlog segment switch
8543  */
8544 pg_time_t
GetLastSegSwitchData(XLogRecPtr * lastSwitchLSN)8545 GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8546 {
8547 	pg_time_t	result;
8548 
8549 	/* Need WALWriteLock, but shared lock is sufficient */
8550 	LWLockAcquire(WALWriteLock, LW_SHARED);
8551 	result = XLogCtl->lastSegSwitchTime;
8552 	*lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8553 	LWLockRelease(WALWriteLock);
8554 
8555 	return result;
8556 }
8557 
8558 /*
8559  * This must be called ONCE during postmaster or standalone-backend shutdown
8560  */
8561 void
ShutdownXLOG(int code,Datum arg)8562 ShutdownXLOG(int code, Datum arg)
8563 {
8564 	/*
8565 	 * We should have an aux process resource owner to use, and we should not
8566 	 * be in a transaction that's installed some other resowner.
8567 	 */
8568 	Assert(AuxProcessResourceOwner != NULL);
8569 	Assert(CurrentResourceOwner == NULL ||
8570 		   CurrentResourceOwner == AuxProcessResourceOwner);
8571 	CurrentResourceOwner = AuxProcessResourceOwner;
8572 
8573 	/* Don't be chatty in standalone mode */
8574 	ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8575 			(errmsg("shutting down")));
8576 
8577 	/*
8578 	 * Signal walsenders to move to stopping state.
8579 	 */
8580 	WalSndInitStopping();
8581 
8582 	/*
8583 	 * Wait for WAL senders to be in stopping state.  This prevents commands
8584 	 * from writing new WAL.
8585 	 */
8586 	WalSndWaitStopping();
8587 
8588 	if (RecoveryInProgress())
8589 		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8590 	else
8591 	{
8592 		/*
8593 		 * If archiving is enabled, rotate the last XLOG file so that all the
8594 		 * remaining records are archived (postmaster wakes up the archiver
8595 		 * process one more time at the end of shutdown). The checkpoint
8596 		 * record will go to the next XLOG file and won't be archived (yet).
8597 		 */
8598 		if (XLogArchivingActive() && XLogArchiveCommandSet())
8599 			RequestXLogSwitch(false);
8600 
8601 		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8602 	}
8603 	ShutdownCLOG();
8604 	ShutdownCommitTs();
8605 	ShutdownSUBTRANS();
8606 	ShutdownMultiXact();
8607 }
8608 
8609 /*
8610  * Log start of a checkpoint.
8611  */
8612 static void
LogCheckpointStart(int flags,bool restartpoint)8613 LogCheckpointStart(int flags, bool restartpoint)
8614 {
8615 	elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
8616 		 restartpoint ? "restartpoint" : "checkpoint",
8617 		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8618 		 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8619 		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8620 		 (flags & CHECKPOINT_FORCE) ? " force" : "",
8621 		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
8622 		 (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
8623 		 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8624 		 (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
8625 }
8626 
8627 /*
8628  * Log end of a checkpoint.
8629  */
8630 static void
LogCheckpointEnd(bool restartpoint)8631 LogCheckpointEnd(bool restartpoint)
8632 {
8633 	long		write_msecs,
8634 				sync_msecs,
8635 				total_msecs,
8636 				longest_msecs,
8637 				average_msecs;
8638 	uint64		average_sync_time;
8639 
8640 	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8641 
8642 	write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
8643 												  CheckpointStats.ckpt_sync_t);
8644 
8645 	sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
8646 												 CheckpointStats.ckpt_sync_end_t);
8647 
8648 	/* Accumulate checkpoint timing summary data, in milliseconds. */
8649 	BgWriterStats.m_checkpoint_write_time += write_msecs;
8650 	BgWriterStats.m_checkpoint_sync_time += sync_msecs;
8651 
8652 	/*
8653 	 * All of the published timing statistics are accounted for.  Only
8654 	 * continue if a log message is to be written.
8655 	 */
8656 	if (!log_checkpoints)
8657 		return;
8658 
8659 	total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
8660 												  CheckpointStats.ckpt_end_t);
8661 
8662 	/*
8663 	 * Timing values returned from CheckpointStats are in microseconds.
8664 	 * Convert to milliseconds for consistent printing.
8665 	 */
8666 	longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
8667 
8668 	average_sync_time = 0;
8669 	if (CheckpointStats.ckpt_sync_rels > 0)
8670 		average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8671 			CheckpointStats.ckpt_sync_rels;
8672 	average_msecs = (long) ((average_sync_time + 999) / 1000);
8673 
8674 	elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
8675 		 "%d WAL file(s) added, %d removed, %d recycled; "
8676 		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8677 		 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8678 		 "distance=%d kB, estimate=%d kB",
8679 		 restartpoint ? "restartpoint" : "checkpoint",
8680 		 CheckpointStats.ckpt_bufs_written,
8681 		 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8682 		 CheckpointStats.ckpt_segs_added,
8683 		 CheckpointStats.ckpt_segs_removed,
8684 		 CheckpointStats.ckpt_segs_recycled,
8685 		 write_msecs / 1000, (int) (write_msecs % 1000),
8686 		 sync_msecs / 1000, (int) (sync_msecs % 1000),
8687 		 total_msecs / 1000, (int) (total_msecs % 1000),
8688 		 CheckpointStats.ckpt_sync_rels,
8689 		 longest_msecs / 1000, (int) (longest_msecs % 1000),
8690 		 average_msecs / 1000, (int) (average_msecs % 1000),
8691 		 (int) (PrevCheckPointDistance / 1024.0),
8692 		 (int) (CheckPointDistanceEstimate / 1024.0));
8693 }
8694 
8695 /*
8696  * Update the estimate of distance between checkpoints.
8697  *
8698  * The estimate is used to calculate the number of WAL segments to keep
8699  * preallocated, see XLOGfileslop().
8700  */
8701 static void
UpdateCheckPointDistanceEstimate(uint64 nbytes)8702 UpdateCheckPointDistanceEstimate(uint64 nbytes)
8703 {
8704 	/*
8705 	 * To estimate the number of segments consumed between checkpoints, keep a
8706 	 * moving average of the amount of WAL generated in previous checkpoint
8707 	 * cycles. However, if the load is bursty, with quiet periods and busy
8708 	 * periods, we want to cater for the peak load. So instead of a plain
8709 	 * moving average, let the average decline slowly if the previous cycle
8710 	 * used less WAL than estimated, but bump it up immediately if it used
8711 	 * more.
8712 	 *
8713 	 * When checkpoints are triggered by max_wal_size, this should converge to
8714 	 * CheckpointSegments * wal_segment_size,
8715 	 *
8716 	 * Note: This doesn't pay any attention to what caused the checkpoint.
8717 	 * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8718 	 * starting a base backup, are counted the same as those created
8719 	 * automatically. The slow-decline will largely mask them out, if they are
8720 	 * not frequent. If they are frequent, it seems reasonable to count them
8721 	 * in as any others; if you issue a manual checkpoint every 5 minutes and
8722 	 * never let a timed checkpoint happen, it makes sense to base the
8723 	 * preallocation on that 5 minute interval rather than whatever
8724 	 * checkpoint_timeout is set to.
8725 	 */
8726 	PrevCheckPointDistance = nbytes;
8727 	if (CheckPointDistanceEstimate < nbytes)
8728 		CheckPointDistanceEstimate = nbytes;
8729 	else
8730 		CheckPointDistanceEstimate =
8731 			(0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8732 }
8733 
8734 /*
8735  * Perform a checkpoint --- either during shutdown, or on-the-fly
8736  *
8737  * flags is a bitwise OR of the following:
8738  *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8739  *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8740  *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8741  *		ignoring checkpoint_completion_target parameter.
8742  *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8743  *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8744  *		CHECKPOINT_END_OF_RECOVERY).
8745  *	CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8746  *
8747  * Note: flags contains other bits, of interest here only for logging purposes.
8748  * In particular note that this routine is synchronous and does not pay
8749  * attention to CHECKPOINT_WAIT.
8750  *
8751  * If !shutdown then we are writing an online checkpoint. This is a very special
8752  * kind of operation and WAL record because the checkpoint action occurs over
8753  * a period of time yet logically occurs at just a single LSN. The logical
8754  * position of the WAL record (redo ptr) is the same or earlier than the
8755  * physical position. When we replay WAL we locate the checkpoint via its
8756  * physical position then read the redo ptr and actually start replay at the
8757  * earlier logical position. Note that we don't write *anything* to WAL at
8758  * the logical position, so that location could be any other kind of WAL record.
8759  * All of this mechanism allows us to continue working while we checkpoint.
8760  * As a result, timing of actions is critical here and be careful to note that
8761  * this function will likely take minutes to execute on a busy system.
8762  */
8763 void
CreateCheckPoint(int flags)8764 CreateCheckPoint(int flags)
8765 {
8766 	bool		shutdown;
8767 	CheckPoint	checkPoint;
8768 	XLogRecPtr	recptr;
8769 	XLogSegNo	_logSegNo;
8770 	XLogCtlInsert *Insert = &XLogCtl->Insert;
8771 	uint32		freespace;
8772 	XLogRecPtr	PriorRedoPtr;
8773 	XLogRecPtr	curInsert;
8774 	XLogRecPtr	last_important_lsn;
8775 	VirtualTransactionId *vxids;
8776 	int			nvxids;
8777 
8778 	/*
8779 	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8780 	 * issued at a different time.
8781 	 */
8782 	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8783 		shutdown = true;
8784 	else
8785 		shutdown = false;
8786 
8787 	/* sanity check */
8788 	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8789 		elog(ERROR, "can't create a checkpoint during recovery");
8790 
8791 	/*
8792 	 * Initialize InitXLogInsert working areas before entering the critical
8793 	 * section.  Normally, this is done by the first call to
8794 	 * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
8795 	 * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
8796 	 * done below in a critical section, and InitXLogInsert cannot be called
8797 	 * in a critical section.
8798 	 */
8799 	InitXLogInsert();
8800 
8801 	/*
8802 	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8803 	 * (This is just pro forma, since in the present system structure there is
8804 	 * only one process that is allowed to issue checkpoints at any given
8805 	 * time.)
8806 	 */
8807 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8808 
8809 	/*
8810 	 * Prepare to accumulate statistics.
8811 	 *
8812 	 * Note: because it is possible for log_checkpoints to change while a
8813 	 * checkpoint proceeds, we always accumulate stats, even if
8814 	 * log_checkpoints is currently off.
8815 	 */
8816 	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8817 	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8818 
8819 	/*
8820 	 * Use a critical section to force system panic if we have trouble.
8821 	 */
8822 	START_CRIT_SECTION();
8823 
8824 	if (shutdown)
8825 	{
8826 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8827 		ControlFile->state = DB_SHUTDOWNING;
8828 		ControlFile->time = (pg_time_t) time(NULL);
8829 		UpdateControlFile();
8830 		LWLockRelease(ControlFileLock);
8831 	}
8832 
8833 	/*
8834 	 * Let smgr prepare for checkpoint; this has to happen before we determine
8835 	 * the REDO pointer.  Note that smgr must not do anything that'd have to
8836 	 * be undone if we decide no checkpoint is needed.
8837 	 */
8838 	SyncPreCheckpoint();
8839 
8840 	/* Begin filling in the checkpoint WAL record */
8841 	MemSet(&checkPoint, 0, sizeof(checkPoint));
8842 	checkPoint.time = (pg_time_t) time(NULL);
8843 
8844 	/*
8845 	 * For Hot Standby, derive the oldestActiveXid before we fix the redo
8846 	 * pointer. This allows us to begin accumulating changes to assemble our
8847 	 * starting snapshot of locks and transactions.
8848 	 */
8849 	if (!shutdown && XLogStandbyInfoActive())
8850 		checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8851 	else
8852 		checkPoint.oldestActiveXid = InvalidTransactionId;
8853 
8854 	/*
8855 	 * Get location of last important record before acquiring insert locks (as
8856 	 * GetLastImportantRecPtr() also locks WAL locks).
8857 	 */
8858 	last_important_lsn = GetLastImportantRecPtr();
8859 
8860 	/*
8861 	 * We must block concurrent insertions while examining insert state to
8862 	 * determine the checkpoint REDO pointer.
8863 	 */
8864 	WALInsertLockAcquireExclusive();
8865 	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8866 
8867 	/*
8868 	 * If this isn't a shutdown or forced checkpoint, and if there has been no
8869 	 * WAL activity requiring a checkpoint, skip it.  The idea here is to
8870 	 * avoid inserting duplicate checkpoints when the system is idle.
8871 	 */
8872 	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8873 				  CHECKPOINT_FORCE)) == 0)
8874 	{
8875 		if (last_important_lsn == ControlFile->checkPoint)
8876 		{
8877 			WALInsertLockRelease();
8878 			LWLockRelease(CheckpointLock);
8879 			END_CRIT_SECTION();
8880 			ereport(DEBUG1,
8881 					(errmsg("checkpoint skipped because system is idle")));
8882 			return;
8883 		}
8884 	}
8885 
8886 	/*
8887 	 * An end-of-recovery checkpoint is created before anyone is allowed to
8888 	 * write WAL. To allow us to write the checkpoint record, temporarily
8889 	 * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8890 	 * initialized, which we need here and in AdvanceXLInsertBuffer.)
8891 	 */
8892 	if (flags & CHECKPOINT_END_OF_RECOVERY)
8893 		LocalSetXLogInsertAllowed();
8894 
8895 	checkPoint.ThisTimeLineID = ThisTimeLineID;
8896 	if (flags & CHECKPOINT_END_OF_RECOVERY)
8897 		checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8898 	else
8899 		checkPoint.PrevTimeLineID = ThisTimeLineID;
8900 
8901 	checkPoint.fullPageWrites = Insert->fullPageWrites;
8902 
8903 	/*
8904 	 * Compute new REDO record ptr = location of next XLOG record.
8905 	 *
8906 	 * NB: this is NOT necessarily where the checkpoint record itself will be,
8907 	 * since other backends may insert more XLOG records while we're off doing
8908 	 * the buffer flush work.  Those XLOG records are logically after the
8909 	 * checkpoint, even though physically before it.  Got that?
8910 	 */
8911 	freespace = INSERT_FREESPACE(curInsert);
8912 	if (freespace == 0)
8913 	{
8914 		if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
8915 			curInsert += SizeOfXLogLongPHD;
8916 		else
8917 			curInsert += SizeOfXLogShortPHD;
8918 	}
8919 	checkPoint.redo = curInsert;
8920 
8921 	/*
8922 	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8923 	 * must be done while holding all the insertion locks.
8924 	 *
8925 	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8926 	 * pointing past where it really needs to point.  This is okay; the only
8927 	 * consequence is that XLogInsert might back up whole buffers that it
8928 	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
8929 	 * XLogInserts that happen while we are dumping buffers must assume that
8930 	 * their buffer changes are not included in the checkpoint.
8931 	 */
8932 	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
8933 
8934 	/*
8935 	 * Now we can release the WAL insertion locks, allowing other xacts to
8936 	 * proceed while we are flushing disk buffers.
8937 	 */
8938 	WALInsertLockRelease();
8939 
8940 	/* Update the info_lck-protected copy of RedoRecPtr as well */
8941 	SpinLockAcquire(&XLogCtl->info_lck);
8942 	XLogCtl->RedoRecPtr = checkPoint.redo;
8943 	SpinLockRelease(&XLogCtl->info_lck);
8944 
8945 	/*
8946 	 * If enabled, log checkpoint start.  We postpone this until now so as not
8947 	 * to log anything if we decided to skip the checkpoint.
8948 	 */
8949 	if (log_checkpoints)
8950 		LogCheckpointStart(flags, false);
8951 
8952 	TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8953 
8954 	/*
8955 	 * Get the other info we need for the checkpoint record.
8956 	 *
8957 	 * We don't need to save oldestClogXid in the checkpoint, it only matters
8958 	 * for the short period in which clog is being truncated, and if we crash
8959 	 * during that we'll redo the clog truncation and fix up oldestClogXid
8960 	 * there.
8961 	 */
8962 	LWLockAcquire(XidGenLock, LW_SHARED);
8963 	checkPoint.nextFullXid = ShmemVariableCache->nextFullXid;
8964 	checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8965 	checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8966 	LWLockRelease(XidGenLock);
8967 
8968 	LWLockAcquire(CommitTsLock, LW_SHARED);
8969 	checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
8970 	checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
8971 	LWLockRelease(CommitTsLock);
8972 
8973 	LWLockAcquire(OidGenLock, LW_SHARED);
8974 	checkPoint.nextOid = ShmemVariableCache->nextOid;
8975 	if (!shutdown)
8976 		checkPoint.nextOid += ShmemVariableCache->oidCount;
8977 	LWLockRelease(OidGenLock);
8978 
8979 	MultiXactGetCheckptMulti(shutdown,
8980 							 &checkPoint.nextMulti,
8981 							 &checkPoint.nextMultiOffset,
8982 							 &checkPoint.oldestMulti,
8983 							 &checkPoint.oldestMultiDB);
8984 
8985 	/*
8986 	 * Having constructed the checkpoint record, ensure all shmem disk buffers
8987 	 * and commit-log buffers are flushed to disk.
8988 	 *
8989 	 * This I/O could fail for various reasons.  If so, we will fail to
8990 	 * complete the checkpoint, but there is no reason to force a system
8991 	 * panic. Accordingly, exit critical section while doing it.
8992 	 */
8993 	END_CRIT_SECTION();
8994 
8995 	/*
8996 	 * In some cases there are groups of actions that must all occur on one
8997 	 * side or the other of a checkpoint record. Before flushing the
8998 	 * checkpoint record we must explicitly wait for any backend currently
8999 	 * performing those groups of actions.
9000 	 *
9001 	 * One example is end of transaction, so we must wait for any transactions
9002 	 * that are currently in commit critical sections.  If an xact inserted
9003 	 * its commit record into XLOG just before the REDO point, then a crash
9004 	 * restart from the REDO point would not replay that record, which means
9005 	 * that our flushing had better include the xact's update of pg_xact.  So
9006 	 * we wait till he's out of his commit critical section before proceeding.
9007 	 * See notes in RecordTransactionCommit().
9008 	 *
9009 	 * Because we've already released the insertion locks, this test is a bit
9010 	 * fuzzy: it is possible that we will wait for xacts we didn't really need
9011 	 * to wait for.  But the delay should be short and it seems better to make
9012 	 * checkpoint take a bit longer than to hold off insertions longer than
9013 	 * necessary. (In fact, the whole reason we have this issue is that xact.c
9014 	 * does commit record XLOG insertion and clog update as two separate steps
9015 	 * protected by different locks, but again that seems best on grounds of
9016 	 * minimizing lock contention.)
9017 	 *
9018 	 * A transaction that has not yet set delayChkpt when we look cannot be at
9019 	 * risk, since he's not inserted his commit record yet; and one that's
9020 	 * already cleared it is not at risk either, since he's done fixing clog
9021 	 * and we will correctly flush the update below.  So we cannot miss any
9022 	 * xacts we need to wait for.
9023 	 */
9024 	vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
9025 	if (nvxids > 0)
9026 	{
9027 		do
9028 		{
9029 			pg_usleep(10000L);	/* wait for 10 msec */
9030 		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
9031 	}
9032 	pfree(vxids);
9033 
9034 	CheckPointGuts(checkPoint.redo, flags);
9035 
9036 	/*
9037 	 * Take a snapshot of running transactions and write this to WAL. This
9038 	 * allows us to reconstruct the state of running transactions during
9039 	 * archive recovery, if required. Skip, if this info disabled.
9040 	 *
9041 	 * If we are shutting down, or Startup process is completing crash
9042 	 * recovery we don't need to write running xact data.
9043 	 */
9044 	if (!shutdown && XLogStandbyInfoActive())
9045 		LogStandbySnapshot();
9046 
9047 	START_CRIT_SECTION();
9048 
9049 	/*
9050 	 * Now insert the checkpoint record into XLOG.
9051 	 */
9052 	XLogBeginInsert();
9053 	XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
9054 	recptr = XLogInsert(RM_XLOG_ID,
9055 						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
9056 						XLOG_CHECKPOINT_ONLINE);
9057 
9058 	XLogFlush(recptr);
9059 
9060 	/*
9061 	 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
9062 	 * overwritten at next startup.  No-one should even try, this just allows
9063 	 * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
9064 	 * to just temporarily disable writing until the system has exited
9065 	 * recovery.
9066 	 */
9067 	if (shutdown)
9068 	{
9069 		if (flags & CHECKPOINT_END_OF_RECOVERY)
9070 			LocalXLogInsertAllowed = -1;	/* return to "check" state */
9071 		else
9072 			LocalXLogInsertAllowed = 0; /* never again write WAL */
9073 	}
9074 
9075 	/*
9076 	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
9077 	 * = end of actual checkpoint record.
9078 	 */
9079 	if (shutdown && checkPoint.redo != ProcLastRecPtr)
9080 		ereport(PANIC,
9081 				(errmsg("concurrent write-ahead log activity while database system is shutting down")));
9082 
9083 	/*
9084 	 * Remember the prior checkpoint's redo ptr for
9085 	 * UpdateCheckPointDistanceEstimate()
9086 	 */
9087 	PriorRedoPtr = ControlFile->checkPointCopy.redo;
9088 
9089 	/*
9090 	 * Update the control file.
9091 	 */
9092 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9093 	if (shutdown)
9094 		ControlFile->state = DB_SHUTDOWNED;
9095 	ControlFile->checkPoint = ProcLastRecPtr;
9096 	ControlFile->checkPointCopy = checkPoint;
9097 	ControlFile->time = (pg_time_t) time(NULL);
9098 	/* crash recovery should always recover to the end of WAL */
9099 	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
9100 	ControlFile->minRecoveryPointTLI = 0;
9101 
9102 	/*
9103 	 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
9104 	 * unused on non-shutdown checkpoints, but seems useful to store it always
9105 	 * for debugging purposes.
9106 	 */
9107 	SpinLockAcquire(&XLogCtl->ulsn_lck);
9108 	ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
9109 	SpinLockRelease(&XLogCtl->ulsn_lck);
9110 
9111 	UpdateControlFile();
9112 	LWLockRelease(ControlFileLock);
9113 
9114 	/* Update shared-memory copy of checkpoint XID/epoch */
9115 	SpinLockAcquire(&XLogCtl->info_lck);
9116 	XLogCtl->ckptFullXid = checkPoint.nextFullXid;
9117 	SpinLockRelease(&XLogCtl->info_lck);
9118 
9119 	/*
9120 	 * We are now done with critical updates; no need for system panic if we
9121 	 * have trouble while fooling with old log segments.
9122 	 */
9123 	END_CRIT_SECTION();
9124 
9125 	/*
9126 	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
9127 	 */
9128 	SyncPostCheckpoint();
9129 
9130 	/*
9131 	 * Update the average distance between checkpoints if the prior checkpoint
9132 	 * exists.
9133 	 */
9134 	if (PriorRedoPtr != InvalidXLogRecPtr)
9135 		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9136 
9137 	/*
9138 	 * Delete old log files, those no longer needed for last checkpoint to
9139 	 * prevent the disk holding the xlog from growing full.
9140 	 */
9141 	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9142 	KeepLogSeg(recptr, &_logSegNo);
9143 	if (InvalidateObsoleteReplicationSlots(_logSegNo))
9144 	{
9145 		/*
9146 		 * Some slots have been invalidated; recalculate the old-segment
9147 		 * horizon, starting again from RedoRecPtr.
9148 		 */
9149 		XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9150 		KeepLogSeg(recptr, &_logSegNo);
9151 	}
9152 	_logSegNo--;
9153 	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
9154 
9155 	/*
9156 	 * Make more log segments if needed.  (Do this after recycling old log
9157 	 * segments, since that may supply some of the needed files.)
9158 	 */
9159 	if (!shutdown)
9160 		PreallocXlogFiles(recptr);
9161 
9162 	/*
9163 	 * Truncate pg_subtrans if possible.  We can throw away all data before
9164 	 * the oldest XMIN of any running transaction.  No future transaction will
9165 	 * attempt to reference any pg_subtrans entry older than that (see Asserts
9166 	 * in subtrans.c).  During recovery, though, we mustn't do this because
9167 	 * StartupSUBTRANS hasn't been called yet.
9168 	 */
9169 	if (!RecoveryInProgress())
9170 		TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9171 
9172 	/* Real work is done, but log and update stats before releasing lock. */
9173 	LogCheckpointEnd(false);
9174 
9175 	TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
9176 									 NBuffers,
9177 									 CheckpointStats.ckpt_segs_added,
9178 									 CheckpointStats.ckpt_segs_removed,
9179 									 CheckpointStats.ckpt_segs_recycled);
9180 
9181 	LWLockRelease(CheckpointLock);
9182 }
9183 
9184 /*
9185  * Mark the end of recovery in WAL though without running a full checkpoint.
9186  * We can expect that a restartpoint is likely to be in progress as we
9187  * do this, though we are unwilling to wait for it to complete. So be
9188  * careful to avoid taking the CheckpointLock anywhere here.
9189  *
9190  * CreateRestartPoint() allows for the case where recovery may end before
9191  * the restartpoint completes so there is no concern of concurrent behaviour.
9192  */
9193 static void
CreateEndOfRecoveryRecord(void)9194 CreateEndOfRecoveryRecord(void)
9195 {
9196 	xl_end_of_recovery xlrec;
9197 	XLogRecPtr	recptr;
9198 
9199 	/* sanity check */
9200 	if (!RecoveryInProgress())
9201 		elog(ERROR, "can only be used to end recovery");
9202 
9203 	xlrec.end_time = GetCurrentTimestamp();
9204 
9205 	WALInsertLockAcquireExclusive();
9206 	xlrec.ThisTimeLineID = ThisTimeLineID;
9207 	xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
9208 	WALInsertLockRelease();
9209 
9210 	LocalSetXLogInsertAllowed();
9211 
9212 	START_CRIT_SECTION();
9213 
9214 	XLogBeginInsert();
9215 	XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
9216 	recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
9217 
9218 	XLogFlush(recptr);
9219 
9220 	/*
9221 	 * Update the control file so that crash recovery can follow the timeline
9222 	 * changes to this point.
9223 	 */
9224 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9225 	ControlFile->time = (pg_time_t) time(NULL);
9226 	ControlFile->minRecoveryPoint = recptr;
9227 	ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9228 	UpdateControlFile();
9229 	LWLockRelease(ControlFileLock);
9230 
9231 	END_CRIT_SECTION();
9232 
9233 	LocalXLogInsertAllowed = -1;	/* return to "check" state */
9234 }
9235 
9236 /*
9237  * Write an OVERWRITE_CONTRECORD message.
9238  *
9239  * When on WAL replay we expect a continuation record at the start of a page
9240  * that is not there, recovery ends and WAL writing resumes at that point.
9241  * But it's wrong to resume writing new WAL back at the start of the record
9242  * that was broken, because downstream consumers of that WAL (physical
9243  * replicas) are not prepared to "rewind".  So the first action after
9244  * finishing replay of all valid WAL must be to write a record of this type
9245  * at the point where the contrecord was missing; to support xlogreader
9246  * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
9247  * to the page header where the record occurs.  xlogreader has an ad-hoc
9248  * mechanism to report metadata about the broken record, which is what we
9249  * use here.
9250  *
9251  * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
9252  * skip the record it was reading, and pass back the LSN of the skipped
9253  * record, so that its caller can verify (on "replay" of that record) that the
9254  * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
9255  */
9256 static XLogRecPtr
CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)9257 CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)
9258 {
9259 	xl_overwrite_contrecord xlrec;
9260 	XLogRecPtr	recptr;
9261 
9262 	/* sanity check */
9263 	if (!RecoveryInProgress())
9264 		elog(ERROR, "can only be used at end of recovery");
9265 
9266 	xlrec.overwritten_lsn = aborted_lsn;
9267 	xlrec.overwrite_time = GetCurrentTimestamp();
9268 
9269 	START_CRIT_SECTION();
9270 
9271 	XLogBeginInsert();
9272 	XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
9273 
9274 	recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
9275 
9276 	XLogFlush(recptr);
9277 
9278 	END_CRIT_SECTION();
9279 
9280 	return recptr;
9281 }
9282 
9283 /*
9284  * Flush all data in shared memory to disk, and fsync
9285  *
9286  * This is the common code shared between regular checkpoints and
9287  * recovery restartpoints.
9288  */
9289 static void
CheckPointGuts(XLogRecPtr checkPointRedo,int flags)9290 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
9291 {
9292 	CheckPointCLOG();
9293 	CheckPointCommitTs();
9294 	CheckPointSUBTRANS();
9295 	CheckPointMultiXact();
9296 	CheckPointPredicate();
9297 	CheckPointRelationMap();
9298 	CheckPointReplicationSlots();
9299 	CheckPointSnapBuild();
9300 	CheckPointLogicalRewriteHeap();
9301 	CheckPointBuffers(flags);	/* performs all required fsyncs */
9302 	CheckPointReplicationOrigin();
9303 	/* We deliberately delay 2PC checkpointing as long as possible */
9304 	CheckPointTwoPhase(checkPointRedo);
9305 }
9306 
9307 /*
9308  * Save a checkpoint for recovery restart if appropriate
9309  *
9310  * This function is called each time a checkpoint record is read from XLOG.
9311  * It must determine whether the checkpoint represents a safe restartpoint or
9312  * not.  If so, the checkpoint record is stashed in shared memory so that
9313  * CreateRestartPoint can consult it.  (Note that the latter function is
9314  * executed by the checkpointer, while this one will be executed by the
9315  * startup process.)
9316  */
9317 static void
RecoveryRestartPoint(const CheckPoint * checkPoint)9318 RecoveryRestartPoint(const CheckPoint *checkPoint)
9319 {
9320 	/*
9321 	 * Also refrain from creating a restartpoint if we have seen any
9322 	 * references to non-existent pages. Restarting recovery from the
9323 	 * restartpoint would not see the references, so we would lose the
9324 	 * cross-check that the pages belonged to a relation that was dropped
9325 	 * later.
9326 	 */
9327 	if (XLogHaveInvalidPages())
9328 	{
9329 		elog(trace_recovery(DEBUG2),
9330 			 "could not record restart point at %X/%X because there "
9331 			 "are unresolved references to invalid pages",
9332 			 (uint32) (checkPoint->redo >> 32),
9333 			 (uint32) checkPoint->redo);
9334 		return;
9335 	}
9336 
9337 	/*
9338 	 * Copy the checkpoint record to shared memory, so that checkpointer can
9339 	 * work out the next time it wants to perform a restartpoint.
9340 	 */
9341 	SpinLockAcquire(&XLogCtl->info_lck);
9342 	XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9343 	XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9344 	XLogCtl->lastCheckPoint = *checkPoint;
9345 	SpinLockRelease(&XLogCtl->info_lck);
9346 }
9347 
9348 /*
9349  * Establish a restartpoint if possible.
9350  *
9351  * This is similar to CreateCheckPoint, but is used during WAL recovery
9352  * to establish a point from which recovery can roll forward without
9353  * replaying the entire recovery log.
9354  *
9355  * Returns true if a new restartpoint was established. We can only establish
9356  * a restartpoint if we have replayed a safe checkpoint record since last
9357  * restartpoint.
9358  */
9359 bool
CreateRestartPoint(int flags)9360 CreateRestartPoint(int flags)
9361 {
9362 	XLogRecPtr	lastCheckPointRecPtr;
9363 	XLogRecPtr	lastCheckPointEndPtr;
9364 	CheckPoint	lastCheckPoint;
9365 	XLogRecPtr	PriorRedoPtr;
9366 	XLogRecPtr	receivePtr;
9367 	XLogRecPtr	replayPtr;
9368 	TimeLineID	replayTLI;
9369 	XLogRecPtr	endptr;
9370 	XLogSegNo	_logSegNo;
9371 	TimestampTz xtime;
9372 
9373 	/*
9374 	 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
9375 	 * happens at a time.
9376 	 */
9377 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
9378 
9379 	/* Get a local copy of the last safe checkpoint record. */
9380 	SpinLockAcquire(&XLogCtl->info_lck);
9381 	lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9382 	lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9383 	lastCheckPoint = XLogCtl->lastCheckPoint;
9384 	SpinLockRelease(&XLogCtl->info_lck);
9385 
9386 	/*
9387 	 * Check that we're still in recovery mode. It's ok if we exit recovery
9388 	 * mode after this check, the restart point is valid anyway.
9389 	 */
9390 	if (!RecoveryInProgress())
9391 	{
9392 		ereport(DEBUG2,
9393 				(errmsg("skipping restartpoint, recovery has already ended")));
9394 		LWLockRelease(CheckpointLock);
9395 		return false;
9396 	}
9397 
9398 	/*
9399 	 * If the last checkpoint record we've replayed is already our last
9400 	 * restartpoint, we can't perform a new restart point. We still update
9401 	 * minRecoveryPoint in that case, so that if this is a shutdown restart
9402 	 * point, we won't start up earlier than before. That's not strictly
9403 	 * necessary, but when hot standby is enabled, it would be rather weird if
9404 	 * the database opened up for read-only connections at a point-in-time
9405 	 * before the last shutdown. Such time travel is still possible in case of
9406 	 * immediate shutdown, though.
9407 	 *
9408 	 * We don't explicitly advance minRecoveryPoint when we do create a
9409 	 * restartpoint. It's assumed that flushing the buffers will do that as a
9410 	 * side-effect.
9411 	 */
9412 	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9413 		lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9414 	{
9415 		ereport(DEBUG2,
9416 				(errmsg("skipping restartpoint, already performed at %X/%X",
9417 						(uint32) (lastCheckPoint.redo >> 32),
9418 						(uint32) lastCheckPoint.redo)));
9419 
9420 		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9421 		if (flags & CHECKPOINT_IS_SHUTDOWN)
9422 		{
9423 			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9424 			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9425 			ControlFile->time = (pg_time_t) time(NULL);
9426 			UpdateControlFile();
9427 			LWLockRelease(ControlFileLock);
9428 		}
9429 		LWLockRelease(CheckpointLock);
9430 		return false;
9431 	}
9432 
9433 	/*
9434 	 * Update the shared RedoRecPtr so that the startup process can calculate
9435 	 * the number of segments replayed since last restartpoint, and request a
9436 	 * restartpoint if it exceeds CheckPointSegments.
9437 	 *
9438 	 * Like in CreateCheckPoint(), hold off insertions to update it, although
9439 	 * during recovery this is just pro forma, because no WAL insertions are
9440 	 * happening.
9441 	 */
9442 	WALInsertLockAcquireExclusive();
9443 	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9444 	WALInsertLockRelease();
9445 
9446 	/* Also update the info_lck-protected copy */
9447 	SpinLockAcquire(&XLogCtl->info_lck);
9448 	XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9449 	SpinLockRelease(&XLogCtl->info_lck);
9450 
9451 	/*
9452 	 * Prepare to accumulate statistics.
9453 	 *
9454 	 * Note: because it is possible for log_checkpoints to change while a
9455 	 * checkpoint proceeds, we always accumulate stats, even if
9456 	 * log_checkpoints is currently off.
9457 	 */
9458 	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9459 	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9460 
9461 	if (log_checkpoints)
9462 		LogCheckpointStart(flags, true);
9463 
9464 	CheckPointGuts(lastCheckPoint.redo, flags);
9465 
9466 	/*
9467 	 * Remember the prior checkpoint's redo ptr for
9468 	 * UpdateCheckPointDistanceEstimate()
9469 	 */
9470 	PriorRedoPtr = ControlFile->checkPointCopy.redo;
9471 
9472 	/*
9473 	 * Update pg_control, using current time.  Check that it still shows
9474 	 * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9475 	 * this is a quick hack to make sure nothing really bad happens if somehow
9476 	 * we get here after the end-of-recovery checkpoint.
9477 	 */
9478 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9479 	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9480 		ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9481 	{
9482 		ControlFile->checkPoint = lastCheckPointRecPtr;
9483 		ControlFile->checkPointCopy = lastCheckPoint;
9484 		ControlFile->time = (pg_time_t) time(NULL);
9485 
9486 		/*
9487 		 * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
9488 		 * this will have happened already while writing out dirty buffers,
9489 		 * but not necessarily - e.g. because no buffers were dirtied.  We do
9490 		 * this because a non-exclusive base backup uses minRecoveryPoint to
9491 		 * determine which WAL files must be included in the backup, and the
9492 		 * file (or files) containing the checkpoint record must be included,
9493 		 * at a minimum. Note that for an ordinary restart of recovery there's
9494 		 * no value in having the minimum recovery point any earlier than this
9495 		 * anyway, because redo will begin just after the checkpoint record.
9496 		 */
9497 		if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9498 		{
9499 			ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9500 			ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9501 
9502 			/* update local copy */
9503 			minRecoveryPoint = ControlFile->minRecoveryPoint;
9504 			minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9505 		}
9506 		if (flags & CHECKPOINT_IS_SHUTDOWN)
9507 			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9508 		UpdateControlFile();
9509 	}
9510 	LWLockRelease(ControlFileLock);
9511 
9512 	/*
9513 	 * Update the average distance between checkpoints/restartpoints if the
9514 	 * prior checkpoint exists.
9515 	 */
9516 	if (PriorRedoPtr != InvalidXLogRecPtr)
9517 		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9518 
9519 	/*
9520 	 * Delete old log files, those no longer needed for last restartpoint to
9521 	 * prevent the disk holding the xlog from growing full.
9522 	 */
9523 	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9524 
9525 	/*
9526 	 * Retreat _logSegNo using the current end of xlog replayed or received,
9527 	 * whichever is later.
9528 	 */
9529 	receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
9530 	replayPtr = GetXLogReplayRecPtr(&replayTLI);
9531 	endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9532 	KeepLogSeg(endptr, &_logSegNo);
9533 	if (InvalidateObsoleteReplicationSlots(_logSegNo))
9534 	{
9535 		/*
9536 		 * Some slots have been invalidated; recalculate the old-segment
9537 		 * horizon, starting again from RedoRecPtr.
9538 		 */
9539 		XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9540 		KeepLogSeg(endptr, &_logSegNo);
9541 	}
9542 	_logSegNo--;
9543 
9544 	/*
9545 	 * Try to recycle segments on a useful timeline. If we've been promoted
9546 	 * since the beginning of this restartpoint, use the new timeline chosen
9547 	 * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
9548 	 * case). If we're still in recovery, use the timeline we're currently
9549 	 * replaying.
9550 	 *
9551 	 * There is no guarantee that the WAL segments will be useful on the
9552 	 * current timeline; if recovery proceeds to a new timeline right after
9553 	 * this, the pre-allocated WAL segments on this timeline will not be used,
9554 	 * and will go wasted until recycled on the next restartpoint. We'll live
9555 	 * with that.
9556 	 */
9557 	if (RecoveryInProgress())
9558 		ThisTimeLineID = replayTLI;
9559 
9560 	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
9561 
9562 	/*
9563 	 * Make more log segments if needed.  (Do this after recycling old log
9564 	 * segments, since that may supply some of the needed files.)
9565 	 */
9566 	PreallocXlogFiles(endptr);
9567 
9568 	/*
9569 	 * ThisTimeLineID is normally not set when we're still in recovery.
9570 	 * However, recycling/preallocating segments above needed ThisTimeLineID
9571 	 * to determine which timeline to install the segments on. Reset it now,
9572 	 * to restore the normal state of affairs for debugging purposes.
9573 	 */
9574 	if (RecoveryInProgress())
9575 		ThisTimeLineID = 0;
9576 
9577 	/*
9578 	 * Truncate pg_subtrans if possible.  We can throw away all data before
9579 	 * the oldest XMIN of any running transaction.  No future transaction will
9580 	 * attempt to reference any pg_subtrans entry older than that (see Asserts
9581 	 * in subtrans.c).  When hot standby is disabled, though, we mustn't do
9582 	 * this because StartupSUBTRANS hasn't been called yet.
9583 	 */
9584 	if (EnableHotStandby)
9585 		TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9586 
9587 	/* Real work is done, but log and update before releasing lock. */
9588 	LogCheckpointEnd(true);
9589 
9590 	xtime = GetLatestXTime();
9591 	ereport((log_checkpoints ? LOG : DEBUG2),
9592 			(errmsg("recovery restart point at %X/%X",
9593 					(uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
9594 			 xtime ? errdetail("Last completed transaction was at log time %s.",
9595 							   timestamptz_to_str(xtime)) : 0));
9596 
9597 	LWLockRelease(CheckpointLock);
9598 
9599 	/*
9600 	 * Finally, execute archive_cleanup_command, if any.
9601 	 */
9602 	if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
9603 		ExecuteRecoveryCommand(archiveCleanupCommand,
9604 							   "archive_cleanup_command",
9605 							   false);
9606 
9607 	return true;
9608 }
9609 
9610 /*
9611  * Report availability of WAL for the given target LSN
9612  *		(typically a slot's restart_lsn)
9613  *
9614  * Returns one of the following enum values:
9615  *
9616  * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
9617  *   max_wal_size.
9618  *
9619  * * WALAVAIL_EXTENDED means it is still available by preserving extra
9620  *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
9621  *   than max_wal_size, this state is not returned.
9622  *
9623  * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
9624  *   remove reserved segments. The walsender using this slot may return to the
9625  *   above.
9626  *
9627  * * WALAVAIL_REMOVED means it has been removed. A replication stream on
9628  *   a slot with this LSN cannot continue after a restart.
9629  *
9630  * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
9631  */
9632 WALAvailability
GetWALAvailability(XLogRecPtr targetLSN)9633 GetWALAvailability(XLogRecPtr targetLSN)
9634 {
9635 	XLogRecPtr	currpos;		/* current write LSN */
9636 	XLogSegNo	currSeg;		/* segid of currpos */
9637 	XLogSegNo	targetSeg;		/* segid of targetLSN */
9638 	XLogSegNo	oldestSeg;		/* actual oldest segid */
9639 	XLogSegNo	oldestSegMaxWalSize;	/* oldest segid kept by max_wal_size */
9640 	XLogSegNo	oldestSlotSeg;	/* oldest segid kept by slot */
9641 	uint64		keepSegs;
9642 
9643 	/*
9644 	 * slot does not reserve WAL. Either deactivated, or has never been active
9645 	 */
9646 	if (XLogRecPtrIsInvalid(targetLSN))
9647 		return WALAVAIL_INVALID_LSN;
9648 
9649 	/*
9650 	 * Calculate the oldest segment currently reserved by all slots,
9651 	 * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
9652 	 * oldestSlotSeg to the current segment.
9653 	 */
9654 	currpos = GetXLogWriteRecPtr();
9655 	XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
9656 	KeepLogSeg(currpos, &oldestSlotSeg);
9657 
9658 	/*
9659 	 * Find the oldest extant segment file. We get 1 until checkpoint removes
9660 	 * the first WAL segment file since startup, which causes the status being
9661 	 * wrong under certain abnormal conditions but that doesn't actually harm.
9662 	 */
9663 	oldestSeg = XLogGetLastRemovedSegno() + 1;
9664 
9665 	/* calculate oldest segment by max_wal_size */
9666 	XLByteToSeg(currpos, currSeg, wal_segment_size);
9667 	keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
9668 
9669 	if (currSeg > keepSegs)
9670 		oldestSegMaxWalSize = currSeg - keepSegs;
9671 	else
9672 		oldestSegMaxWalSize = 1;
9673 
9674 	/* the segment we care about */
9675 	XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
9676 
9677 	/*
9678 	 * No point in returning reserved or extended status values if the
9679 	 * targetSeg is known to be lost.
9680 	 */
9681 	if (targetSeg >= oldestSlotSeg)
9682 	{
9683 		/* show "reserved" when targetSeg is within max_wal_size */
9684 		if (targetSeg >= oldestSegMaxWalSize)
9685 			return WALAVAIL_RESERVED;
9686 
9687 		/* being retained by slots exceeding max_wal_size */
9688 		return WALAVAIL_EXTENDED;
9689 	}
9690 
9691 	/* WAL segments are no longer retained but haven't been removed yet */
9692 	if (targetSeg >= oldestSeg)
9693 		return WALAVAIL_UNRESERVED;
9694 
9695 	/* Definitely lost */
9696 	return WALAVAIL_REMOVED;
9697 }
9698 
9699 
9700 /*
9701  * Retreat *logSegNo to the last segment that we need to retain because of
9702  * either wal_keep_size or replication slots.
9703  *
9704  * This is calculated by subtracting wal_keep_size from the given xlog
9705  * location, recptr and by making sure that that result is below the
9706  * requirement of replication slots.  For the latter criterion we do consider
9707  * the effects of max_slot_wal_keep_size: reserve at most that much space back
9708  * from recptr.
9709  *
9710  * Note about replication slots: if this function calculates a value
9711  * that's further ahead than what slots need reserved, then affected
9712  * slots need to be invalidated and this function invoked again.
9713  * XXX it might be a good idea to rewrite this function so that
9714  * invalidation is optionally done here, instead.
9715  */
9716 static void
KeepLogSeg(XLogRecPtr recptr,XLogSegNo * logSegNo)9717 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9718 {
9719 	XLogSegNo	currSegNo;
9720 	XLogSegNo	segno;
9721 	XLogRecPtr	keep;
9722 
9723 	XLByteToSeg(recptr, currSegNo, wal_segment_size);
9724 	segno = currSegNo;
9725 
9726 	/*
9727 	 * Calculate how many segments are kept by slots first, adjusting for
9728 	 * max_slot_wal_keep_size.
9729 	 */
9730 	keep = XLogGetReplicationSlotMinimumLSN();
9731 	if (keep != InvalidXLogRecPtr)
9732 	{
9733 		XLByteToSeg(keep, segno, wal_segment_size);
9734 
9735 		/* Cap by max_slot_wal_keep_size ... */
9736 		if (max_slot_wal_keep_size_mb >= 0)
9737 		{
9738 			uint64		slot_keep_segs;
9739 
9740 			slot_keep_segs =
9741 				ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
9742 
9743 			if (currSegNo - segno > slot_keep_segs)
9744 				segno = currSegNo - slot_keep_segs;
9745 		}
9746 	}
9747 
9748 	/* but, keep at least wal_keep_size if that's set */
9749 	if (wal_keep_size_mb > 0)
9750 	{
9751 		uint64		keep_segs;
9752 
9753 		keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
9754 		if (currSegNo - segno < keep_segs)
9755 		{
9756 			/* avoid underflow, don't go below 1 */
9757 			if (currSegNo <= keep_segs)
9758 				segno = 1;
9759 			else
9760 				segno = currSegNo - keep_segs;
9761 		}
9762 	}
9763 
9764 	/* don't delete WAL segments newer than the calculated segment */
9765 	if (segno < *logSegNo)
9766 		*logSegNo = segno;
9767 }
9768 
9769 /*
9770  * Write a NEXTOID log record
9771  */
9772 void
XLogPutNextOid(Oid nextOid)9773 XLogPutNextOid(Oid nextOid)
9774 {
9775 	XLogBeginInsert();
9776 	XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9777 	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9778 
9779 	/*
9780 	 * We need not flush the NEXTOID record immediately, because any of the
9781 	 * just-allocated OIDs could only reach disk as part of a tuple insert or
9782 	 * update that would have its own XLOG record that must follow the NEXTOID
9783 	 * record.  Therefore, the standard buffer LSN interlock applied to those
9784 	 * records will ensure no such OID reaches disk before the NEXTOID record
9785 	 * does.
9786 	 *
9787 	 * Note, however, that the above statement only covers state "within" the
9788 	 * database.  When we use a generated OID as a file or directory name, we
9789 	 * are in a sense violating the basic WAL rule, because that filesystem
9790 	 * change may reach disk before the NEXTOID WAL record does.  The impact
9791 	 * of this is that if a database crash occurs immediately afterward, we
9792 	 * might after restart re-generate the same OID and find that it conflicts
9793 	 * with the leftover file or directory.  But since for safety's sake we
9794 	 * always loop until finding a nonconflicting filename, this poses no real
9795 	 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9796 	 */
9797 }
9798 
9799 /*
9800  * Write an XLOG SWITCH record.
9801  *
9802  * Here we just blindly issue an XLogInsert request for the record.
9803  * All the magic happens inside XLogInsert.
9804  *
9805  * The return value is either the end+1 address of the switch record,
9806  * or the end+1 address of the prior segment if we did not need to
9807  * write a switch record because we are already at segment start.
9808  */
9809 XLogRecPtr
RequestXLogSwitch(bool mark_unimportant)9810 RequestXLogSwitch(bool mark_unimportant)
9811 {
9812 	XLogRecPtr	RecPtr;
9813 
9814 	/* XLOG SWITCH has no data */
9815 	XLogBeginInsert();
9816 
9817 	if (mark_unimportant)
9818 		XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
9819 	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
9820 
9821 	return RecPtr;
9822 }
9823 
9824 /*
9825  * Write a RESTORE POINT record
9826  */
9827 XLogRecPtr
XLogRestorePoint(const char * rpName)9828 XLogRestorePoint(const char *rpName)
9829 {
9830 	XLogRecPtr	RecPtr;
9831 	xl_restore_point xlrec;
9832 
9833 	xlrec.rp_time = GetCurrentTimestamp();
9834 	strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9835 
9836 	XLogBeginInsert();
9837 	XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
9838 
9839 	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
9840 
9841 	ereport(LOG,
9842 			(errmsg("restore point \"%s\" created at %X/%X",
9843 					rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9844 
9845 	return RecPtr;
9846 }
9847 
9848 /*
9849  * Check if any of the GUC parameters that are critical for hot standby
9850  * have changed, and update the value in pg_control file if necessary.
9851  */
9852 static void
XLogReportParameters(void)9853 XLogReportParameters(void)
9854 {
9855 	if (wal_level != ControlFile->wal_level ||
9856 		wal_log_hints != ControlFile->wal_log_hints ||
9857 		MaxConnections != ControlFile->MaxConnections ||
9858 		max_worker_processes != ControlFile->max_worker_processes ||
9859 		max_wal_senders != ControlFile->max_wal_senders ||
9860 		max_prepared_xacts != ControlFile->max_prepared_xacts ||
9861 		max_locks_per_xact != ControlFile->max_locks_per_xact ||
9862 		track_commit_timestamp != ControlFile->track_commit_timestamp)
9863 	{
9864 		/*
9865 		 * The change in number of backend slots doesn't need to be WAL-logged
9866 		 * if archiving is not enabled, as you can't start archive recovery
9867 		 * with wal_level=minimal anyway. We don't really care about the
9868 		 * values in pg_control either if wal_level=minimal, but seems better
9869 		 * to keep them up-to-date to avoid confusion.
9870 		 */
9871 		if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9872 		{
9873 			xl_parameter_change xlrec;
9874 			XLogRecPtr	recptr;
9875 
9876 			xlrec.MaxConnections = MaxConnections;
9877 			xlrec.max_worker_processes = max_worker_processes;
9878 			xlrec.max_wal_senders = max_wal_senders;
9879 			xlrec.max_prepared_xacts = max_prepared_xacts;
9880 			xlrec.max_locks_per_xact = max_locks_per_xact;
9881 			xlrec.wal_level = wal_level;
9882 			xlrec.wal_log_hints = wal_log_hints;
9883 			xlrec.track_commit_timestamp = track_commit_timestamp;
9884 
9885 			XLogBeginInsert();
9886 			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
9887 
9888 			recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
9889 			XLogFlush(recptr);
9890 		}
9891 
9892 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9893 
9894 		ControlFile->MaxConnections = MaxConnections;
9895 		ControlFile->max_worker_processes = max_worker_processes;
9896 		ControlFile->max_wal_senders = max_wal_senders;
9897 		ControlFile->max_prepared_xacts = max_prepared_xacts;
9898 		ControlFile->max_locks_per_xact = max_locks_per_xact;
9899 		ControlFile->wal_level = wal_level;
9900 		ControlFile->wal_log_hints = wal_log_hints;
9901 		ControlFile->track_commit_timestamp = track_commit_timestamp;
9902 		UpdateControlFile();
9903 
9904 		LWLockRelease(ControlFileLock);
9905 	}
9906 }
9907 
9908 /*
9909  * Update full_page_writes in shared memory, and write an
9910  * XLOG_FPW_CHANGE record if necessary.
9911  *
9912  * Note: this function assumes there is no other process running
9913  * concurrently that could update it.
9914  */
9915 void
UpdateFullPageWrites(void)9916 UpdateFullPageWrites(void)
9917 {
9918 	XLogCtlInsert *Insert = &XLogCtl->Insert;
9919 	bool		recoveryInProgress;
9920 
9921 	/*
9922 	 * Do nothing if full_page_writes has not been changed.
9923 	 *
9924 	 * It's safe to check the shared full_page_writes without the lock,
9925 	 * because we assume that there is no concurrently running process which
9926 	 * can update it.
9927 	 */
9928 	if (fullPageWrites == Insert->fullPageWrites)
9929 		return;
9930 
9931 	/*
9932 	 * Perform this outside critical section so that the WAL insert
9933 	 * initialization done by RecoveryInProgress() doesn't trigger an
9934 	 * assertion failure.
9935 	 */
9936 	recoveryInProgress = RecoveryInProgress();
9937 
9938 	START_CRIT_SECTION();
9939 
9940 	/*
9941 	 * It's always safe to take full page images, even when not strictly
9942 	 * required, but not the other round. So if we're setting full_page_writes
9943 	 * to true, first set it true and then write the WAL record. If we're
9944 	 * setting it to false, first write the WAL record and then set the global
9945 	 * flag.
9946 	 */
9947 	if (fullPageWrites)
9948 	{
9949 		WALInsertLockAcquireExclusive();
9950 		Insert->fullPageWrites = true;
9951 		WALInsertLockRelease();
9952 	}
9953 
9954 	/*
9955 	 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9956 	 * full_page_writes during archive recovery, if required.
9957 	 */
9958 	if (XLogStandbyInfoActive() && !recoveryInProgress)
9959 	{
9960 		XLogBeginInsert();
9961 		XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
9962 
9963 		XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
9964 	}
9965 
9966 	if (!fullPageWrites)
9967 	{
9968 		WALInsertLockAcquireExclusive();
9969 		Insert->fullPageWrites = false;
9970 		WALInsertLockRelease();
9971 	}
9972 	END_CRIT_SECTION();
9973 }
9974 
9975 /*
9976  * Check that it's OK to switch to new timeline during recovery.
9977  *
9978  * 'lsn' is the address of the shutdown checkpoint record we're about to
9979  * replay. (Currently, timeline can only change at a shutdown checkpoint).
9980  */
9981 static void
checkTimeLineSwitch(XLogRecPtr lsn,TimeLineID newTLI,TimeLineID prevTLI)9982 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9983 {
9984 	/* Check that the record agrees on what the current (old) timeline is */
9985 	if (prevTLI != ThisTimeLineID)
9986 		ereport(PANIC,
9987 				(errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9988 						prevTLI, ThisTimeLineID)));
9989 
9990 	/*
9991 	 * The new timeline better be in the list of timelines we expect to see,
9992 	 * according to the timeline history. It should also not decrease.
9993 	 */
9994 	if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9995 		ereport(PANIC,
9996 				(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9997 						newTLI, ThisTimeLineID)));
9998 
9999 	/*
10000 	 * If we have not yet reached min recovery point, and we're about to
10001 	 * switch to a timeline greater than the timeline of the min recovery
10002 	 * point: trouble. After switching to the new timeline, we could not
10003 	 * possibly visit the min recovery point on the correct timeline anymore.
10004 	 * This can happen if there is a newer timeline in the archive that
10005 	 * branched before the timeline the min recovery point is on, and you
10006 	 * attempt to do PITR to the new timeline.
10007 	 */
10008 	if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
10009 		lsn < minRecoveryPoint &&
10010 		newTLI > minRecoveryPointTLI)
10011 		ereport(PANIC,
10012 				(errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
10013 						newTLI,
10014 						(uint32) (minRecoveryPoint >> 32),
10015 						(uint32) minRecoveryPoint,
10016 						minRecoveryPointTLI)));
10017 
10018 	/* Looks good */
10019 }
10020 
10021 /*
10022  * XLOG resource manager's routines
10023  *
10024  * Definitions of info values are in include/catalog/pg_control.h, though
10025  * not all record types are related to control file updates.
10026  */
10027 void
xlog_redo(XLogReaderState * record)10028 xlog_redo(XLogReaderState *record)
10029 {
10030 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
10031 	XLogRecPtr	lsn = record->EndRecPtr;
10032 
10033 	/* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
10034 	Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
10035 		   !XLogRecHasAnyBlockRefs(record));
10036 
10037 	if (info == XLOG_NEXTOID)
10038 	{
10039 		Oid			nextOid;
10040 
10041 		/*
10042 		 * We used to try to take the maximum of ShmemVariableCache->nextOid
10043 		 * and the recorded nextOid, but that fails if the OID counter wraps
10044 		 * around.  Since no OID allocation should be happening during replay
10045 		 * anyway, better to just believe the record exactly.  We still take
10046 		 * OidGenLock while setting the variable, just in case.
10047 		 */
10048 		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
10049 		LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
10050 		ShmemVariableCache->nextOid = nextOid;
10051 		ShmemVariableCache->oidCount = 0;
10052 		LWLockRelease(OidGenLock);
10053 	}
10054 	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
10055 	{
10056 		CheckPoint	checkPoint;
10057 
10058 		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
10059 		/* In a SHUTDOWN checkpoint, believe the counters exactly */
10060 		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
10061 		ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
10062 		LWLockRelease(XidGenLock);
10063 		LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
10064 		ShmemVariableCache->nextOid = checkPoint.nextOid;
10065 		ShmemVariableCache->oidCount = 0;
10066 		LWLockRelease(OidGenLock);
10067 		MultiXactSetNextMXact(checkPoint.nextMulti,
10068 							  checkPoint.nextMultiOffset);
10069 
10070 		MultiXactAdvanceOldest(checkPoint.oldestMulti,
10071 							   checkPoint.oldestMultiDB);
10072 
10073 		/*
10074 		 * No need to set oldestClogXid here as well; it'll be set when we
10075 		 * redo an xl_clog_truncate if it changed since initialization.
10076 		 */
10077 		SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
10078 
10079 		/*
10080 		 * If we see a shutdown checkpoint while waiting for an end-of-backup
10081 		 * record, the backup was canceled and the end-of-backup record will
10082 		 * never arrive.
10083 		 */
10084 		if (ArchiveRecoveryRequested &&
10085 			!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
10086 			XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
10087 			ereport(PANIC,
10088 					(errmsg("online backup was canceled, recovery cannot continue")));
10089 
10090 		/*
10091 		 * If we see a shutdown checkpoint, we know that nothing was running
10092 		 * on the master at this point. So fake-up an empty running-xacts
10093 		 * record and use that here and now. Recover additional standby state
10094 		 * for prepared transactions.
10095 		 */
10096 		if (standbyState >= STANDBY_INITIALIZED)
10097 		{
10098 			TransactionId *xids;
10099 			int			nxids;
10100 			TransactionId oldestActiveXID;
10101 			TransactionId latestCompletedXid;
10102 			RunningTransactionsData running;
10103 
10104 			oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
10105 
10106 			/*
10107 			 * Construct a RunningTransactions snapshot representing a shut
10108 			 * down server, with only prepared transactions still alive. We're
10109 			 * never overflowed at this point because all subxids are listed
10110 			 * with their parent prepared transactions.
10111 			 */
10112 			running.xcnt = nxids;
10113 			running.subxcnt = 0;
10114 			running.subxid_overflow = false;
10115 			running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
10116 			running.oldestRunningXid = oldestActiveXID;
10117 			latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
10118 			TransactionIdRetreat(latestCompletedXid);
10119 			Assert(TransactionIdIsNormal(latestCompletedXid));
10120 			running.latestCompletedXid = latestCompletedXid;
10121 			running.xids = xids;
10122 
10123 			ProcArrayApplyRecoveryInfo(&running);
10124 
10125 			StandbyRecoverPreparedTransactions();
10126 		}
10127 
10128 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
10129 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10130 		ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
10131 		LWLockRelease(ControlFileLock);
10132 
10133 		/* Update shared-memory copy of checkpoint XID/epoch */
10134 		SpinLockAcquire(&XLogCtl->info_lck);
10135 		XLogCtl->ckptFullXid = checkPoint.nextFullXid;
10136 		SpinLockRelease(&XLogCtl->info_lck);
10137 
10138 		/*
10139 		 * We should've already switched to the new TLI before replaying this
10140 		 * record.
10141 		 */
10142 		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
10143 			ereport(PANIC,
10144 					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10145 							checkPoint.ThisTimeLineID, ThisTimeLineID)));
10146 
10147 		RecoveryRestartPoint(&checkPoint);
10148 	}
10149 	else if (info == XLOG_CHECKPOINT_ONLINE)
10150 	{
10151 		CheckPoint	checkPoint;
10152 
10153 		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
10154 		/* In an ONLINE checkpoint, treat the XID counter as a minimum */
10155 		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
10156 		if (FullTransactionIdPrecedes(ShmemVariableCache->nextFullXid,
10157 									  checkPoint.nextFullXid))
10158 			ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
10159 		LWLockRelease(XidGenLock);
10160 
10161 		/*
10162 		 * We ignore the nextOid counter in an ONLINE checkpoint, preferring
10163 		 * to track OID assignment through XLOG_NEXTOID records.  The nextOid
10164 		 * counter is from the start of the checkpoint and might well be stale
10165 		 * compared to later XLOG_NEXTOID records.  We could try to take the
10166 		 * maximum of the nextOid counter and our latest value, but since
10167 		 * there's no particular guarantee about the speed with which the OID
10168 		 * counter wraps around, that's a risky thing to do.  In any case,
10169 		 * users of the nextOid counter are required to avoid assignment of
10170 		 * duplicates, so that a somewhat out-of-date value should be safe.
10171 		 */
10172 
10173 		/* Handle multixact */
10174 		MultiXactAdvanceNextMXact(checkPoint.nextMulti,
10175 								  checkPoint.nextMultiOffset);
10176 
10177 		/*
10178 		 * NB: This may perform multixact truncation when replaying WAL
10179 		 * generated by an older primary.
10180 		 */
10181 		MultiXactAdvanceOldest(checkPoint.oldestMulti,
10182 							   checkPoint.oldestMultiDB);
10183 		if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
10184 								  checkPoint.oldestXid))
10185 			SetTransactionIdLimit(checkPoint.oldestXid,
10186 								  checkPoint.oldestXidDB);
10187 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
10188 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10189 		ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
10190 		LWLockRelease(ControlFileLock);
10191 
10192 		/* Update shared-memory copy of checkpoint XID/epoch */
10193 		SpinLockAcquire(&XLogCtl->info_lck);
10194 		XLogCtl->ckptFullXid = checkPoint.nextFullXid;
10195 		SpinLockRelease(&XLogCtl->info_lck);
10196 
10197 		/* TLI should not change in an on-line checkpoint */
10198 		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
10199 			ereport(PANIC,
10200 					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10201 							checkPoint.ThisTimeLineID, ThisTimeLineID)));
10202 
10203 		RecoveryRestartPoint(&checkPoint);
10204 	}
10205 	else if (info == XLOG_OVERWRITE_CONTRECORD)
10206 	{
10207 		xl_overwrite_contrecord xlrec;
10208 
10209 		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
10210 		VerifyOverwriteContrecord(&xlrec, record);
10211 	}
10212 	else if (info == XLOG_END_OF_RECOVERY)
10213 	{
10214 		xl_end_of_recovery xlrec;
10215 
10216 		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
10217 
10218 		/*
10219 		 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
10220 		 * but this case is rarer and harder to test, so the benefit doesn't
10221 		 * outweigh the potential extra cost of maintenance.
10222 		 */
10223 
10224 		/*
10225 		 * We should've already switched to the new TLI before replaying this
10226 		 * record.
10227 		 */
10228 		if (xlrec.ThisTimeLineID != ThisTimeLineID)
10229 			ereport(PANIC,
10230 					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10231 							xlrec.ThisTimeLineID, ThisTimeLineID)));
10232 	}
10233 	else if (info == XLOG_NOOP)
10234 	{
10235 		/* nothing to do here */
10236 	}
10237 	else if (info == XLOG_SWITCH)
10238 	{
10239 		/* nothing to do here */
10240 	}
10241 	else if (info == XLOG_RESTORE_POINT)
10242 	{
10243 		/* nothing to do here */
10244 	}
10245 	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
10246 	{
10247 		/*
10248 		 * Full-page image (FPI) records contain nothing else but a backup
10249 		 * block (or multiple backup blocks). Every block reference must
10250 		 * include a full-page image - otherwise there would be no point in
10251 		 * this record.
10252 		 *
10253 		 * No recovery conflicts are generated by these generic records - if a
10254 		 * resource manager needs to generate conflicts, it has to define a
10255 		 * separate WAL record type and redo routine.
10256 		 *
10257 		 * XLOG_FPI_FOR_HINT records are generated when a page needs to be
10258 		 * WAL- logged because of a hint bit update. They are only generated
10259 		 * when checksums are enabled. There is no difference in handling
10260 		 * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
10261 		 * code just to distinguish them for statistics purposes.
10262 		 */
10263 		for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
10264 		{
10265 			Buffer		buffer;
10266 
10267 			if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
10268 				elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
10269 			UnlockReleaseBuffer(buffer);
10270 		}
10271 	}
10272 	else if (info == XLOG_BACKUP_END)
10273 	{
10274 		XLogRecPtr	startpoint;
10275 
10276 		memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
10277 
10278 		if (ControlFile->backupStartPoint == startpoint)
10279 		{
10280 			/*
10281 			 * We have reached the end of base backup, the point where
10282 			 * pg_stop_backup() was done. The data on disk is now consistent.
10283 			 * Reset backupStartPoint, and update minRecoveryPoint to make
10284 			 * sure we don't allow starting up at an earlier point even if
10285 			 * recovery is stopped and restarted soon after this.
10286 			 */
10287 			elog(DEBUG1, "end of backup reached");
10288 
10289 			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10290 
10291 			if (ControlFile->minRecoveryPoint < lsn)
10292 			{
10293 				ControlFile->minRecoveryPoint = lsn;
10294 				ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10295 			}
10296 			ControlFile->backupStartPoint = InvalidXLogRecPtr;
10297 			ControlFile->backupEndRequired = false;
10298 			UpdateControlFile();
10299 
10300 			LWLockRelease(ControlFileLock);
10301 		}
10302 	}
10303 	else if (info == XLOG_PARAMETER_CHANGE)
10304 	{
10305 		xl_parameter_change xlrec;
10306 
10307 		/* Update our copy of the parameters in pg_control */
10308 		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
10309 
10310 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10311 		ControlFile->MaxConnections = xlrec.MaxConnections;
10312 		ControlFile->max_worker_processes = xlrec.max_worker_processes;
10313 		ControlFile->max_wal_senders = xlrec.max_wal_senders;
10314 		ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
10315 		ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
10316 		ControlFile->wal_level = xlrec.wal_level;
10317 		ControlFile->wal_log_hints = xlrec.wal_log_hints;
10318 
10319 		/*
10320 		 * Update minRecoveryPoint to ensure that if recovery is aborted, we
10321 		 * recover back up to this point before allowing hot standby again.
10322 		 * This is important if the max_* settings are decreased, to ensure
10323 		 * you don't run queries against the WAL preceding the change. The
10324 		 * local copies cannot be updated as long as crash recovery is
10325 		 * happening and we expect all the WAL to be replayed.
10326 		 */
10327 		if (InArchiveRecovery)
10328 		{
10329 			minRecoveryPoint = ControlFile->minRecoveryPoint;
10330 			minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
10331 		}
10332 		if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
10333 		{
10334 			ControlFile->minRecoveryPoint = lsn;
10335 			ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10336 		}
10337 
10338 		CommitTsParameterChange(xlrec.track_commit_timestamp,
10339 								ControlFile->track_commit_timestamp);
10340 		ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
10341 
10342 		UpdateControlFile();
10343 		LWLockRelease(ControlFileLock);
10344 
10345 		/* Check to see if any parameter change gives a problem on recovery */
10346 		CheckRequiredParameterValues();
10347 	}
10348 	else if (info == XLOG_FPW_CHANGE)
10349 	{
10350 		bool		fpw;
10351 
10352 		memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
10353 
10354 		/*
10355 		 * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
10356 		 * do_pg_start_backup() and do_pg_stop_backup() can check whether
10357 		 * full_page_writes has been disabled during online backup.
10358 		 */
10359 		if (!fpw)
10360 		{
10361 			SpinLockAcquire(&XLogCtl->info_lck);
10362 			if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
10363 				XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
10364 			SpinLockRelease(&XLogCtl->info_lck);
10365 		}
10366 
10367 		/* Keep track of full_page_writes */
10368 		lastFullPageWrites = fpw;
10369 	}
10370 }
10371 
10372 /*
10373  * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record.
10374  */
10375 static void
VerifyOverwriteContrecord(xl_overwrite_contrecord * xlrec,XLogReaderState * state)10376 VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state)
10377 {
10378 	if (xlrec->overwritten_lsn != state->overwrittenRecPtr)
10379 		elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
10380 			 (uint32) (xlrec->overwritten_lsn >> 32),
10381 			 (uint32) xlrec->overwritten_lsn,
10382 			 (uint32) (state->overwrittenRecPtr >> 32),
10383 			 (uint32) state->overwrittenRecPtr);
10384 
10385 	ereport(LOG,
10386 			(errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
10387 					(uint32) (xlrec->overwritten_lsn >> 32),
10388 					(uint32) xlrec->overwritten_lsn,
10389 					timestamptz_to_str(xlrec->overwrite_time))));
10390 
10391 	/* Verifying the record should only happen once */
10392 	state->overwrittenRecPtr = InvalidXLogRecPtr;
10393 }
10394 
10395 #ifdef WAL_DEBUG
10396 
10397 static void
xlog_outrec(StringInfo buf,XLogReaderState * record)10398 xlog_outrec(StringInfo buf, XLogReaderState *record)
10399 {
10400 	int			block_id;
10401 
10402 	appendStringInfo(buf, "prev %X/%X; xid %u",
10403 					 (uint32) (XLogRecGetPrev(record) >> 32),
10404 					 (uint32) XLogRecGetPrev(record),
10405 					 XLogRecGetXid(record));
10406 
10407 	appendStringInfo(buf, "; len %u",
10408 					 XLogRecGetDataLen(record));
10409 
10410 	/* decode block references */
10411 	for (block_id = 0; block_id <= record->max_block_id; block_id++)
10412 	{
10413 		RelFileNode rnode;
10414 		ForkNumber	forknum;
10415 		BlockNumber blk;
10416 
10417 		if (!XLogRecHasBlockRef(record, block_id))
10418 			continue;
10419 
10420 		XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
10421 		if (forknum != MAIN_FORKNUM)
10422 			appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
10423 							 block_id,
10424 							 rnode.spcNode, rnode.dbNode, rnode.relNode,
10425 							 forknum,
10426 							 blk);
10427 		else
10428 			appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
10429 							 block_id,
10430 							 rnode.spcNode, rnode.dbNode, rnode.relNode,
10431 							 blk);
10432 		if (XLogRecHasBlockImage(record, block_id))
10433 			appendStringInfoString(buf, " FPW");
10434 	}
10435 }
10436 #endif							/* WAL_DEBUG */
10437 
10438 /*
10439  * Returns a string describing an XLogRecord, consisting of its identity
10440  * optionally followed by a colon, a space, and a further description.
10441  */
10442 static void
xlog_outdesc(StringInfo buf,XLogReaderState * record)10443 xlog_outdesc(StringInfo buf, XLogReaderState *record)
10444 {
10445 	RmgrId		rmid = XLogRecGetRmid(record);
10446 	uint8		info = XLogRecGetInfo(record);
10447 	const char *id;
10448 
10449 	appendStringInfoString(buf, RmgrTable[rmid].rm_name);
10450 	appendStringInfoChar(buf, '/');
10451 
10452 	id = RmgrTable[rmid].rm_identify(info);
10453 	if (id == NULL)
10454 		appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
10455 	else
10456 		appendStringInfo(buf, "%s: ", id);
10457 
10458 	RmgrTable[rmid].rm_desc(buf, record);
10459 }
10460 
10461 
10462 /*
10463  * Return the (possible) sync flag used for opening a file, depending on the
10464  * value of the GUC wal_sync_method.
10465  */
10466 static int
get_sync_bit(int method)10467 get_sync_bit(int method)
10468 {
10469 	int			o_direct_flag = 0;
10470 
10471 	/* If fsync is disabled, never open in sync mode */
10472 	if (!enableFsync)
10473 		return 0;
10474 
10475 	/*
10476 	 * Optimize writes by bypassing kernel cache with O_DIRECT when using
10477 	 * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
10478 	 * disabled, otherwise the archive command or walsender process will read
10479 	 * the WAL soon after writing it, which is guaranteed to cause a physical
10480 	 * read if we bypassed the kernel cache. We also skip the
10481 	 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
10482 	 * reason.
10483 	 *
10484 	 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
10485 	 * written by walreceiver is normally read by the startup process soon
10486 	 * after its written. Also, walreceiver performs unaligned writes, which
10487 	 * don't work with O_DIRECT, so it is required for correctness too.
10488 	 */
10489 	if (!XLogIsNeeded() && !AmWalReceiverProcess())
10490 		o_direct_flag = PG_O_DIRECT;
10491 
10492 	switch (method)
10493 	{
10494 			/*
10495 			 * enum values for all sync options are defined even if they are
10496 			 * not supported on the current platform.  But if not, they are
10497 			 * not included in the enum option array, and therefore will never
10498 			 * be seen here.
10499 			 */
10500 		case SYNC_METHOD_FSYNC:
10501 		case SYNC_METHOD_FSYNC_WRITETHROUGH:
10502 		case SYNC_METHOD_FDATASYNC:
10503 			return 0;
10504 #ifdef OPEN_SYNC_FLAG
10505 		case SYNC_METHOD_OPEN:
10506 			return OPEN_SYNC_FLAG | o_direct_flag;
10507 #endif
10508 #ifdef OPEN_DATASYNC_FLAG
10509 		case SYNC_METHOD_OPEN_DSYNC:
10510 			return OPEN_DATASYNC_FLAG | o_direct_flag;
10511 #endif
10512 		default:
10513 			/* can't happen (unless we are out of sync with option array) */
10514 			elog(ERROR, "unrecognized wal_sync_method: %d", method);
10515 			return 0;			/* silence warning */
10516 	}
10517 }
10518 
10519 /*
10520  * GUC support
10521  */
10522 void
assign_xlog_sync_method(int new_sync_method,void * extra)10523 assign_xlog_sync_method(int new_sync_method, void *extra)
10524 {
10525 	if (sync_method != new_sync_method)
10526 	{
10527 		/*
10528 		 * To ensure that no blocks escape unsynced, force an fsync on the
10529 		 * currently open log segment (if any).  Also, if the open flag is
10530 		 * changing, close the log file so it will be reopened (with new flag
10531 		 * bit) at next use.
10532 		 */
10533 		if (openLogFile >= 0)
10534 		{
10535 			pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
10536 			if (pg_fsync(openLogFile) != 0)
10537 			{
10538 				char		xlogfname[MAXFNAMELEN];
10539 				int			save_errno;
10540 
10541 				save_errno = errno;
10542 				XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
10543 							 wal_segment_size);
10544 				errno = save_errno;
10545 				ereport(PANIC,
10546 						(errcode_for_file_access(),
10547 						 errmsg("could not fsync file \"%s\": %m", xlogfname)));
10548 			}
10549 
10550 			pgstat_report_wait_end();
10551 			if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10552 				XLogFileClose();
10553 		}
10554 	}
10555 }
10556 
10557 
10558 /*
10559  * Issue appropriate kind of fsync (if any) for an XLOG output file.
10560  *
10561  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10562  * 'segno' is for error reporting purposes.
10563  */
10564 void
issue_xlog_fsync(int fd,XLogSegNo segno)10565 issue_xlog_fsync(int fd, XLogSegNo segno)
10566 {
10567 	char	   *msg = NULL;
10568 
10569 	pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
10570 	switch (sync_method)
10571 	{
10572 		case SYNC_METHOD_FSYNC:
10573 			if (pg_fsync_no_writethrough(fd) != 0)
10574 				msg = _("could not fsync file \"%s\": %m");
10575 			break;
10576 #ifdef HAVE_FSYNC_WRITETHROUGH
10577 		case SYNC_METHOD_FSYNC_WRITETHROUGH:
10578 			if (pg_fsync_writethrough(fd) != 0)
10579 				msg = _("could not fsync write-through file \"%s\": %m");
10580 			break;
10581 #endif
10582 #ifdef HAVE_FDATASYNC
10583 		case SYNC_METHOD_FDATASYNC:
10584 			if (pg_fdatasync(fd) != 0)
10585 				msg = _("could not fdatasync file \"%s\": %m");
10586 			break;
10587 #endif
10588 		case SYNC_METHOD_OPEN:
10589 		case SYNC_METHOD_OPEN_DSYNC:
10590 			/* write synced it already */
10591 			break;
10592 		default:
10593 			elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10594 			break;
10595 	}
10596 
10597 	/* PANIC if failed to fsync */
10598 	if (msg)
10599 	{
10600 		char		xlogfname[MAXFNAMELEN];
10601 		int			save_errno = errno;
10602 
10603 		XLogFileName(xlogfname, ThisTimeLineID, segno,
10604 					 wal_segment_size);
10605 		errno = save_errno;
10606 		ereport(PANIC,
10607 				(errcode_for_file_access(),
10608 				 errmsg(msg, xlogfname)));
10609 	}
10610 
10611 	pgstat_report_wait_end();
10612 }
10613 
10614 /*
10615  * do_pg_start_backup
10616  *
10617  * Utility function called at the start of an online backup. It creates the
10618  * necessary starting checkpoint and constructs the backup label file.
10619  *
10620  * There are two kind of backups: exclusive and non-exclusive. An exclusive
10621  * backup is started with pg_start_backup(), and there can be only one active
10622  * at a time. The backup and tablespace map files of an exclusive backup are
10623  * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10624  * removed by pg_stop_backup().
10625  *
10626  * A non-exclusive backup is used for the streaming base backups (see
10627  * src/backend/replication/basebackup.c). The difference to exclusive backups
10628  * is that the backup label and tablespace map files are not written to disk.
10629  * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10630  * and the caller is responsible for including them in the backup archive as
10631  * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10632  * active at the same time, and they don't conflict with an exclusive backup
10633  * either.
10634  *
10635  * tablespaces is required only when this function is called while
10636  * the streaming base backup requested by pg_basebackup is running.
10637  * NULL should be specified otherwise.
10638  *
10639  * tblspcmapfile is required mainly for tar format in windows as native windows
10640  * utilities are not able to create symlinks while extracting files from tar.
10641  * However for consistency, the same is used for all platforms.
10642  *
10643  * needtblspcmapfile is true for the cases (exclusive backup and for
10644  * non-exclusive backup only when tar format is used for taking backup)
10645  * when backup needs to generate tablespace_map file, it is used to
10646  * embed escape character before newline character in tablespace path.
10647  *
10648  * Returns the minimum WAL location that must be present to restore from this
10649  * backup, and the corresponding timeline ID in *starttli_p.
10650  *
10651  * Every successfully started non-exclusive backup must be stopped by calling
10652  * do_pg_stop_backup() or do_pg_abort_backup().
10653  *
10654  * It is the responsibility of the caller of this function to verify the
10655  * permissions of the calling user!
10656  */
10657 XLogRecPtr
do_pg_start_backup(const char * backupidstr,bool fast,TimeLineID * starttli_p,StringInfo labelfile,List ** tablespaces,StringInfo tblspcmapfile,bool infotbssize,bool needtblspcmapfile)10658 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10659 				   StringInfo labelfile, List **tablespaces,
10660 				   StringInfo tblspcmapfile, bool infotbssize,
10661 				   bool needtblspcmapfile)
10662 {
10663 	bool		exclusive = (labelfile == NULL);
10664 	bool		backup_started_in_recovery = false;
10665 	XLogRecPtr	checkpointloc;
10666 	XLogRecPtr	startpoint;
10667 	TimeLineID	starttli;
10668 	pg_time_t	stamp_time;
10669 	char		strfbuf[128];
10670 	char		xlogfilename[MAXFNAMELEN];
10671 	XLogSegNo	_logSegNo;
10672 	struct stat stat_buf;
10673 	FILE	   *fp;
10674 
10675 	backup_started_in_recovery = RecoveryInProgress();
10676 
10677 	/*
10678 	 * Currently only non-exclusive backup can be taken during recovery.
10679 	 */
10680 	if (backup_started_in_recovery && exclusive)
10681 		ereport(ERROR,
10682 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10683 				 errmsg("recovery is in progress"),
10684 				 errhint("WAL control functions cannot be executed during recovery.")));
10685 
10686 	/*
10687 	 * During recovery, we don't need to check WAL level. Because, if WAL
10688 	 * level is not sufficient, it's impossible to get here during recovery.
10689 	 */
10690 	if (!backup_started_in_recovery && !XLogIsNeeded())
10691 		ereport(ERROR,
10692 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10693 				 errmsg("WAL level not sufficient for making an online backup"),
10694 				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10695 
10696 	if (strlen(backupidstr) > MAXPGPATH)
10697 		ereport(ERROR,
10698 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10699 				 errmsg("backup label too long (max %d bytes)",
10700 						MAXPGPATH)));
10701 
10702 	/*
10703 	 * Mark backup active in shared memory.  We must do full-page WAL writes
10704 	 * during an on-line backup even if not doing so at other times, because
10705 	 * it's quite possible for the backup dump to obtain a "torn" (partially
10706 	 * written) copy of a database page if it reads the page concurrently with
10707 	 * our write to the same page.  This can be fixed as long as the first
10708 	 * write to the page in the WAL sequence is a full-page write. Hence, we
10709 	 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10710 	 * are no dirty pages in shared memory that might get dumped while the
10711 	 * backup is in progress without having a corresponding WAL record.  (Once
10712 	 * the backup is complete, we need not force full-page writes anymore,
10713 	 * since we expect that any pages not modified during the backup interval
10714 	 * must have been correctly captured by the backup.)
10715 	 *
10716 	 * Note that forcePageWrites has no effect during an online backup from
10717 	 * the standby.
10718 	 *
10719 	 * We must hold all the insertion locks to change the value of
10720 	 * forcePageWrites, to ensure adequate interlocking against
10721 	 * XLogInsertRecord().
10722 	 */
10723 	WALInsertLockAcquireExclusive();
10724 	if (exclusive)
10725 	{
10726 		/*
10727 		 * At first, mark that we're now starting an exclusive backup, to
10728 		 * ensure that there are no other sessions currently running
10729 		 * pg_start_backup() or pg_stop_backup().
10730 		 */
10731 		if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10732 		{
10733 			WALInsertLockRelease();
10734 			ereport(ERROR,
10735 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10736 					 errmsg("a backup is already in progress"),
10737 					 errhint("Run pg_stop_backup() and try again.")));
10738 		}
10739 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10740 	}
10741 	else
10742 		XLogCtl->Insert.nonExclusiveBackups++;
10743 	XLogCtl->Insert.forcePageWrites = true;
10744 	WALInsertLockRelease();
10745 
10746 	/* Ensure we release forcePageWrites if fail below */
10747 	PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10748 	{
10749 		bool		gotUniqueStartpoint = false;
10750 		DIR		   *tblspcdir;
10751 		struct dirent *de;
10752 		tablespaceinfo *ti;
10753 		int			datadirpathlen;
10754 
10755 		/*
10756 		 * Force an XLOG file switch before the checkpoint, to ensure that the
10757 		 * WAL segment the checkpoint is written to doesn't contain pages with
10758 		 * old timeline IDs.  That would otherwise happen if you called
10759 		 * pg_start_backup() right after restoring from a PITR archive: the
10760 		 * first WAL segment containing the startup checkpoint has pages in
10761 		 * the beginning with the old timeline ID.  That can cause trouble at
10762 		 * recovery: we won't have a history file covering the old timeline if
10763 		 * pg_wal directory was not included in the base backup and the WAL
10764 		 * archive was cleared too before starting the backup.
10765 		 *
10766 		 * This also ensures that we have emitted a WAL page header that has
10767 		 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10768 		 * Therefore, if a WAL archiver (such as pglesslog) is trying to
10769 		 * compress out removable backup blocks, it won't remove any that
10770 		 * occur after this point.
10771 		 *
10772 		 * During recovery, we skip forcing XLOG file switch, which means that
10773 		 * the backup taken during recovery is not available for the special
10774 		 * recovery case described above.
10775 		 */
10776 		if (!backup_started_in_recovery)
10777 			RequestXLogSwitch(false);
10778 
10779 		do
10780 		{
10781 			bool		checkpointfpw;
10782 
10783 			/*
10784 			 * Force a CHECKPOINT.  Aside from being necessary to prevent torn
10785 			 * page problems, this guarantees that two successive backup runs
10786 			 * will have different checkpoint positions and hence different
10787 			 * history file names, even if nothing happened in between.
10788 			 *
10789 			 * During recovery, establish a restartpoint if possible. We use
10790 			 * the last restartpoint as the backup starting checkpoint. This
10791 			 * means that two successive backup runs can have same checkpoint
10792 			 * positions.
10793 			 *
10794 			 * Since the fact that we are executing do_pg_start_backup()
10795 			 * during recovery means that checkpointer is running, we can use
10796 			 * RequestCheckpoint() to establish a restartpoint.
10797 			 *
10798 			 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10799 			 * passing fast = true).  Otherwise this can take awhile.
10800 			 */
10801 			RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10802 							  (fast ? CHECKPOINT_IMMEDIATE : 0));
10803 
10804 			/*
10805 			 * Now we need to fetch the checkpoint record location, and also
10806 			 * its REDO pointer.  The oldest point in WAL that would be needed
10807 			 * to restore starting from the checkpoint is precisely the REDO
10808 			 * pointer.
10809 			 */
10810 			LWLockAcquire(ControlFileLock, LW_SHARED);
10811 			checkpointloc = ControlFile->checkPoint;
10812 			startpoint = ControlFile->checkPointCopy.redo;
10813 			starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10814 			checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10815 			LWLockRelease(ControlFileLock);
10816 
10817 			if (backup_started_in_recovery)
10818 			{
10819 				XLogRecPtr	recptr;
10820 
10821 				/*
10822 				 * Check to see if all WAL replayed during online backup
10823 				 * (i.e., since last restartpoint used as backup starting
10824 				 * checkpoint) contain full-page writes.
10825 				 */
10826 				SpinLockAcquire(&XLogCtl->info_lck);
10827 				recptr = XLogCtl->lastFpwDisableRecPtr;
10828 				SpinLockRelease(&XLogCtl->info_lck);
10829 
10830 				if (!checkpointfpw || startpoint <= recptr)
10831 					ereport(ERROR,
10832 							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10833 							 errmsg("WAL generated with full_page_writes=off was replayed "
10834 									"since last restartpoint"),
10835 							 errhint("This means that the backup being taken on the standby "
10836 									 "is corrupt and should not be used. "
10837 									 "Enable full_page_writes and run CHECKPOINT on the master, "
10838 									 "and then try an online backup again.")));
10839 
10840 				/*
10841 				 * During recovery, since we don't use the end-of-backup WAL
10842 				 * record and don't write the backup history file, the
10843 				 * starting WAL location doesn't need to be unique. This means
10844 				 * that two base backups started at the same time might use
10845 				 * the same checkpoint as starting locations.
10846 				 */
10847 				gotUniqueStartpoint = true;
10848 			}
10849 
10850 			/*
10851 			 * If two base backups are started at the same time (in WAL sender
10852 			 * processes), we need to make sure that they use different
10853 			 * checkpoints as starting locations, because we use the starting
10854 			 * WAL location as a unique identifier for the base backup in the
10855 			 * end-of-backup WAL record and when we write the backup history
10856 			 * file. Perhaps it would be better generate a separate unique ID
10857 			 * for each backup instead of forcing another checkpoint, but
10858 			 * taking a checkpoint right after another is not that expensive
10859 			 * either because only few buffers have been dirtied yet.
10860 			 */
10861 			WALInsertLockAcquireExclusive();
10862 			if (XLogCtl->Insert.lastBackupStart < startpoint)
10863 			{
10864 				XLogCtl->Insert.lastBackupStart = startpoint;
10865 				gotUniqueStartpoint = true;
10866 			}
10867 			WALInsertLockRelease();
10868 		} while (!gotUniqueStartpoint);
10869 
10870 		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
10871 		XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
10872 
10873 		/*
10874 		 * Construct tablespace_map file
10875 		 */
10876 		if (exclusive)
10877 			tblspcmapfile = makeStringInfo();
10878 
10879 		datadirpathlen = strlen(DataDir);
10880 
10881 		/*
10882 		 * Report that we are now estimating the total backup size if we're
10883 		 * streaming base backup as requested by pg_basebackup
10884 		 */
10885 		if (tablespaces)
10886 			pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE,
10887 										 PROGRESS_BASEBACKUP_PHASE_ESTIMATE_BACKUP_SIZE);
10888 
10889 		/* Collect information about all tablespaces */
10890 		tblspcdir = AllocateDir("pg_tblspc");
10891 		while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
10892 		{
10893 			char		fullpath[MAXPGPATH + 10];
10894 			char		linkpath[MAXPGPATH];
10895 			char	   *relpath = NULL;
10896 			int			rllen;
10897 			StringInfoData buflinkpath;
10898 			char	   *s = linkpath;
10899 
10900 			/* Skip special stuff */
10901 			if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
10902 				continue;
10903 
10904 			snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
10905 
10906 #if defined(HAVE_READLINK) || defined(WIN32)
10907 			rllen = readlink(fullpath, linkpath, sizeof(linkpath));
10908 			if (rllen < 0)
10909 			{
10910 				ereport(WARNING,
10911 						(errmsg("could not read symbolic link \"%s\": %m",
10912 								fullpath)));
10913 				continue;
10914 			}
10915 			else if (rllen >= sizeof(linkpath))
10916 			{
10917 				ereport(WARNING,
10918 						(errmsg("symbolic link \"%s\" target is too long",
10919 								fullpath)));
10920 				continue;
10921 			}
10922 			linkpath[rllen] = '\0';
10923 
10924 			/*
10925 			 * Add the escape character '\\' before newline in a string to
10926 			 * ensure that we can distinguish between the newline in the
10927 			 * tablespace path and end of line while reading tablespace_map
10928 			 * file during archive recovery.
10929 			 */
10930 			initStringInfo(&buflinkpath);
10931 
10932 			while (*s)
10933 			{
10934 				if ((*s == '\n' || *s == '\r') && needtblspcmapfile)
10935 					appendStringInfoChar(&buflinkpath, '\\');
10936 				appendStringInfoChar(&buflinkpath, *s++);
10937 			}
10938 
10939 			/*
10940 			 * Relpath holds the relative path of the tablespace directory
10941 			 * when it's located within PGDATA, or NULL if it's located
10942 			 * elsewhere.
10943 			 */
10944 			if (rllen > datadirpathlen &&
10945 				strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
10946 				IS_DIR_SEP(linkpath[datadirpathlen]))
10947 				relpath = linkpath + datadirpathlen + 1;
10948 
10949 			ti = palloc(sizeof(tablespaceinfo));
10950 			ti->oid = pstrdup(de->d_name);
10951 			ti->path = pstrdup(buflinkpath.data);
10952 			ti->rpath = relpath ? pstrdup(relpath) : NULL;
10953 			ti->size = infotbssize ?
10954 				sendTablespace(fullpath, ti->oid, true, NULL) : -1;
10955 
10956 			if (tablespaces)
10957 				*tablespaces = lappend(*tablespaces, ti);
10958 
10959 			appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
10960 
10961 			pfree(buflinkpath.data);
10962 #else
10963 
10964 			/*
10965 			 * If the platform does not have symbolic links, it should not be
10966 			 * possible to have tablespaces - clearly somebody else created
10967 			 * them. Warn about it and ignore.
10968 			 */
10969 			ereport(WARNING,
10970 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
10971 					 errmsg("tablespaces are not supported on this platform")));
10972 #endif
10973 		}
10974 		FreeDir(tblspcdir);
10975 
10976 		/*
10977 		 * Construct backup label file
10978 		 */
10979 		if (exclusive)
10980 			labelfile = makeStringInfo();
10981 
10982 		/* Use the log timezone here, not the session timezone */
10983 		stamp_time = (pg_time_t) time(NULL);
10984 		pg_strftime(strfbuf, sizeof(strfbuf),
10985 					"%Y-%m-%d %H:%M:%S %Z",
10986 					pg_localtime(&stamp_time, log_timezone));
10987 		appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
10988 						 (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10989 		appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
10990 						 (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10991 		appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
10992 						 exclusive ? "pg_start_backup" : "streamed");
10993 		appendStringInfo(labelfile, "BACKUP FROM: %s\n",
10994 						 backup_started_in_recovery ? "standby" : "master");
10995 		appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
10996 		appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
10997 		appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
10998 
10999 		/*
11000 		 * Okay, write the file, or return its contents to caller.
11001 		 */
11002 		if (exclusive)
11003 		{
11004 			/*
11005 			 * Check for existing backup label --- implies a backup is already
11006 			 * running.  (XXX given that we checked exclusiveBackupState
11007 			 * above, maybe it would be OK to just unlink any such label
11008 			 * file?)
11009 			 */
11010 			if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
11011 			{
11012 				if (errno != ENOENT)
11013 					ereport(ERROR,
11014 							(errcode_for_file_access(),
11015 							 errmsg("could not stat file \"%s\": %m",
11016 									BACKUP_LABEL_FILE)));
11017 			}
11018 			else
11019 				ereport(ERROR,
11020 						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11021 						 errmsg("a backup is already in progress"),
11022 						 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
11023 								 BACKUP_LABEL_FILE)));
11024 
11025 			fp = AllocateFile(BACKUP_LABEL_FILE, "w");
11026 
11027 			if (!fp)
11028 				ereport(ERROR,
11029 						(errcode_for_file_access(),
11030 						 errmsg("could not create file \"%s\": %m",
11031 								BACKUP_LABEL_FILE)));
11032 			if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
11033 				fflush(fp) != 0 ||
11034 				pg_fsync(fileno(fp)) != 0 ||
11035 				ferror(fp) ||
11036 				FreeFile(fp))
11037 				ereport(ERROR,
11038 						(errcode_for_file_access(),
11039 						 errmsg("could not write file \"%s\": %m",
11040 								BACKUP_LABEL_FILE)));
11041 			/* Allocated locally for exclusive backups, so free separately */
11042 			pfree(labelfile->data);
11043 			pfree(labelfile);
11044 
11045 			/* Write backup tablespace_map file. */
11046 			if (tblspcmapfile->len > 0)
11047 			{
11048 				if (stat(TABLESPACE_MAP, &stat_buf) != 0)
11049 				{
11050 					if (errno != ENOENT)
11051 						ereport(ERROR,
11052 								(errcode_for_file_access(),
11053 								 errmsg("could not stat file \"%s\": %m",
11054 										TABLESPACE_MAP)));
11055 				}
11056 				else
11057 					ereport(ERROR,
11058 							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11059 							 errmsg("a backup is already in progress"),
11060 							 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
11061 									 TABLESPACE_MAP)));
11062 
11063 				fp = AllocateFile(TABLESPACE_MAP, "w");
11064 
11065 				if (!fp)
11066 					ereport(ERROR,
11067 							(errcode_for_file_access(),
11068 							 errmsg("could not create file \"%s\": %m",
11069 									TABLESPACE_MAP)));
11070 				if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
11071 					fflush(fp) != 0 ||
11072 					pg_fsync(fileno(fp)) != 0 ||
11073 					ferror(fp) ||
11074 					FreeFile(fp))
11075 					ereport(ERROR,
11076 							(errcode_for_file_access(),
11077 							 errmsg("could not write file \"%s\": %m",
11078 									TABLESPACE_MAP)));
11079 			}
11080 
11081 			/* Allocated locally for exclusive backups, so free separately */
11082 			pfree(tblspcmapfile->data);
11083 			pfree(tblspcmapfile);
11084 		}
11085 	}
11086 	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
11087 
11088 	/*
11089 	 * Mark that start phase has correctly finished for an exclusive backup.
11090 	 * Session-level locks are updated as well to reflect that state.
11091 	 *
11092 	 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
11093 	 * counters and session-level lock. Otherwise they can be updated
11094 	 * inconsistently, and which might cause do_pg_abort_backup() to fail.
11095 	 */
11096 	if (exclusive)
11097 	{
11098 		WALInsertLockAcquireExclusive();
11099 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
11100 
11101 		/* Set session-level lock */
11102 		sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
11103 		WALInsertLockRelease();
11104 	}
11105 	else
11106 		sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
11107 
11108 	/*
11109 	 * We're done.  As a convenience, return the starting WAL location.
11110 	 */
11111 	if (starttli_p)
11112 		*starttli_p = starttli;
11113 	return startpoint;
11114 }
11115 
11116 /* Error cleanup callback for pg_start_backup */
11117 static void
pg_start_backup_callback(int code,Datum arg)11118 pg_start_backup_callback(int code, Datum arg)
11119 {
11120 	bool		exclusive = DatumGetBool(arg);
11121 
11122 	/* Update backup counters and forcePageWrites on failure */
11123 	WALInsertLockAcquireExclusive();
11124 	if (exclusive)
11125 	{
11126 		Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
11127 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11128 	}
11129 	else
11130 	{
11131 		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11132 		XLogCtl->Insert.nonExclusiveBackups--;
11133 	}
11134 
11135 	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11136 		XLogCtl->Insert.nonExclusiveBackups == 0)
11137 	{
11138 		XLogCtl->Insert.forcePageWrites = false;
11139 	}
11140 	WALInsertLockRelease();
11141 }
11142 
11143 /*
11144  * Error cleanup callback for pg_stop_backup
11145  */
11146 static void
pg_stop_backup_callback(int code,Datum arg)11147 pg_stop_backup_callback(int code, Datum arg)
11148 {
11149 	bool		exclusive = DatumGetBool(arg);
11150 
11151 	/* Update backup status on failure */
11152 	WALInsertLockAcquireExclusive();
11153 	if (exclusive)
11154 	{
11155 		Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
11156 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
11157 	}
11158 	WALInsertLockRelease();
11159 }
11160 
11161 /*
11162  * Utility routine to fetch the session-level status of a backup running.
11163  */
11164 SessionBackupState
get_backup_status(void)11165 get_backup_status(void)
11166 {
11167 	return sessionBackupState;
11168 }
11169 
11170 /*
11171  * do_pg_stop_backup
11172  *
11173  * Utility function called at the end of an online backup. It cleans up the
11174  * backup state and can optionally wait for WAL segments to be archived.
11175  *
11176  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
11177  * the non-exclusive backup specified by 'labelfile'.
11178  *
11179  * Returns the last WAL location that must be present to restore from this
11180  * backup, and the corresponding timeline ID in *stoptli_p.
11181  *
11182  * It is the responsibility of the caller of this function to verify the
11183  * permissions of the calling user!
11184  */
11185 XLogRecPtr
do_pg_stop_backup(char * labelfile,bool waitforarchive,TimeLineID * stoptli_p)11186 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
11187 {
11188 	bool		exclusive = (labelfile == NULL);
11189 	bool		backup_started_in_recovery = false;
11190 	XLogRecPtr	startpoint;
11191 	XLogRecPtr	stoppoint;
11192 	TimeLineID	stoptli;
11193 	pg_time_t	stamp_time;
11194 	char		strfbuf[128];
11195 	char		histfilepath[MAXPGPATH];
11196 	char		startxlogfilename[MAXFNAMELEN];
11197 	char		stopxlogfilename[MAXFNAMELEN];
11198 	char		lastxlogfilename[MAXFNAMELEN];
11199 	char		histfilename[MAXFNAMELEN];
11200 	char		backupfrom[20];
11201 	XLogSegNo	_logSegNo;
11202 	FILE	   *lfp;
11203 	FILE	   *fp;
11204 	char		ch;
11205 	int			seconds_before_warning;
11206 	int			waits = 0;
11207 	bool		reported_waiting = false;
11208 	char	   *remaining;
11209 	char	   *ptr;
11210 	uint32		hi,
11211 				lo;
11212 
11213 	backup_started_in_recovery = RecoveryInProgress();
11214 
11215 	/*
11216 	 * Currently only non-exclusive backup can be taken during recovery.
11217 	 */
11218 	if (backup_started_in_recovery && exclusive)
11219 		ereport(ERROR,
11220 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11221 				 errmsg("recovery is in progress"),
11222 				 errhint("WAL control functions cannot be executed during recovery.")));
11223 
11224 	/*
11225 	 * During recovery, we don't need to check WAL level. Because, if WAL
11226 	 * level is not sufficient, it's impossible to get here during recovery.
11227 	 */
11228 	if (!backup_started_in_recovery && !XLogIsNeeded())
11229 		ereport(ERROR,
11230 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11231 				 errmsg("WAL level not sufficient for making an online backup"),
11232 				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
11233 
11234 	if (exclusive)
11235 	{
11236 		/*
11237 		 * At first, mark that we're now stopping an exclusive backup, to
11238 		 * ensure that there are no other sessions currently running
11239 		 * pg_start_backup() or pg_stop_backup().
11240 		 */
11241 		WALInsertLockAcquireExclusive();
11242 		if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
11243 		{
11244 			WALInsertLockRelease();
11245 			ereport(ERROR,
11246 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11247 					 errmsg("exclusive backup not in progress")));
11248 		}
11249 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
11250 		WALInsertLockRelease();
11251 
11252 		/*
11253 		 * Remove backup_label. In case of failure, the state for an exclusive
11254 		 * backup is switched back to in-progress.
11255 		 */
11256 		PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11257 		{
11258 			/*
11259 			 * Read the existing label file into memory.
11260 			 */
11261 			struct stat statbuf;
11262 			int			r;
11263 
11264 			if (stat(BACKUP_LABEL_FILE, &statbuf))
11265 			{
11266 				/* should not happen per the upper checks */
11267 				if (errno != ENOENT)
11268 					ereport(ERROR,
11269 							(errcode_for_file_access(),
11270 							 errmsg("could not stat file \"%s\": %m",
11271 									BACKUP_LABEL_FILE)));
11272 				ereport(ERROR,
11273 						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11274 						 errmsg("a backup is not in progress")));
11275 			}
11276 
11277 			lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11278 			if (!lfp)
11279 			{
11280 				ereport(ERROR,
11281 						(errcode_for_file_access(),
11282 						 errmsg("could not read file \"%s\": %m",
11283 								BACKUP_LABEL_FILE)));
11284 			}
11285 			labelfile = palloc(statbuf.st_size + 1);
11286 			r = fread(labelfile, statbuf.st_size, 1, lfp);
11287 			labelfile[statbuf.st_size] = '\0';
11288 
11289 			/*
11290 			 * Close and remove the backup label file
11291 			 */
11292 			if (r != 1 || ferror(lfp) || FreeFile(lfp))
11293 				ereport(ERROR,
11294 						(errcode_for_file_access(),
11295 						 errmsg("could not read file \"%s\": %m",
11296 								BACKUP_LABEL_FILE)));
11297 			durable_unlink(BACKUP_LABEL_FILE, ERROR);
11298 
11299 			/*
11300 			 * Remove tablespace_map file if present, it is created only if
11301 			 * there are tablespaces.
11302 			 */
11303 			durable_unlink(TABLESPACE_MAP, DEBUG1);
11304 		}
11305 		PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11306 	}
11307 
11308 	/*
11309 	 * OK to update backup counters, forcePageWrites and session-level lock.
11310 	 *
11311 	 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
11312 	 * Otherwise they can be updated inconsistently, and which might cause
11313 	 * do_pg_abort_backup() to fail.
11314 	 */
11315 	WALInsertLockAcquireExclusive();
11316 	if (exclusive)
11317 	{
11318 		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11319 	}
11320 	else
11321 	{
11322 		/*
11323 		 * The user-visible pg_start/stop_backup() functions that operate on
11324 		 * exclusive backups can be called at any time, but for non-exclusive
11325 		 * backups, it is expected that each do_pg_start_backup() call is
11326 		 * matched by exactly one do_pg_stop_backup() call.
11327 		 */
11328 		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11329 		XLogCtl->Insert.nonExclusiveBackups--;
11330 	}
11331 
11332 	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11333 		XLogCtl->Insert.nonExclusiveBackups == 0)
11334 	{
11335 		XLogCtl->Insert.forcePageWrites = false;
11336 	}
11337 
11338 	/*
11339 	 * Clean up session-level lock.
11340 	 *
11341 	 * You might think that WALInsertLockRelease() can be called before
11342 	 * cleaning up session-level lock because session-level lock doesn't need
11343 	 * to be protected with WAL insertion lock. But since
11344 	 * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
11345 	 * cleaned up before it.
11346 	 */
11347 	sessionBackupState = SESSION_BACKUP_NONE;
11348 
11349 	WALInsertLockRelease();
11350 
11351 	/*
11352 	 * Read and parse the START WAL LOCATION line (this code is pretty crude,
11353 	 * but we are not expecting any variability in the file format).
11354 	 */
11355 	if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
11356 			   &hi, &lo, startxlogfilename,
11357 			   &ch) != 4 || ch != '\n')
11358 		ereport(ERROR,
11359 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11360 				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11361 	startpoint = ((uint64) hi) << 32 | lo;
11362 	remaining = strchr(labelfile, '\n') + 1;	/* %n is not portable enough */
11363 
11364 	/*
11365 	 * Parse the BACKUP FROM line. If we are taking an online backup from the
11366 	 * standby, we confirm that the standby has not been promoted during the
11367 	 * backup.
11368 	 */
11369 	ptr = strstr(remaining, "BACKUP FROM:");
11370 	if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
11371 		ereport(ERROR,
11372 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11373 				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11374 	if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
11375 		ereport(ERROR,
11376 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11377 				 errmsg("the standby was promoted during online backup"),
11378 				 errhint("This means that the backup being taken is corrupt "
11379 						 "and should not be used. "
11380 						 "Try taking another online backup.")));
11381 
11382 	/*
11383 	 * During recovery, we don't write an end-of-backup record. We assume that
11384 	 * pg_control was backed up last and its minimum recovery point can be
11385 	 * available as the backup end location. Since we don't have an
11386 	 * end-of-backup record, we use the pg_control value to check whether
11387 	 * we've reached the end of backup when starting recovery from this
11388 	 * backup. We have no way of checking if pg_control wasn't backed up last
11389 	 * however.
11390 	 *
11391 	 * We don't force a switch to new WAL file but it is still possible to
11392 	 * wait for all the required files to be archived if waitforarchive is
11393 	 * true. This is okay if we use the backup to start a standby and fetch
11394 	 * the missing WAL using streaming replication. But in the case of an
11395 	 * archive recovery, a user should set waitforarchive to true and wait for
11396 	 * them to be archived to ensure that all the required files are
11397 	 * available.
11398 	 *
11399 	 * We return the current minimum recovery point as the backup end
11400 	 * location. Note that it can be greater than the exact backup end
11401 	 * location if the minimum recovery point is updated after the backup of
11402 	 * pg_control. This is harmless for current uses.
11403 	 *
11404 	 * XXX currently a backup history file is for informational and debug
11405 	 * purposes only. It's not essential for an online backup. Furthermore,
11406 	 * even if it's created, it will not be archived during recovery because
11407 	 * an archiver is not invoked. So it doesn't seem worthwhile to write a
11408 	 * backup history file during recovery.
11409 	 */
11410 	if (backup_started_in_recovery)
11411 	{
11412 		XLogRecPtr	recptr;
11413 
11414 		/*
11415 		 * Check to see if all WAL replayed during online backup contain
11416 		 * full-page writes.
11417 		 */
11418 		SpinLockAcquire(&XLogCtl->info_lck);
11419 		recptr = XLogCtl->lastFpwDisableRecPtr;
11420 		SpinLockRelease(&XLogCtl->info_lck);
11421 
11422 		if (startpoint <= recptr)
11423 			ereport(ERROR,
11424 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11425 					 errmsg("WAL generated with full_page_writes=off was replayed "
11426 							"during online backup"),
11427 					 errhint("This means that the backup being taken on the standby "
11428 							 "is corrupt and should not be used. "
11429 							 "Enable full_page_writes and run CHECKPOINT on the master, "
11430 							 "and then try an online backup again.")));
11431 
11432 
11433 		LWLockAcquire(ControlFileLock, LW_SHARED);
11434 		stoppoint = ControlFile->minRecoveryPoint;
11435 		stoptli = ControlFile->minRecoveryPointTLI;
11436 		LWLockRelease(ControlFileLock);
11437 	}
11438 	else
11439 	{
11440 		/*
11441 		 * Write the backup-end xlog record
11442 		 */
11443 		XLogBeginInsert();
11444 		XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
11445 		stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
11446 		stoptli = ThisTimeLineID;
11447 
11448 		/*
11449 		 * Force a switch to a new xlog segment file, so that the backup is
11450 		 * valid as soon as archiver moves out the current segment file.
11451 		 */
11452 		RequestXLogSwitch(false);
11453 
11454 		XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11455 		XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
11456 
11457 		/* Use the log timezone here, not the session timezone */
11458 		stamp_time = (pg_time_t) time(NULL);
11459 		pg_strftime(strfbuf, sizeof(strfbuf),
11460 					"%Y-%m-%d %H:%M:%S %Z",
11461 					pg_localtime(&stamp_time, log_timezone));
11462 
11463 		/*
11464 		 * Write the backup history file
11465 		 */
11466 		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11467 		BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
11468 							  startpoint, wal_segment_size);
11469 		fp = AllocateFile(histfilepath, "w");
11470 		if (!fp)
11471 			ereport(ERROR,
11472 					(errcode_for_file_access(),
11473 					 errmsg("could not create file \"%s\": %m",
11474 							histfilepath)));
11475 		fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
11476 				(uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
11477 		fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
11478 				(uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
11479 
11480 		/*
11481 		 * Transfer remaining lines including label and start timeline to
11482 		 * history file.
11483 		 */
11484 		fprintf(fp, "%s", remaining);
11485 		fprintf(fp, "STOP TIME: %s\n", strfbuf);
11486 		fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
11487 		if (fflush(fp) || ferror(fp) || FreeFile(fp))
11488 			ereport(ERROR,
11489 					(errcode_for_file_access(),
11490 					 errmsg("could not write file \"%s\": %m",
11491 							histfilepath)));
11492 
11493 		/*
11494 		 * Clean out any no-longer-needed history files.  As a side effect,
11495 		 * this will post a .ready file for the newly created history file,
11496 		 * notifying the archiver that history file may be archived
11497 		 * immediately.
11498 		 */
11499 		CleanupBackupHistory();
11500 	}
11501 
11502 	/*
11503 	 * If archiving is enabled, wait for all the required WAL files to be
11504 	 * archived before returning. If archiving isn't enabled, the required WAL
11505 	 * needs to be transported via streaming replication (hopefully with
11506 	 * wal_keep_size set high enough), or some more exotic mechanism like
11507 	 * polling and copying files from pg_wal with script. We have no knowledge
11508 	 * of those mechanisms, so it's up to the user to ensure that he gets all
11509 	 * the required WAL.
11510 	 *
11511 	 * We wait until both the last WAL file filled during backup and the
11512 	 * history file have been archived, and assume that the alphabetic sorting
11513 	 * property of the WAL files ensures any earlier WAL files are safely
11514 	 * archived as well.
11515 	 *
11516 	 * We wait forever, since archive_command is supposed to work and we
11517 	 * assume the admin wanted his backup to work completely. If you don't
11518 	 * wish to wait, then either waitforarchive should be passed in as false,
11519 	 * or you can set statement_timeout.  Also, some notices are issued to
11520 	 * clue in anyone who might be doing this interactively.
11521 	 */
11522 
11523 	if (waitforarchive &&
11524 		((!backup_started_in_recovery && XLogArchivingActive()) ||
11525 		 (backup_started_in_recovery && XLogArchivingAlways())))
11526 	{
11527 		XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11528 		XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
11529 
11530 		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11531 		BackupHistoryFileName(histfilename, stoptli, _logSegNo,
11532 							  startpoint, wal_segment_size);
11533 
11534 		seconds_before_warning = 60;
11535 		waits = 0;
11536 
11537 		while (XLogArchiveIsBusy(lastxlogfilename) ||
11538 			   XLogArchiveIsBusy(histfilename))
11539 		{
11540 			CHECK_FOR_INTERRUPTS();
11541 
11542 			if (!reported_waiting && waits > 5)
11543 			{
11544 				ereport(NOTICE,
11545 						(errmsg("base backup done, waiting for required WAL segments to be archived")));
11546 				reported_waiting = true;
11547 			}
11548 
11549 			pgstat_report_wait_start(WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
11550 			pg_usleep(1000000L);
11551 			pgstat_report_wait_end();
11552 
11553 			if (++waits >= seconds_before_warning)
11554 			{
11555 				seconds_before_warning *= 2;	/* This wraps in >10 years... */
11556 				ereport(WARNING,
11557 						(errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
11558 								waits),
11559 						 errhint("Check that your archive_command is executing properly.  "
11560 								 "You can safely cancel this backup, "
11561 								 "but the database backup will not be usable without all the WAL segments.")));
11562 			}
11563 		}
11564 
11565 		ereport(NOTICE,
11566 				(errmsg("all required WAL segments have been archived")));
11567 	}
11568 	else if (waitforarchive)
11569 		ereport(NOTICE,
11570 				(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11571 
11572 	/*
11573 	 * We're done.  As a convenience, return the ending WAL location.
11574 	 */
11575 	if (stoptli_p)
11576 		*stoptli_p = stoptli;
11577 	return stoppoint;
11578 }
11579 
11580 
11581 /*
11582  * do_pg_abort_backup: abort a running backup
11583  *
11584  * This does just the most basic steps of do_pg_stop_backup(), by taking the
11585  * system out of backup mode, thus making it a lot more safe to call from
11586  * an error handler.
11587  *
11588  * The caller can pass 'arg' as 'true' or 'false' to control whether a warning
11589  * is emitted.
11590  *
11591  * NB: This is only for aborting a non-exclusive backup that doesn't write
11592  * backup_label. A backup started with pg_start_backup() needs to be finished
11593  * with pg_stop_backup().
11594  *
11595  * NB: This gets used as a before_shmem_exit handler, hence the odd-looking
11596  * signature.
11597  */
11598 void
do_pg_abort_backup(int code,Datum arg)11599 do_pg_abort_backup(int code, Datum arg)
11600 {
11601 	bool		emit_warning = DatumGetBool(arg);
11602 
11603 	/*
11604 	 * Quick exit if session is not keeping around a non-exclusive backup
11605 	 * already started.
11606 	 */
11607 	if (sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
11608 		return;
11609 
11610 	WALInsertLockAcquireExclusive();
11611 	Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11612 	XLogCtl->Insert.nonExclusiveBackups--;
11613 
11614 	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11615 		XLogCtl->Insert.nonExclusiveBackups == 0)
11616 	{
11617 		XLogCtl->Insert.forcePageWrites = false;
11618 	}
11619 	WALInsertLockRelease();
11620 
11621 	if (emit_warning)
11622 		ereport(WARNING,
11623 				(errmsg("aborting backup due to backend exiting before pg_stop_backup was called")));
11624 }
11625 
11626 /*
11627  * Register a handler that will warn about unterminated backups at end of
11628  * session, unless this has already been done.
11629  */
11630 void
register_persistent_abort_backup_handler(void)11631 register_persistent_abort_backup_handler(void)
11632 {
11633 	static bool already_done = false;
11634 
11635 	if (already_done)
11636 		return;
11637 	before_shmem_exit(do_pg_abort_backup, DatumGetBool(true));
11638 	already_done = true;
11639 }
11640 
11641 /*
11642  * Get latest redo apply position.
11643  *
11644  * Exported to allow WALReceiver to read the pointer directly.
11645  */
11646 XLogRecPtr
GetXLogReplayRecPtr(TimeLineID * replayTLI)11647 GetXLogReplayRecPtr(TimeLineID *replayTLI)
11648 {
11649 	XLogRecPtr	recptr;
11650 	TimeLineID	tli;
11651 
11652 	SpinLockAcquire(&XLogCtl->info_lck);
11653 	recptr = XLogCtl->lastReplayedEndRecPtr;
11654 	tli = XLogCtl->lastReplayedTLI;
11655 	SpinLockRelease(&XLogCtl->info_lck);
11656 
11657 	if (replayTLI)
11658 		*replayTLI = tli;
11659 	return recptr;
11660 }
11661 
11662 /*
11663  * Get latest WAL insert pointer
11664  */
11665 XLogRecPtr
GetXLogInsertRecPtr(void)11666 GetXLogInsertRecPtr(void)
11667 {
11668 	XLogCtlInsert *Insert = &XLogCtl->Insert;
11669 	uint64		current_bytepos;
11670 
11671 	SpinLockAcquire(&Insert->insertpos_lck);
11672 	current_bytepos = Insert->CurrBytePos;
11673 	SpinLockRelease(&Insert->insertpos_lck);
11674 
11675 	return XLogBytePosToRecPtr(current_bytepos);
11676 }
11677 
11678 /*
11679  * Get latest WAL write pointer
11680  */
11681 XLogRecPtr
GetXLogWriteRecPtr(void)11682 GetXLogWriteRecPtr(void)
11683 {
11684 	SpinLockAcquire(&XLogCtl->info_lck);
11685 	LogwrtResult = XLogCtl->LogwrtResult;
11686 	SpinLockRelease(&XLogCtl->info_lck);
11687 
11688 	return LogwrtResult.Write;
11689 }
11690 
11691 /*
11692  * Returns the redo pointer of the last checkpoint or restartpoint. This is
11693  * the oldest point in WAL that we still need, if we have to restart recovery.
11694  */
11695 void
GetOldestRestartPoint(XLogRecPtr * oldrecptr,TimeLineID * oldtli)11696 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11697 {
11698 	LWLockAcquire(ControlFileLock, LW_SHARED);
11699 	*oldrecptr = ControlFile->checkPointCopy.redo;
11700 	*oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11701 	LWLockRelease(ControlFileLock);
11702 }
11703 
11704 /*
11705  * read_backup_label: check to see if a backup_label file is present
11706  *
11707  * If we see a backup_label during recovery, we assume that we are recovering
11708  * from a backup dump file, and we therefore roll forward from the checkpoint
11709  * identified by the label file, NOT what pg_control says.  This avoids the
11710  * problem that pg_control might have been archived one or more checkpoints
11711  * later than the start of the dump, and so if we rely on it as the start
11712  * point, we will fail to restore a consistent database state.
11713  *
11714  * Returns true if a backup_label was found (and fills the checkpoint
11715  * location and its REDO location into *checkPointLoc and RedoStartLSN,
11716  * respectively); returns false if not. If this backup_label came from a
11717  * streamed backup, *backupEndRequired is set to true. If this backup_label
11718  * was created during recovery, *backupFromStandby is set to true.
11719  */
11720 static bool
read_backup_label(XLogRecPtr * checkPointLoc,bool * backupEndRequired,bool * backupFromStandby)11721 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11722 				  bool *backupFromStandby)
11723 {
11724 	char		startxlogfilename[MAXFNAMELEN];
11725 	TimeLineID	tli_from_walseg,
11726 				tli_from_file;
11727 	FILE	   *lfp;
11728 	char		ch;
11729 	char		backuptype[20];
11730 	char		backupfrom[20];
11731 	char		backuplabel[MAXPGPATH];
11732 	char		backuptime[128];
11733 	uint32		hi,
11734 				lo;
11735 
11736 	*backupEndRequired = false;
11737 	*backupFromStandby = false;
11738 
11739 	/*
11740 	 * See if label file is present
11741 	 */
11742 	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11743 	if (!lfp)
11744 	{
11745 		if (errno != ENOENT)
11746 			ereport(FATAL,
11747 					(errcode_for_file_access(),
11748 					 errmsg("could not read file \"%s\": %m",
11749 							BACKUP_LABEL_FILE)));
11750 		return false;			/* it's not there, all is fine */
11751 	}
11752 
11753 	/*
11754 	 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11755 	 * is pretty crude, but we are not expecting any variability in the file
11756 	 * format).
11757 	 */
11758 	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11759 			   &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
11760 		ereport(FATAL,
11761 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11762 				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11763 	RedoStartLSN = ((uint64) hi) << 32 | lo;
11764 	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11765 			   &hi, &lo, &ch) != 3 || ch != '\n')
11766 		ereport(FATAL,
11767 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11768 				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11769 	*checkPointLoc = ((uint64) hi) << 32 | lo;
11770 
11771 	/*
11772 	 * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
11773 	 * from an older backup anyway, but since the information on it is not
11774 	 * strictly required, don't error out if it's missing for some reason.
11775 	 */
11776 	if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
11777 	{
11778 		if (strcmp(backuptype, "streamed") == 0)
11779 			*backupEndRequired = true;
11780 	}
11781 
11782 	if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
11783 	{
11784 		if (strcmp(backupfrom, "standby") == 0)
11785 			*backupFromStandby = true;
11786 	}
11787 
11788 	/*
11789 	 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
11790 	 * but checking for their presence is useful for debugging and the next
11791 	 * sanity checks. Cope also with the fact that the result buffers have a
11792 	 * pre-allocated size, hence if the backup_label file has been generated
11793 	 * with strings longer than the maximum assumed here an incorrect parsing
11794 	 * happens. That's fine as only minor consistency checks are done
11795 	 * afterwards.
11796 	 */
11797 	if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
11798 		ereport(DEBUG1,
11799 				(errmsg("backup time %s in file \"%s\"",
11800 						backuptime, BACKUP_LABEL_FILE)));
11801 
11802 	if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
11803 		ereport(DEBUG1,
11804 				(errmsg("backup label %s in file \"%s\"",
11805 						backuplabel, BACKUP_LABEL_FILE)));
11806 
11807 	/*
11808 	 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
11809 	 * it as a sanity check if present.
11810 	 */
11811 	if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
11812 	{
11813 		if (tli_from_walseg != tli_from_file)
11814 			ereport(FATAL,
11815 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11816 					 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
11817 					 errdetail("Timeline ID parsed is %u, but expected %u.",
11818 							   tli_from_file, tli_from_walseg)));
11819 
11820 		ereport(DEBUG1,
11821 				(errmsg("backup timeline %u in file \"%s\"",
11822 						tli_from_file, BACKUP_LABEL_FILE)));
11823 	}
11824 
11825 	if (ferror(lfp) || FreeFile(lfp))
11826 		ereport(FATAL,
11827 				(errcode_for_file_access(),
11828 				 errmsg("could not read file \"%s\": %m",
11829 						BACKUP_LABEL_FILE)));
11830 
11831 	return true;
11832 }
11833 
11834 /*
11835  * read_tablespace_map: check to see if a tablespace_map file is present
11836  *
11837  * If we see a tablespace_map file during recovery, we assume that we are
11838  * recovering from a backup dump file, and we therefore need to create symlinks
11839  * as per the information present in tablespace_map file.
11840  *
11841  * Returns true if a tablespace_map file was found (and fills the link
11842  * information for all the tablespace links present in file); returns false
11843  * if not.
11844  */
11845 static bool
read_tablespace_map(List ** tablespaces)11846 read_tablespace_map(List **tablespaces)
11847 {
11848 	tablespaceinfo *ti;
11849 	FILE	   *lfp;
11850 	char		tbsoid[MAXPGPATH];
11851 	char	   *tbslinkpath;
11852 	char		str[MAXPGPATH];
11853 	int			ch,
11854 				prev_ch = -1,
11855 				i = 0,
11856 				n;
11857 
11858 	/*
11859 	 * See if tablespace_map file is present
11860 	 */
11861 	lfp = AllocateFile(TABLESPACE_MAP, "r");
11862 	if (!lfp)
11863 	{
11864 		if (errno != ENOENT)
11865 			ereport(FATAL,
11866 					(errcode_for_file_access(),
11867 					 errmsg("could not read file \"%s\": %m",
11868 							TABLESPACE_MAP)));
11869 		return false;			/* it's not there, all is fine */
11870 	}
11871 
11872 	/*
11873 	 * Read and parse the link name and path lines from tablespace_map file
11874 	 * (this code is pretty crude, but we are not expecting any variability in
11875 	 * the file format).  While taking backup we embed escape character '\\'
11876 	 * before newline in tablespace path, so that during reading of
11877 	 * tablespace_map file, we could distinguish newline in tablespace path
11878 	 * and end of line.  Now while reading tablespace_map file, remove the
11879 	 * escape character that has been added in tablespace path during backup.
11880 	 */
11881 	while ((ch = fgetc(lfp)) != EOF)
11882 	{
11883 		if ((ch == '\n' || ch == '\r') && prev_ch != '\\')
11884 		{
11885 			str[i] = '\0';
11886 			if (sscanf(str, "%s %n", tbsoid, &n) != 1)
11887 				ereport(FATAL,
11888 						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11889 						 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
11890 			tbslinkpath = str + n;
11891 			i = 0;
11892 
11893 			ti = palloc(sizeof(tablespaceinfo));
11894 			ti->oid = pstrdup(tbsoid);
11895 			ti->path = pstrdup(tbslinkpath);
11896 
11897 			*tablespaces = lappend(*tablespaces, ti);
11898 			continue;
11899 		}
11900 		else if ((ch == '\n' || ch == '\r') && prev_ch == '\\')
11901 			str[i - 1] = ch;
11902 		else if (i < sizeof(str) - 1)
11903 			str[i++] = ch;
11904 		prev_ch = ch;
11905 	}
11906 
11907 	if (ferror(lfp) || FreeFile(lfp))
11908 		ereport(FATAL,
11909 				(errcode_for_file_access(),
11910 				 errmsg("could not read file \"%s\": %m",
11911 						TABLESPACE_MAP)));
11912 
11913 	return true;
11914 }
11915 
11916 /*
11917  * Error context callback for errors occurring during rm_redo().
11918  */
11919 static void
rm_redo_error_callback(void * arg)11920 rm_redo_error_callback(void *arg)
11921 {
11922 	XLogReaderState *record = (XLogReaderState *) arg;
11923 	StringInfoData buf;
11924 
11925 	initStringInfo(&buf);
11926 	xlog_outdesc(&buf, record);
11927 
11928 	/* translator: %s is a WAL record description */
11929 	errcontext("WAL redo at %X/%X for %s",
11930 			   (uint32) (record->ReadRecPtr >> 32),
11931 			   (uint32) record->ReadRecPtr,
11932 			   buf.data);
11933 
11934 	pfree(buf.data);
11935 }
11936 
11937 /*
11938  * BackupInProgress: check if online backup mode is active
11939  *
11940  * This is done by checking for existence of the "backup_label" file.
11941  */
11942 bool
BackupInProgress(void)11943 BackupInProgress(void)
11944 {
11945 	struct stat stat_buf;
11946 
11947 	return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
11948 }
11949 
11950 /*
11951  * CancelBackup: rename the "backup_label" and "tablespace_map"
11952  *				 files to cancel backup mode
11953  *
11954  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
11955  * Similarly, if the "tablespace_map" file exists, it will be renamed to
11956  * "tablespace_map.old".
11957  *
11958  * Note that this will render an online backup in progress
11959  * useless. To correctly finish an online backup, pg_stop_backup must be
11960  * called.
11961  */
11962 void
CancelBackup(void)11963 CancelBackup(void)
11964 {
11965 	struct stat stat_buf;
11966 
11967 	/* if the backup_label file is not there, return */
11968 	if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
11969 		return;
11970 
11971 	/* remove leftover file from previously canceled backup if it exists */
11972 	unlink(BACKUP_LABEL_OLD);
11973 
11974 	if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
11975 	{
11976 		ereport(WARNING,
11977 				(errcode_for_file_access(),
11978 				 errmsg("online backup mode was not canceled"),
11979 				 errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
11980 						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11981 		return;
11982 	}
11983 
11984 	/* if the tablespace_map file is not there, return */
11985 	if (stat(TABLESPACE_MAP, &stat_buf) < 0)
11986 	{
11987 		ereport(LOG,
11988 				(errmsg("online backup mode canceled"),
11989 				 errdetail("File \"%s\" was renamed to \"%s\".",
11990 						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11991 		return;
11992 	}
11993 
11994 	/* remove leftover file from previously canceled backup if it exists */
11995 	unlink(TABLESPACE_MAP_OLD);
11996 
11997 	if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
11998 	{
11999 		ereport(LOG,
12000 				(errmsg("online backup mode canceled"),
12001 				 errdetail("Files \"%s\" and \"%s\" were renamed to "
12002 						   "\"%s\" and \"%s\", respectively.",
12003 						   BACKUP_LABEL_FILE, TABLESPACE_MAP,
12004 						   BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
12005 	}
12006 	else
12007 	{
12008 		ereport(WARNING,
12009 				(errcode_for_file_access(),
12010 				 errmsg("online backup mode canceled"),
12011 				 errdetail("File \"%s\" was renamed to \"%s\", but "
12012 						   "file \"%s\" could not be renamed to \"%s\": %m.",
12013 						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
12014 						   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
12015 	}
12016 }
12017 
12018 /*
12019  * Read the XLOG page containing RecPtr into readBuf (if not read already).
12020  * Returns number of bytes read, if the page is read successfully, or -1
12021  * in case of errors.  When errors occur, they are ereport'ed, but only
12022  * if they have not been previously reported.
12023  *
12024  * This is responsible for restoring files from archive as needed, as well
12025  * as for waiting for the requested WAL record to arrive in standby mode.
12026  *
12027  * 'emode' specifies the log level used for reporting "file not found" or
12028  * "end of WAL" situations in archive recovery, or in standby mode when a
12029  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
12030  * false in those situations, on higher log levels the ereport() won't
12031  * return.
12032  *
12033  * In standby mode, if after a successful return of XLogPageRead() the
12034  * caller finds the record it's interested in to be broken, it should
12035  * ereport the error with the level determined by
12036  * emode_for_corrupt_record(), and then set lastSourceFailed
12037  * and call XLogPageRead() again with the same arguments. This lets
12038  * XLogPageRead() to try fetching the record from another source, or to
12039  * sleep and retry.
12040  */
12041 static int
XLogPageRead(XLogReaderState * xlogreader,XLogRecPtr targetPagePtr,int reqLen,XLogRecPtr targetRecPtr,char * readBuf)12042 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
12043 			 XLogRecPtr targetRecPtr, char *readBuf)
12044 {
12045 	XLogPageReadPrivate *private =
12046 	(XLogPageReadPrivate *) xlogreader->private_data;
12047 	int			emode = private->emode;
12048 	uint32		targetPageOff;
12049 	XLogSegNo	targetSegNo PG_USED_FOR_ASSERTS_ONLY;
12050 	int			r;
12051 
12052 	XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
12053 	targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
12054 
12055 	/*
12056 	 * See if we need to switch to a new segment because the requested record
12057 	 * is not in the currently open one.
12058 	 */
12059 	if (readFile >= 0 &&
12060 		!XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
12061 	{
12062 		/*
12063 		 * Request a restartpoint if we've replayed too much xlog since the
12064 		 * last one.
12065 		 */
12066 		if (bgwriterLaunched)
12067 		{
12068 			if (XLogCheckpointNeeded(readSegNo))
12069 			{
12070 				(void) GetRedoRecPtr();
12071 				if (XLogCheckpointNeeded(readSegNo))
12072 					RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
12073 			}
12074 		}
12075 
12076 		close(readFile);
12077 		readFile = -1;
12078 		readSource = XLOG_FROM_ANY;
12079 	}
12080 
12081 	XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
12082 
12083 retry:
12084 	/* See if we need to retrieve more data */
12085 	if (readFile < 0 ||
12086 		(readSource == XLOG_FROM_STREAM &&
12087 		 flushedUpto < targetPagePtr + reqLen))
12088 	{
12089 		if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
12090 										 private->randAccess,
12091 										 private->fetching_ckpt,
12092 										 targetRecPtr))
12093 		{
12094 			if (readFile >= 0)
12095 				close(readFile);
12096 			readFile = -1;
12097 			readLen = 0;
12098 			readSource = XLOG_FROM_ANY;
12099 
12100 			return -1;
12101 		}
12102 	}
12103 
12104 	/*
12105 	 * At this point, we have the right segment open and if we're streaming we
12106 	 * know the requested record is in it.
12107 	 */
12108 	Assert(readFile != -1);
12109 
12110 	/*
12111 	 * If the current segment is being streamed from master, calculate how
12112 	 * much of the current page we have received already. We know the
12113 	 * requested record has been received, but this is for the benefit of
12114 	 * future calls, to allow quick exit at the top of this function.
12115 	 */
12116 	if (readSource == XLOG_FROM_STREAM)
12117 	{
12118 		if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
12119 			readLen = XLOG_BLCKSZ;
12120 		else
12121 			readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
12122 				targetPageOff;
12123 	}
12124 	else
12125 		readLen = XLOG_BLCKSZ;
12126 
12127 	/* Read the requested page */
12128 	readOff = targetPageOff;
12129 
12130 	pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
12131 	r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
12132 	if (r != XLOG_BLCKSZ)
12133 	{
12134 		char		fname[MAXFNAMELEN];
12135 		int			save_errno = errno;
12136 
12137 		pgstat_report_wait_end();
12138 		XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
12139 		if (r < 0)
12140 		{
12141 			errno = save_errno;
12142 			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
12143 					(errcode_for_file_access(),
12144 					 errmsg("could not read from log segment %s, offset %u: %m",
12145 							fname, readOff)));
12146 		}
12147 		else
12148 			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
12149 					(errcode(ERRCODE_DATA_CORRUPTED),
12150 					 errmsg("could not read from log segment %s, offset %u: read %d of %zu",
12151 							fname, readOff, r, (Size) XLOG_BLCKSZ)));
12152 		goto next_record_is_invalid;
12153 	}
12154 	pgstat_report_wait_end();
12155 
12156 	Assert(targetSegNo == readSegNo);
12157 	Assert(targetPageOff == readOff);
12158 	Assert(reqLen <= readLen);
12159 
12160 	xlogreader->seg.ws_tli = curFileTLI;
12161 
12162 	/*
12163 	 * Check the page header immediately, so that we can retry immediately if
12164 	 * it's not valid. This may seem unnecessary, because XLogReadRecord()
12165 	 * validates the page header anyway, and would propagate the failure up to
12166 	 * ReadRecord(), which would retry. However, there's a corner case with
12167 	 * continuation records, if a record is split across two pages such that
12168 	 * we would need to read the two pages from different sources. For
12169 	 * example, imagine a scenario where a streaming replica is started up,
12170 	 * and replay reaches a record that's split across two WAL segments. The
12171 	 * first page is only available locally, in pg_wal, because it's already
12172 	 * been recycled in the master. The second page, however, is not present
12173 	 * in pg_wal, and we should stream it from the master. There is a recycled
12174 	 * WAL segment present in pg_wal, with garbage contents, however. We would
12175 	 * read the first page from the local WAL segment, but when reading the
12176 	 * second page, we would read the bogus, recycled, WAL segment. If we
12177 	 * didn't catch that case here, we would never recover, because
12178 	 * ReadRecord() would retry reading the whole record from the beginning.
12179 	 *
12180 	 * Of course, this only catches errors in the page header, which is what
12181 	 * happens in the case of a recycled WAL segment. Other kinds of errors or
12182 	 * corruption still has the same problem. But this at least fixes the
12183 	 * common case, which can happen as part of normal operation.
12184 	 *
12185 	 * Validating the page header is cheap enough that doing it twice
12186 	 * shouldn't be a big deal from a performance point of view.
12187 	 */
12188 	if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
12189 	{
12190 		/* reset any error XLogReaderValidatePageHeader() might have set */
12191 		xlogreader->errormsg_buf[0] = '\0';
12192 		goto next_record_is_invalid;
12193 	}
12194 
12195 	return readLen;
12196 
12197 next_record_is_invalid:
12198 	lastSourceFailed = true;
12199 
12200 	if (readFile >= 0)
12201 		close(readFile);
12202 	readFile = -1;
12203 	readLen = 0;
12204 	readSource = XLOG_FROM_ANY;
12205 
12206 	/* In standby-mode, keep trying */
12207 	if (StandbyMode)
12208 		goto retry;
12209 	else
12210 		return -1;
12211 }
12212 
12213 /*
12214  * Open the WAL segment containing WAL location 'RecPtr'.
12215  *
12216  * The segment can be fetched via restore_command, or via walreceiver having
12217  * streamed the record, or it can already be present in pg_wal. Checking
12218  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
12219  * too, in case someone copies a new segment directly to pg_wal. That is not
12220  * documented or recommended, though.
12221  *
12222  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
12223  * prepare to read WAL starting from RedoStartLSN after this.
12224  *
12225  * 'RecPtr' might not point to the beginning of the record we're interested
12226  * in, it might also point to the page or segment header. In that case,
12227  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
12228  * used to decide which timeline to stream the requested WAL from.
12229  *
12230  * If the record is not immediately available, the function returns false
12231  * if we're not in standby mode. In standby mode, waits for it to become
12232  * available.
12233  *
12234  * When the requested record becomes available, the function opens the file
12235  * containing it (if not open already), and returns true. When end of standby
12236  * mode is triggered by the user, and there is no more WAL available, returns
12237  * false.
12238  */
12239 static bool
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,bool randAccess,bool fetching_ckpt,XLogRecPtr tliRecPtr)12240 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
12241 							bool fetching_ckpt, XLogRecPtr tliRecPtr)
12242 {
12243 	static TimestampTz last_fail_time = 0;
12244 	TimestampTz now;
12245 	bool		streaming_reply_sent = false;
12246 
12247 	/*-------
12248 	 * Standby mode is implemented by a state machine:
12249 	 *
12250 	 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
12251 	 *	  pg_wal (XLOG_FROM_PG_WAL)
12252 	 * 2. Check trigger file
12253 	 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
12254 	 * 4. Rescan timelines
12255 	 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
12256 	 *
12257 	 * Failure to read from the current source advances the state machine to
12258 	 * the next state.
12259 	 *
12260 	 * 'currentSource' indicates the current state. There are no currentSource
12261 	 * values for "check trigger", "rescan timelines", and "sleep" states,
12262 	 * those actions are taken when reading from the previous source fails, as
12263 	 * part of advancing to the next state.
12264 	 *
12265 	 * If standby mode is turned off while reading WAL from stream, we move
12266 	 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
12267 	 * the files (which would be required at end of recovery, e.g., timeline
12268 	 * history file) from archive or pg_wal. We don't need to kill WAL receiver
12269 	 * here because it's already stopped when standby mode is turned off at
12270 	 * the end of recovery.
12271 	 *-------
12272 	 */
12273 	if (!InArchiveRecovery)
12274 		currentSource = XLOG_FROM_PG_WAL;
12275 	else if (currentSource == XLOG_FROM_ANY ||
12276 			 (!StandbyMode && currentSource == XLOG_FROM_STREAM))
12277 	{
12278 		lastSourceFailed = false;
12279 		currentSource = XLOG_FROM_ARCHIVE;
12280 	}
12281 
12282 	for (;;)
12283 	{
12284 		XLogSource	oldSource = currentSource;
12285 		bool		startWalReceiver = false;
12286 
12287 		/*
12288 		 * First check if we failed to read from the current source, and
12289 		 * advance the state machine if so. The failure to read might've
12290 		 * happened outside this function, e.g when a CRC check fails on a
12291 		 * record, or within this loop.
12292 		 */
12293 		if (lastSourceFailed)
12294 		{
12295 			switch (currentSource)
12296 			{
12297 				case XLOG_FROM_ARCHIVE:
12298 				case XLOG_FROM_PG_WAL:
12299 
12300 					/*
12301 					 * Check to see if the trigger file exists. Note that we
12302 					 * do this only after failure, so when you create the
12303 					 * trigger file, we still finish replaying as much as we
12304 					 * can from archive and pg_wal before failover.
12305 					 */
12306 					if (StandbyMode && CheckForStandbyTrigger())
12307 					{
12308 						ShutdownWalRcv();
12309 						return false;
12310 					}
12311 
12312 					/*
12313 					 * Not in standby mode, and we've now tried the archive
12314 					 * and pg_wal.
12315 					 */
12316 					if (!StandbyMode)
12317 						return false;
12318 
12319 					/*
12320 					 * Move to XLOG_FROM_STREAM state, and set to start a
12321 					 * walreceiver if necessary.
12322 					 */
12323 					currentSource = XLOG_FROM_STREAM;
12324 					startWalReceiver = true;
12325 					break;
12326 
12327 				case XLOG_FROM_STREAM:
12328 
12329 					/*
12330 					 * Failure while streaming. Most likely, we got here
12331 					 * because streaming replication was terminated, or
12332 					 * promotion was triggered. But we also get here if we
12333 					 * find an invalid record in the WAL streamed from master,
12334 					 * in which case something is seriously wrong. There's
12335 					 * little chance that the problem will just go away, but
12336 					 * PANIC is not good for availability either, especially
12337 					 * in hot standby mode. So, we treat that the same as
12338 					 * disconnection, and retry from archive/pg_wal again. The
12339 					 * WAL in the archive should be identical to what was
12340 					 * streamed, so it's unlikely that it helps, but one can
12341 					 * hope...
12342 					 */
12343 
12344 					/*
12345 					 * We should be able to move to XLOG_FROM_STREAM only in
12346 					 * standby mode.
12347 					 */
12348 					Assert(StandbyMode);
12349 
12350 					/*
12351 					 * Before we leave XLOG_FROM_STREAM state, make sure that
12352 					 * walreceiver is not active, so that it won't overwrite
12353 					 * WAL that we restore from archive.
12354 					 */
12355 					if (WalRcvStreaming())
12356 						ShutdownWalRcv();
12357 
12358 					/*
12359 					 * Before we sleep, re-scan for possible new timelines if
12360 					 * we were requested to recover to the latest timeline.
12361 					 */
12362 					if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
12363 					{
12364 						if (rescanLatestTimeLine())
12365 						{
12366 							currentSource = XLOG_FROM_ARCHIVE;
12367 							break;
12368 						}
12369 					}
12370 
12371 					/*
12372 					 * XLOG_FROM_STREAM is the last state in our state
12373 					 * machine, so we've exhausted all the options for
12374 					 * obtaining the requested WAL. We're going to loop back
12375 					 * and retry from the archive, but if it hasn't been long
12376 					 * since last attempt, sleep wal_retrieve_retry_interval
12377 					 * milliseconds to avoid busy-waiting.
12378 					 */
12379 					now = GetCurrentTimestamp();
12380 					if (!TimestampDifferenceExceeds(last_fail_time, now,
12381 													wal_retrieve_retry_interval))
12382 					{
12383 						long		wait_time;
12384 
12385 						wait_time = wal_retrieve_retry_interval -
12386 							TimestampDifferenceMilliseconds(last_fail_time, now);
12387 
12388 						(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
12389 										 WL_LATCH_SET | WL_TIMEOUT |
12390 										 WL_EXIT_ON_PM_DEATH,
12391 										 wait_time,
12392 										 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
12393 						ResetLatch(&XLogCtl->recoveryWakeupLatch);
12394 						now = GetCurrentTimestamp();
12395 
12396 						/* Handle interrupt signals of startup process */
12397 						HandleStartupProcInterrupts();
12398 					}
12399 					last_fail_time = now;
12400 					currentSource = XLOG_FROM_ARCHIVE;
12401 					break;
12402 
12403 				default:
12404 					elog(ERROR, "unexpected WAL source %d", currentSource);
12405 			}
12406 		}
12407 		else if (currentSource == XLOG_FROM_PG_WAL)
12408 		{
12409 			/*
12410 			 * We just successfully read a file in pg_wal. We prefer files in
12411 			 * the archive over ones in pg_wal, so try the next file again
12412 			 * from the archive first.
12413 			 */
12414 			if (InArchiveRecovery)
12415 				currentSource = XLOG_FROM_ARCHIVE;
12416 		}
12417 
12418 		if (currentSource != oldSource)
12419 			elog(DEBUG2, "switched WAL source from %s to %s after %s",
12420 				 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
12421 				 lastSourceFailed ? "failure" : "success");
12422 
12423 		/*
12424 		 * We've now handled possible failure. Try to read from the chosen
12425 		 * source.
12426 		 */
12427 		lastSourceFailed = false;
12428 
12429 		switch (currentSource)
12430 		{
12431 			case XLOG_FROM_ARCHIVE:
12432 			case XLOG_FROM_PG_WAL:
12433 
12434 				/*
12435 				 * WAL receiver must not be running when reading WAL from
12436 				 * archive or pg_wal.
12437 				 */
12438 				Assert(!WalRcvStreaming());
12439 
12440 				/* Close any old file we might have open. */
12441 				if (readFile >= 0)
12442 				{
12443 					close(readFile);
12444 					readFile = -1;
12445 				}
12446 				/* Reset curFileTLI if random fetch. */
12447 				if (randAccess)
12448 					curFileTLI = 0;
12449 
12450 				/*
12451 				 * Try to restore the file from archive, or read an existing
12452 				 * file from pg_wal.
12453 				 */
12454 				readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
12455 											  currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
12456 											  currentSource);
12457 				if (readFile >= 0)
12458 					return true;	/* success! */
12459 
12460 				/*
12461 				 * Nope, not found in archive or pg_wal.
12462 				 */
12463 				lastSourceFailed = true;
12464 				break;
12465 
12466 			case XLOG_FROM_STREAM:
12467 				{
12468 					bool		havedata;
12469 
12470 					/*
12471 					 * We should be able to move to XLOG_FROM_STREAM only in
12472 					 * standby mode.
12473 					 */
12474 					Assert(StandbyMode);
12475 
12476 					/*
12477 					 * First, shutdown walreceiver if its restart has been
12478 					 * requested -- but no point if we're already slated for
12479 					 * starting it.
12480 					 */
12481 					if (pendingWalRcvRestart && !startWalReceiver)
12482 					{
12483 						ShutdownWalRcv();
12484 
12485 						/*
12486 						 * Re-scan for possible new timelines if we were
12487 						 * requested to recover to the latest timeline.
12488 						 */
12489 						if (recoveryTargetTimeLineGoal ==
12490 							RECOVERY_TARGET_TIMELINE_LATEST)
12491 							rescanLatestTimeLine();
12492 
12493 						startWalReceiver = true;
12494 					}
12495 					pendingWalRcvRestart = false;
12496 
12497 					/*
12498 					 * Launch walreceiver if needed.
12499 					 *
12500 					 * If fetching_ckpt is true, RecPtr points to the initial
12501 					 * checkpoint location. In that case, we use RedoStartLSN
12502 					 * as the streaming start position instead of RecPtr, so
12503 					 * that when we later jump backwards to start redo at
12504 					 * RedoStartLSN, we will have the logs streamed already.
12505 					 */
12506 					if (startWalReceiver &&
12507 						PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
12508 					{
12509 						XLogRecPtr	ptr;
12510 						TimeLineID	tli;
12511 
12512 						if (fetching_ckpt)
12513 						{
12514 							ptr = RedoStartLSN;
12515 							tli = ControlFile->checkPointCopy.ThisTimeLineID;
12516 						}
12517 						else
12518 						{
12519 							ptr = RecPtr;
12520 
12521 							/*
12522 							 * Use the record begin position to determine the
12523 							 * TLI, rather than the position we're reading.
12524 							 */
12525 							tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
12526 
12527 							if (curFileTLI > 0 && tli < curFileTLI)
12528 								elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
12529 									 (uint32) (tliRecPtr >> 32),
12530 									 (uint32) tliRecPtr,
12531 									 tli, curFileTLI);
12532 						}
12533 						curFileTLI = tli;
12534 						RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
12535 											 PrimarySlotName,
12536 											 wal_receiver_create_temp_slot);
12537 						flushedUpto = 0;
12538 					}
12539 
12540 					/*
12541 					 * Check if WAL receiver is active or wait to start up.
12542 					 */
12543 					if (!WalRcvStreaming())
12544 					{
12545 						lastSourceFailed = true;
12546 						break;
12547 					}
12548 
12549 					/*
12550 					 * Walreceiver is active, so see if new data has arrived.
12551 					 *
12552 					 * We only advance XLogReceiptTime when we obtain fresh
12553 					 * WAL from walreceiver and observe that we had already
12554 					 * processed everything before the most recent "chunk"
12555 					 * that it flushed to disk.  In steady state where we are
12556 					 * keeping up with the incoming data, XLogReceiptTime will
12557 					 * be updated on each cycle. When we are behind,
12558 					 * XLogReceiptTime will not advance, so the grace time
12559 					 * allotted to conflicting queries will decrease.
12560 					 */
12561 					if (RecPtr < flushedUpto)
12562 						havedata = true;
12563 					else
12564 					{
12565 						XLogRecPtr	latestChunkStart;
12566 
12567 						flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
12568 						if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
12569 						{
12570 							havedata = true;
12571 							if (latestChunkStart <= RecPtr)
12572 							{
12573 								XLogReceiptTime = GetCurrentTimestamp();
12574 								SetCurrentChunkStartTime(XLogReceiptTime);
12575 							}
12576 						}
12577 						else
12578 							havedata = false;
12579 					}
12580 					if (havedata)
12581 					{
12582 						/*
12583 						 * Great, streamed far enough.  Open the file if it's
12584 						 * not open already.  Also read the timeline history
12585 						 * file if we haven't initialized timeline history
12586 						 * yet; it should be streamed over and present in
12587 						 * pg_wal by now.  Use XLOG_FROM_STREAM so that source
12588 						 * info is set correctly and XLogReceiptTime isn't
12589 						 * changed.
12590 						 *
12591 						 * NB: We must set readTimeLineHistory based on
12592 						 * recoveryTargetTLI, not receiveTLI. Normally they'll
12593 						 * be the same, but if recovery_target_timeline is
12594 						 * 'latest' and archiving is configured, then it's
12595 						 * possible that we managed to retrieve one or more
12596 						 * new timeline history files from the archive,
12597 						 * updating recoveryTargetTLI.
12598 						 */
12599 						if (readFile < 0)
12600 						{
12601 							if (!expectedTLEs)
12602 								expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
12603 							readFile = XLogFileRead(readSegNo, PANIC,
12604 													receiveTLI,
12605 													XLOG_FROM_STREAM, false);
12606 							Assert(readFile >= 0);
12607 						}
12608 						else
12609 						{
12610 							/* just make sure source info is correct... */
12611 							readSource = XLOG_FROM_STREAM;
12612 							XLogReceiptSource = XLOG_FROM_STREAM;
12613 							return true;
12614 						}
12615 						break;
12616 					}
12617 
12618 					/*
12619 					 * Data not here yet. Check for trigger, then wait for
12620 					 * walreceiver to wake us up when new WAL arrives.
12621 					 */
12622 					if (CheckForStandbyTrigger())
12623 					{
12624 						/*
12625 						 * Note that we don't "return false" immediately here.
12626 						 * After being triggered, we still want to replay all
12627 						 * the WAL that was already streamed. It's in pg_wal
12628 						 * now, so we just treat this as a failure, and the
12629 						 * state machine will move on to replay the streamed
12630 						 * WAL from pg_wal, and then recheck the trigger and
12631 						 * exit replay.
12632 						 */
12633 						lastSourceFailed = true;
12634 						break;
12635 					}
12636 
12637 					/*
12638 					 * Since we have replayed everything we have received so
12639 					 * far and are about to start waiting for more WAL, let's
12640 					 * tell the upstream server our replay location now so
12641 					 * that pg_stat_replication doesn't show stale
12642 					 * information.
12643 					 */
12644 					if (!streaming_reply_sent)
12645 					{
12646 						WalRcvForceReply();
12647 						streaming_reply_sent = true;
12648 					}
12649 
12650 					/*
12651 					 * Wait for more WAL to arrive. Time out after 5 seconds
12652 					 * to react to a trigger file promptly and to check if the
12653 					 * WAL receiver is still active.
12654 					 */
12655 					(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
12656 									 WL_LATCH_SET | WL_TIMEOUT |
12657 									 WL_EXIT_ON_PM_DEATH,
12658 									 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM);
12659 					ResetLatch(&XLogCtl->recoveryWakeupLatch);
12660 					break;
12661 				}
12662 
12663 			default:
12664 				elog(ERROR, "unexpected WAL source %d", currentSource);
12665 		}
12666 
12667 		/*
12668 		 * This possibly-long loop needs to handle interrupts of startup
12669 		 * process.
12670 		 */
12671 		HandleStartupProcInterrupts();
12672 	}
12673 
12674 	return false;				/* not reached */
12675 }
12676 
12677 /*
12678  * Set flag to signal the walreceiver to restart.  (The startup process calls
12679  * this on noticing a relevant configuration change.)
12680  */
12681 void
StartupRequestWalReceiverRestart(void)12682 StartupRequestWalReceiverRestart(void)
12683 {
12684 	if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
12685 	{
12686 		ereport(LOG,
12687 				(errmsg("WAL receiver process shutdown requested")));
12688 
12689 		pendingWalRcvRestart = true;
12690 	}
12691 }
12692 
12693 /*
12694  * Determine what log level should be used to report a corrupt WAL record
12695  * in the current WAL page, previously read by XLogPageRead().
12696  *
12697  * 'emode' is the error mode that would be used to report a file-not-found
12698  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
12699  * we're retrying the exact same record that we've tried previously, only
12700  * complain the first time to keep the noise down.  However, we only do when
12701  * reading from pg_wal, because we don't expect any invalid records in archive
12702  * or in records streamed from master. Files in the archive should be complete,
12703  * and we should never hit the end of WAL because we stop and wait for more WAL
12704  * to arrive before replaying it.
12705  *
12706  * NOTE: This function remembers the RecPtr value it was last called with,
12707  * to suppress repeated messages about the same record. Only call this when
12708  * you are about to ereport(), or you might cause a later message to be
12709  * erroneously suppressed.
12710  */
12711 static int
emode_for_corrupt_record(int emode,XLogRecPtr RecPtr)12712 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
12713 {
12714 	static XLogRecPtr lastComplaint = 0;
12715 
12716 	if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
12717 	{
12718 		if (RecPtr == lastComplaint)
12719 			emode = DEBUG1;
12720 		else
12721 			lastComplaint = RecPtr;
12722 	}
12723 	return emode;
12724 }
12725 
12726 /*
12727  * Has a standby promotion already been triggered?
12728  *
12729  * Unlike CheckForStandbyTrigger(), this works in any process
12730  * that's connected to shared memory.
12731  */
12732 bool
PromoteIsTriggered(void)12733 PromoteIsTriggered(void)
12734 {
12735 	/*
12736 	 * We check shared state each time only until a standby promotion is
12737 	 * triggered. We can't trigger a promotion again, so there's no need to
12738 	 * keep checking after the shared variable has once been seen true.
12739 	 */
12740 	if (LocalPromoteIsTriggered)
12741 		return true;
12742 
12743 	SpinLockAcquire(&XLogCtl->info_lck);
12744 	LocalPromoteIsTriggered = XLogCtl->SharedPromoteIsTriggered;
12745 	SpinLockRelease(&XLogCtl->info_lck);
12746 
12747 	return LocalPromoteIsTriggered;
12748 }
12749 
12750 static void
SetPromoteIsTriggered(void)12751 SetPromoteIsTriggered(void)
12752 {
12753 	SpinLockAcquire(&XLogCtl->info_lck);
12754 	XLogCtl->SharedPromoteIsTriggered = true;
12755 	SpinLockRelease(&XLogCtl->info_lck);
12756 
12757 	LocalPromoteIsTriggered = true;
12758 }
12759 
12760 /*
12761  * Check to see whether the user-specified trigger file exists and whether a
12762  * promote request has arrived.  If either condition holds, return true.
12763  */
12764 static bool
CheckForStandbyTrigger(void)12765 CheckForStandbyTrigger(void)
12766 {
12767 	struct stat stat_buf;
12768 
12769 	if (LocalPromoteIsTriggered)
12770 		return true;
12771 
12772 	if (IsPromoteSignaled())
12773 	{
12774 		/*
12775 		 * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
12776 		 * signal handler. It now leaves the file in place and lets the
12777 		 * Startup process do the unlink. This allows Startup to know whether
12778 		 * it should create a full checkpoint before starting up (fallback
12779 		 * mode). Fast promotion takes precedence.
12780 		 */
12781 		if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12782 		{
12783 			unlink(PROMOTE_SIGNAL_FILE);
12784 			unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12785 			fast_promote = true;
12786 		}
12787 		else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12788 		{
12789 			unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12790 			fast_promote = false;
12791 		}
12792 
12793 		ereport(LOG, (errmsg("received promote request")));
12794 
12795 		ResetPromoteSignaled();
12796 		SetPromoteIsTriggered();
12797 		return true;
12798 	}
12799 
12800 	if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
12801 		return false;
12802 
12803 	if (stat(PromoteTriggerFile, &stat_buf) == 0)
12804 	{
12805 		ereport(LOG,
12806 				(errmsg("promote trigger file found: %s", PromoteTriggerFile)));
12807 		unlink(PromoteTriggerFile);
12808 		SetPromoteIsTriggered();
12809 		fast_promote = true;
12810 		return true;
12811 	}
12812 	else if (errno != ENOENT)
12813 		ereport(ERROR,
12814 				(errcode_for_file_access(),
12815 				 errmsg("could not stat promote trigger file \"%s\": %m",
12816 						PromoteTriggerFile)));
12817 
12818 	return false;
12819 }
12820 
12821 /*
12822  * Remove the files signaling a standby promotion request.
12823  */
12824 void
RemovePromoteSignalFiles(void)12825 RemovePromoteSignalFiles(void)
12826 {
12827 	unlink(PROMOTE_SIGNAL_FILE);
12828 	unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12829 }
12830 
12831 /*
12832  * Check to see if a promote request has arrived. Should be
12833  * called by postmaster after receiving SIGUSR1.
12834  */
12835 bool
CheckPromoteSignal(void)12836 CheckPromoteSignal(void)
12837 {
12838 	struct stat stat_buf;
12839 
12840 	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
12841 		stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12842 		return true;
12843 
12844 	return false;
12845 }
12846 
12847 /*
12848  * Wake up startup process to replay newly arrived WAL, or to notice that
12849  * failover has been requested.
12850  */
12851 void
WakeupRecovery(void)12852 WakeupRecovery(void)
12853 {
12854 	SetLatch(&XLogCtl->recoveryWakeupLatch);
12855 }
12856 
12857 /*
12858  * Update the WalWriterSleeping flag.
12859  */
12860 void
SetWalWriterSleeping(bool sleeping)12861 SetWalWriterSleeping(bool sleeping)
12862 {
12863 	SpinLockAcquire(&XLogCtl->info_lck);
12864 	XLogCtl->WalWriterSleeping = sleeping;
12865 	SpinLockRelease(&XLogCtl->info_lck);
12866 }
12867 
12868 /*
12869  * Schedule a walreceiver wakeup in the main recovery loop.
12870  */
12871 void
XLogRequestWalReceiverReply(void)12872 XLogRequestWalReceiverReply(void)
12873 {
12874 	doRequestWalReceiverReply = true;
12875 }
12876